Skip to content

Commit

Permalink
updated download.sh for google drive header changes
Browse files Browse the repository at this point in the history
  • Loading branch information
lhhunghimself committed Jan 23, 2024
1 parent 19b636c commit 6338b1a
Show file tree
Hide file tree
Showing 7 changed files with 628 additions and 265 deletions.
346 changes: 245 additions & 101 deletions widgets/Utilities/S3_download/Dockerfiles/download.sh
Original file line number Diff line number Diff line change
@@ -1,113 +1,257 @@
#!/bin/bash

awsdir=$1
bucket=$2
outputDir=$3

error=0
mkdir -p outputDir
cp -r $awsdir/* /root/.aws || exit 1

copy_wildcard(){
echo "parsing wildcards in $1"
local my_str=$1
local max_attempts=4
#if there is no / then we search from the base bucket
local my_glob=""
local wildcard=$my_str
if [[ $glob == */* ]]; then
#split into a glob (directory) and wildcard string
no_wc="${my_str%%['!'@#\$%^\&*()+]*}"
my_glob="${no_wc%/*}"/
wildcard="${my_str#$my_glob}"
fi
local command=(nice aws s3 cp --exclude "*" --include="$wildcard" --recursive s3://$bucket/$my_glob $outputDir)
local attempts
for attempts in {1..$max_attempts}; do
echo "${command[@]}"
if "${command[@]}" ; then
return
fi
done
echo "error in ${command[@]}"
exit 1

function checkFilename(){
echo "check zero length"
[[ -n "$filename" ]] || return 1
echo "check too long"
[[ $(echo "${#filename}") -lt 255 ]] || return 1
echo "check character"
[[ $filename =~ ^[0-9a-zA-Z._-]+$ ]] || return 1
echo "check first char"
[[ $(echo $filename | cut -c1-1) =~ ^[0-9a-zA-Z]+$ ]] || return 1
return 0
}

copy_directory(){
echo "copying directory object $1"
local attempts
local max_attempts=2
local command=(nice aws s3 cp --recursive s3://$bucket/$1 $outputDir/$dest)
for attempts in {1..$max_attempts}; do
echo "${command[@]}"
if "${command[@]}" ; then
return
fi
done
echo "error in ${command[@]}"
exit 1

function getFilename(){
curlret=0
unset filename
echo "finding filename for url $url"
tempDir="$(mktemp -d /tmp/XXXXXXXXX)"
#make a temporary directory without write permissions to force curl to quit after obtaining filename
chmod -w $tempDir
filename=$(cd $tempDir; su user -c "wget --content-disposition $url |& grep denied | sed 's/.*denied //; s/:.*//'")
checkFilename && return
filename=$(su user -c "curl -JLO $url |& grep -m 1 Warning | sed 's/.* file //; s/:.*//'")
checkFilename && return
filename=$(basename "$url")
checkFilename && return
filename="${url##*/}"
}
function getRequestURL(){
local content_url="https://drive.google.com/uc?export=download&id=$1"
local html_content=$(curl -c ./cookies.txt -s -L "$content_url")
# Extract values using grep and sed
local id=$(echo "$html_content" | grep -o 'name="id" value="[^"]*' | sed 's/name="id" value="//')
local export=$(echo "$html_content" | grep -o 'name="export" value="[^"]*' | sed 's/name="export" value="//')
local confirm=$(echo "$html_content" | grep -o 'name="confirm" value="[^"]*' | sed 's/name="confirm" value="//')
local uuid=$(echo "$html_content" | grep -o 'name="uuid" value="[^"]*' | sed 's/name="uuid" value="//')

copy_file(){
echo "copying file object $1"
destination=basename $1
local attempts
local max_attempts=2
local command=(nice aws s3 cp s3://$bucket/$1 $outputDir/$dest)
for attempts in {1..$max_attempts}; do
echo "${command[@]}"
if "${command[@]}" ; then
return
fi
done
echo "error in ${command[@]}"
exit 1
# Construct the request URL
request_url="https://drive.usercontent.google.com/download?id=${id}&export=${export}&confirm=${confirm}&uuid=${uuid}"

}

copy(){
local my_glob=$1
echo "$my_glob"
if [[ $my_glob == *['!'@#\$%^\&*()+]* ]]; then
copy_wildcard $my_glob || error=1
elif [ "${my_glob: -1}" == "/" ]; then
copy_directory $my_glob || error=1
else
copy_file $my_glob || error=1
fi
function findGoogleFilename(){
if [[ $1 == *drive.google.com/file/d/* ]]; then
fileID=$(echo "$1" | sed -n -e 's/.*drive\.google\.com\/file\/d\///p' | sed 's:/.*::')
else
fileID=$(echo "$1" | sed -n -e 's/.*\?id\=//p')
fileID=${fileID%%/*}
fi
echo "fileID is ${fileID}"
filename=$(curl -s -L "$1" | sed -n -e 's/.*<meta property\="og\:title" content\="//p' | sed -n -e 's/">.*//p')
echo curl -s -L "$1"
[ -z "$filename" ] && echo "Unable to find filename - cannot download from google" && exit 1
#first see if the file can be downloaded directly by checking the header when downloading to a non-writable directory
tempDir="$(mktemp -d /tmp/XXXXXXXXX)"
#make a temporary directory without write permissions to force curl to quit after obtaining filename
chmod -w $tempDir
cookie=$(cd $tempDir; su user bash -c "curl -I -L 'https://docs.google.com/uc?export=download&id=$fileID'" | grep -o -P '(?<=set-cookie: ).*' | sed 's/;.*//')
}

multiCopy(){
lasti=$((${#globs[@]} - 1))
for i in $(seq 0 ${lasti}); do
if ( mkdir $lockDir/lock$i 2> /dev/null ); then
glob=${globs[i]}
echo "thread $1 copying $glob"
copy $glob
fi
done
function decompString(){
unset dcmd
unset zipFlag
if [ -n "$decompress" ]; then
case $1 in
*.tar.bz2 | *.tbz2 )
if [ -n "$concatenateFile" ]; then
dcmd="| tar -xjOf - >> $concatenateFile"
else
dcmd='| tar -xjf -'
fi
return
;;
*.tar.gz | *.tgz)
if [ -n "$concatenateFile" ]; then
dcmd="| tar -xzOf - >> $concatenateFile"
else
dcmd='| tar -xzf -'
fi
return
;;
*.tar)
if [ -n "$concatenateFile" ]; then
dcmd="| tar -xOf - >> $concatenateFile"
else
dcmd='| tar -xf -'
fi
return
;;
*.gz)
if [ -n "$concatenateFile" ]; then
dcmd="| gzip -d >> $concatenateFile"
else
local outputname=$(basename "$1" .gz)
dcmd="| gzip -d > $outputname"
fi
return
;;
*.bz2)
if [ -n "$concatenateFile" ]; then
dcmd="| bzip2 -d >> $concatenateFile"
else
local outputname=$(basename "$1" .bz2)
dcmd="| bzip2 -d > $outputname"
fi
return
;;
*.zip)
zipFlag=1
if [ -n "$concatenateFile" ]; then
dcmd="&& unzip -p >> $concatenateFile"
else
dcmd="&& unzip -o '$filename' && rm '$filename'"
fi
return
;;
esac
fi
if [ -n "$concatenateFile" ]; then
dcmd=">> $concatenateFile"
else
dcmd="-o $filename"
fi
}
if [ -z $DIRS ] || [ "$DIRS" == "[]" ]; then
echo "no bucket object given to download"
exit 1
fi
globs=( $(echo $DIRS | jq -r '.[]') )

if [ -z $nThreads ] || (( $nThreads == 1 )) || (( $nThreads == 0 )); then
#use single thread
echo "Using single thread"
for glob in "${globs[@]}"; do
copy $glob
done
else
lockDir=/tmp/locks.$$
mkdir -p $lockDir
for i in $(seq 2 $nThreads); do
multiCopy $i &
done
multiCopy 1 &
wait
rm -rf $lockDir
while [[ $# -gt 0 ]] ; do
case $1 in
--decompress)
decompress=1
;;
--directory)
mkdir -p $2
cd $2
shift
;;
--concatenateFile)
concatenateFile=$2
shift
;;
--noClobber)
noClobber=1
;;
*)
urls+=("$1")
;;
esac
shift
done

#empty the concatenateFile if it exists
#do it here instead of in parse loop because in case the directory change comes after the concatenate
if [ -n "$concatenateFile" ]; then
bash -c "> $concatenateFile"
fi
exit $error

#loop through the urls
status=0
for url in "${urls[@]}" ; do
#if it falls through all code - then there is an error.
curlret=1
if [[ $url == *drive.google.com* ]]; then
#find filename and fileID and keep cookie
findGoogleFilename $url
decompString "$filename"
echo "google drive url is $url filename is $filename fileID is $fileID dcmd is $dcmd"
if [[ -n "$filename" ]]; then
if [ -n "$noClobber" ] && [ -f "$filename" ]; then
echo "File $filename is already present, skipping download"
shift
continue
fi
if [ -z "$cookie" ]; then
echo No problem with virus check no verification needed
cmd="curl -L 'https://docs.google.com/uc?export=download&id=${fileID}' "
else
echo "We need to pass the virus check"
getRequestURL $fileID
echo "request url is $request_url"
cmd="curl -Lb ./cookies.txt '${request_url}' "
fi
if [[ -n $zipFlag ]]; then
cmd+="-o '$filename' "
fi
cmd+="$dcmd"
echo "$cmd"
bash -c "$cmd"
curlret=$?
rm -f ./cookie
else
echo "did not download $url - can't find filename - authentication may be required"
fi
else
echo "url $url is not from google drive"
getFilename
if [ -n "$noClobber" ] && [ -f "$filename" ]; then
echo "File $filename is already present, skipping download"
shift
continue
fi
echo "$filename"
if [ -n "$decompress" ]; then
# check for a log that should contain all extracted objects
if [ -s "$filename.log" ] && [ -n "$noClobber" ]; then
skipDownload=true
while read f; do
if [ ! -e "$f" ]; then
# if we are here then one of the extracted objects does not exist
skipDownload=false
break
fi
done < $filename.log
if $skipDownload; then
shift
continue
fi
fi
decompString "$filename"
# make a temp directory to store file content then move to permanent location after
tmpdir=$(mktemp -d -p $PWD)
pushd $tmpdir > /dev/null
if [[ -n $zipFlag ]]; then
cmd="curl -JLO $url $dcmd"
else
cmd="curl -L $url $dcmd"
fi
echo "$cmd"
bash -c "$cmd"
curlret=$?
if [ $curlret -eq 0 ]; then
# store a log file to prevent script from downloading again
find -not -name . > ../$filename.log
# set dot glob to move hidden files and directories
shopt -s dotglob
mv * ../
# unset dot glob to avoid trouble
shopt -u dotglob
fi
popd > /dev/null
rmdir $tmpdir
else
if [ -n "$concatenateFile" ]; then
cmd="curl $url >> $concatenateFile"
else
cmd="curl -JLO $url"
fi
echo "$cmd"
$cmd
#curlret=$?
fi
fi
if [ $curlret -ne 0 ]; then
status=1
fi
shift
done

exit $status
Loading

0 comments on commit 6338b1a

Please sign in to comment.