-
Notifications
You must be signed in to change notification settings - Fork 19
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
updated download.sh for google drive header changes
- Loading branch information
1 parent
19b636c
commit 6338b1a
Showing
7 changed files
with
628 additions
and
265 deletions.
There are no files selected for viewing
346 changes: 245 additions & 101 deletions
346
widgets/Utilities/S3_download/Dockerfiles/download.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,113 +1,257 @@ | ||
#!/bin/bash | ||
|
||
awsdir=$1 | ||
bucket=$2 | ||
outputDir=$3 | ||
|
||
error=0 | ||
mkdir -p outputDir | ||
cp -r $awsdir/* /root/.aws || exit 1 | ||
|
||
copy_wildcard(){ | ||
echo "parsing wildcards in $1" | ||
local my_str=$1 | ||
local max_attempts=4 | ||
#if there is no / then we search from the base bucket | ||
local my_glob="" | ||
local wildcard=$my_str | ||
if [[ $glob == */* ]]; then | ||
#split into a glob (directory) and wildcard string | ||
no_wc="${my_str%%['!'@#\$%^\&*()+]*}" | ||
my_glob="${no_wc%/*}"/ | ||
wildcard="${my_str#$my_glob}" | ||
fi | ||
local command=(nice aws s3 cp --exclude "*" --include="$wildcard" --recursive s3://$bucket/$my_glob $outputDir) | ||
local attempts | ||
for attempts in {1..$max_attempts}; do | ||
echo "${command[@]}" | ||
if "${command[@]}" ; then | ||
return | ||
fi | ||
done | ||
echo "error in ${command[@]}" | ||
exit 1 | ||
|
||
function checkFilename(){ | ||
echo "check zero length" | ||
[[ -n "$filename" ]] || return 1 | ||
echo "check too long" | ||
[[ $(echo "${#filename}") -lt 255 ]] || return 1 | ||
echo "check character" | ||
[[ $filename =~ ^[0-9a-zA-Z._-]+$ ]] || return 1 | ||
echo "check first char" | ||
[[ $(echo $filename | cut -c1-1) =~ ^[0-9a-zA-Z]+$ ]] || return 1 | ||
return 0 | ||
} | ||
|
||
copy_directory(){ | ||
echo "copying directory object $1" | ||
local attempts | ||
local max_attempts=2 | ||
local command=(nice aws s3 cp --recursive s3://$bucket/$1 $outputDir/$dest) | ||
for attempts in {1..$max_attempts}; do | ||
echo "${command[@]}" | ||
if "${command[@]}" ; then | ||
return | ||
fi | ||
done | ||
echo "error in ${command[@]}" | ||
exit 1 | ||
|
||
function getFilename(){ | ||
curlret=0 | ||
unset filename | ||
echo "finding filename for url $url" | ||
tempDir="$(mktemp -d /tmp/XXXXXXXXX)" | ||
#make a temporary directory without write permissions to force curl to quit after obtaining filename | ||
chmod -w $tempDir | ||
filename=$(cd $tempDir; su user -c "wget --content-disposition $url |& grep denied | sed 's/.*denied //; s/:.*//'") | ||
checkFilename && return | ||
filename=$(su user -c "curl -JLO $url |& grep -m 1 Warning | sed 's/.* file //; s/:.*//'") | ||
checkFilename && return | ||
filename=$(basename "$url") | ||
checkFilename && return | ||
filename="${url##*/}" | ||
} | ||
function getRequestURL(){ | ||
local content_url="https://drive.google.com/uc?export=download&id=$1" | ||
local html_content=$(curl -c ./cookies.txt -s -L "$content_url") | ||
# Extract values using grep and sed | ||
local id=$(echo "$html_content" | grep -o 'name="id" value="[^"]*' | sed 's/name="id" value="//') | ||
local export=$(echo "$html_content" | grep -o 'name="export" value="[^"]*' | sed 's/name="export" value="//') | ||
local confirm=$(echo "$html_content" | grep -o 'name="confirm" value="[^"]*' | sed 's/name="confirm" value="//') | ||
local uuid=$(echo "$html_content" | grep -o 'name="uuid" value="[^"]*' | sed 's/name="uuid" value="//') | ||
|
||
copy_file(){ | ||
echo "copying file object $1" | ||
destination=basename $1 | ||
local attempts | ||
local max_attempts=2 | ||
local command=(nice aws s3 cp s3://$bucket/$1 $outputDir/$dest) | ||
for attempts in {1..$max_attempts}; do | ||
echo "${command[@]}" | ||
if "${command[@]}" ; then | ||
return | ||
fi | ||
done | ||
echo "error in ${command[@]}" | ||
exit 1 | ||
# Construct the request URL | ||
request_url="https://drive.usercontent.google.com/download?id=${id}&export=${export}&confirm=${confirm}&uuid=${uuid}" | ||
|
||
} | ||
|
||
copy(){ | ||
local my_glob=$1 | ||
echo "$my_glob" | ||
if [[ $my_glob == *['!'@#\$%^\&*()+]* ]]; then | ||
copy_wildcard $my_glob || error=1 | ||
elif [ "${my_glob: -1}" == "/" ]; then | ||
copy_directory $my_glob || error=1 | ||
else | ||
copy_file $my_glob || error=1 | ||
fi | ||
function findGoogleFilename(){ | ||
if [[ $1 == *drive.google.com/file/d/* ]]; then | ||
fileID=$(echo "$1" | sed -n -e 's/.*drive\.google\.com\/file\/d\///p' | sed 's:/.*::') | ||
else | ||
fileID=$(echo "$1" | sed -n -e 's/.*\?id\=//p') | ||
fileID=${fileID%%/*} | ||
fi | ||
echo "fileID is ${fileID}" | ||
filename=$(curl -s -L "$1" | sed -n -e 's/.*<meta property\="og\:title" content\="//p' | sed -n -e 's/">.*//p') | ||
echo curl -s -L "$1" | ||
[ -z "$filename" ] && echo "Unable to find filename - cannot download from google" && exit 1 | ||
#first see if the file can be downloaded directly by checking the header when downloading to a non-writable directory | ||
tempDir="$(mktemp -d /tmp/XXXXXXXXX)" | ||
#make a temporary directory without write permissions to force curl to quit after obtaining filename | ||
chmod -w $tempDir | ||
cookie=$(cd $tempDir; su user bash -c "curl -I -L 'https://docs.google.com/uc?export=download&id=$fileID'" | grep -o -P '(?<=set-cookie: ).*' | sed 's/;.*//') | ||
} | ||
|
||
multiCopy(){ | ||
lasti=$((${#globs[@]} - 1)) | ||
for i in $(seq 0 ${lasti}); do | ||
if ( mkdir $lockDir/lock$i 2> /dev/null ); then | ||
glob=${globs[i]} | ||
echo "thread $1 copying $glob" | ||
copy $glob | ||
fi | ||
done | ||
function decompString(){ | ||
unset dcmd | ||
unset zipFlag | ||
if [ -n "$decompress" ]; then | ||
case $1 in | ||
*.tar.bz2 | *.tbz2 ) | ||
if [ -n "$concatenateFile" ]; then | ||
dcmd="| tar -xjOf - >> $concatenateFile" | ||
else | ||
dcmd='| tar -xjf -' | ||
fi | ||
return | ||
;; | ||
*.tar.gz | *.tgz) | ||
if [ -n "$concatenateFile" ]; then | ||
dcmd="| tar -xzOf - >> $concatenateFile" | ||
else | ||
dcmd='| tar -xzf -' | ||
fi | ||
return | ||
;; | ||
*.tar) | ||
if [ -n "$concatenateFile" ]; then | ||
dcmd="| tar -xOf - >> $concatenateFile" | ||
else | ||
dcmd='| tar -xf -' | ||
fi | ||
return | ||
;; | ||
*.gz) | ||
if [ -n "$concatenateFile" ]; then | ||
dcmd="| gzip -d >> $concatenateFile" | ||
else | ||
local outputname=$(basename "$1" .gz) | ||
dcmd="| gzip -d > $outputname" | ||
fi | ||
return | ||
;; | ||
*.bz2) | ||
if [ -n "$concatenateFile" ]; then | ||
dcmd="| bzip2 -d >> $concatenateFile" | ||
else | ||
local outputname=$(basename "$1" .bz2) | ||
dcmd="| bzip2 -d > $outputname" | ||
fi | ||
return | ||
;; | ||
*.zip) | ||
zipFlag=1 | ||
if [ -n "$concatenateFile" ]; then | ||
dcmd="&& unzip -p >> $concatenateFile" | ||
else | ||
dcmd="&& unzip -o '$filename' && rm '$filename'" | ||
fi | ||
return | ||
;; | ||
esac | ||
fi | ||
if [ -n "$concatenateFile" ]; then | ||
dcmd=">> $concatenateFile" | ||
else | ||
dcmd="-o $filename" | ||
fi | ||
} | ||
if [ -z $DIRS ] || [ "$DIRS" == "[]" ]; then | ||
echo "no bucket object given to download" | ||
exit 1 | ||
fi | ||
globs=( $(echo $DIRS | jq -r '.[]') ) | ||
|
||
if [ -z $nThreads ] || (( $nThreads == 1 )) || (( $nThreads == 0 )); then | ||
#use single thread | ||
echo "Using single thread" | ||
for glob in "${globs[@]}"; do | ||
copy $glob | ||
done | ||
else | ||
lockDir=/tmp/locks.$$ | ||
mkdir -p $lockDir | ||
for i in $(seq 2 $nThreads); do | ||
multiCopy $i & | ||
done | ||
multiCopy 1 & | ||
wait | ||
rm -rf $lockDir | ||
while [[ $# -gt 0 ]] ; do | ||
case $1 in | ||
--decompress) | ||
decompress=1 | ||
;; | ||
--directory) | ||
mkdir -p $2 | ||
cd $2 | ||
shift | ||
;; | ||
--concatenateFile) | ||
concatenateFile=$2 | ||
shift | ||
;; | ||
--noClobber) | ||
noClobber=1 | ||
;; | ||
*) | ||
urls+=("$1") | ||
;; | ||
esac | ||
shift | ||
done | ||
|
||
#empty the concatenateFile if it exists | ||
#do it here instead of in parse loop because in case the directory change comes after the concatenate | ||
if [ -n "$concatenateFile" ]; then | ||
bash -c "> $concatenateFile" | ||
fi | ||
exit $error | ||
|
||
#loop through the urls | ||
status=0 | ||
for url in "${urls[@]}" ; do | ||
#if it falls through all code - then there is an error. | ||
curlret=1 | ||
if [[ $url == *drive.google.com* ]]; then | ||
#find filename and fileID and keep cookie | ||
findGoogleFilename $url | ||
decompString "$filename" | ||
echo "google drive url is $url filename is $filename fileID is $fileID dcmd is $dcmd" | ||
if [[ -n "$filename" ]]; then | ||
if [ -n "$noClobber" ] && [ -f "$filename" ]; then | ||
echo "File $filename is already present, skipping download" | ||
shift | ||
continue | ||
fi | ||
if [ -z "$cookie" ]; then | ||
echo No problem with virus check no verification needed | ||
cmd="curl -L 'https://docs.google.com/uc?export=download&id=${fileID}' " | ||
else | ||
echo "We need to pass the virus check" | ||
getRequestURL $fileID | ||
echo "request url is $request_url" | ||
cmd="curl -Lb ./cookies.txt '${request_url}' " | ||
fi | ||
if [[ -n $zipFlag ]]; then | ||
cmd+="-o '$filename' " | ||
fi | ||
cmd+="$dcmd" | ||
echo "$cmd" | ||
bash -c "$cmd" | ||
curlret=$? | ||
rm -f ./cookie | ||
else | ||
echo "did not download $url - can't find filename - authentication may be required" | ||
fi | ||
else | ||
echo "url $url is not from google drive" | ||
getFilename | ||
if [ -n "$noClobber" ] && [ -f "$filename" ]; then | ||
echo "File $filename is already present, skipping download" | ||
shift | ||
continue | ||
fi | ||
echo "$filename" | ||
if [ -n "$decompress" ]; then | ||
# check for a log that should contain all extracted objects | ||
if [ -s "$filename.log" ] && [ -n "$noClobber" ]; then | ||
skipDownload=true | ||
while read f; do | ||
if [ ! -e "$f" ]; then | ||
# if we are here then one of the extracted objects does not exist | ||
skipDownload=false | ||
break | ||
fi | ||
done < $filename.log | ||
if $skipDownload; then | ||
shift | ||
continue | ||
fi | ||
fi | ||
decompString "$filename" | ||
# make a temp directory to store file content then move to permanent location after | ||
tmpdir=$(mktemp -d -p $PWD) | ||
pushd $tmpdir > /dev/null | ||
if [[ -n $zipFlag ]]; then | ||
cmd="curl -JLO $url $dcmd" | ||
else | ||
cmd="curl -L $url $dcmd" | ||
fi | ||
echo "$cmd" | ||
bash -c "$cmd" | ||
curlret=$? | ||
if [ $curlret -eq 0 ]; then | ||
# store a log file to prevent script from downloading again | ||
find -not -name . > ../$filename.log | ||
# set dot glob to move hidden files and directories | ||
shopt -s dotglob | ||
mv * ../ | ||
# unset dot glob to avoid trouble | ||
shopt -u dotglob | ||
fi | ||
popd > /dev/null | ||
rmdir $tmpdir | ||
else | ||
if [ -n "$concatenateFile" ]; then | ||
cmd="curl $url >> $concatenateFile" | ||
else | ||
cmd="curl -JLO $url" | ||
fi | ||
echo "$cmd" | ||
$cmd | ||
#curlret=$? | ||
fi | ||
fi | ||
if [ $curlret -ne 0 ]; then | ||
status=1 | ||
fi | ||
shift | ||
done | ||
|
||
exit $status |
Oops, something went wrong.