-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbatch_link.py
82 lines (59 loc) · 1.75 KB
/
batch_link.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import re
import sys
import os
import requests
import shutil
from bs4 import BeautifulSoup
from urllib.parse import urlparse
def slugify(value):
"""
https://stackoverflow.com/a/295466/10039085
Originally from the Django framework.
Normalizes string, converts to lowercase, removes non-alpha characters,
and converts spaces to hyphens.
"""
value = re.sub(r'[^\w\s\-\.]', '', value).strip().lower()
value = re.sub('[-\s]+', '-', value)
return value
def download_file(url):
path_segments = url.split('/')[-3:-1]
fname = slugify(url.split('/')[-1])
# https://stackoverflow.com/a/4979569/10039085
dir_path = os.path.join(
os.path.expanduser('~'),
'Desktop', 'test', 'seisho', *path_segments
)
if not os.path.exists(dir_path):
os.makedirs(dir_path)
fpath = os.path.join(dir_path, fname)
print(fpath)
with requests.get(url, stream=True, verify=False) as r:
with open(fpath, 'wb') as f:
shutil.copyfileobj(r.raw, f)
# alternative method
# for chunk in r.iter_content(chunk_size=1024):
# if chunk: # filter out keep-alive new chunks
# f.write(chunk)
return fpath
def main(url, target):
hostname = urlparse(url).hostname
sauce = requests.get(url).content
soup = BeautifulSoup(sauce, 'html.parser')
batch_list = [a for a in soup('a') if a['href'].endswith(target)]
for yes in batch_list:
if yes['href'].startswith(url):
link = yes['href']
else:
if url.endswith('/'):
link = url+yes['href']
else:
link = url+'/'+yes['href']
download_file(link)
print('done')
return
if len(sys.argv) is 3:
main(sys.argv[1], sys.argv[2])
else:
print('pass exactly two arguments')
print('first argument is the url')
print('second argument is the file ending that the links should contain (e.g. - ".pdf")')