-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
executable file
·123 lines (102 loc) · 3.33 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python3
import os
import re
import sys
from multiprocessing import Pool
import requests
from pyquery import PyQuery as pq
from tqdm import trange, tqdm
def extract_books_from_url(url):
print('Processing', url)
r = requests.get(url)
d = pq(r.text)
h3 = d('h3.product-hdg a')
if not h3:
"""series contains only one book
https://bookwalker.jp/series/120052/
"""
h3 = d('span.product-hdg a')
result = []
for i in h3:
title = pq(i).text().strip()
url = pq(i).attr('href')
result.append({'title': title, 'url': url})
return result
def generate_urls_by_series_page(series, max_page):
for i in range(1, max_page + 1):
yield 'https://bookwalker.jp/series/{}/page{}/'.format(series, i)
def extract_books_from_series(series):
"""extract book url by series_number https://bookwalker.jp/series/4206/
:param series str/int
:return (series_title,[{'tile':xxx,'url':xxx}])
"""
r = requests.get('https://bookwalker.jp/series/{}/'.format(series))
d = pq(r.text)
series_title = (d('span.overview-hdg-txt')).text()
print(series_title)
if d('ul.pager-num li:last a'):
max_page = int(d('ul.pager-num li:last a').text())
else:
max_page = 1
books = []
print('Total page number', max_page)
for url in generate_urls_by_series_page(series, max_page):
books.extend(extract_books_from_url(url))
return series_title, books
def decode_cover_number(number):
return int(str(number)[::-1]) - 1
def download_cover(folder, book):
"""
:param folder: str folder_name
:param book: {'title':xxx,'url':xxx}
:return:
"""
r = requests.get(book['url'])
cover_number = re.search(r'<meta property="og:image" content="https://c.bookwalker.jp/(\d+)/t_700x780.jpg">',
r.text).group(1)
ori_number = decode_cover_number(cover_number)
url = 'https://c.bookwalker.jp/coverImage_{}.jpg'.format(ori_number)
filename, ext = url.split('/')[-1].split('.')
new_filename = '{filename} {title}.{ext}'.format(filename=filename, title=book['title'], ext=ext)
filepath = os.path.join(folder, new_filename)
if not os.path.exists(filepath):
r = requests.get(url)
with open(filepath, 'wb') as f:
f.write(r.content)
def update():
folders = os.listdir('./covers')
series = []
for folder in folders:
if re.match(r'\d+ .+', folder):
series.append(int(folder.split()[0]))
series.sort()
print(series)
for i in series:
download_by_series(i)
def download_by_series(series):
"""
:param series: str/int
"""
series_title, books = extract_books_from_series(series)
folder = './covers/{} {}'.format(series, series_title)
if not os.path.exists(folder):
os.mkdir(folder)
p = Pool(4)
bar = trange(len(books))
for i in books:
p.apply_async(download_cover, [folder, i], callback=lambda x: bar.update(1))
p.close()
p.join()
bar.close()
def main():
if len(sys.argv) > 1:
user_input = sys.argv[1]
else:
user_input = input('Input series/update:')
if user_input == 'update':
update()
else:
for series in user_input.split(','):
download_by_series(series)
if __name__ == '__main__':
main()