-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_box_office.py
69 lines (61 loc) · 2.1 KB
/
get_box_office.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
"""get the All Time Worldwide Box Office
only top 600, as of Aug 2017."""
from bs4 import BeautifulSoup
import requests
from pathlib import Path
import locale
from help import logging, set_logging, write_json
base_url = 'http://www.the-numbers.com/box-office-records/worldwide/all-movies/cumulative/all-time/{}'
output_p = Path('data/movie_box.json')
num_per_page = 100
# max 19461
total_num = 19461
#selectors:
#page_filling_chart > center > table > tbody > tr:nth-child(1)
# remove center!
main_sele = '#page_filling_chart table tbody tr'
# rank_sele = 'td:nth-of-type(1)'
# year_sele = 'td:nth-of-type(2) > a'
# name_sele = 'td:nth-of-type(3) > b > a'
# world_sele = 'td:nth-of-type(4)'
# dome_sele = 'td:nth-of-type(5)'
# inter_sele = 'td:nth-of-type(6)'
dict_sele = {
'rank' : 'td:nth-of-type(1)',
'year' : 'td:nth-of-type(2) > a',
'name' : 'td:nth-of-type(3) > b > a',
'world_box' : 'td:nth-of-type(4)',
'domestic_box' : 'td:nth-of-type(5)',
'internatonal_box' : 'td:nth-of-type(6)',
}
def get_movie_box(out_p=output_p):
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
ret = []
for idx in range(1, total_num, num_per_page):
resp = requests.get(base_url.format(idx))
soup = BeautifulSoup(resp.content, 'html.parser')
movies = soup.select(main_sele)
# print(movies)
logging.info(f'num of movies on page: {len(movies)}')
for movie in movies:
m = {}
for key in dict_sele:
value = movie.select(dict_sele[key])[0].get_text()
if not value:
value = None
elif key == 'rank' or key == 'year':
value = locale.atoi(value)
elif key.endswith('_box'):
# print(value)
value = int(locale.atof(value[1:]))
m[key] = value
ret.append(m)
logging.info(f'add rank:{m["rank"]}, name:{m["name"]}')
write_json(out_p, ret)
def main():
set_logging(stream=True)
if not output_p.is_file() or True:
get_movie_box()
#%%
if __name__ == '__main__':
main()