-
Notifications
You must be signed in to change notification settings - Fork 1
/
randommv.py
115 lines (85 loc) · 2.97 KB
/
randommv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import datetime
import requests
from multiprocessing import Pool
import random
import json
from bs4 import BeautifulSoup
from scraper import get_data
import billboard
HEADERS = {
'User-Agent': 'yt.py'
}
def downloadHTML(url, timeout=25):
"""Downloads and returns the webpage with the given URL.
Returns an empty string on failure.
"""
assert url.startswith('http')
req = requests.get(url, headers=HEADERS, timeout=timeout)
req.encoding = 'utf-8'
if req.status_code == 200:
return req.text
else:
print(req.status_code)
return ''
def scrape_vids(params):
'''
Returns list of artist, title tuple scraped from the given
year and page number
'''
year, page = params
print("Scraping year {} page {}".format(year, page))
url = 'http://imvdb.com/calendar/{}?page={}'.format(year, page)
html = downloadHTML(url)
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', {'class', 'imvdbTable'})
rows = table.find_all('tr')
out = []
for row in rows:
title = row.find('h3').find('a').contents[0].strip()[:-6].strip()
artist = row.find('h4').find('a').contents[0].strip()
out.append((artist, title))
return out
def get_mvs_by_year(year, limit=120):
# returns an array of (artist, title)
# of music videos released in the given year.
url = 'http://imvdb.com/calendar/{}?page=1'.format(year)
html = downloadHTML(url)
soup = BeautifulSoup(html, 'html.parser')
# print(html)
# with open('blah.html', 'w') as f:
# f.write(str(soup))
num_vids = soup.find('h3', {'class', 'rack_title'}).contents[0].split(' ')[-1]
num_vids = int(num_vids[1:-1].replace(',', ''))
table = soup.find('table', {'class', 'imvdbTable'})
rows = table.find_all('tr')
# num_pages = num_vids // len(rows) + 1
num_pages = 10
params = []
for i in range(1, num_pages + 1):
params.append((year, i))
p = Pool()
batches = p.map(scrape_vids, params)
vids = []
for batch in batches:
vids.extend(batch)
random.shuffle(vids)
return vids[:limit]
def fetch_mvs_data(vids, year):
# given a list of vids, fetch needed data
# skip songs that are in the hot-100 year-end chart of billboard
print(vids)
print("Fetching video data")
chart = billboard.ChartData(name='hot-100-songs', date=str(year), yearEnd=True)
billboard_titles = set([e.title.lower() for e in chart])
# params is an array of tuple: (artist, title, rank, year)
params = [(vid[0], vid[1], 0, year) for vid in vids if vid[1] not in billboard_titles]
p = Pool()
return p.map(get_data, params)
if __name__ == '__main__':
years = range(2010, 2018)
for year in years:
print("Processing year {}".format(year))
vids = get_mvs_by_year(year)
data = fetch_mvs_data(vids, year)
with open('{}_non_billboard_data.json'.format(year), 'w') as f:
f.write(json.dumps(data))