-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
80 lines (66 loc) · 2.56 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import requests
from bs4 import BeautifulSoup
import os
import re
from secrets import GENIUS_API_TOKEN
# Get artist object from Genius API
def request_artist_info(artist_name, page):
base_url = 'https://api.genius.com'
headers = {'Authorization': 'Bearer ' + GENIUS_API_TOKEN}
search_url = base_url + '/search?per_page=10&page=' + str(page)
data = {'q': artist_name}
response = requests.get(search_url, data=data, headers=headers)
return response
# Get Genius.com song url's from artist object
def request_song_url(artist_name, song_cap):
page = 1
songs = []
while True:
response = request_artist_info(artist_name, page)
json = response.json()
# Collect up to song_cap song objects from artist
song_info = []
for hit in json['response']['hits']:
if artist_name.lower() in hit['result']['primary_artist']['name'].lower():
song_info.append(hit)
# Collect song URL's from song objects
for song in song_info:
if len(songs) < song_cap:
url = song['result']['url']
songs.append(url)
if len(songs) == song_cap:
break
else:
page += 1
print('Found {} songs by {}'.format(len(songs), artist_name))
return songs
# Scrape lyrics from a Genius.com song URL
def scrape_song_lyrics(url):
page = requests.get(url)
html = BeautifulSoup(page.text, 'html.parser')
try:
lyrics = html.find('div', class_='lyrics').get_text()
lyrics = re.sub(r'[\(\[].*?[\)\]]', '', lyrics)
lyrics = os.linesep.join([s for s in lyrics.splitlines() if s])
return lyrics
except AttributeError:
lyrics = html.find('div', class_='SongPageGrid-sc-1vi6xda-0 DGVcp Lyrics__Root-sc-1ynbvzw-0 jvlKWy').get_text()
lyrics = str(lyrics)
lyrics = lyrics.replace('<br/>', '\n')
lyrics = re.sub(r'(\<.*?\>)', '', lyrics)
return lyrics
finally:
print("Divs not found")
return
def write_lyrics_to_file(artist_name, song_count):
f = open('lyrics/' + artist_name.lower() + '.txt', 'wb')
urls = request_song_url(artist_name, song_count)
for url in urls:
lyrics = scrape_song_lyrics(url)
f.write(lyrics.encode("utf8"))
f.close()
num_lines = sum(1 for line in open('lyrics/' + artist_name.lower() + '.txt', 'rb'))
print('Wrote {} lines to file from {} songs'.format(num_lines, song_count))
if __name__ == '__main__':
# write_lyrics_to_file('Kendrick Lamar', 100)
write_lyrics_to_file('Ariana Grande', 2)