-
Notifications
You must be signed in to change notification settings - Fork 3
/
Lyrics.py
120 lines (104 loc) · 3.92 KB
/
Lyrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# Modified version of Lyrics.py from this repo: https://github.com/bhrigu123/Instant-Lyrics
# TODO: submit a patch?
import requests
from bs4 import BeautifulSoup
import os
import sys
import re
try:
from urllib.parse import quote_plus
except ImportError:
from urllib import quote_plus
class LyricsNotFoundException(Exception):
pass
def get_metrolyrics(url):
resp = requests.get(url, headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel'
'Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, '
'like Gecko) Chrome/55.0.2883.95 Safari/537.36'
}
)
if resp.status_code == 404:
raise LyricsNotFoundException
lyrics_html = resp.text
soup = BeautifulSoup(lyrics_html, "lxml")
raw_lyrics = (soup.findAll('p', attrs={'class': 'verse'}))
paras = []
try:
final_lyrics = unicode.join(u'\n', map(unicode, raw_lyrics))
except NameError:
final_lyrics = str.join(u'\n', map(str, raw_lyrics))
final_lyrics = (final_lyrics.replace('<p class="verse">', '\n'))
final_lyrics = (final_lyrics.replace('<br/>', ' '))
final_lyrics = final_lyrics.replace('</p>', ' ')
return (final_lyrics, url)
def get_lyrics2(song):
# Using google isn't really scalable. Looks like they're pretty serious about
# detecting and blocking scrapers.
# Have to just guess the URL for now :/
artist = song.artist.lower()
# metrolyrics quirk. if artist is foo ft bar, url seems to always just have foo
cleaved = False
for feat in [' featuring', ' &', ' feat.']:
feati = artist.find(feat)
if feati != -1:
artist = artist[:feati]
cleaved = True
if cleaved:
if ',' in artist:
artist = artist.split(',')[0].strip()
if artist == 'n sync':
artist = 'nsync'
if artist == 'p!nk':
artist = 'pink'
title = song.title.lower().replace(' & ', ' and ')
fragment = title + ' lyrics ' + artist
# Lowercase islands seem to come up a lot in song titles like
# "It Wasn t Me", or "I ll Be There"
fragment = fragment\
.replace("'", "")\
.replace(' s ', 's ')\
.replace(' t ', 't ')\
.replace(' ll ', 'll ')\
.replace('-', '')\
.replace('#', '')\
.replace(".", "")\
.replace("& ", "")\
.replace('?', '')\
.replace('f**k', 'fuck')
fragment = re.sub('\s+', ' ', fragment)
fragment = fragment.replace(' ', '-')
try:
url = 'http://www.metrolyrics.com/{}.html'.format(fragment)
except UnicodeEncodeError:
raise LyricsNotFoundException
return get_metrolyrics(url)
def get_lyrics(song_name):
song_name += ' site:metrolyrics.com'
name = quote_plus(song_name)
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11'
'(KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
url = 'http://www.google.com/search?q=' + name
result = requests.get(url, headers=hdr).text
offset = 0
lyrics_found = False
while not lyrics_found:
domain = 'http://www.metrolyrics.com'
link_start = result.find(domain, offset)
if link_start == -1:
with open('err.html', 'w') as f:
#result = ''.join(map(lambda c: chr(ord(c)), unicode(result))
f.write(result.encode('utf-8'))
raise LyricsNotFoundException
link_end = result.find('html', link_start + 1)
offset = link_start+1
link = result[link_start:link_end + 4]
if 'lyrics' in link[len(domain):]:
lyrics_found = True
return get_metrolyrics(link)
if __name__ == '__main__':
song = ' '.join(sys.argv[1:])
lyrics = get_lyrics(song)
print lyrics