-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathlyrics_fetcher.py
121 lines (110 loc) · 5.98 KB
/
lyrics_fetcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import scrapy
import os
import re
import unicodedata
from mutagen.flac import FLAC
from mutagen.mp4 import MP4
from mutagen.easyid3 import EasyID3
from mutagen.mp3 import MP3
import mutagen
OVERWRITE_EXISTING_LYRICS = False
OVERWRITE_EXISTING_TXT = False
GENERATE_LRC = False # Creates a .lrc file using the same name of the original file
GENERATE_TXT = True # Creates a .txt file using the same name of the original file
NUKE_TXT = False # Deletes all .txt files
NUKE_LRC = True # Deletes all .lrc files
def sanitize_values_for_url(value):
no_parenthesis = re.sub(r'[()]', "", value)
no_square = re.sub(r'\[.*?\]', "", no_parenthesis)
no_dots = no_square.replace("’", "").replace("'", "").replace(",", "").replace(
"&", "and").replace(".", "").replace("+", "")
no_accents = ''.join(c for c in unicodedata.normalize('NFD', no_dots)
if unicodedata.category(c) != 'Mn')
no_spaces = " ".join(no_accents.split())
sanitized = re.sub(r'[^A-Za-z0-9 ]+', ' ', no_spaces).strip(' ').strip('-').replace(' ', '-').lower()
sanitized = re.sub(r'[-]+', '-', sanitized) # Replace multiple dashes with a single dash
return sanitized
def add_extension(value, extension = ".lrc"):
return value.removesuffix(".flac").removesuffix(".m4a").removesuffix(".mp3").removesuffix(".opus").removesuffix(".ogg") + extension
def add_txt_extension(path, artist, track):
return path.replace(path.split(os.sep)[-1],"") + artist + "-" + track + ".txt"
class BlogSpider(scrapy.Spider):
print("Scaning library...")
name = 'genius spider'
urls = []
values = {}
for subdir, dirs, files in os.walk('.'):
for file in files:
path = os.path.join(subdir, file)
artistName = ""
trackName = ""
album = ""
duration = 0.0
#Only pull files that are needed
if ((GENERATE_TXT and (OVERWRITE_EXISTING_TXT or not os.path.exists(add_extension(path, ".txt")))) or (GENERATE_LRC and (OVERWRITE_EXISTING_LYRICS or not os.path.exists(add_extension(path))))):
if file.endswith(".flac"):
fileInfo = FLAC(path)
artistName = fileInfo["albumartist"][0] if "albumartist" in fileInfo else fileInfo["artist"][0]
trackName = fileInfo["title"][0] if "title" in fileInfo else ""
album = fileInfo["album"][0] if "title" in fileInfo else ""
duration = fileInfo.info.length
elif file.endswith(".m4a"):
fileInfo = MP4(path)
artistName = fileInfo['aART'][0] if 'aART' in fileInfo else fileInfo['\xa9ART'][0]
trackName = fileInfo['\xa9nam'][0] if '\xa9nam' in fileInfo else ""
album = fileInfo['\xa9alb'][0] if '\xa9alb' in fileInfo else ""
duration = fileInfo.info.length
elif file.endswith(".mp3"):
fileInfo = EasyID3(path)
artistName = fileInfo['albumartist'][0] if 'albumartist' in fileInfo else fileInfo['artist'][0]
trackName = fileInfo['title'][0] if 'title' in fileInfo else ""
album = fileInfo['album'][0] if 'album' in fileInfo else ""
duration = MP3(path).info.length
elif file.endswith(".opus") or file.endswith(".ogg"):
fileInfo = mutagen.File(path)
artistName = fileInfo['albumartist'][0] if 'albumartist' in fileInfo else fileInfo['artist'][0]
trackName = fileInfo['title'][0] if 'title' in fileInfo else ""
album = fileInfo['album'][0] if 'album' in fileInfo else ""
duration = fileInfo.info.length
#Added some output to see what info is being grabbed from the meta data inside each music file.
print(f'{path=}')
print(f'{artistName=} {trackName=} {album=}')
if artistName != "" and trackName != "":
parsedArtistName = sanitize_values_for_url(artistName)
parsedTrack = sanitize_values_for_url(trackName)
url = "https://genius.com/" + parsedArtistName + "-" + parsedTrack + "-lyrics"
urls.append(url)
values[url.lower()] = (duration, path, artistName, trackName, album)
if (NUKE_TXT and path.endswith(".txt")) or (NUKE_LRC and path.endswith(".lrc")) :
os.remove(path)
start_urls = urls
def parse(self, response):
key = response.url if response.url in self.values else response.request.meta.get(
'redirect_urls')[0]
duration = self.values[key][0]
file_path = self.values[key][1]
artistName = self.values[key][2]
trackName = self.values[key][3]
album = self.values[key][4]
lyrics = "\n".join(response.xpath(
'//div[@data-lyrics-container="true"]/text() | //div[@data-lyrics-container="true"]/a/span/text() | //div[@data-lyrics-container="true"]/i/text()').getall())
lines = lyrics.count("\n") + 1
time_per_line = duration / lines
new_lyrics = lyrics.split("\n")
final_lyrics = []
time = 0
for line in new_lyrics:
minutes = int(time // 60)
seconds = int(time % 60)
final_lyrics.append(
"[" + "{:02d}".format(minutes) + ":" + "{:02d}".format(seconds) + ".00]" + line)
time = time + time_per_line
if GENERATE_LRC:
f = open(add_extension(file_path), "w+", encoding='utf-8')
f.write("\n".join(final_lyrics))
f.close()
if GENERATE_TXT:
f = open(add_extension(file_path, ".txt"), "w+", encoding='utf-8')
f.write("Artist: " + artistName+"\nAlbum: " + album +
"\nTrack: " + trackName+"\n\n" + lyrics)
f.close()