-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlastFMScrape.py
99 lines (77 loc) · 3.01 KB
/
lastFMScrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/python
from networkScrape import NetworkScraper
import string
class ArtistNetwork(NetworkScraper):
"""docstring for ArtistNetwork"""
def __init__(self):
super(ArtistNetwork, self).__init__()
def getDataSource(self, nodeId):
print nodeId
artistURL = lambda artistId: "https://www.last.fm/music/" + artistId
soup = self.url_to_soup(artistURL(nodeId))
return soup
def getEdgeData(self, data):
relatedList = data.find_all("ol", class_ = "grid-items")[-1]
if relatedList.find(itemprop = "album") != None:
return []
links = relatedList.find_all("a", class_ = "link-block-cover-link")
ids = [link["href"].replace("/music/","") for link in links]
edgeObjs = [self.makeEdgeObject(nodeId) for nodeId in ids]
return edgeObjs
def getNodeName(self, data):
title = data.find("h1", class_ = "header-title")
return title.text.strip()
def getNodeProperties(self, data):
propertiesObj = {}
# rank
rankDivs = data.find_all("button", class_ = "header-popularity-rank")
if len(rankDivs) == 0:
propertiesObj["top 100"] = "not top 100"
propertiesObj["rank"] = "Unranked"
else:
propertiesObj["top 100"] = "top 100"
propertiesObj["rank"] = rankDivs[0].text.strip()
# scrobbles
scrobblesDiv = data.find("li", class_ = "header-metadata-item--scrobbles")
scrobbleNum = int(scrobblesDiv.find("abbr")["title"].replace(",",""))
propertiesObj["scrobbles"] = scrobbleNum
# listeners
listenerDiv = data.find("li", class_ = "header-metadata-item--listeners")
listenerNum = int(listenerDiv.find("abbr")["title"].replace(",",""))
propertiesObj["listeners"] = listenerNum
# top tag
tags = data.find_all("li", class_ = "tag", itemprop = "genre")
propertiesObj["top tag"] = tags[0].text.strip().replace("-"," ")
propertiesObj["tag list"] = [tag.text.strip() for tag in tags]
return propertiesObj
class TagNetwork(NetworkScraper):
"""docstring for TagNetwork"""
def __init__(self):
super(TagNetwork, self).__init__()
def getDataSource(self, nodeId):
print nodeId
tagURL = lambda tag: "https://www.last.fm/tag/" + tag
soup = self.url_to_soup(tagURL(nodeId))
return soup
def getEdgeData(self, data):
tagSection = data.find("section", class_ = "tag-section")
if tagSection == None:
return []
tags = [tagDiv['href'].replace("/tag/","") for tagDiv in tagSection.find_all("a")]
edgeObjs = [self.makeEdgeObject(nodeId) for nodeId in tags]
return edgeObjs
def getNodeName(self, data):
title = data.find("h1", class_ = "header-title")
return title.text.strip()
def getNodeProperties(self, data):
propertiesObj = {}
# top 5 artist listeners
artistsDiv = [x for x in data.find_all("div", class_ = "selectable-range")
if "More artists" in x.text]
if artistsDiv == []:
return {}
listenerDivs= artistsDiv[0].find_all(class_ = "grid-items-item-aux-text")
listenerNums = [int(div.text.strip().replace(",","").split(" ")[0]) for div in listenerDivs
if div.text.strip() != ""]
propertiesObj["top 5 listener sum"] = sum(listenerNums)
return propertiesObj