-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrapper.py
105 lines (95 loc) · 3.31 KB
/
scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import requests
import urllib
from bs4 import BeautifulSoup
import http.cookiejar
import re
class SteamStoreScrapper():
STORE_HOMEPAGE = 'https://store.steampowered.com/'
STORE_URL = STORE_HOMEPAGE + 'app/{0}'
def __init__(self):
pass
def scrap(self, appid):
soup = self.get_soup(appid)
if soup is not None:
name = self.get_name(soup)
tags = self.get_tags(soup)
score = self.get_score(soup)
year = self.get_year(soup)
data = {
'name': name,
'tags': tags,
'score': score,
'year': year,
}
else:
data = None
return data
def get_past_age_check(self, url):
cj = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(
urllib.request.HTTPCookieProcessor(cj)
)
# GET the original url, it should return the page with the age form
response = opener.open(url)
sessionid = re.search(
r'.*sessionid=(?P<sessionid>[0-9a-f]*).*',
str(response.headers)
).group('sessionid')
# Creating POST data
data = urllib.parse.urlencode({
'ageDay': '1',
'ageMonth': 'January',
'ageYear': '1954',
'sessionid': sessionid,
'snr': '1_agecheck_agecheck__age-gate',
})
data = data.encode('utf-8')
url = response.url
# POST on the page with the age form
response = opener.open(url, data)
# TODO check form post success
return response
def get_soup(self, appid):
url = self.STORE_URL.format(appid)
cookies = dict(mature_content='1')
response = requests.get(url, cookies=cookies)
if str(appid) not in response.url:
return None
elif response.url.find('agecheck') != -1:
# There is another type of age check that is more difficult to pass
response = self.get_past_age_check(url)
# We are using two different type of response objects
# because it's the only way I've found to pass the age check
# So the methods to get the text are different
# TODO : find a solution to use either urllib or responses
# but not both
soup = BeautifulSoup(response.read(), 'lxml')
else:
soup = BeautifulSoup(response.text, 'lxml')
return soup
def get_name(self, soup):
name = soup.find('title').get_text().strip()[:-9]
return name
def get_score(self, soup):
score = soup.find(class_='score')
if score is None:
return 'Not rated'
else:
return score.get_text().strip()
def get_year(self, soup):
date = soup.find(class_='release_date')
if date is None:
return 'Unknown'
else:
date = date.get_text().strip()
return date[-4:]
def get_tags(self, soup):
# soup = self.get_soup(appid)
tags = list(map(
lambda x: x.get_text().strip(),
soup.find_all(class_='app_tag')
))
# Sometimes there's a '+' in the tags list
# I don't know why, let's remove it
tags = list(filter(lambda x: x != '+', tags))
return tags