-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathgoogle.py
95 lines (75 loc) · 2.58 KB
/
google.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from urllib.request import urlopen
import json
import urllib
import lxml.html as lh
def GetUrl(url, post = None):
print ("GetUrl :: ", url)
headers = { }#'User-Agent' : USER_AGENT }
encodedPost = None
if post:
encodedPost = urllib.parse.urlencode(post).encode('utf-8')
opener = urllib.request.build_opener()
req = urllib.request.Request(url, encodedPost, headers)
response = opener.open(req)
doc = lh.parse(response)
return doc
all_titles = []
def get_movies(category, showtype, existing_shows):
start = 0
num = 100
previousPagehash = ""
shows = []
while True:
url = "https://play.google.com/store/movies/category/%s/collection/movers_shakers" % category
print("page", start)
doc = GetUrl(url, {'start': start, 'num': num, 'numChildren': 0, 'ipf' : 1, 'xhr': 1 })
cards = doc.xpath("//div[contains(@class, 'card-list')]//div[contains(@class, 'card-content')]")
# the last page repeats...
pagehash = ""
for card in cards:
pagehash = pagehash + card.xpath(".//h2//a")[0].text.strip()
print (pagehash)
if pagehash == previousPagehash:
break
previousPagehash = pagehash
for card in cards:
a = card.xpath(".//h2//a")[0]
img = card.xpath(".//img[contains(@class, 'cover-image')]")[0]
price = card.xpath(".//span[contains(@class, 'display-price')]")[0]
# remove duplicates
if a.get("href") in all_titles:
continue
all_titles.append(a.get("href"))
show = {}
show["title"] = a.text.strip()
show["image"] = img.get("src")
price = price.text.strip().replace("$","")
if showtype == "movie":
show["type"] = "movie"
# every show has an episode even movies
show["episodes"] = [{"show" : a.text.strip(), "uri" : "https://play.google.com" + a.get("href"), "date" : "", "s" : 0, "e" : 0, "price" : price}]
print (show["title"], price)
else:
show["type"] = "tv"
# every show has an episode even movies
show["episodes"] = [{"show" : a.text.strip(), "uri" : "https://play.google.com" + a.get("href"), "date" : "", "s" : 0, "e" : 0, "price" : price.text.strip()}]
add_show = True
# check does not already exist
for s in existing_shows:
if s["episodes"][0]["uri"] == show["episodes"][0]["uri"]:
add_show = False
break
if add_show:
shows.append(show)
start = start + num
return shows
def get_listings():
all_titles = []
categories = [1,2,3,4,5,6,7,8,10,27,18,25,13]
shows = []
for category in categories:
shows = shows + get_movies(category, "movie", shows)
return shows
if __name__ == "__main__":
all_shows = get_listings()
print("Count ", len(all_shows))