-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathberkeley_pizza_parser.py
143 lines (117 loc) · 4.34 KB
/
berkeley_pizza_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from BeautifulSoup import BeautifulSoup as bs
import BeautifulSoup
import urllib2
import dateutil.parser
import json
from datetime import date
from datetime import datetime
import sys
class Pizza:
url = None
page = None
def fetch(self):
try:
self.page = urllib2.urlopen(self.url)
except:
pass
def parse(self):
assert False
def getMePizza(self):
self.fetch()
return self.parse()
class CheeseBoard(Pizza):
def __init__(self):
self.url = "http://cheeseboardcollective.coop/pizza/"
def parse(self):
soup = bs(self.page)
days = soup.find("div", {"class": "pizza-list"})
ret = {}
for day in days:
if isinstance(day, BeautifulSoup.NavigableString):
continue
d = dateutil.parser.parse(day.find("div", {"class": "date"}).text).date()
if not day.find("div", {"class": "menu"}).text:
continue
p = day.find("div", {"class": "menu"}).text.split(":")[1].strip()
if p.endswith("Salad"):
p = p[:-len("Salad")]
if p.lower().startswith("new pizza") and p[len("new pizza")] != " ":
p = p[0:len("new pizza")] + " " + p[len("new pizza"):]
ret[d] = p
return ret
class Sliver(Pizza):
def __init__(self, location):
self.url = "https://www.sliverpizzeria.com/menu-weekly"
self.location = location.lower()
def parse(self):
soup = bs(self.page)
locations = soup.findAll("div", {"class": "summary-item-list-container sqs-gallery-container"})
location = None
for this_location in locations:
location_text = this_location.findAll("span", {"class": "summary-collection-title"})[0].text
if self.location not in location_text.lower():
continue
location = this_location
months = location.findAll("span", {"class": "summary-thumbnail-event-date-month"})
months = [x.text for x in months]
days = location.findAll("span", {"class": "summary-thumbnail-event-date-day"})
days = [x.text for x in days]
dates = zip(months, days)
dates = [ "%s %s" % x for x in dates]
dates = [ dateutil.parser.parse(x).date() for x in dates]
pizzas = location.findAll("div", {"class": "summary-excerpt"})
pizzas = [ x.text for x in pizzas]
pairs = zip(dates, pizzas)
ret = {}
for date, pizza in pairs:
ret[date] = pizza
return ret
def mergePizza(l):
labels = map(lambda s: s[0], l)
pizzas = map(lambda s: s[1], l)
keys = []
for item in pizzas:
keys += item.keys()
keys = set(keys)
ret = {}
for key in keys:
ret[key] = {}
for i in xrange(0, len(pizzas)):
val = None
if key in pizzas[i]:
val = pizzas[i][key]
ret[key][labels[i]] = val
return ret
def tagPizza(pizza):
d = {
"data": pizza,
"meta": {
"source": "https://github.com/Paul-pearce/berkeley_pizza_parser",
"author": "Paul Pearce <pearce@cs.berkeley.edu>",
"timestamp": datetime.utcnow().isoformat(),
},
}
return d
def jsonPizza(pizza):
for key in pizza["data"].keys():
pizza["data"][key.isoformat()] = pizza["data"][key]
del pizza["data"][key]
return json.dumps(pizza, sort_keys=True, indent=4, separators=(',', ': '))
def writePizza(pizza, filename):
# I want the trailing \n that json.dump() does not give.
f = open(filename, "w")
f.write(pizza)
f.write("\n")
f.close()
if __name__ == "__main__":
if len(sys.argv) != 2:
print "Usage: python berkeley_pizza_parser.py output.json"
exit(-1)
cheese = CheeseBoard().getMePizza()
sliver_shattuck = Sliver("shattuck").getMePizza()
sliver_telegraph = Sliver("telegraph").getMePizza()
sliver_oakland = Sliver("broadway").getMePizza()
pizza = mergePizza([["Cheeseboard", cheese], ["Sliver Shattuck", sliver_shattuck], ["Sliver Telegraph", sliver_telegraph], ["Sliver Broadway", sliver_oakland]])
taggedPizza = tagPizza(pizza)
jsonPizza = jsonPizza(taggedPizza)
writePizza(jsonPizza, sys.argv[1])