Skip to content

Commit

Permalink
cleaning times for variations data
Browse files Browse the repository at this point in the history
Cleaning times for variations data, though for some reasons some
variations slips through. Added the different variations and blacklist,
changed weekday index according to feedback
  • Loading branch information
Skippern committed Apr 2, 2017
1 parent b35578d commit 0af7afc
Show file tree
Hide file tree
Showing 2 changed files with 2,374 additions and 1,843 deletions.
215 changes: 211 additions & 4 deletions get_times.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,23 @@
# PDFs are stored here
baseurl = "http://www.expressolorenzutti.com.br/horarios/"

ignoreVariants = True
blacklistVariants = True

# List of route numbers
routes = [ "001", "002", "003", "004", "005", "006", "007", "008", "009", "010", "011", "012", "013", "014", "015", "016", "017", "018", "019", "020", "021", "022", "023", "024", "025", "026", "027", "028", "029", "030", "031", "032", "033", "034", "035", "036", "037", "038", "039", "040", "041", "042", "043", "044", "045", "046", "047", "048", "049", "050", "051", "052", "053", "054", "055", "056", "057", "058" ]
#routes = [ "001", "009" ]
myRoutes = {}

def uniq(values):
output = []
seen = set()
for value in values:
if value not in seen:
output.append(value)
seen.add(value)
return output

def download_pdf(i):
downloadURL = baseurl + i + ".pdf"
r = False
Expand All @@ -50,11 +62,39 @@ def download_pdf(i):
else:
return None

def lower_capitalized(input):
newString = input.lower().replace(u"n. s .", u"nossa senhora da ").replace(u".", u". ")
toOutput = []
for s in newString.split(u" "):
tmp = s.capitalize()
toOutput.append(tmp)
newString = u" ".join(toOutput)
output = newString.replace(u" Da ", u" da ").replace(u" Das ", u" das ").replace(u" De ", u" de ").replace(u" Do ", u" do ").replace(u" Dos ", u" dos ").replace(u" E ", u" e ").replace(u"Sesc", u"SESC").replace(u"sesc", u"SESC").replace(u" X ", u" x ").replace(u"Ciac", u"CIAC").replace(u"Via", u"via").replace(u"Br ", u"BR-").replace(u"Br-", u"BR-").replace(u"Br1", u"BR-1").replace(u"BR 101", u"BR-101").replace(u"BR101", u"BR-101").replace(u"Caic", u"CAIC").replace(u" ", u" ")
# Specific place names
output = output.replace(u"Trevo Setiba", u"Trevo de Setiba")
output = output.replace(u"Trevo BR-101", u"Trevo da BR-101")
output = output.replace(u"Santa Monica", u"Santa Mônica")
output = output.replace(u"Pontal Santa Mônica", u"Pontal de Santa Mônica")
output = output.replace(u"Vitoria", u"Vitória")
output = output.replace(u"Praça Vitória", u"Praça da Vitória")
output = output.replace(u"Ewerson de A. Sodré", u"Ewerson de Abreu Sodré")
output = output.replace(u"Meaipe", u"Meaípe")
output = output.replace(u"J. Boa Vista", u"Jardim Boa Vista")
output = output.replace(u"Jabarai", u"Jabaraí")
output = output.replace(u"olaria", u"Olaria")
output = output.replace(u"muquiçaba", u"Muquiçaba")
output = output.replace(u"Independencia", u"Independência")
output = output.replace(u"Patura", u"Paturá")
return output

def create_json(fromV, toV, weekdays, saturdays, sundays):
weekdays.sort()
saturdays.sort()
sundays.sort()
retValue = {}
retValue[u"from"] = fromV
retValue[u"to"] = toV
retValue[u"WD"] = weekdays
retValue[u"Mo-Fr"] = weekdays
retValue[u"Sa"] = saturdays
retValue[u"Su"] = sundays
return retValue
Expand All @@ -63,6 +103,7 @@ def create_json(fromV, toV, weekdays, saturdays, sundays):
myRoutes[u"operator"] = u"Expresso Lorenzutti"
myRoutes[u"network"] = u"PMG"
myRoutes[u"source"] = baseurl
myRoutes[u"blacklist"] = []
myRoutes[u"routes"] = {}

for i in routes:
Expand Down Expand Up @@ -110,13 +151,13 @@ def create_json(fromV, toV, weekdays, saturdays, sundays):
ref = tmpList[0]
tmpList.pop(0)
tmpList.pop(0)
name = u" ".join(tmpList).strip()
name = lower_capitalized(u" ".join(tmpList).strip())
fieldNr += 1
elif fieldNr == 2:
origin = object.get_text().strip()
origin = lower_capitalized(object.get_text().strip())
fieldNr += 1
elif fieldNr == 4:
destination = object.get_text().strip()
destination = lower_capitalized(object.get_text().strip())
fieldNr += 1
else:
tmp = object.get_text()
Expand Down Expand Up @@ -155,6 +196,172 @@ def create_json(fromV, toV, weekdays, saturdays, sundays):
print ref, name
print " From", origin
print " To", destination
# Here we need some code to handle variations, for now we'll just strip the information after the time stamp
wd_ida = uniq(wd_ida)
wd_volta = uniq(wd_volta)
sa_ida = uniq(sa_ida)
sa_volta = uniq(sa_volta)
su_ida = uniq(su_ida)
su_volta = uniq(su_volta)
myVariations = []
myVariationList = {}
variationSet = set()
for t in wd_ida:
if len(t) > 5:
newT = t[:5]
variation = t[5:].strip()
#print "(wi) Variation in \"{0}\"/\"{1}\"/\"{2}\"".format(t,newT,variation)
wd_ida.remove(t)
if ignoreVariants:
wd_ida.append(newT)
myVariations.append(variation)
tmp = u"{0} {1}".format(ref, variation)
if variation not in variationSet:
variationSet.add(variation)
myVariationList[tmp] = {}
myVariationList[tmp]["ida"] = {}
myVariationList[tmp]["volta"] = {}
myVariationList[tmp]["ida"]["Mo-Fr"] = []
myVariationList[tmp]["volta"]["Mo-Fr"] = []
myVariationList[tmp]["ida"]["Sa"] = []
myVariationList[tmp]["volta"]["Sa"] = []
myVariationList[tmp]["ida"]["Su"] = []
myVariationList[tmp]["volta"]["Su"] = []
myVariationList[tmp]["ida"]["Mo-Fr"].append(newT)
#else:
#print len(t), t
for t in wd_volta:
if len(t) > 5:
newT = t[:5]
variation = t[5:].strip()
#print "(wv) Variation in \"{0}\"/\"{1}\"/\"{2}\"".format(t,newT,variation)
wd_volta.remove(t)
if ignoreVariants:
wd_volta.append(newT)
myVariations.append(variation)
tmp = u"{0} {1}".format(ref, variation)
if variation not in variationSet:
variationSet.add(variation)
myVariationList[tmp] = {}
myVariationList[tmp]["ida"] = {}
myVariationList[tmp]["volta"] = {}
myVariationList[tmp]["ida"]["Mo-Fr"] = []
myVariationList[tmp]["volta"]["Mo-Fr"] = []
myVariationList[tmp]["ida"]["Sa"] = []
myVariationList[tmp]["volta"]["Sa"] = []
myVariationList[tmp]["ida"]["Su"] = []
myVariationList[tmp]["volta"]["Su"] = []
myVariationList[tmp]["volta"]["Mo-Fr"].append(newT)
#else:
#print len(t), t
for t in sa_ida:
if len(t) > 5:
newT = t[:5]
variation = t[5:].strip()
#print "(si) Variation in \"{0}\"/\"{1}\"/\"{2}\"".format(t,newT,variation)
sa_ida.remove(t)
if ignoreVariants:
sa_ida.append(newT)
tmp = u"{0} {1}".format(ref, variation)
myVariations.append(variation)
if variation not in variationSet:
variationSet.add(variation)
myVariationList[tmp] = {}
myVariationList[tmp]["ida"] = {}
myVariationList[tmp]["volta"] = {}
myVariationList[tmp]["ida"]["Mo-Fr"] = []
myVariationList[tmp]["volta"]["Mo-Fr"] = []
myVariationList[tmp]["ida"]["Sa"] = []
myVariationList[tmp]["volta"]["Sa"] = []
myVariationList[tmp]["ida"]["Su"] = []
myVariationList[tmp]["volta"]["Su"] = []
myVariationList[tmp]["ida"]["Sa"].append(newT)
#else:
#print len(t), t
for t in sa_volta:
if len(t) > 5:
newT = t[:5]
variation = t[5:].strip()
#print "(sv) Variation in \"{0}\"/\"{1}\"/\"{2}\"".format(t,newT,variation)
sa_volta.remove(t)
if ignoreVariants:
sa_volta.append(newT)
tmp = u"{0} {1}".format(ref, variation)
myVariations.append(variation)
if variation not in variationSet:
variationSet.add(variation)
myVariationList[tmp] = {}
myVariationList[tmp]["ida"] = {}
myVariationList[tmp]["volta"] = {}
myVariationList[tmp]["ida"]["Mo-Fr"] = []
myVariationList[tmp]["volta"]["Mo-Fr"] = []
myVariationList[tmp]["ida"]["Sa"] = []
myVariationList[tmp]["volta"]["Sa"] = []
myVariationList[tmp]["ida"]["Su"] = []
myVariationList[tmp]["volta"]["Su"] = []
myVariationList[tmp]["volta"]["Sa"].append(newT)
#else:
#print len(t), t
for t in su_ida:
if len(t) > 5:
newT = t[:5]
variation = t[5:].strip()
#print "(di) Variation in \"{0}\"/\"{1}\"/\"{2}\"".format(t,newT,variation)
su_ida.remove(t)
if ignoreVariants:
su_ida.append(newT)
myVariations.append(variation)
tmp = u"{0} {1}".format(ref, variation)
if variation not in variationSet:
variationSet.add(variation)
myVariationList[tmp] = {}
myVariationList[tmp]["ida"] = {}
myVariationList[tmp]["volta"] = {}
myVariationList[tmp]["ida"]["Mo-Fr"] = []
myVariationList[tmp]["volta"]["Mo-Fr"] = []
myVariationList[tmp]["ida"]["Sa"] = []
myVariationList[tmp]["volta"]["Sa"] = []
myVariationList[tmp]["ida"]["Su"] = []
myVariationList[tmp]["volta"]["Su"] = []
myVariationList[tmp]["ida"]["Su"].append(newT)
#else:
#print len(t), t
for t in su_volta:
if len(t) > 5:
newT = t[:5]
variation = t[5:].strip()
#print "(dv) Variation in \"{0}\"/\"{1}\"/\"{2}\"".format(t,newT,variation)
su_volta.remove(t)
if ignoreVariants:
su_volta.append(newT)
tmp = u"{0} {1}".format(ref, variation)
myVariations.append(variation)
if variation not in variationSet:
variationSet.add(variation)
myVariationList[tmp] = {}
myVariationList[tmp]["ida"] = {}
myVariationList[tmp]["volta"] = {}
myVariationList[tmp]["ida"]["Mo-Fr"] = []
myVariationList[tmp]["volta"]["Mo-Fr"] = []
myVariationList[tmp]["ida"]["Sa"] = []
myVariationList[tmp]["volta"]["Sa"] = []
myVariationList[tmp]["ida"]["Su"] = []
myVariationList[tmp]["volta"]["Su"] = []
myVariationList[tmp]["volta"]["Su"].append(newT)
#else:
#print len(t), t
if len(myVariations) > 0:
myVariations = uniq(myVariations)
print "Known variations: ",
for i in myVariations:
print "{0}, ".format(i),
tmp = u"{0} {1}".format(ref, i)
if blacklistVariants:
myRoutes["blacklist"].append(tmp)
myRoutes["routes"][tmp] = [ create_json(origin, destination, myVariationList[tmp]["ida"]["Mo-Fr"], myVariationList[tmp]["ida"]["Sa"], myVariationList[tmp]["ida"]["Su"]), create_json(destination, origin, myVariationList[tmp]["volta"]["Mo-Fr"], myVariationList[tmp]["volta"]["Sa"], myVariationList[tmp]["volta"]["Su"]) ]
print ""
# print myVariationList

myRoutes["routes"][ref] = [ create_json(origin, destination, wd_ida, sa_ida, su_ida),
create_json(destination, origin, wd_volta, sa_volta, su_volta) ]
with open('timetable.json', 'w') as outfile:
Expand Down
Loading

0 comments on commit 0af7afc

Please sign in to comment.