-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBluestar_extract.py
183 lines (125 loc) · 6.34 KB
/
Bluestar_extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#!/usr/bin/python
# -*- coding:utf-8 -*-
"""
Created on Mon Aug 19 08:42:15 2019
Nov 2021: Updated to python3 and new bus stop format
@author: jvteleco
"""
try:
from BeautifulSoup import BeautifulSoup
except ImportError:
from bs4 import BeautifulSoup
import urllib
from urllib.request import urlopen
import re
MAX_NUMBER_TO_SHOW = 10
url_stop_name = "1980SN121003" #Bargate
url_head ="https://www.bluestarbus.co.uk/stops/"
url_joined =url_head + url_stop_name
#/# Main Variables
status = False
bus_stop_bundle_main = ("","")
bus_info_stop_list_main = []
def obtainData(url):
bus_stop_bundle = ("","")
bus_info_stop_list = []
print("Obtaining data from url", url, "...\r\n")
try:
connection = urllib.request.urlopen(url)
html = connection.read()
connection.close()
###print(html)
#/# To Debug and check the HTML
#f= open("bluestarbus_response.html","w")
#f.write(html)
#f.close()
parsed_html = BeautifulSoup(html)
#parsed_html = BeautifulSoup(html, "lxml")
###print(parsed_html.body)
except Exception as e:
print("There was an error during the connection" )
print(e)
return False, ("",""), []
try:
print("Converting data...")
#/# Need to get bus stop info from single-stop__header
#bus_stop_header = parsed_html.body.find('div', attrs={'class':'single-stop__header'})
bus_stop_header = parsed_html.body.find('h1', attrs={'class':'place-info-banner__name'})
#print(bus_stop_header)
#print(bus_stop_header.contents)
#print(bus_stop_header.text)
#/# The bust stop name is in h1 class="place-info-banner__name"
###bus_stop_header.find('h1', attrs={'class':'place-info-banner__name'})
##bus_stop_header.h1.contents
#bus_stop_name = bus_stop_header.h1.contents[0]
bus_stop_name_1 = bus_stop_header.contents[0].strip()
bus_stop_name_2 = bus_stop_header.contents[1].text.strip()
#print(bus_stop_name_1)
#print(bus_stop_name_2)
bus_stop_name = bus_stop_name_1 + "," + bus_stop_name_2
#OLD WEBSITE; had numbers of buses, eg: 3,16,18.
#bus_stop_number_header = parsed_html.body.find('p', attrs={'class':'place-info-banner__meta'})
#bus_stop_name_code = bus_stop_number_header.text
#bus_stop_name = bus_stop_name.strip()
#print("BUS STOP NAME:", bus_stop_name , "CODE:", bus_stop_name_code)
#2021 now they are blue buttons:
#<div class="place-info-banner__row">
# <ul class="place-info-banner__block-list
bus_stop_number_header = parsed_html.body.find('ul', attrs={'class':'place-info-banner__block-list'})
buses_stop_list_li = bus_stop_number_header.find_all('li')
bus_stop_name_code =""
for child in buses_stop_list_li:
bus_stop_name_code = bus_stop_name_code + child.text.strip() + " "
bus_stop_bundle = (bus_stop_name, bus_stop_name_code)
#/# Now grab departure buses and times
single_stop_body = parsed_html.body.find('div', attrs={'class':'single-stop__body'})
#/# Sometime they are div class="single-visit" and others they are a class="single-visit"
###all_bus_items = single_stop_body.findAll("div", "single-visit")
list_elements= single_stop_body.div.contents
all_bus_items = []
for item in list_elements:
if len(item) > 1:
#only save "good ones"
all_bus_items.append(item)
####print(item.find("div", "single-visit") )
####print(item.find("a", "single-visit") )
if len(all_bus_items) == 0:
print("ERROR, no info.")
print("try again in 1 minute or recheck bus stop name")
else:
#/# Truncate to the next 5 buses
#print(all_bus_items)
#print(len(all_bus_items))
if len(all_bus_items) > MAX_NUMBER_TO_SHOW:
all_bus_items= all_bus_items[0:MAX_NUMBER_TO_SHOW]
#need to now separate by "single-visit" items
for item in all_bus_items:
###print(item)
#bus_info = item.find('div', attrs={'class':'single-visit__content'})
#print(bus_info)
bus_info_stop_number= item.find('p', attrs={'class':'single-visit__name'}).text
bus_info_stop_name= item.find('p', attrs={'class':'single-visit__description'}).text
bus_info_stop_time_list = item.find('div', attrs={'class':'single-visit__time'})
#/# If it is realtime, <div class="single-visit__time single-visit__time--expected"> 59 mins</div>
#/# If it not realtime <div class="single-visit__time single-visit__time--aimed"> 10:06</div>
###print(bus_info_stop_time_list)
bus_info_stop_time_text = bus_info_stop_time_list.text
bus_info_stop_time_text= bus_info_stop_time_text.strip()
#print("BUS NUMBER:", bus_info_stop_number, "BUS_NAME", bus_info_stop_name , "TIME:", bus_info_stop_time_text )
bus_info_stop_list.append( [bus_info_stop_number, bus_info_stop_name, bus_info_stop_time_text] )
return True, bus_stop_bundle, bus_info_stop_list
except Exception as e:
print("There was an error finding the bus info, probably wrong bus url code")
print(e)
return False, ("",""), []
if __name__ == '__main__':
status, bus_stop_bundle_main, bus_info_stop_list_main = obtainData(url_joined)
if status:
##print(bus_stop_bundle_main )
##print(bus_info_stop_list_main)
print("----" )
print("BUS STOP NAME:", bus_stop_bundle_main[0] , "\tBUS NUMBERS:", bus_stop_bundle_main[1] )
for bus_info_stop_item in bus_info_stop_list_main:
print("BUS NUMBER:", bus_info_stop_item[0] , "\tBUS DIRECTION:", bus_info_stop_item[1] , "\t\tTIME:", bus_info_stop_item[2] )
else:
print("Status was False")