Skip to content

Commit

Permalink
Modified due to updates in the web pages of the data source
Browse files Browse the repository at this point in the history
  • Loading branch information
mikeqfu committed Dec 4, 2019
1 parent aca6383 commit 0a31277
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 27 deletions.
8 changes: 4 additions & 4 deletions pyrcs/line_data_cls/crs_nlc_tiploc_stanox.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import pandas as pd
import requests
from pyhelpers.dir import regulate_input_data_dir
from pyhelpers.misc import confirmed
from pyhelpers.ops import confirmed
from pyhelpers.store import load_json, load_pickle

from pyrcs.utils import cd_dat, save_json, save_pickle
Expand Down Expand Up @@ -153,7 +153,7 @@ def collect_other_systems_codes(self, confirmation_required=True, verbose=False)
# Get column names for the other systems table
headers = list(more_itertools.unique_everseen([h.text for h in web_page_text.find_all('th')]))
# Parse table data for each system
tbl_data = web_page_text.find_all('table', {'width': '1100px'})
tbl_data = web_page_text.find_all('table')
tables = [pd.DataFrame(parse_tr(headers, table.find_all('tr')), columns=headers) for table in tbl_data]
codes = [tables[i] for i in range(len(tables)) if i % 2 != 0]
# Make a dict
Expand Down Expand Up @@ -275,8 +275,8 @@ def parse_stanox_note(x):
if any('see note' in crs_note for crs_note in location_codes.CRS_Note):
loc_idx = [i for i, crs_note in enumerate(location_codes.CRS_Note) if 'see note' in crs_note]
web_page_text = bs4.BeautifulSoup(source.text, 'lxml')
note_urls = [urllib.parse.urljoin(self.Catalogue[initial.upper()], l['href'])
for l in web_page_text.find_all('a', href=True, text='note')]
note_urls = [urllib.parse.urljoin(self.Catalogue[initial.upper()], x['href'])
for x in web_page_text.find_all('a', href=True, text='note')]
additional_notes = [self.parse_additional_note_page(note_url) for note_url in note_urls]
additional_note = dict(zip(location_codes.CRS.iloc[loc_idx], additional_notes))
else:
Expand Down
6 changes: 3 additions & 3 deletions pyrcs/line_data_cls/electrification.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import pandas as pd
import requests
from pyhelpers.dir import regulate_input_data_dir
from pyhelpers.misc import confirmed
from pyhelpers.ops import confirmed
from pyhelpers.store import load_pickle

from pyrcs.utils import cd_dat, get_catalogue, get_last_updated_date, parse_tr, save_pickle
Expand Down Expand Up @@ -78,7 +78,7 @@ def collect_codes_for_national_network(self, confirmation_required=True, verbose

note_tag = h3.find_next('h4')
if note_tag and note_tag.text == 'Notes':
notes_ = dict((x.find('a').get('name').title(), x.text.replace('\xa0', ''))
notes_ = dict((x.find('a').get('id').title(), x.text.replace('\xa0', ''))
for x in soup.find('ol') if x != '\n')
if notes['Notes'] is None:
notes['Notes'] = notes_
Expand Down Expand Up @@ -164,7 +164,7 @@ def collect_codes_for_independent_lines(self, confirmation_required=True, verbos
if h4:
previous_h3 = h4.find_previous('h3')
if previous_h3 == h3 and h4.text == 'Notes':
notes_ = dict((x.find('a').get('name').title(), x.text.replace('\xa0', ''))
notes_ = dict((x.find('a').get('id').title(), x.text.replace('\xa0', ''))
for x in h4.find_next('ol') if x != '\n')
if notes['Notes'] is None:
notes['Notes'] = notes_
Expand Down
8 changes: 4 additions & 4 deletions pyrcs/line_data_cls/lor_codes.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import pandas as pd
import requests
from pyhelpers.dir import regulate_input_data_dir
from pyhelpers.misc import confirmed
from pyhelpers.ops import confirmed
from pyhelpers.store import load_pickle

from pyrcs.utils import cd_dat, get_catalogue, get_last_updated_date, parse_tr
Expand Down Expand Up @@ -80,7 +80,7 @@ def get_lor_page_urls(self, update=False):
soup = bs4.BeautifulSoup(source.text, 'lxml')
links = soup.find_all('a', href=re.compile('^pride|elrmapping'),
text=re.compile('.*(codes|converter|Historical)'))
urls = list(dict.fromkeys([self.URL.replace(os.path.basename(self.URL), l['href']) for l in links]))
urls = list(dict.fromkeys([self.URL.replace(os.path.basename(self.URL), x['href']) for x in links]))
save_pickle(urls, path_to_pickle)
except Exception as e:
print("Failed to get the \"urls\" to LOR codes web pages. {}.".format(e))
Expand Down Expand Up @@ -134,12 +134,12 @@ def parse_h3_table(tbl_soup):
line_name_info.columns = ['Line Name', 'Line Name Note']
code_dat = pd.concat([code_dat, line_name_info], axis=1, sort=False)
try:
note_dat = dict([(x['name'].title(), x.text) for x in soup.find('ol').findChildren('a')])
note_dat = dict([(x['id'].title(), x.text) for x in soup.find('ol').findChildren('a')])
except AttributeError:
note_dat = dict([('Note', None)])
return code_dat, note_dat

h3, table_soup = soup.find_all('h3'), soup.find_all('table', {'width': '1100px'})
h3, table_soup = soup.find_all('h3'), soup.find_all('table')
if len(h3) == 0:
code_data, code_data_notes = parse_h3_table(table_soup)
lor_codes_by_initials = {'Code': code_data, 'Note': code_data_notes}
Expand Down
14 changes: 11 additions & 3 deletions pyrcs/other_assets_cls/stations.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,12 +106,20 @@ def collect_station_locations(self, initial, update=False, verbose=False):
records, header = parse_table(source, parser='lxml')
# Create a DataFrame of the requested table
dat = [[x.replace('=', 'See').strip('\xa0') for x in i] for i in records]
col = [h.replace('\r\n', ' ').replace('\r', ' ') for h in header]
col = [re.sub(r'\n?\r+\n?', ' ', h) for h in header]
station_locations_table = pd.DataFrame(dat, columns=col)

def parse_degrees(x):
if x == '':
y = pd.np.nan
else:
y = float(x.replace('c.', '') if x.startswith('c.') else x)
return y

station_locations_table[['Degrees Longitude', 'Degrees Latitude']] = \
station_locations_table[['Degrees Longitude', 'Degrees Latitude']].applymap(
lambda x: pd.np.nan if x == '' else float(x))
station_locations_table[['Degrees Longitude', 'Degrees Latitude']].applymap(parse_degrees)
station_locations_table['Grid Reference'] = station_locations_table['Grid Reference'].map(
lambda x: x.replace('c.', '') if x.startswith('c.') else x)

station_locations_table[['Station', 'Station_Note']] = \
station_locations_table.Station.map(parse_location_note).apply(pd.Series)
Expand Down
4 changes: 2 additions & 2 deletions pyrcs/other_assets_cls/tunnels.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def collect_railway_tunnel_lengths(self, page_no, update=False, verbose=False):
header[idx[0]:idx[0]] = ['Station_O', 'Station_D']

# Table data
temp_tables = parsed_text.find_all('table', attrs={'width': '1100px'})
temp_tables = parsed_text.find_all('table')
tbl_lst = parse_tr(header, trs=temp_tables[1].find_all('tr'))
tbl_lst = [[item.replace('\r', ' ').replace('\xa0', '') for item in record] for record in tbl_lst]

Expand Down Expand Up @@ -200,7 +200,7 @@ def collect_page4_others(self, update=False, verbose=False):
headers.append(header)
temp_header = temp_header.find_next('table')

tbl_lst = parsed_text.find_all('table', attrs={'width': '1100px'})
tbl_lst = parsed_text.find_all('table')
tbl_lst = operator.itemgetter(1, 3)(tbl_lst)
tbl_lst = [parse_tr(header, x.find_all('tr')) for header, x in zip(headers, tbl_lst)]
tbl_lst = [[[item.replace('\xa0', '') for item in record] for record in tbl] for tbl in tbl_lst]
Expand Down
22 changes: 11 additions & 11 deletions pyrcs/rc_psql.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,22 @@
import sqlalchemy.engine.reflection
import sqlalchemy.engine.url
import sqlalchemy_utils
from pyhelpers.misc import confirmed
from pyhelpers.ops import confirmed


class RailwayCodesPSQL:
def __init__(self):
def __init__(self, username='postgres', password=None, host='localhost', port=5432, database_name='postgres'):
"""
We need to be connected to the database server in order to execute the "CREATE DATABASE" command. There is a
database called "postgres" created by the "initdb" command when the data storage area is initialised. If we
need to create the first of our own database, we can set up a connection to "postgres" in the first instance.
"""
self.database_info = {'drivername': 'postgresql+psycopg2',
'username': input('PostgreSQL username: '),
'password': getpass.getpass('PostgreSQL password: '),
'host': input('Host name: '),
'port': 5432, # default by installation
'database': input('Database name: ')}
'username': username,
'password': password if password else getpass.getpass('PostgreSQL password: '),
'host': host, # default: localhost
'port': port, # 5432 (default by installation)
'database': database_name}

# The typical form of a database URL is: url = backend+driver://username:password@host:port/database_name
self.url = sqlalchemy.engine.url.URL(**self.database_info)
Expand All @@ -37,9 +37,9 @@ def __init__(self):
self.connection = self.engine.connect()

# Establish a connection to the specified database (named e.g. 'osm_extracts')
def connect_db(self, database_name='Railway Codes and other data'):
def connect_db(self, database_name='Railway_Codes'):
"""
:param database_name: [str, 'Railway Codes and other data' (default)] name of a database
:param database_name: [str] (default: 'Railway_Codes') name of a database
"""
self.database_name = database_name
self.database_info['database'] = self.database_name
Expand Down Expand Up @@ -79,7 +79,7 @@ def create_db(self, database_name='Railway Codes and other data'):
# Get size of a database
def get_db_size(self, database_name=None):
"""
:param database_name: [str; None(default)] name of database
:param database_name: [str; None (default)] name of database
:return:
"""
db_name = '\'{}\''.format(database_name) if database_name else 'current_database()'
Expand Down Expand Up @@ -115,7 +115,7 @@ def disconnect_all_others(self):
# Drop the specified database
def drop(self, database_name=None):
"""
:param database_name: [str] name of database to disconnect from, or None (default) to disconnect the current one
:param database_name: [str; None (default)] database to be disconnected; None: to disconnect the current one
"""
db_name = self.database_name if database_name is None else database_name
if confirmed("Confirmed to drop the database \"{}\"?".format(db_name)):
Expand Down

0 comments on commit 0a31277

Please sign in to comment.