Modified due to updates in the web pages of the data source

mikeqfu · Dec 4, 2019 · 0a31277 · 0a31277
1 parent aca6383
commit 0a31277
Show file tree

Hide file tree

Showing 6 changed files with 35 additions and 27 deletions.
diff --git a/pyrcs/line_data_cls/crs_nlc_tiploc_stanox.py b/pyrcs/line_data_cls/crs_nlc_tiploc_stanox.py
@@ -24,7 +24,7 @@
 import pandas as pd
 import requests
 from pyhelpers.dir import regulate_input_data_dir
-from pyhelpers.misc import confirmed
+from pyhelpers.ops import confirmed
 from pyhelpers.store import load_json, load_pickle
 
 from pyrcs.utils import cd_dat, save_json, save_pickle
@@ -153,7 +153,7 @@ def collect_other_systems_codes(self, confirmation_required=True, verbose=False)
                 # Get column names for the other systems table
                 headers = list(more_itertools.unique_everseen([h.text for h in web_page_text.find_all('th')]))
                 # Parse table data for each system
-                tbl_data = web_page_text.find_all('table', {'width': '1100px'})
+                tbl_data = web_page_text.find_all('table')
                 tables = [pd.DataFrame(parse_tr(headers, table.find_all('tr')), columns=headers) for table in tbl_data]
                 codes = [tables[i] for i in range(len(tables)) if i % 2 != 0]
                 # Make a dict
@@ -275,8 +275,8 @@ def parse_stanox_note(x):
                 if any('see note' in crs_note for crs_note in location_codes.CRS_Note):
                     loc_idx = [i for i, crs_note in enumerate(location_codes.CRS_Note) if 'see note' in crs_note]
                     web_page_text = bs4.BeautifulSoup(source.text, 'lxml')
-                    note_urls = [urllib.parse.urljoin(self.Catalogue[initial.upper()], l['href'])
-                                 for l in web_page_text.find_all('a', href=True, text='note')]
+                    note_urls = [urllib.parse.urljoin(self.Catalogue[initial.upper()], x['href'])
+                                 for x in web_page_text.find_all('a', href=True, text='note')]
                     additional_notes = [self.parse_additional_note_page(note_url) for note_url in note_urls]
                     additional_note = dict(zip(location_codes.CRS.iloc[loc_idx], additional_notes))
                 else:

diff --git a/pyrcs/line_data_cls/electrification.py b/pyrcs/line_data_cls/electrification.py
@@ -15,7 +15,7 @@
 import pandas as pd
 import requests
 from pyhelpers.dir import regulate_input_data_dir
-from pyhelpers.misc import confirmed
+from pyhelpers.ops import confirmed
 from pyhelpers.store import load_pickle
 
 from pyrcs.utils import cd_dat, get_catalogue, get_last_updated_date, parse_tr, save_pickle
@@ -78,7 +78,7 @@ def collect_codes_for_national_network(self, confirmation_required=True, verbose
 
                     note_tag = h3.find_next('h4')
                     if note_tag and note_tag.text == 'Notes':
-                        notes_ = dict((x.find('a').get('name').title(), x.text.replace('\xa0', ''))
+                        notes_ = dict((x.find('a').get('id').title(), x.text.replace('\xa0', ''))
                                       for x in soup.find('ol') if x != '\n')
                         if notes['Notes'] is None:
                             notes['Notes'] = notes_
@@ -164,7 +164,7 @@ def collect_codes_for_independent_lines(self, confirmation_required=True, verbos
                     if h4:
                         previous_h3 = h4.find_previous('h3')
                         if previous_h3 == h3 and h4.text == 'Notes':
-                            notes_ = dict((x.find('a').get('name').title(), x.text.replace('\xa0', ''))
+                            notes_ = dict((x.find('a').get('id').title(), x.text.replace('\xa0', ''))
                                           for x in h4.find_next('ol') if x != '\n')
                             if notes['Notes'] is None:
                                 notes['Notes'] = notes_

diff --git a/pyrcs/line_data_cls/lor_codes.py b/pyrcs/line_data_cls/lor_codes.py
@@ -19,7 +19,7 @@
 import pandas as pd
 import requests
 from pyhelpers.dir import regulate_input_data_dir
-from pyhelpers.misc import confirmed
+from pyhelpers.ops import confirmed
 from pyhelpers.store import load_pickle
 
 from pyrcs.utils import cd_dat, get_catalogue, get_last_updated_date, parse_tr
@@ -80,7 +80,7 @@ def get_lor_page_urls(self, update=False):
                 soup = bs4.BeautifulSoup(source.text, 'lxml')
                 links = soup.find_all('a', href=re.compile('^pride|elrmapping'),
                                       text=re.compile('.*(codes|converter|Historical)'))
-                urls = list(dict.fromkeys([self.URL.replace(os.path.basename(self.URL), l['href']) for l in links]))
+                urls = list(dict.fromkeys([self.URL.replace(os.path.basename(self.URL), x['href']) for x in links]))
                 save_pickle(urls, path_to_pickle)
             except Exception as e:
                 print("Failed to get the \"urls\" to LOR codes web pages. {}.".format(e))
@@ -134,12 +134,12 @@ def parse_h3_table(tbl_soup):
                     line_name_info.columns = ['Line Name', 'Line Name Note']
                     code_dat = pd.concat([code_dat, line_name_info], axis=1, sort=False)
                     try:
-                        note_dat = dict([(x['name'].title(), x.text) for x in soup.find('ol').findChildren('a')])
+                        note_dat = dict([(x['id'].title(), x.text) for x in soup.find('ol').findChildren('a')])
                     except AttributeError:
                         note_dat = dict([('Note', None)])
                     return code_dat, note_dat
 
-                h3, table_soup = soup.find_all('h3'), soup.find_all('table', {'width': '1100px'})
+                h3, table_soup = soup.find_all('h3'), soup.find_all('table')
                 if len(h3) == 0:
                     code_data, code_data_notes = parse_h3_table(table_soup)
                     lor_codes_by_initials = {'Code': code_data, 'Note': code_data_notes}

diff --git a/pyrcs/other_assets_cls/stations.py b/pyrcs/other_assets_cls/stations.py
@@ -106,12 +106,20 @@ def collect_station_locations(self, initial, update=False, verbose=False):
                     records, header = parse_table(source, parser='lxml')
                     # Create a DataFrame of the requested table
                     dat = [[x.replace('=', 'See').strip('\xa0') for x in i] for i in records]
-                    col = [h.replace('\r\n', ' ').replace('\r', ' ') for h in header]
+                    col = [re.sub(r'\n?\r+\n?', ' ', h) for h in header]
                     station_locations_table = pd.DataFrame(dat, columns=col)
 
+                    def parse_degrees(x):
+                        if x == '':
+                            y = pd.np.nan
+                        else:
+                            y = float(x.replace('c.', '') if x.startswith('c.') else x)
+                        return y
+
                     station_locations_table[['Degrees Longitude', 'Degrees Latitude']] = \
-                        station_locations_table[['Degrees Longitude', 'Degrees Latitude']].applymap(
-                            lambda x: pd.np.nan if x == '' else float(x))
+                        station_locations_table[['Degrees Longitude', 'Degrees Latitude']].applymap(parse_degrees)
+                    station_locations_table['Grid Reference'] = station_locations_table['Grid Reference'].map(
+                        lambda x: x.replace('c.', '') if x.startswith('c.') else x)
 
                     station_locations_table[['Station', 'Station_Note']] = \
                         station_locations_table.Station.map(parse_location_note).apply(pd.Series)

diff --git a/pyrcs/other_assets_cls/tunnels.py b/pyrcs/other_assets_cls/tunnels.py
@@ -141,7 +141,7 @@ def collect_railway_tunnel_lengths(self, page_no, update=False, verbose=False):
                     header[idx[0]:idx[0]] = ['Station_O', 'Station_D']
 
                 # Table data
-                temp_tables = parsed_text.find_all('table', attrs={'width': '1100px'})
+                temp_tables = parsed_text.find_all('table')
                 tbl_lst = parse_tr(header, trs=temp_tables[1].find_all('tr'))
                 tbl_lst = [[item.replace('\r', ' ').replace('\xa0', '') for item in record] for record in tbl_lst]
 
@@ -200,7 +200,7 @@ def collect_page4_others(self, update=False, verbose=False):
                         headers.append(header)
                     temp_header = temp_header.find_next('table')
 
-                tbl_lst = parsed_text.find_all('table', attrs={'width': '1100px'})
+                tbl_lst = parsed_text.find_all('table')
                 tbl_lst = operator.itemgetter(1, 3)(tbl_lst)
                 tbl_lst = [parse_tr(header, x.find_all('tr')) for header, x in zip(headers, tbl_lst)]
                 tbl_lst = [[[item.replace('\xa0', '') for item in record] for record in tbl] for tbl in tbl_lst]

diff --git a/pyrcs/rc_psql.py b/pyrcs/rc_psql.py
@@ -6,22 +6,22 @@
 import sqlalchemy.engine.reflection
 import sqlalchemy.engine.url
 import sqlalchemy_utils
-from pyhelpers.misc import confirmed
+from pyhelpers.ops import confirmed
 
 
 class RailwayCodesPSQL:
-    def __init__(self):
+    def __init__(self, username='postgres', password=None, host='localhost', port=5432, database_name='postgres'):
         """
         We need to be connected to the database server in order to execute the "CREATE DATABASE" command. There is a 
         database called "postgres" created by the "initdb" command when the data storage area is initialised. If we 
         need to create the first of our own database, we can set up a connection to "postgres" in the first instance.
         """
         self.database_info = {'drivername': 'postgresql+psycopg2',
-                              'username': input('PostgreSQL username: '),
-                              'password': getpass.getpass('PostgreSQL password: '),
-                              'host': input('Host name: '),
-                              'port': 5432,  # default by installation
-                              'database': input('Database name: ')}
+                              'username': username,
+                              'password': password if password else getpass.getpass('PostgreSQL password: '),
+                              'host': host,  # default: localhost
+                              'port': port,  # 5432 (default by installation)
+                              'database': database_name}
 
         # The typical form of a database URL is: url = backend+driver://username:password@host:port/database_name
         self.url = sqlalchemy.engine.url.URL(**self.database_info)
@@ -37,9 +37,9 @@ def __init__(self):
         self.connection = self.engine.connect()
 
     # Establish a connection to the specified database (named e.g. 'osm_extracts')
-    def connect_db(self, database_name='Railway Codes and other data'):
+    def connect_db(self, database_name='Railway_Codes'):
         """
-        :param database_name: [str, 'Railway Codes and other data' (default)] name of a database
+        :param database_name: [str] (default: 'Railway_Codes') name of a database
         """
         self.database_name = database_name
         self.database_info['database'] = self.database_name
@@ -79,7 +79,7 @@ def create_db(self, database_name='Railway Codes and other data'):
     # Get size of a database
     def get_db_size(self, database_name=None):
         """
-        :param database_name: [str; None(default)] name of database
+        :param database_name: [str; None (default)] name of database
         :return:
         """
         db_name = '\'{}\''.format(database_name) if database_name else 'current_database()'
@@ -115,7 +115,7 @@ def disconnect_all_others(self):
     # Drop the specified database
     def drop(self, database_name=None):
         """
-        :param database_name: [str] name of database to disconnect from, or None (default) to disconnect the current one
+        :param database_name: [str; None (default)] database to be disconnected; None: to disconnect the current one
         """
         db_name = self.database_name if database_name is None else database_name
         if confirmed("Confirmed to drop the database \"{}\"?".format(db_name)):