Validation zhuzh-up: headway value checks (#146)

* refactor link length over 1km * tidy up graph connectivity validation * refactor length and zero value checks * add checks for negative, infinite and fractional link attribute values * fix to work with dictionary / nested attributes * module rename, tidy up * update logging to match the rest * fix imports in tests * leave first trip dep diffs as nans, let them be ignored in stats * address PR comments: Part 1: readability and conditions toolbox dataclass * address PR comments: Part 2: chop up existing tests for validation report * add condition for none values * expose trips to dataframe and headway stats methods to Route and Service objects * make vehicle definitions a stronger fail/pass condition, report as invalid stage on schedule level * shuffle vehicle checks * add headway stats to route level validation and check for zero values * add tests for headway in validation report * rename test file
arup-group · Oct 7, 2022 · c611e80 · c611e80
1 parent 6ecec80
commit c611e80
Show file tree

Hide file tree

Showing 5 changed files with 364 additions and 177 deletions.
diff --git a/genet/schedule_elements.py b/genet/schedule_elements.py
@@ -259,6 +259,70 @@ def find_epsg(self):
                     return epsg
         return None
 
+    @abstractmethod
+    def trips_to_dataframe(self, gtfs_day='19700101'):
+        pass
+
+    def trips_headways(self, from_time=None, to_time=None, gtfs_day='19700101'):
+        """
+        Generates a DataFrame holding all the trips IDs, their departure times (in datetime with given GTFS day,
+        if specified in `gtfs_day`) and vehicle IDs, next to the route ID and service ID.
+        Adds two columns: headway and headway_mins by calculating the time difference in ordered trip departures for
+        each unique route.
+        This can also be done for a specific time frame by specifying from_time and to_time (or just one of them).
+        :param from_time: "HH:MM:SS" format, used as lower time bound for subsetting
+        :param to_time: "HH:MM:SS" format, used as upper time bound for subsetting
+        :param gtfs_day: day used for GTFS when creating the network in YYYYMMDD format defaults to 19700101
+        :return:
+        """
+        df = self.trips_to_dataframe(gtfs_day=gtfs_day).sort_values(
+            ['route_id', 'trip_departure_time']).reset_index(drop=True)
+
+        year = int(gtfs_day[:4])
+        month = int(gtfs_day[4:6])
+        day = int(gtfs_day[6:8])
+
+        df = df.groupby('route_id').apply(get_headway)
+        df['headway_mins'] = (pd.to_timedelta(df['headway']).dt.total_seconds() / 60)
+
+        if from_time is not None:
+            hour, minute, second = list(map(int, from_time.split(':')))
+            df = df[df['trip_departure_time'] >= datetime(year, month, day, hour, minute, second)]
+        if to_time is not None:
+            hour, minute, second = list(map(int, to_time.split(':')))
+            df = df[df['trip_departure_time'] <= datetime(year, month, day, hour, minute, second)]
+
+        return df
+
+    def headway_stats(self, from_time=None, to_time=None, gtfs_day='19700101'):
+        """
+        Generates a DataFrame calculating mean headway in minutes for all routes, with their service ID.
+        This can also be done for a specific time frame by specifying from_time and to_time (or just one of them).
+        :param from_time: "HH:MM:SS" format, used as lower time bound for subsetting
+        :param to_time: "HH:MM:SS" format, used as upper time bound for subsetting
+        :param gtfs_day: day used for GTFS when creating the network in YYYYMMDD format defaults to 19700101
+        :return:
+        """
+        df = self.trips_headways(from_time=from_time, to_time=to_time, gtfs_day=gtfs_day)
+
+        groupby_cols = []
+        if 'service_id' in df.columns:
+            groupby_cols.append('service_id')
+        groupby_cols += ['route_id', 'mode']
+
+        # first trips don't have a headway, they are kept as NaT and NaN
+        if not df.empty:
+            route_groups = df.groupby(groupby_cols)
+            df = route_groups.describe()
+            df = df['headway_mins'][['mean', 'std', 'max', 'min']]
+            df['trip_count'] = route_groups.apply(len)
+            df.reset_index(inplace=True)
+            df = df.rename(
+                columns={'mean': 'mean_headway_mins', 'std': 'std_headway_mins', 'max': 'max_headway_mins',
+                         'min': 'min_headway_mins'}
+            )
+        return df
+
     def to_geodataframe(self):
         """
         Generates GeoDataFrames of the Schedule graph in Schedule's crs
@@ -684,6 +748,7 @@ def trips_to_dataframe(self, gtfs_day='19700101'):
         :return:
         """
         df = pd.DataFrame(self.trips)
+
         df['route_id'] = self.id
         df['trip_departure_time'] = df['trip_departure_time'].apply(lambda x: use_schedule.sanitise_time(x, gtfs_day))
         df['mode'] = self.mode
@@ -1449,35 +1514,6 @@ def trips_to_dataframe(self, gtfs_day='19700101'):
         df['trip_departure_time'] = df['trip_departure_time'].apply(lambda x: use_schedule.sanitise_time(x, gtfs_day))
         return df
 
-    def trips_headways(self, from_time=None, to_time=None, gtfs_day='19700101'):
-        """
-        Generates a DataFrame holding all the trips IDs, their departure times (in datetime with given GTFS day,
-        if specified in `gtfs_day`) and vehicle IDs, next to the route ID and service ID.
-        Adds two columns: headway and headway_mins by calculating the time difference in ordered trip departures for
-        each unique route.
-        This can also be done for a specific time frame by specifying from_time and to_time (or just one of them).
-        :param from_time: "HH:MM:SS" format, used as lower time bound for subsetting
-        :param to_time: "HH:MM:SS" format, used as upper time bound for subsetting
-        :param gtfs_day: day used for GTFS when creating the network in YYYYMMDD format defaults to 19700101
-        :return:
-        """
-        df = self.trips_to_dataframe(gtfs_day=gtfs_day).sort_values(
-            ['route_id', 'trip_departure_time']).reset_index(drop=True)
-
-        year = int(gtfs_day[:4])
-        month = int(gtfs_day[4:6])
-        day = int(gtfs_day[6:8])
-        if from_time is not None:
-            hour, minute, second = list(map(int, from_time.split(':')))
-            df = df[df['trip_departure_time'] >= datetime(year, month, day, hour, minute, second)]
-        if to_time is not None:
-            hour, minute, second = list(map(int, to_time.split(':')))
-            df = df[df['trip_departure_time'] <= datetime(year, month, day, hour, minute, second)]
-
-        df = df.groupby('route_id').apply(get_headway)
-        df['headway_mins'] = (pd.to_timedelta(df['headway']).dt.total_seconds() / 60).fillna(0)
-        return df
-
     def generate_trips_dataframe_from_headway(self, route_id, headway_spec: dict):
         """
         Generates new trips and vehicles for the specified route.
@@ -1523,25 +1559,6 @@ def generate_trips_from_headway(self, route_id, headway_spec: dict):
         self.vehicles = {**{veh_id: veh_type for veh_id in new_trips['vehicle_id']}, **self.vehicles}
         list(map(self.vehicles.pop, old_vehicles - set(new_trips['vehicle_id'])))
 
-    def headway_stats(self, from_time=None, to_time=None, gtfs_day='19700101'):
-        """
-        Generates a DataFrame calculating mean headway in minutes for all routes, with their service ID.
-        This can also be done for a specific time frame by specifying from_time and to_time (or just one of them).
-        :param from_time: "HH:MM:SS" format, used as lower time bound for subsetting
-        :param to_time: "HH:MM:SS" format, used as upper time bound for subsetting
-        :param gtfs_day: day used for GTFS when creating the network in YYYYMMDD format defaults to 19700101
-        :return:
-        """
-        df = self.trips_headways(from_time=from_time, to_time=to_time, gtfs_day=gtfs_day)
-
-        df = df.groupby(['service_id', 'route_id', 'mode']).describe()
-        df = df['headway_mins'][['mean', 'std', 'max', 'min', 'count']].reset_index()
-        df = df.rename(
-            columns={'mean': 'mean_headway_mins', 'std': 'std_headway_mins', 'max': 'max_headway_mins',
-                     'min': 'min_headway_mins', 'count': 'trip_count'}
-        )
-        return df
-
     def unused_vehicles(self):
         """
         A scenario change to the network may result in changes to vehicle assignments, with some vehicles not
@@ -3073,7 +3090,7 @@ def read_vehicle_types(yml):
 
 
 def get_headway(group):
-    group['headway'] = group['trip_departure_time'].diff().fillna(pd.Timedelta(seconds=0))
+    group['headway'] = group['trip_departure_time'].diff()
     return group
 
 

diff --git a/genet/validate/schedule.py b/genet/validate/schedule.py
@@ -9,35 +9,28 @@ def generate_validation_report(schedule):
         'route_level': {},
         'vehicle_level': {}
     }
+
+    logging.info('Computing headway stats')
+    df_headway = schedule.headway_stats().set_index('route_id')
+
     route_validity = {}
     for route in schedule.routes():
         is_valid_route, invalid_stages = route.is_valid_route(return_reason=True)
         route_validity[route.id] = {
             'is_valid_route': is_valid_route,
-            'invalid_stages': invalid_stages
+            'invalid_stages': invalid_stages,
+            'headway_stats': df_headway.loc[
+                route.id, ['mean_headway_mins', 'std_headway_mins', 'max_headway_mins', 'min_headway_mins']].to_dict()
         }
 
-    is_valid_vehicle_def = schedule.validate_vehicle_definitions()
-    missing_vehicle_types = schedule.get_missing_vehicle_information()
-
-    report['vehicle_level'] = {
-        'vehicle_definitions_valid': is_valid_vehicle_def,
-        'vehicle_definitions_validity_components': {
-            'missing_vehicles': {
-                'missing_vehicles_types': missing_vehicle_types['missing_vehicle_types'],
-                'vehicles_affected': missing_vehicle_types['vehicles_affected']},
-            'unused_vehicles': schedule.unused_vehicles(),
-            'multiple_use_vehicles': schedule.check_vehicle_uniqueness()}
-    }
-
     for service_id in schedule.service_ids():
         invalid_stages = []
         invalid_routes = []
         report['route_level'][service_id] = {}
         for route_id in schedule.service_to_route_map()[service_id]:
             if not route_validity[route_id]['is_valid_route']:
                 invalid_routes.append(route_id)
-                logging.warning(f'Route id={route_id} under Service id={service_id} is not valid')
+                logging.warning(f'Route ID: {route_id} under Service ID: {service_id} is not valid')
             report['route_level'][service_id][route_id] = route_validity[route_id]
 
         if invalid_routes:
@@ -55,16 +48,33 @@ def generate_validation_report(schedule):
             'invalid_routes': invalid_routes
         }
         if not is_valid_service:
-            logging.warning('Service id={} is not valid'.format(service_id))
+            logging.warning(f'Service with ID: {service_id} is not valid')
 
     invalid_stages = []
     invalid_services = [service_id for service_id in schedule.service_ids() if
                         not report['service_level'][service_id]['is_valid_service']]
 
-    if invalid_services:
+    logging.info('Checking validity of PT vehicles')
+    has_valid_vehicle_def = schedule.validate_vehicle_definitions()
+    missing_vehicle_types = schedule.get_missing_vehicle_information()
+
+    report['vehicle_level'] = {
+        'vehicle_definitions_valid': has_valid_vehicle_def,
+        'vehicle_definitions_validity_components': {
+            'missing_vehicles': {
+                'missing_vehicles_types': missing_vehicle_types['missing_vehicle_types'],
+                'vehicles_affected': missing_vehicle_types['vehicles_affected']},
+            'unused_vehicles': schedule.unused_vehicles(),
+            'multiple_use_vehicles': schedule.check_vehicle_uniqueness()}
+    }
+
+    if invalid_services or (not has_valid_vehicle_def):
         is_valid_schedule = False
         has_valid_services = False
-        invalid_stages.append('not_has_valid_services')
+        if invalid_services:
+            invalid_stages.append('not_has_valid_services')
+        if not has_valid_vehicle_def:
+            invalid_stages.append('not_has_valid_vehicle_definitions')
     else:
         is_valid_schedule = True
         has_valid_services = True
@@ -75,7 +85,25 @@ def generate_validation_report(schedule):
         'has_valid_services': has_valid_services,
         'invalid_services': invalid_services}
 
-    if (not is_valid_schedule) or (not is_valid_vehicle_def):
+    zero_headways = df_headway[df_headway['min_headway_mins'] == 0]
+    report['schedule_level']['headways'] = {}
+    if not zero_headways.empty:
+        report['schedule_level']['headways']['has_zero_min_headways'] = True
+        report['schedule_level']['headways']['routes'] = {
+            'number_of_affected': len(zero_headways),
+            'ids': list(zero_headways.index)
+        }
+        report['schedule_level']['headways']['services'] = {
+            'number_of_affected': len(zero_headways['service_id'].unique()),
+            'ids': list(zero_headways['service_id'].unique())
+        }
+
+        logging.warning(f"Found {len(zero_headways)} PT Routes 0 minimum headway between trips. "
+                        f"The following Services are affected: {report['schedule_level']['headways']['services']}")
+    else:
+        report['schedule_level']['headways']['has_zero_min_headways'] = False
+
+    if not is_valid_schedule:
         logging.warning('This schedule is not valid')
 
     return report
diff --git a/tests/test_core_network.py b/tests/test_core_network.py
@@ -2744,7 +2744,6 @@ def invalid_pt2matsim_network_for_validation(network_object_from_test_data):
         'pt_routes_with_invalid_network_route': ['VJbd8660f05fe6f744e58a66ae12bd66acbca88b98'],
     }
 
-
 def test_connectivity_in_report_with_invalid_network(invalid_pt2matsim_network_for_validation):
     report = invalid_pt2matsim_network_for_validation['network'].generate_validation_report()
     for mode, expected_connected_subgraphs in invalid_pt2matsim_network_for_validation['subgraph_no_per_mode'].items():