arup-group · KasiaKoz · Oct 7, 2022 · Aug 17, 2022 · Aug 17, 2022 · Aug 17, 2022
diff --git a/example_data/api_requests_send.json b/example_data/api_requests_send.json
diff --git a/genet/core.py b/genet/core.py
@@ -25,7 +25,7 @@
 import genet.utils.plot as plot
 import genet.utils.simplification as simplification
 import genet.utils.spatial as spatial
-import genet.validate.network_validation as network_validation
+import genet.validate.network as network_validation
 import geopandas as gpd
 import networkx as nx
 import numpy as np
@@ -942,7 +942,7 @@ def subgraph_on_link_conditions(self, conditions, how=any, mixed_dtypes=True):
 
     def modes(self):
         """
-        Scans network for 'modes' attribute and returns list of all modes present int he network
+        Scans network for 'modes' attribute and returns list of all modes present in the network
         :return:
         """
         modes = set()
@@ -1999,55 +1999,60 @@ def invalid_network_routes(self):
         return [route.id for route in self.schedule.routes() if
                 not route.has_network_route() or not self.is_valid_network_route(route)]
 
-    def generate_validation_report(self, link_length_threshold=1000):
+    def generate_validation_report(self, modes_for_strong_connectivity=None, link_metre_length_threshold=1000):
         """
         Generates a dictionary with keys: 'graph', 'schedule' and 'routing' describing validity of the Network's
         underlying graph, the schedule services and then the intersection of the two which is the routing of schedule
         services onto the graph.
-        :param link_length_threshold: in meters defaults to 1000, i.e. 1km
+        :param modes_for_strong_connectivity: list of modes in the network that need to be checked for strong
+            connectivity. Defaults to 'car', 'walk' and 'bike'
+        :param link_metre_length_threshold: in meters defaults to 1000, i.e. 1km
         :return:
         """
         logging.info('Checking validity of the Network')
         logging.info('Checking validity of the Network graph')
         report = {}
-        # describe network connectivity
-        modes = ['car', 'walk', 'bike']
-        report['graph'] = {'graph_connectivity': {}}
-        for mode in modes:
-            logging.info(f'Checking network connectivity for mode: {mode}')
-            # subgraph for the mode to be tested
-            G_mode = self.modal_subgraph(mode)
-            # calculate how many connected subgraphs there are
-            report['graph']['graph_connectivity'][mode] = network_validation.describe_graph_connectivity(G_mode)
-
-        def links_over_threshold_length(value):
-            return value >= link_length_threshold
-
-        links_over_1km_length = self.extract_links_on_edge_attributes(
-            conditions={'length': links_over_threshold_length})
 
+        # describe network connectivity
+        if modes_for_strong_connectivity is None:
+            modes_for_strong_connectivity = ['car', 'walk', 'bike']
+            logging.info(f'Defaulting to checking graph connectivity for modes: {modes_for_strong_connectivity}. '
+                         'You can change this by passing a `modes_for_strong_connectivity` param')
+        graph_connectivity = {}
+        for mode in modes_for_strong_connectivity:
+            graph_connectivity[mode] = self.check_connectivity_for_mode(mode)
+        report['graph'] = {'graph_connectivity': graph_connectivity}
+
+        # attribute checks
+        conditions_toolbox = network_validation.ConditionsToolbox()
         report['graph']['link_attributes'] = {
-            'links_over_1km_length': {
-                'number_of': len(links_over_1km_length),
-                'percentage': len(links_over_1km_length) / self.graph.number_of_edges(),
-                'link_ids': links_over_1km_length
-            }
-        }
+            f'{k}_attributes': {} for k in conditions_toolbox.condition_names()}
 
-        def zero_value(value):
-            return (value == 0) or (value == '0') or (value == '0.0')
-
-        report['graph']['link_attributes']['zero_attributes'] = {}
-        for attrib in [d.name for d in graph_operations.get_attribute_schema(self.links()).descendants]:
-            links_with_zero_attrib = self.extract_links_on_edge_attributes(
-                conditions={attrib: zero_value}, mixed_dtypes=False)
-            if links_with_zero_attrib:
-                logging.warning(f'{len(links_with_zero_attrib)} of links have values of 0 for `{attrib}`')
-                report['graph']['link_attributes']['zero_attributes'][attrib] = {
-                    'number_of': len(links_with_zero_attrib),
-                    'percentage': len(links_with_zero_attrib) / self.graph.number_of_edges(),
-                    'link_ids': links_with_zero_attrib
-                }
+        # checks on length attribute specifically
+        def links_over_threshold_length(value):
+            return value >= link_metre_length_threshold
+
+        report['graph']['link_attributes']['links_over_1000_length'] = self.report_on_link_attribute_condition(
+            'length', links_over_threshold_length)
+
+        # more general attribute value checks
+        non_testable = ['id', 'from', 'to', 's2_to', 's2_from', 'geometry']
+        link_attributes = [graph_operations.parse_leaf(leaf) for leaf in
+                           graph_operations.get_attribute_schema(self.links()).leaves]
+        link_attributes = [attrib for attrib in link_attributes if attrib not in non_testable]
+        for attrib in link_attributes:
+            logging.info(f'Checking link values for `{attrib}`')
+            for condition_name in conditions_toolbox.condition_names():
+                links_satifying_condition = self.report_on_link_attribute_condition(
+                    attrib, conditions_toolbox.get_condition_evaluator(condition_name))
+                if links_satifying_condition['number_of']:
+                    logging.warning(
+                        f'{links_satifying_condition["number_of"]} of links have '
+                        f'{condition_name} values for `{attrib}`')
+                    if isinstance(attrib, dict):
+                        attrib = dict_support.dict_to_string(attrib)
+                    report['graph']['link_attributes'][f'{condition_name}_attributes'][
+                        attrib] = links_satifying_condition
 
         if self.schedule:
             report['schedule'] = self.schedule.generate_validation_report()
@@ -2066,6 +2071,39 @@ def zero_value(value):
             }
         return report
 
+    def report_on_link_attribute_condition(self, attribute, condition):
+        """
+        :param attribute: one of the link attributes, e.g. 'length'
+        :param condition: callable, condition for link[attribute] to satisfy
+        :return:
+        """
+        if isinstance(attribute, dict):
+            conditions = dict_support.nest_at_leaf(deepcopy(attribute), condition)
+        else:
+            conditions = {attribute: condition}
+
+        links_satifying_condition = self.extract_links_on_edge_attributes(conditions=conditions)
+        return {
+            'number_of': len(links_satifying_condition),
+            'percentage': len(links_satifying_condition) / self.graph.number_of_edges(),
+            'link_ids': links_satifying_condition
+        }
+
+    def check_connectivity_for_mode(self, mode):
+        logging.info(f'Checking network connectivity for mode: {mode}')
+        G_mode = self.modal_subgraph(mode)
+        con_desc = network_validation.describe_graph_connectivity(G_mode)
+        no_of_components = con_desc["number_of_connected_subgraphs"]
+        logging.info(f'The graph for mode: {mode} has: '
+                     f'{no_of_components} connected components, '
+                     f'{len(con_desc["problem_nodes"]["dead_ends"])} sinks/dead_ends and '
+                     f'{len(con_desc["problem_nodes"]["unreachable_node"])} sources/unreachable nodes.')
+        if no_of_components > 1:
+            logging.warning(f'The graph has more than one connected component for mode {mode}! '
+                            'If this is not expected, consider using the `connect_components` method to connect the '
+                            'components, or `retain_n_connected_subgraphs` with `n=1` to extract the largest component')
+        return con_desc
+
     def generate_standard_outputs(self, output_dir, gtfs_day='19700101', include_shp_files=False):
         """
         Generates geojsons that can be used for generating standard kepler visualisations.

diff --git a/genet/output/matsim_xml_writer.py b/genet/output/matsim_xml_writer.py
@@ -4,7 +4,7 @@
 from copy import deepcopy
 from pandas import DataFrame
 from genet.output import sanitiser
-from genet.validate.network_validation import validate_attribute_data
+from genet.validate.network import validate_attribute_data
 from genet.utils.spatial import encode_shapely_linestring_to_polyline
 from genet.exceptions import MalformedAdditionalAttributeError
 import genet.variables as variables

diff --git a/genet/schedule_elements.py b/genet/schedule_elements.py
@@ -23,7 +23,7 @@
 import genet.utils.persistence as persistence
 import genet.utils.plot as plot
 import genet.utils.spatial as spatial
-import genet.validate.schedule_validation as schedule_validation
+import genet.validate.schedule as schedule_validation
 import networkx as nx
 import numpy as np
 import pandas as pd
@@ -259,6 +259,70 @@ def find_epsg(self):
                     return epsg
         return None
 
+    @abstractmethod
+    def trips_to_dataframe(self, gtfs_day='19700101'):
+        pass
+
+    def trips_headways(self, from_time=None, to_time=None, gtfs_day='19700101'):
+        """
+        Generates a DataFrame holding all the trips IDs, their departure times (in datetime with given GTFS day,
+        if specified in `gtfs_day`) and vehicle IDs, next to the route ID and service ID.
+        Adds two columns: headway and headway_mins by calculating the time difference in ordered trip departures for
+        each unique route.
+        This can also be done for a specific time frame by specifying from_time and to_time (or just one of them).
+        :param from_time: "HH:MM:SS" format, used as lower time bound for subsetting
+        :param to_time: "HH:MM:SS" format, used as upper time bound for subsetting
+        :param gtfs_day: day used for GTFS when creating the network in YYYYMMDD format defaults to 19700101
+        :return:
+        """
+        df = self.trips_to_dataframe(gtfs_day=gtfs_day).sort_values(
+            ['route_id', 'trip_departure_time']).reset_index(drop=True)
+
+        year = int(gtfs_day[:4])
+        month = int(gtfs_day[4:6])
+        day = int(gtfs_day[6:8])
+
+        df = df.groupby('route_id').apply(get_headway)
+        df['headway_mins'] = (pd.to_timedelta(df['headway']).dt.total_seconds() / 60)
+
+        if from_time is not None:
+            hour, minute, second = list(map(int, from_time.split(':')))
+            df = df[df['trip_departure_time'] >= datetime(year, month, day, hour, minute, second)]
+        if to_time is not None:
+            hour, minute, second = list(map(int, to_time.split(':')))
+            df = df[df['trip_departure_time'] <= datetime(year, month, day, hour, minute, second)]
+
+        return df
+
+    def headway_stats(self, from_time=None, to_time=None, gtfs_day='19700101'):
+        """
+        Generates a DataFrame calculating mean headway in minutes for all routes, with their service ID.
+        This can also be done for a specific time frame by specifying from_time and to_time (or just one of them).
+        :param from_time: "HH:MM:SS" format, used as lower time bound for subsetting
+        :param to_time: "HH:MM:SS" format, used as upper time bound for subsetting
+        :param gtfs_day: day used for GTFS when creating the network in YYYYMMDD format defaults to 19700101
+        :return:
+        """
+        df = self.trips_headways(from_time=from_time, to_time=to_time, gtfs_day=gtfs_day)
+
+        groupby_cols = []
+        if 'service_id' in df.columns:
+            groupby_cols.append('service_id')
+        groupby_cols += ['route_id', 'mode']
+
+        # first trips don't have a headway, they are kept as NaT and NaN
+        if not df.empty:
+            route_groups = df.groupby(groupby_cols)
+            df = route_groups.describe()
+            df = df['headway_mins'][['mean', 'std', 'max', 'min']]
+            df['trip_count'] = route_groups.apply(len)
+            df.reset_index(inplace=True)
+            df = df.rename(
+                columns={'mean': 'mean_headway_mins', 'std': 'std_headway_mins', 'max': 'max_headway_mins',
+                         'min': 'min_headway_mins'}
+            )
+        return df
+
     def to_geodataframe(self):
         """
         Generates GeoDataFrames of the Schedule graph in Schedule's crs
@@ -684,6 +748,7 @@ def trips_to_dataframe(self, gtfs_day='19700101'):
         :return:
         """
         df = pd.DataFrame(self.trips)
+
         df['route_id'] = self.id
         df['trip_departure_time'] = df['trip_departure_time'].apply(lambda x: use_schedule.sanitise_time(x, gtfs_day))
         df['mode'] = self.mode
@@ -1449,35 +1514,6 @@ def trips_to_dataframe(self, gtfs_day='19700101'):
         df['trip_departure_time'] = df['trip_departure_time'].apply(lambda x: use_schedule.sanitise_time(x, gtfs_day))
         return df
 
-    def trips_headways(self, from_time=None, to_time=None, gtfs_day='19700101'):
-        """
-        Generates a DataFrame holding all the trips IDs, their departure times (in datetime with given GTFS day,
-        if specified in `gtfs_day`) and vehicle IDs, next to the route ID and service ID.
-        Adds two columns: headway and headway_mins by calculating the time difference in ordered trip departures for
-        each unique route.
-        This can also be done for a specific time frame by specifying from_time and to_time (or just one of them).
-        :param from_time: "HH:MM:SS" format, used as lower time bound for subsetting
-        :param to_time: "HH:MM:SS" format, used as upper time bound for subsetting
-        :param gtfs_day: day used for GTFS when creating the network in YYYYMMDD format defaults to 19700101
-        :return:
-        """
-        df = self.trips_to_dataframe(gtfs_day=gtfs_day).sort_values(
-            ['route_id', 'trip_departure_time']).reset_index(drop=True)
-
-        year = int(gtfs_day[:4])
-        month = int(gtfs_day[4:6])
-        day = int(gtfs_day[6:8])
-        if from_time is not None:
-            hour, minute, second = list(map(int, from_time.split(':')))
-            df = df[df['trip_departure_time'] >= datetime(year, month, day, hour, minute, second)]
-        if to_time is not None:
-            hour, minute, second = list(map(int, to_time.split(':')))
-            df = df[df['trip_departure_time'] <= datetime(year, month, day, hour, minute, second)]
-
-        df = df.groupby('route_id').apply(get_headway)
-        df['headway_mins'] = (pd.to_timedelta(df['headway']).dt.total_seconds() / 60).fillna(0)
-        return df
-
     def generate_trips_dataframe_from_headway(self, route_id, headway_spec: dict):
         """
         Generates new trips and vehicles for the specified route.
@@ -1523,25 +1559,6 @@ def generate_trips_from_headway(self, route_id, headway_spec: dict):
         self.vehicles = {**{veh_id: veh_type for veh_id in new_trips['vehicle_id']}, **self.vehicles}
         list(map(self.vehicles.pop, old_vehicles - set(new_trips['vehicle_id'])))
 
-    def headway_stats(self, from_time=None, to_time=None, gtfs_day='19700101'):
-        """
-        Generates a DataFrame calculating mean headway in minutes for all routes, with their service ID.
-        This can also be done for a specific time frame by specifying from_time and to_time (or just one of them).
-        :param from_time: "HH:MM:SS" format, used as lower time bound for subsetting
-        :param to_time: "HH:MM:SS" format, used as upper time bound for subsetting
-        :param gtfs_day: day used for GTFS when creating the network in YYYYMMDD format defaults to 19700101
-        :return:
-        """
-        df = self.trips_headways(from_time=from_time, to_time=to_time, gtfs_day=gtfs_day)
-
-        df = df.groupby(['service_id', 'route_id', 'mode']).describe()
-        df = df['headway_mins'][['mean', 'std', 'max', 'min', 'count']].reset_index()
-        df = df.rename(
-            columns={'mean': 'mean_headway_mins', 'std': 'std_headway_mins', 'max': 'max_headway_mins',
-                     'min': 'min_headway_mins', 'count': 'trip_count'}
-        )
-        return df
-
     def unused_vehicles(self):
         """
         A scenario change to the network may result in changes to vehicle assignments, with some vehicles not
@@ -3068,7 +3085,7 @@ def read_vehicle_types(yml):
 
 
 def get_headway(group):
-    group['headway'] = group['trip_departure_time'].diff().fillna(pd.Timedelta(seconds=0))
+    group['headway'] = group['trip_departure_time'].diff()
     return group
 
 

diff --git a/genet/utils/dict_support.py b/genet/utils/dict_support.py
@@ -113,6 +113,10 @@ def combine_edge_data_lists(l1, l2):
     return [(u, v, dat) for (u, v), dat in edges.items()]
 
 
+def dict_to_string(d):
+    return str(d).replace('{', '').replace('}', '').replace("'", '').replace(' ', ':')
+
+
 def dataframe_to_dict(df):
     return {_id: {k: v for k, v in m.items() if notna(v)} for _id, m in df.to_dict().items()}
 

diff --git a/genet/utils/graph_operations.py b/genet/utils/graph_operations.py
@@ -6,6 +6,7 @@
 from anytree import Node, RenderTree
 
 from genet.utils import pandas_helpers as pd_helpers
+import genet.utils.dict_support as dict_support
 
 
 class Filter:
@@ -209,6 +210,21 @@ def render_tree(root, data=False):
             print("%s%s" % (pre, node.name))
 
 
+def parse_leaf(leaf):
+    """
+    :param leaf: anytree.node.node.Node
+    :return: str or dictionary with string key value pairs, for use as keys to extraction methods
+    """
+    if leaf.depth > 1:
+        dict_path = {leaf.path[1].name: leaf.path[2].name}
+        if leaf.depth > 2:
+            for node in leaf.path[3:]:
+                dict_path = dict_support.nest_at_leaf(dict_path, node.name)
+        return dict_path
+    else:
+        return leaf.name
+
+
 def get_attribute_data_under_key(iterator: Iterable, key: Union[str, dict]):
     """
     Returns all data stored under key in attribute dictionaries for iterators yielding (index, attribute_dictionary),
@@ -256,8 +272,7 @@ def build_attribute_dataframe(iterator, keys: Union[list, str], index_name: str
     for key in keys:
         if isinstance(key, dict):
             # consolidate nestedness to get a name for the column
-            name = str(key)
-            name = name.replace('{', '').replace('}', '').replace("'", '').replace(' ', ':')
+            name = dict_support.dict_to_string(key)
         else:
             name = key