Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Validation zhuzh-up: headway value checks #146

Merged
merged 24 commits into from
Oct 7, 2022
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
033d63b
refactor link length over 1km
KasiaKoz Aug 17, 2022
b32c1da
tidy up graph connectivity validation
KasiaKoz Aug 17, 2022
c08b259
refactor length and zero value checks
KasiaKoz Aug 17, 2022
9a81c78
add checks for negative, infinite and fractional link attribute values
KasiaKoz Aug 17, 2022
c8f87d7
fix to work with dictionary / nested attributes
KasiaKoz Aug 18, 2022
3bc0ae1
Merge branch 'main' into lab-1249-validation-zhuzh-up
KasiaKoz Aug 25, 2022
8e0978a
module rename, tidy up
KasiaKoz Aug 25, 2022
2b6f801
update logging to match the rest
KasiaKoz Aug 25, 2022
71846cf
fix imports in tests
KasiaKoz Aug 25, 2022
eacb3b8
Merge branch 'main' into lab-1249-validation-zhuzh-up
KasiaKoz Aug 25, 2022
0a6c4d8
Merge branch 'main' into lab-1249-validation-zhuzh-up
KasiaKoz Aug 25, 2022
7aa5e69
leave first trip dep diffs as nans, let them be ignored in stats
KasiaKoz Sep 8, 2022
78f3047
address PR comments: Part 1: readability and conditions toolbox datac…
KasiaKoz Sep 8, 2022
831a0f3
address PR comments: Part 2: chop up existing tests for validation re…
KasiaKoz Sep 8, 2022
4ec968e
add condition for none values
KasiaKoz Sep 8, 2022
6bf198c
expose trips to dataframe and headway stats methods to Route and Serv…
KasiaKoz Sep 8, 2022
a8c1f2d
make vehicle definitions a stronger fail/pass condition, report as in…
KasiaKoz Sep 9, 2022
9f733c9
shuffle vehicle checks
KasiaKoz Sep 9, 2022
642e487
add headway stats to route level validation and check for zero values
KasiaKoz Sep 9, 2022
7b23a55
Merge branch 'lab-1249-validation-zhuzh-up' into lab-1250-check-headw…
KasiaKoz Sep 9, 2022
dd059a4
add tests for headway in validation report
KasiaKoz Sep 9, 2022
64d2a0d
rename test file
KasiaKoz Sep 9, 2022
0388311
Merge branch 'main' into lab-1250-check-headway-values
KasiaKoz Sep 23, 2022
34e731e
Merge branch 'main' into lab-1250-check-headway-values
KasiaKoz Oct 7, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion example_data/api_requests_send.json

Large diffs are not rendered by default.

116 changes: 77 additions & 39 deletions genet/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
import genet.utils.plot as plot
import genet.utils.simplification as simplification
import genet.utils.spatial as spatial
import genet.validate.network_validation as network_validation
import genet.validate.network as network_validation
import geopandas as gpd
import networkx as nx
import numpy as np
Expand Down Expand Up @@ -942,7 +942,7 @@ def subgraph_on_link_conditions(self, conditions, how=any, mixed_dtypes=True):

def modes(self):
"""
Scans network for 'modes' attribute and returns list of all modes present int he network
Scans network for 'modes' attribute and returns list of all modes present in the network
:return:
"""
modes = set()
Expand Down Expand Up @@ -1999,55 +1999,60 @@ def invalid_network_routes(self):
return [route.id for route in self.schedule.routes() if
not route.has_network_route() or not self.is_valid_network_route(route)]

def generate_validation_report(self, link_length_threshold=1000):
def generate_validation_report(self, modes_for_strong_connectivity=None, link_metre_length_threshold=1000):
"""
Generates a dictionary with keys: 'graph', 'schedule' and 'routing' describing validity of the Network's
underlying graph, the schedule services and then the intersection of the two which is the routing of schedule
services onto the graph.
:param link_length_threshold: in meters defaults to 1000, i.e. 1km
:param modes_for_strong_connectivity: list of modes in the network that need to be checked for strong
connectivity. Defaults to 'car', 'walk' and 'bike'
:param link_metre_length_threshold: in meters defaults to 1000, i.e. 1km
:return:
"""
logging.info('Checking validity of the Network')
logging.info('Checking validity of the Network graph')
report = {}
# describe network connectivity
modes = ['car', 'walk', 'bike']
report['graph'] = {'graph_connectivity': {}}
for mode in modes:
logging.info(f'Checking network connectivity for mode: {mode}')
# subgraph for the mode to be tested
G_mode = self.modal_subgraph(mode)
# calculate how many connected subgraphs there are
report['graph']['graph_connectivity'][mode] = network_validation.describe_graph_connectivity(G_mode)

def links_over_threshold_length(value):
return value >= link_length_threshold

links_over_1km_length = self.extract_links_on_edge_attributes(
conditions={'length': links_over_threshold_length})

# describe network connectivity
if modes_for_strong_connectivity is None:
modes_for_strong_connectivity = ['car', 'walk', 'bike']
logging.info(f'Defaulting to checking graph connectivity for modes: {modes_for_strong_connectivity}. '
'You can change this by passing a `modes_for_strong_connectivity` param')
graph_connectivity = {}
for mode in modes_for_strong_connectivity:
graph_connectivity[mode] = self.check_connectivity_for_mode(mode)
report['graph'] = {'graph_connectivity': graph_connectivity}

# attribute checks
conditions_toolbox = network_validation.ConditionsToolbox()
report['graph']['link_attributes'] = {
'links_over_1km_length': {
'number_of': len(links_over_1km_length),
'percentage': len(links_over_1km_length) / self.graph.number_of_edges(),
'link_ids': links_over_1km_length
}
}
f'{k}_attributes': {} for k in conditions_toolbox.condition_names()}

def zero_value(value):
return (value == 0) or (value == '0') or (value == '0.0')

report['graph']['link_attributes']['zero_attributes'] = {}
for attrib in [d.name for d in graph_operations.get_attribute_schema(self.links()).descendants]:
links_with_zero_attrib = self.extract_links_on_edge_attributes(
conditions={attrib: zero_value}, mixed_dtypes=False)
if links_with_zero_attrib:
logging.warning(f'{len(links_with_zero_attrib)} of links have values of 0 for `{attrib}`')
report['graph']['link_attributes']['zero_attributes'][attrib] = {
'number_of': len(links_with_zero_attrib),
'percentage': len(links_with_zero_attrib) / self.graph.number_of_edges(),
'link_ids': links_with_zero_attrib
}
# checks on length attribute specifically
def links_over_threshold_length(value):
return value >= link_metre_length_threshold

report['graph']['link_attributes']['links_over_1000_length'] = self.report_on_link_attribute_condition(
'length', links_over_threshold_length)

# more general attribute value checks
non_testable = ['id', 'from', 'to', 's2_to', 's2_from', 'geometry']
link_attributes = [graph_operations.parse_leaf(leaf) for leaf in
graph_operations.get_attribute_schema(self.links()).leaves]
link_attributes = [attrib for attrib in link_attributes if attrib not in non_testable]
for attrib in link_attributes:
logging.info(f'Checking link values for `{attrib}`')
for condition_name in conditions_toolbox.condition_names():
links_satifying_condition = self.report_on_link_attribute_condition(
attrib, conditions_toolbox.get_condition_evaluator(condition_name))
if links_satifying_condition['number_of']:
logging.warning(
f'{links_satifying_condition["number_of"]} of links have '
f'{condition_name} values for `{attrib}`')
if isinstance(attrib, dict):
attrib = dict_support.dict_to_string(attrib)
report['graph']['link_attributes'][f'{condition_name}_attributes'][
attrib] = links_satifying_condition

if self.schedule:
report['schedule'] = self.schedule.generate_validation_report()
Expand All @@ -2066,6 +2071,39 @@ def zero_value(value):
}
return report

def report_on_link_attribute_condition(self, attribute, condition):
"""
:param attribute: one of the link attributes, e.g. 'length'
:param condition: callable, condition for link[attribute] to satisfy
:return:
"""
if isinstance(attribute, dict):
conditions = dict_support.nest_at_leaf(deepcopy(attribute), condition)
else:
conditions = {attribute: condition}

links_satifying_condition = self.extract_links_on_edge_attributes(conditions=conditions)
return {
'number_of': len(links_satifying_condition),
'percentage': len(links_satifying_condition) / self.graph.number_of_edges(),
'link_ids': links_satifying_condition
}

def check_connectivity_for_mode(self, mode):
logging.info(f'Checking network connectivity for mode: {mode}')
G_mode = self.modal_subgraph(mode)
con_desc = network_validation.describe_graph_connectivity(G_mode)
no_of_components = con_desc["number_of_connected_subgraphs"]
logging.info(f'The graph for mode: {mode} has: '
f'{no_of_components} connected components, '
f'{len(con_desc["problem_nodes"]["dead_ends"])} sinks/dead_ends and '
f'{len(con_desc["problem_nodes"]["unreachable_node"])} sources/unreachable nodes.')
if no_of_components > 1:
logging.warning(f'The graph has more than one connected component for mode {mode}! '
'If this is not expected, consider using the `connect_components` method to connect the '
'components, or `retain_n_connected_subgraphs` with `n=1` to extract the largest component')
return con_desc

def generate_standard_outputs(self, output_dir, gtfs_day='19700101', include_shp_files=False):
"""
Generates geojsons that can be used for generating standard kepler visualisations.
Expand Down
2 changes: 1 addition & 1 deletion genet/output/matsim_xml_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from copy import deepcopy
from pandas import DataFrame
from genet.output import sanitiser
from genet.validate.network_validation import validate_attribute_data
from genet.validate.network import validate_attribute_data
from genet.utils.spatial import encode_shapely_linestring_to_polyline
from genet.exceptions import MalformedAdditionalAttributeError
import genet.variables as variables
Expand Down
117 changes: 67 additions & 50 deletions genet/schedule_elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import genet.utils.persistence as persistence
import genet.utils.plot as plot
import genet.utils.spatial as spatial
import genet.validate.schedule_validation as schedule_validation
import genet.validate.schedule as schedule_validation
import networkx as nx
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -259,6 +259,70 @@ def find_epsg(self):
return epsg
return None

@abstractmethod
def trips_to_dataframe(self, gtfs_day='19700101'):
pass

def trips_headways(self, from_time=None, to_time=None, gtfs_day='19700101'):
"""
Generates a DataFrame holding all the trips IDs, their departure times (in datetime with given GTFS day,
if specified in `gtfs_day`) and vehicle IDs, next to the route ID and service ID.
Adds two columns: headway and headway_mins by calculating the time difference in ordered trip departures for
each unique route.
This can also be done for a specific time frame by specifying from_time and to_time (or just one of them).
:param from_time: "HH:MM:SS" format, used as lower time bound for subsetting
:param to_time: "HH:MM:SS" format, used as upper time bound for subsetting
:param gtfs_day: day used for GTFS when creating the network in YYYYMMDD format defaults to 19700101
:return:
"""
df = self.trips_to_dataframe(gtfs_day=gtfs_day).sort_values(
['route_id', 'trip_departure_time']).reset_index(drop=True)

year = int(gtfs_day[:4])
month = int(gtfs_day[4:6])
day = int(gtfs_day[6:8])

df = df.groupby('route_id').apply(get_headway)
df['headway_mins'] = (pd.to_timedelta(df['headway']).dt.total_seconds() / 60)

if from_time is not None:
hour, minute, second = list(map(int, from_time.split(':')))
df = df[df['trip_departure_time'] >= datetime(year, month, day, hour, minute, second)]
if to_time is not None:
hour, minute, second = list(map(int, to_time.split(':')))
df = df[df['trip_departure_time'] <= datetime(year, month, day, hour, minute, second)]

return df

def headway_stats(self, from_time=None, to_time=None, gtfs_day='19700101'):
"""
Generates a DataFrame calculating mean headway in minutes for all routes, with their service ID.
This can also be done for a specific time frame by specifying from_time and to_time (or just one of them).
:param from_time: "HH:MM:SS" format, used as lower time bound for subsetting
:param to_time: "HH:MM:SS" format, used as upper time bound for subsetting
:param gtfs_day: day used for GTFS when creating the network in YYYYMMDD format defaults to 19700101
:return:
"""
df = self.trips_headways(from_time=from_time, to_time=to_time, gtfs_day=gtfs_day)

groupby_cols = []
if 'service_id' in df.columns:
groupby_cols.append('service_id')
groupby_cols += ['route_id', 'mode']

# first trips don't have a headway, they are kept as NaT and NaN
if not df.empty:
route_groups = df.groupby(groupby_cols)
df = route_groups.describe()
df = df['headway_mins'][['mean', 'std', 'max', 'min']]
df['trip_count'] = route_groups.apply(len)
df.reset_index(inplace=True)
df = df.rename(
columns={'mean': 'mean_headway_mins', 'std': 'std_headway_mins', 'max': 'max_headway_mins',
'min': 'min_headway_mins'}
)
return df

def to_geodataframe(self):
"""
Generates GeoDataFrames of the Schedule graph in Schedule's crs
Expand Down Expand Up @@ -684,6 +748,7 @@ def trips_to_dataframe(self, gtfs_day='19700101'):
:return:
"""
df = pd.DataFrame(self.trips)

df['route_id'] = self.id
df['trip_departure_time'] = df['trip_departure_time'].apply(lambda x: use_schedule.sanitise_time(x, gtfs_day))
df['mode'] = self.mode
Expand Down Expand Up @@ -1449,35 +1514,6 @@ def trips_to_dataframe(self, gtfs_day='19700101'):
df['trip_departure_time'] = df['trip_departure_time'].apply(lambda x: use_schedule.sanitise_time(x, gtfs_day))
return df

def trips_headways(self, from_time=None, to_time=None, gtfs_day='19700101'):
"""
Generates a DataFrame holding all the trips IDs, their departure times (in datetime with given GTFS day,
if specified in `gtfs_day`) and vehicle IDs, next to the route ID and service ID.
Adds two columns: headway and headway_mins by calculating the time difference in ordered trip departures for
each unique route.
This can also be done for a specific time frame by specifying from_time and to_time (or just one of them).
:param from_time: "HH:MM:SS" format, used as lower time bound for subsetting
:param to_time: "HH:MM:SS" format, used as upper time bound for subsetting
:param gtfs_day: day used for GTFS when creating the network in YYYYMMDD format defaults to 19700101
:return:
"""
df = self.trips_to_dataframe(gtfs_day=gtfs_day).sort_values(
['route_id', 'trip_departure_time']).reset_index(drop=True)

year = int(gtfs_day[:4])
month = int(gtfs_day[4:6])
day = int(gtfs_day[6:8])
if from_time is not None:
hour, minute, second = list(map(int, from_time.split(':')))
df = df[df['trip_departure_time'] >= datetime(year, month, day, hour, minute, second)]
if to_time is not None:
hour, minute, second = list(map(int, to_time.split(':')))
df = df[df['trip_departure_time'] <= datetime(year, month, day, hour, minute, second)]

df = df.groupby('route_id').apply(get_headway)
df['headway_mins'] = (pd.to_timedelta(df['headway']).dt.total_seconds() / 60).fillna(0)
return df

def generate_trips_dataframe_from_headway(self, route_id, headway_spec: dict):
"""
Generates new trips and vehicles for the specified route.
Expand Down Expand Up @@ -1523,25 +1559,6 @@ def generate_trips_from_headway(self, route_id, headway_spec: dict):
self.vehicles = {**{veh_id: veh_type for veh_id in new_trips['vehicle_id']}, **self.vehicles}
list(map(self.vehicles.pop, old_vehicles - set(new_trips['vehicle_id'])))

def headway_stats(self, from_time=None, to_time=None, gtfs_day='19700101'):
"""
Generates a DataFrame calculating mean headway in minutes for all routes, with their service ID.
This can also be done for a specific time frame by specifying from_time and to_time (or just one of them).
:param from_time: "HH:MM:SS" format, used as lower time bound for subsetting
:param to_time: "HH:MM:SS" format, used as upper time bound for subsetting
:param gtfs_day: day used for GTFS when creating the network in YYYYMMDD format defaults to 19700101
:return:
"""
df = self.trips_headways(from_time=from_time, to_time=to_time, gtfs_day=gtfs_day)

df = df.groupby(['service_id', 'route_id', 'mode']).describe()
df = df['headway_mins'][['mean', 'std', 'max', 'min', 'count']].reset_index()
df = df.rename(
columns={'mean': 'mean_headway_mins', 'std': 'std_headway_mins', 'max': 'max_headway_mins',
'min': 'min_headway_mins', 'count': 'trip_count'}
)
return df

def unused_vehicles(self):
"""
A scenario change to the network may result in changes to vehicle assignments, with some vehicles not
Expand Down Expand Up @@ -3068,7 +3085,7 @@ def read_vehicle_types(yml):


def get_headway(group):
group['headway'] = group['trip_departure_time'].diff().fillna(pd.Timedelta(seconds=0))
group['headway'] = group['trip_departure_time'].diff()
return group


Expand Down
4 changes: 4 additions & 0 deletions genet/utils/dict_support.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,10 @@ def combine_edge_data_lists(l1, l2):
return [(u, v, dat) for (u, v), dat in edges.items()]


def dict_to_string(d):
return str(d).replace('{', '').replace('}', '').replace("'", '').replace(' ', ':')


def dataframe_to_dict(df):
return {_id: {k: v for k, v in m.items() if notna(v)} for _id, m in df.to_dict().items()}

Expand Down
19 changes: 17 additions & 2 deletions genet/utils/graph_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from anytree import Node, RenderTree

from genet.utils import pandas_helpers as pd_helpers
import genet.utils.dict_support as dict_support


class Filter:
Expand Down Expand Up @@ -209,6 +210,21 @@ def render_tree(root, data=False):
print("%s%s" % (pre, node.name))


def parse_leaf(leaf):
"""
:param leaf: anytree.node.node.Node
:return: str or dictionary with string key value pairs, for use as keys to extraction methods
"""
if leaf.depth > 1:
dict_path = {leaf.path[1].name: leaf.path[2].name}
if leaf.depth > 2:
for node in leaf.path[3:]:
dict_path = dict_support.nest_at_leaf(dict_path, node.name)
return dict_path
else:
return leaf.name


def get_attribute_data_under_key(iterator: Iterable, key: Union[str, dict]):
"""
Returns all data stored under key in attribute dictionaries for iterators yielding (index, attribute_dictionary),
Expand Down Expand Up @@ -256,8 +272,7 @@ def build_attribute_dataframe(iterator, keys: Union[list, str], index_name: str
for key in keys:
if isinstance(key, dict):
# consolidate nestedness to get a name for the column
name = str(key)
name = name.replace('{', '').replace('}', '').replace("'", '').replace(' ', ':')
name = dict_support.dict_to_string(key)
else:
name = key

Expand Down
Loading