Skip to content

Commit

Permalink
Validation zhuzh-up: link attribute value checks (#141)
Browse files Browse the repository at this point in the history
* refactor link length over 1km

* tidy up graph connectivity validation

* refactor length and zero value checks

* add checks for negative, infinite and fractional link attribute values

* fix to work with dictionary / nested attributes

* module rename, tidy up

* update logging to match the rest

* fix imports in tests

* address PR comments: Part 1: readability and conditions toolbox dataclass

* address PR comments: Part 2: chop up existing tests for validation report

* add condition for none values
  • Loading branch information
KasiaKoz authored Sep 23, 2022
1 parent aa316e1 commit adb9b10
Show file tree
Hide file tree
Showing 15 changed files with 484 additions and 160 deletions.
2 changes: 1 addition & 1 deletion example_data/api_requests_send.json

Large diffs are not rendered by default.

116 changes: 77 additions & 39 deletions genet/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
import genet.utils.plot as plot
import genet.utils.simplification as simplification
import genet.utils.spatial as spatial
import genet.validate.network_validation as network_validation
import genet.validate.network as network_validation
import geopandas as gpd
import networkx as nx
import numpy as np
Expand Down Expand Up @@ -942,7 +942,7 @@ def subgraph_on_link_conditions(self, conditions, how=any, mixed_dtypes=True):

def modes(self):
"""
Scans network for 'modes' attribute and returns list of all modes present int he network
Scans network for 'modes' attribute and returns list of all modes present in the network
:return:
"""
modes = set()
Expand Down Expand Up @@ -1999,55 +1999,60 @@ def invalid_network_routes(self):
return [route.id for route in self.schedule.routes() if
not route.has_network_route() or not self.is_valid_network_route(route)]

def generate_validation_report(self, link_length_threshold=1000):
def generate_validation_report(self, modes_for_strong_connectivity=None, link_metre_length_threshold=1000):
"""
Generates a dictionary with keys: 'graph', 'schedule' and 'routing' describing validity of the Network's
underlying graph, the schedule services and then the intersection of the two which is the routing of schedule
services onto the graph.
:param link_length_threshold: in meters defaults to 1000, i.e. 1km
:param modes_for_strong_connectivity: list of modes in the network that need to be checked for strong
connectivity. Defaults to 'car', 'walk' and 'bike'
:param link_metre_length_threshold: in meters defaults to 1000, i.e. 1km
:return:
"""
logging.info('Checking validity of the Network')
logging.info('Checking validity of the Network graph')
report = {}
# describe network connectivity
modes = ['car', 'walk', 'bike']
report['graph'] = {'graph_connectivity': {}}
for mode in modes:
logging.info(f'Checking network connectivity for mode: {mode}')
# subgraph for the mode to be tested
G_mode = self.modal_subgraph(mode)
# calculate how many connected subgraphs there are
report['graph']['graph_connectivity'][mode] = network_validation.describe_graph_connectivity(G_mode)

def links_over_threshold_length(value):
return value >= link_length_threshold

links_over_1km_length = self.extract_links_on_edge_attributes(
conditions={'length': links_over_threshold_length})

# describe network connectivity
if modes_for_strong_connectivity is None:
modes_for_strong_connectivity = ['car', 'walk', 'bike']
logging.info(f'Defaulting to checking graph connectivity for modes: {modes_for_strong_connectivity}. '
'You can change this by passing a `modes_for_strong_connectivity` param')
graph_connectivity = {}
for mode in modes_for_strong_connectivity:
graph_connectivity[mode] = self.check_connectivity_for_mode(mode)
report['graph'] = {'graph_connectivity': graph_connectivity}

# attribute checks
conditions_toolbox = network_validation.ConditionsToolbox()
report['graph']['link_attributes'] = {
'links_over_1km_length': {
'number_of': len(links_over_1km_length),
'percentage': len(links_over_1km_length) / self.graph.number_of_edges(),
'link_ids': links_over_1km_length
}
}
f'{k}_attributes': {} for k in conditions_toolbox.condition_names()}

def zero_value(value):
return (value == 0) or (value == '0') or (value == '0.0')

report['graph']['link_attributes']['zero_attributes'] = {}
for attrib in [d.name for d in graph_operations.get_attribute_schema(self.links()).descendants]:
links_with_zero_attrib = self.extract_links_on_edge_attributes(
conditions={attrib: zero_value}, mixed_dtypes=False)
if links_with_zero_attrib:
logging.warning(f'{len(links_with_zero_attrib)} of links have values of 0 for `{attrib}`')
report['graph']['link_attributes']['zero_attributes'][attrib] = {
'number_of': len(links_with_zero_attrib),
'percentage': len(links_with_zero_attrib) / self.graph.number_of_edges(),
'link_ids': links_with_zero_attrib
}
# checks on length attribute specifically
def links_over_threshold_length(value):
return value >= link_metre_length_threshold

report['graph']['link_attributes']['links_over_1000_length'] = self.report_on_link_attribute_condition(
'length', links_over_threshold_length)

# more general attribute value checks
non_testable = ['id', 'from', 'to', 's2_to', 's2_from', 'geometry']
link_attributes = [graph_operations.parse_leaf(leaf) for leaf in
graph_operations.get_attribute_schema(self.links()).leaves]
link_attributes = [attrib for attrib in link_attributes if attrib not in non_testable]
for attrib in link_attributes:
logging.info(f'Checking link values for `{attrib}`')
for condition_name in conditions_toolbox.condition_names():
links_satifying_condition = self.report_on_link_attribute_condition(
attrib, conditions_toolbox.get_condition_evaluator(condition_name))
if links_satifying_condition['number_of']:
logging.warning(
f'{links_satifying_condition["number_of"]} of links have '
f'{condition_name} values for `{attrib}`')
if isinstance(attrib, dict):
attrib = dict_support.dict_to_string(attrib)
report['graph']['link_attributes'][f'{condition_name}_attributes'][
attrib] = links_satifying_condition

if self.schedule:
report['schedule'] = self.schedule.generate_validation_report()
Expand All @@ -2066,6 +2071,39 @@ def zero_value(value):
}
return report

def report_on_link_attribute_condition(self, attribute, condition):
"""
:param attribute: one of the link attributes, e.g. 'length'
:param condition: callable, condition for link[attribute] to satisfy
:return:
"""
if isinstance(attribute, dict):
conditions = dict_support.nest_at_leaf(deepcopy(attribute), condition)
else:
conditions = {attribute: condition}

links_satifying_condition = self.extract_links_on_edge_attributes(conditions=conditions)
return {
'number_of': len(links_satifying_condition),
'percentage': len(links_satifying_condition) / self.graph.number_of_edges(),
'link_ids': links_satifying_condition
}

def check_connectivity_for_mode(self, mode):
logging.info(f'Checking network connectivity for mode: {mode}')
G_mode = self.modal_subgraph(mode)
con_desc = network_validation.describe_graph_connectivity(G_mode)
no_of_components = con_desc["number_of_connected_subgraphs"]
logging.info(f'The graph for mode: {mode} has: '
f'{no_of_components} connected components, '
f'{len(con_desc["problem_nodes"]["dead_ends"])} sinks/dead_ends and '
f'{len(con_desc["problem_nodes"]["unreachable_node"])} sources/unreachable nodes.')
if no_of_components > 1:
logging.warning(f'The graph has more than one connected component for mode {mode}! '
'If this is not expected, consider using the `connect_components` method to connect the '
'components, or `retain_n_connected_subgraphs` with `n=1` to extract the largest component')
return con_desc

def generate_standard_outputs(self, output_dir, gtfs_day='19700101', include_shp_files=False):
"""
Generates geojsons that can be used for generating standard kepler visualisations.
Expand Down
2 changes: 1 addition & 1 deletion genet/output/matsim_xml_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from copy import deepcopy
from pandas import DataFrame
from genet.output import sanitiser
from genet.validate.network_validation import validate_attribute_data
from genet.validate.network import validate_attribute_data
from genet.utils.spatial import encode_shapely_linestring_to_polyline
from genet.exceptions import MalformedAdditionalAttributeError
import genet.variables as variables
Expand Down
2 changes: 1 addition & 1 deletion genet/schedule_elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import genet.utils.persistence as persistence
import genet.utils.plot as plot
import genet.utils.spatial as spatial
import genet.validate.schedule_validation as schedule_validation
import genet.validate.schedule as schedule_validation
import networkx as nx
import numpy as np
import pandas as pd
Expand Down
4 changes: 4 additions & 0 deletions genet/utils/dict_support.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,10 @@ def combine_edge_data_lists(l1, l2):
return [(u, v, dat) for (u, v), dat in edges.items()]


def dict_to_string(d):
return str(d).replace('{', '').replace('}', '').replace("'", '').replace(' ', ':')


def dataframe_to_dict(df):
return {_id: {k: v for k, v in m.items() if notna(v)} for _id, m in df.to_dict().items()}

Expand Down
19 changes: 17 additions & 2 deletions genet/utils/graph_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from anytree import Node, RenderTree

from genet.utils import pandas_helpers as pd_helpers
import genet.utils.dict_support as dict_support


class Filter:
Expand Down Expand Up @@ -209,6 +210,21 @@ def render_tree(root, data=False):
print("%s%s" % (pre, node.name))


def parse_leaf(leaf):
"""
:param leaf: anytree.node.node.Node
:return: str or dictionary with string key value pairs, for use as keys to extraction methods
"""
if leaf.depth > 1:
dict_path = {leaf.path[1].name: leaf.path[2].name}
if leaf.depth > 2:
for node in leaf.path[3:]:
dict_path = dict_support.nest_at_leaf(dict_path, node.name)
return dict_path
else:
return leaf.name


def get_attribute_data_under_key(iterator: Iterable, key: Union[str, dict]):
"""
Returns all data stored under key in attribute dictionaries for iterators yielding (index, attribute_dictionary),
Expand Down Expand Up @@ -256,8 +272,7 @@ def build_attribute_dataframe(iterator, keys: Union[list, str], index_name: str
for key in keys:
if isinstance(key, dict):
# consolidate nestedness to get a name for the column
name = str(key)
name = name.replace('{', '').replace('}', '').replace("'", '').replace(' ', ':')
name = dict_support.dict_to_string(key)
else:
name = key

Expand Down
101 changes: 101 additions & 0 deletions genet/validate/network.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import networkx as nx
import math
from dataclasses import dataclass, fields


def validate_attribute_data(attributes, necessary_attributes):
missing_attribs = set(necessary_attributes) - set(attributes)
if missing_attribs:
raise AttributeError(f'Attributes: {missing_attribs} missing from data: {attributes}')


def find_problem_nodes(G):
problem_nodes = {}
problem_nodes['dead_ends'] = []
problem_nodes['unreachable_node'] = []
for node in G.nodes:
if (G.in_degree(node) == 0):
problem_nodes['unreachable_node'].append(node)
if (G.out_degree(node) == 0):
problem_nodes['dead_ends'].append(node)
return problem_nodes


def find_connected_subgraphs(G):
return [(list(c), len(c)) for c in sorted(nx.strongly_connected_components(G), key=len, reverse=True)]


def describe_graph_connectivity(G):
"""
Computes dead ends and unreachable nodes in G. Computes strongly connected components of G
:param G:
:return:
"""
dict_to_return = {}
# find dead ends or unreachable nodes
dict_to_return['problem_nodes'] = find_problem_nodes(G)
# find number of connected subgraphs
dict_to_return['number_of_connected_subgraphs'] = len(find_connected_subgraphs(G))
return dict_to_return


def evaluate_condition_for_floatable(value, condition):
try:
value = float(value)
return condition(value)
except (ValueError, TypeError):
return False


def zero_value(value):
return value == 0.0


def negative_value(value):
return value < 0.0


def infinity_value(value):
return math.isinf(value)


def fractional_value(value):
return 1.0 > value > 0.0


def none_condition(value):
return value in [None, 'None']


@dataclass()
class Condition:
condition: callable

def evaluate(self, value):
return self.condition(value)


@dataclass()
class FloatCondition(Condition):
condition: callable

def evaluate(self, value):
return evaluate_condition_for_floatable(value, self.condition)


@dataclass()
class ConditionsToolbox:
zero: Condition = FloatCondition(zero_value)
negative: Condition = FloatCondition(negative_value)
infinite: Condition = FloatCondition(infinity_value)
fractional: Condition = FloatCondition(fractional_value)
none: Condition = Condition(none_condition)

def condition_names(self) -> list:
return [field.name for field in fields(self)]

def get_condition_evaluator(self, condition: str) -> callable:
if condition in self.__dict__:
return self.__dict__[condition].evaluate
else:
raise NotImplementedError(f'Condition {condition} is not defined.')
37 changes: 0 additions & 37 deletions genet/validate/network_validation.py

This file was deleted.

File renamed without changes.
Loading

0 comments on commit adb9b10

Please sign in to comment.