From 43af03e312ef46c887fddc428b62b0063bb9bfd0 Mon Sep 17 00:00:00 2001 From: Arthur Burianne <123063239+aburianne@users.noreply.github.com> Date: Mon, 18 Mar 2024 18:23:33 +0100 Subject: [PATCH] feat: use urban typology for activity chain matching (#209) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * alpha version * alpha version with scripts add... * fix: properly handling cities with districts * config file update * clean branch from extra config files for interlab use * cleanup * further cleanup * make matching attributes configurable * monkey patching openpyxl to read excel sheet * make configurable * add test data to ENTD * add documentation * update tests * testing and egt * update docs --------- Co-authored-by: Arthur BURIANNE Co-authored-by: Tarek Chouaki Co-authored-by: Sebastian Hörl --- CHANGELOG.md | 2 + data/census/cleaned.py | 18 +++++++- data/hts/egt/cleaned.py | 21 ++++++++++ data/hts/egt/filtered.py | 13 ++++-- data/hts/entd/cleaned.py | 11 +++++ data/hts/entd/filtered.py | 2 +- data/hts/entd/raw.py | 2 +- data/spatial/urban_type.py | 73 +++++++++++++++++++++++++++++++++ docs/population.md | 28 +++++++++++++ synthesis/population/matched.py | 40 ++++++++++++++---- tests/test_determinism.py | 12 +++++- tests/test_pipeline.py | 37 +++++++++++++---- tests/testdata.py | 46 ++++++++++++++++----- 13 files changed, 269 insertions(+), 36 deletions(-) create mode 100644 data/spatial/urban_type.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 97c4914f..d5fca840 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ **Under development** +- feat: make statistical matching attribute list configurable +- feat: add urban type classifiation (unité urbaine) - feat: functionality to make use of INSEE population projection data - update: don't remove households with people not living/studying in Île-de-France anymore to be more consistent with other use cases - fix bug where always one household_id existed twice diff --git a/data/census/cleaned.py b/data/census/cleaned.py index 79e6c1cd..789d0adb 100644 --- a/data/census/cleaned.py +++ b/data/census/cleaned.py @@ -13,6 +13,9 @@ def configure(context): context.stage("data.census.raw") context.stage("data.spatial.codes") + if context.config("use_urban_type", False): + context.stage("data.spatial.urban_type") + def execute(context): df = context.stage("data.census.raw") @@ -96,7 +99,7 @@ def execute(context): # Consumption units df = pd.merge(df, hts.calculate_consumption_units(df), on = "household_id") - return df[[ + df = df[[ "person_id", "household_id", "weight", "iris_id", "commune_id", "departement_id", "age", "sex", "couple", @@ -104,3 +107,16 @@ def execute(context): "studies", "number_of_vehicles", "household_size", "consumption_units", "socioprofessional_class" ]] + + if context.config("use_urban_type"): + df_urban_type = context.stage("data.spatial.urban_type")[[ + "commune_id", "urban_type" + ]] + + # Impute urban type + df = pd.merge(df, df_urban_type, on = "commune_id", how = "left") + df.loc[df["commune_id"] == "undefined", "urban_type"] = "none" + df["commune_id"] = df["commune_id"].astype("category") + assert ~np.any(df["urban_type"].isna()) + + return df diff --git a/data/hts/egt/cleaned.py b/data/hts/egt/cleaned.py index ec7530af..197da72b 100644 --- a/data/hts/egt/cleaned.py +++ b/data/hts/egt/cleaned.py @@ -10,6 +10,9 @@ def configure(context): context.stage("data.hts.egt.raw") + if context.config("use_urban_type", False): + context.stage("data.spatial.urban_type") + INCOME_CLASS_BOUNDS = [800, 1200, 1600, 2000, 2400, 3000, 3500, 4500, 5500, 1e6] PURPOSE_MAP = { @@ -111,6 +114,24 @@ def execute(context): df_households.loc[df_households["income_class"].isin([10.0, 11.0, np.nan]), "income_class"] = -1 df_households["income_class"] = df_households["income_class"].astype(int) + # Impute urban type + if context.config("use_urban_type"): + df_urban_type = context.stage("data.spatial.urban_type")[[ + "commune_id", "urban_type" + ]] + + # Household municipality + df_households["commune_id"] = df_households["RESCOMM"].astype("category") + df_persons = pd.merge(df_persons, df_households[["household_id", "commune_id"]], how = "left") + assert np.all(~df_persons["commune_id"].isna()) + + # Impute urban type + df_persons = pd.merge(df_persons, df_urban_type, on = "commune_id", how = "left") + df_persons["urban_type"] = df_persons["urban_type"].fillna("none").astype("category") + + df_households.drop(columns = ["commune_id"]) + df_persons.drop(columns = ["commune_id"]) + # Trip purpose df_trips["following_purpose"] = "other" df_trips["preceding_purpose"] = "other" diff --git a/data/hts/egt/filtered.py b/data/hts/egt/filtered.py index 62cf1b8f..c2286055 100644 --- a/data/hts/egt/filtered.py +++ b/data/hts/egt/filtered.py @@ -12,7 +12,6 @@ def configure(context): def execute(context): df_codes = context.stage("data.spatial.codes") - assert (df_codes["region_id"] == 11).all() # Otherwise EGT doesn't make sense df_households, df_persons, df_trips = context.stage("data.hts.egt.cleaned") @@ -39,9 +38,15 @@ def execute(context): df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])] # Finish up - df_households = df_households[hts.HOUSEHOLD_COLUMNS + ["income_class"] + ["egt_household_id"]] - df_persons = df_persons[hts.PERSON_COLUMNS + ["egt_household_id", "egt_person_id"]] - df_trips = df_trips[hts.TRIP_COLUMNS + ["euclidean_distance"] + ["egt_household_id", "egt_person_id", "egt_trip_id"]] + household_columns = hts.HOUSEHOLD_COLUMNS + ["income_class"] + ["egt_household_id"] + df_households = df_households[household_columns] + + person_columns = hts.PERSON_COLUMNS + ["egt_household_id", "egt_person_id"] + if "urban_type" in df_persons: person_columns.append("urban_type") + df_persons = df_persons[person_columns] + + trip_columns = hts.TRIP_COLUMNS + ["euclidean_distance"] + ["egt_household_id", "egt_person_id", "egt_trip_id"] + df_trips = df_trips[trip_columns] hts.check(df_households, df_persons, df_trips) diff --git a/data/hts/entd/cleaned.py b/data/hts/entd/cleaned.py index 840d1b96..a8814c94 100644 --- a/data/hts/entd/cleaned.py +++ b/data/hts/entd/cleaned.py @@ -115,6 +115,17 @@ def execute(context): df_trips["origin_departement_id"] = df_trips["V2_MORIDEP"].fillna("undefined").astype("category") df_trips["destination_departement_id"] = df_trips["V2_MDESDEP"].fillna("undefined").astype("category") + # Clean urban type + df_households["urban_type"] = df_households["numcom_UU2010"].replace({ + "B": "suburb", + "C": "central_city", + "I": "isolated_city", + "R": "none" + }) + + assert np.all(~df_households["urban_type"].isna()) + df_households["urban_type"] = df_households["urban_type"].astype("category") + # Clean employment df_persons["employed"] = df_persons["SITUA"].isin([1, 2]) diff --git a/data/hts/entd/filtered.py b/data/hts/entd/filtered.py index d7c8acf7..9fc4793c 100644 --- a/data/hts/entd/filtered.py +++ b/data/hts/entd/filtered.py @@ -33,7 +33,7 @@ def execute(context): df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])] # Finish up - df_households = df_households[hts.HOUSEHOLD_COLUMNS + ["income_class"]] + df_households = df_households[hts.HOUSEHOLD_COLUMNS + ["urban_type", "income_class"]] df_persons = df_persons[hts.PERSON_COLUMNS] df_trips = df_trips[hts.TRIP_COLUMNS + ["routed_distance"]] diff --git a/data/hts/entd/raw.py b/data/hts/entd/raw.py index 27f525bc..f4bdd91a 100644 --- a/data/hts/entd/raw.py +++ b/data/hts/entd/raw.py @@ -14,7 +14,7 @@ Q_TCM_MENAGE_COLUMNS = [ "NPERS", "PONDV1", "TrancheRevenuMensuel", - "DEP", "idENT_MEN", "RG" + "DEP", "idENT_MEN", "RG", "numcom_UU2010" ] Q_INDIVIDU_COLUMNS = [ diff --git a/data/spatial/urban_type.py b/data/spatial/urban_type.py new file mode 100644 index 00000000..7e5c0c26 --- /dev/null +++ b/data/spatial/urban_type.py @@ -0,0 +1,73 @@ +import pandas as pd +import os +import zipfile +import numpy as np + +# START Money patching openpyxl to parse INSEE file +from openpyxl.styles.colors import WHITE, RGB +__old_rgb_set__ = RGB.__set__ + +def __rgb_set_fixed__(self, instance, value): + try: + __old_rgb_set__(self, instance, value) + except ValueError as e: + if e.args[0] == 'Colors must be aRGB hex values': + __old_rgb_set__(self, instance, WHITE) + +RGB.__set__ = __rgb_set_fixed__ +# END Monkey patching openpyxl + +# Loads the input data for the urban type (unité urbain) + +def configure(context): + context.stage("data.spatial.municipalities") + + context.config("data_path") + context.config("urban_type_path", "urban_type/UU2020_au_01-01-2023.zip") + +def execute(context): + with zipfile.ZipFile("{}/{}".format( + context.config("data_path"), context.config("urban_type_path"))) as archive: + assert len(archive.filelist) == 1 + with archive.open(archive.filelist[0]) as f: + df = pd.read_excel(f, sheet_name = "Composition_communale", skiprows = 5) + + df = df[["CODGEO", "STATUT_2017"]].copy() + df = df.set_axis(["commune_id", "urban_type"], axis = "columns") + + # Cities that have districts are not detailed in the UU file, only the whole city is mentioned + # However the municipalities file details the districts with their respective INSEE codes + cities_with_districts = {"75056": [str(75101 + i) for i in (range(20))], # Paris + "69123": [str(69001 + i) for i in range(9)], # Lyon + "13055": [str(13201 + i) for i in range(15)]} # Marseilles + + # Replacing each line of the UU file corresponding to a city with districts by multiple lines one for each districts + for city_code in cities_with_districts: + base_type = df[df["commune_id"] == city_code].iloc[0]["urban_type"] + replacement_codes = cities_with_districts[city_code] + + df = pd.concat([df, pd.DataFrame({ + "commune_id": replacement_codes, + "urban_type": [base_type] * len(replacement_codes) + })]) + + df = df[~df["commune_id"].isin(cities_with_districts.keys())] + + # Clean unités urbaines + df["urban_type"] = df["urban_type"].replace({"B":"suburb","C":"central_city","I":"isolated_city","H":"none"}) + assert np.all(~df["urban_type"].isna()) + df["urban_type"] = df["urban_type"].astype("category") + + df_municipalities = context.stage("data.spatial.municipalities") + requested_communes = set(df_municipalities["commune_id"].unique()) + df = df[df["commune_id"].isin(requested_communes)] + + assert len(df["commune_id"].unique()) == len(df) + + return df + +def validate(context): + if not os.path.exists("%s/%s" % (context.config("data_path"), context.config("urban_type_path"))): + raise RuntimeError("Urban type data is not available") + + return os.path.getsize("%s/%s" % (context.config("data_path"), context.config("urban_type_path"))) diff --git a/docs/population.md b/docs/population.md index 666d1b1a..8717434c 100644 --- a/docs/population.md +++ b/docs/population.md @@ -313,3 +313,31 @@ config: # [...] projection_scenario: 00_central ``` + +### Urban type + +The pipeline allows to work with INSEE's urban type classification (unité urbaine) that distinguishes municipalities in *center cities*, *suburbs*, *isolated cities*, and unclassified ones. To impute the data (currently only for some HTS), activate it via the configuration: + +```yaml +config: + # [...] + use_urban_type: true +``` + +In order to make use of it for activity chain matching, you can set a custom list of matching attributes like so: + +```yaml +config: + # [...] + matching_attributes: ["urban_type", "*default*"] +``` + +The `*default*` trigger will be replaced by the default list of matching attributes. + +Note that not all HTS implement the urban type, so matching may not work with some implementations. Most of them, however, contain the data, we just need to update the code to read them in. + +To make use of the urban type, the following data is needed: +- [Download the urban type data from INSEE](https://www.insee.fr/fr/information/4802589). The pipeline is currently compatible with the 2023 data set (referencing 2020 boundaries). +- Put the downloaded *zip* file into `data/urban_type`, so you will have the file `data/urban_type/UU2020_au_01-01-2023.zip` + +Then, you should be able to run the pipeline with the configuration explained above. diff --git a/synthesis/population/matched.py b/synthesis/population/matched.py index 586de567..18647e72 100644 --- a/synthesis/population/matched.py +++ b/synthesis/population/matched.py @@ -19,10 +19,16 @@ "entd": data.hts.entd.cleaned.calculate_income_class, } +DEFAULT_MATCHING_ATTRIBUTES = [ + "sex", "any_cars", "age_class", "socioprofessional_class", + "departement_id" +] + def configure(context): context.config("processes") context.config("random_seed") context.config("matching_minimum_observations", 20) + context.config("matching_attributes", DEFAULT_MATCHING_ATTRIBUTES) context.stage("synthesis.population.sampled") context.stage("synthesis.population.income") @@ -112,6 +118,9 @@ def statistical_matching(progress, df_source, source_identifier, weight, df_targ progress.update(np.count_nonzero(unassigned_mask)) + if np.count_nonzero(unassigned_mask) > 0: + raise RuntimeError("Some target observations could not be matched. Minimum observations configured too high?") + assert np.count_nonzero(unassigned_mask) == 0 assert np.count_nonzero(assigned_indices == -1) == 0 @@ -165,27 +174,40 @@ def execute(context): df_target = context.stage("synthesis.population.sampled") + columns = context.config("matching_attributes") + + try: + default_index = columns.index("*default*") + columns[default_index:default_index + 1] = DEFAULT_MATCHING_ATTRIBUTES + except ValueError: pass + # Define matching attributes AGE_BOUNDARIES = [14, 29, 44, 59, 74, 1000] - df_target["age_class"] = np.digitize(df_target["age"], AGE_BOUNDARIES, right = True) - df_source["age_class"] = np.digitize(df_source["age"], AGE_BOUNDARIES, right = True) + + if "age_class" in columns: + df_target["age_class"] = np.digitize(df_target["age"], AGE_BOUNDARIES, right = True) + df_source["age_class"] = np.digitize(df_source["age"], AGE_BOUNDARIES, right = True) - if "income_class" in df_source: + if "income_class" in columns: df_income = context.stage("synthesis.population.income")[["household_id", "household_income"]] df_target = pd.merge(df_target, df_income) df_target["income_class"] = INCOME_CLASS[hts](df_target) - df_target["any_cars"] = df_target["number_of_vehicles"] > 0 - df_source["any_cars"] = df_source["number_of_vehicles"] > 0 - - columns = ["sex", "any_cars", "age_class", "socioprofessional_class"] - if "income_class" in df_source: columns += ["income_class"] - columns += ["departement_id"] + if "any_cars" in columns: + df_target["any_cars"] = df_target["number_of_vehicles"] > 0 + df_source["any_cars"] = df_source["number_of_vehicles"] > 0 # Perform statistical matching df_source = df_source.rename(columns = { "person_id": "hts_id" }) + for column in columns: + if not column in df_source: + raise RuntimeError("Attribute not available in source (HTS) for matching: {}".format(column)) + + if not column in df_target: + raise RuntimeError("Attribute not available in target (census) for matching: {}".format(column)) + df_assignment, levels = parallel_statistical_matching( context, df_source, "hts_id", "person_weight", diff --git a/tests/test_determinism.py b/tests/test_determinism.py index ba02a407..bf43e5ed 100644 --- a/tests/test_determinism.py +++ b/tests/test_determinism.py @@ -54,7 +54,11 @@ def _test_determinism(index, data_path, tmpdir): regions = [10, 11], sampling_rate = 1.0, hts = "entd", random_seed = 1000, processes = 1, secloc_maximum_iterations = 10, - maven_skip_tests = True + maven_skip_tests = True, + matching_attributes = [ + "sex", "any_cars", "age_class", "socioprofessional_class", + "income_class", "departement_id" + ] ) stages = [ @@ -111,7 +115,11 @@ def _test_determinism_matsim(index, data_path, tmpdir): regions = [10, 11], sampling_rate = 1.0, hts = "entd", random_seed = 1000, processes = 1, secloc_maximum_iterations = 10, - maven_skip_tests = True + maven_skip_tests = True, + matching_attributes = [ + "sex", "any_cars", "age_class", "socioprofessional_class", + "income_class", "departement_id" + ] ) stages = [ diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 430a277f..fa8fddd8 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -2,6 +2,7 @@ import os import hashlib from . import testdata +import pandas as pd def test_data(tmpdir): data_path = str(tmpdir.mkdir("data")) @@ -34,7 +35,7 @@ def test_data(tmpdir): assert os.path.isfile("%s/ile_de_france_hts_trips.csv" % output_path) assert os.path.isfile("%s/ile_de_france_sirene.gpkg" % output_path) -def run_population(tmpdir, hts, mode_choice): +def run_population(tmpdir, hts, update = {}): data_path = str(tmpdir.mkdir("data")) testdata.create(data_path) @@ -45,9 +46,9 @@ def run_population(tmpdir, hts, mode_choice): regions = [10, 11], sampling_rate = 1.0, hts = hts, random_seed = 1000, processes = 1, secloc_maximum_iterations = 10, - maven_skip_tests = True, - mode_choice = mode_choice + maven_skip_tests = True ) + config.update(update) stages = [ dict(descriptor = "synthesis.output"), @@ -62,11 +63,33 @@ def run_population(tmpdir, hts, mode_choice): assert os.path.isfile("%s/ile_de_france_trips.gpkg" % output_path) assert os.path.isfile("%s/ile_de_france_meta.json" % output_path) + assert 2235 == len(pd.read_csv("%s/ile_de_france_activities.csv" % output_path, usecols = ["household_id"], sep = ";")) + assert 447 == len(pd.read_csv("%s/ile_de_france_persons.csv" % output_path, usecols = ["household_id"], sep = ";")) + assert 149 == len(pd.read_csv("%s/ile_de_france_households.csv" % output_path, usecols = ["household_id"], sep = ";")) + def test_population_with_entd(tmpdir): - run_population(tmpdir, "entd", False) + run_population(tmpdir, "entd") + +def test_population_with_egt(tmpdir): + run_population(tmpdir, "egt") def test_population_with_mode_choice(tmpdir): - run_population(tmpdir, "entd", True) + run_population(tmpdir, "entd", { "mode_choice": True }) + +def test_population_with_urban_type(tmpdir): + run_population(tmpdir, "entd", { + "use_urban_type": True, + "matching_attributes": [ + "urban_type", "*default*" + ], + "matching_minimum_observations": 5 + }) -#def test_population_with_egt(tmpdir): -# run_population(tmpdir, "entd") # TODO: Fix this! +def test_population_with_urban_type_and_egt(tmpdir): + run_population(tmpdir, "egt", { + "use_urban_type": True, + "matching_attributes": [ + "urban_type", "*default*" + ], + "matching_minimum_observations": 5 + }) diff --git a/tests/testdata.py b/tests/testdata.py index 59e5a1cb..1db0d225 100644 --- a/tests/testdata.py +++ b/tests/testdata.py @@ -301,7 +301,7 @@ def create(output_path): "De 1 000", "De 1 200", "De 1 500", "De 1800", "De 2 000", "De 2 500", "De 3 000", "De 4 000", "De 6 000", "10 000" - ]) + ]), numcom_UU2010 = ["B", "C", "I", "R"][household_index % 4] )) for person_index in range(HTS_HOUSEHOLD_MEMBERS): @@ -388,8 +388,9 @@ def create(output_path): trips = [] ) + person_index = 0 for household_index in range(HTS_HOUSEHOLDS): - household_id = household_index + household_id = household_index * 1000 + 50 municipality = random.choice(df["municipality"].unique()) region = df[df["municipality"] == municipality]["region"].values[0] @@ -402,8 +403,7 @@ def create(output_path): MNP = 3, REVENU = random.randint(12) )) - for person_index in range(HTS_HOUSEHOLD_MEMBERS): - person_id = household_id * 1000 + person_index + for person_id in range(1, HTS_HOUSEHOLD_MEMBERS + 1): studies = random.random_sample() < 0.3 data["persons"].append(dict( @@ -421,7 +421,7 @@ def create(output_path): work_region = df[df["municipality"] == work_municipality]["region"].values[0] work_department = df[df["municipality"] == work_municipality]["department"].values[0] - purpose = 21 if studies else 11 + purpose = 4 if studies else 2 mode = random.choice([1, 2, 3, 5, 7]) origin_hour = 8 @@ -429,7 +429,7 @@ def create(output_path): if person_index % 100 == 0: # Testing proper diffusion of plan times - orign_hour = 0 + origin_hour = 0 origin_minute = 12 data["trips"].append(dict( @@ -442,18 +442,27 @@ def create(output_path): data["trips"].append(dict( NQUEST = household_id, NP = person_id, - ND = 1, ORDEP = work_department, DESTDEP = home_department, + ND = 2, ORDEP = work_department, DESTDEP = home_department, ORH = 8, ORM = 0, DESTH = 9, DESTM = 0, ORCOMM = work_municipality, DESTCOMM = home_municipality, DPORTEE = 3, MODP_H7 = 2, - DESTMOT_H9 = 31, ORMOT_H9 = purpose + DESTMOT_H9 = 5, ORMOT_H9 = purpose )) data["trips"].append(dict( NQUEST = household_id, NP = person_id, - ND = 2, ORDEP = home_department, DESTDEP = home_department, + ND = 3, ORDEP = home_department, DESTDEP = home_department, ORH = 17, ORM = 0, DESTH = 18, DESTM = 0, ORCOMM = home_municipality, DESTCOMM = home_municipality, DPORTEE = 3, MODP_H7 = 2, - DESTMOT_H9 = 1, ORMOT_H9 = 31 + DESTMOT_H9 = 1, ORMOT_H9 = 5 + )) + + # Tail + data["trips"].append(dict( + NQUEST = household_id, NP = person_id, + ND = 4, ORDEP = home_department, DESTDEP = home_department, + ORH = 22, ORM = 0, DESTH = 21, DESTM = 0, ORCOMM = home_municipality, + DESTCOMM = home_municipality, DPORTEE = 3, MODP_H7 = 2, + DESTMOT_H9 = 5, ORMOT_H9 = 1 )) os.mkdir("%s/egt_2010" % output_path) @@ -657,7 +666,22 @@ def create(output_path): df_sirene_geoloc.to_csv("%s/sirene/GeolocalisationEtablissement_Sirene_pour_etudes_statistiques_utf8.zip" % output_path, index = False, sep=";", compression={'method': 'zip', 'archive_name': 'GeolocalisationEtablissement_Sirene_pour_etudes_statistiques_utf8.csv'}) - + # Data set: Urban type + print("Creating urban type ...") + df_urban_type = df_codes[["DEPCOM"]].copy().rename(columns = { "DEPCOM": "CODGEO" }) + df_urban_type = df_urban_type.drop_duplicates() + df_urban_type["STATUT_2017"] = [["B", "C", "I", "H"][k % 4] for k in range(len(df_urban_type))] + + df_urban_type = pd.concat([df_urban_type, pd.DataFrame({ + "CODGEO": ["75056", "69123", "13055"], + "STATUT_2017": ["C", "C", "C"] + })]) + + os.mkdir("%s/urban_type" % output_path) + with zipfile.ZipFile("%s/urban_type/UU2020_au_01-01-2023.zip" % output_path, "w") as archive: + with archive.open("UU2020_au_01-01-2023.xlsx", "w") as f: + df_urban_type.to_excel(f, startrow = 5, sheet_name = "Composition_communale", index = False) + # Data set: OSM # We add add a road grid of 500m print("Creating OSM ...")