feat: use urban typology for activity chain matching (#209)

* alpha version * alpha version with scripts add... * fix: properly handling cities with districts * config file update * clean branch from extra config files for interlab use * cleanup * further cleanup * make matching attributes configurable * monkey patching openpyxl to read excel sheet * make configurable * add test data to ENTD * add documentation * update tests * testing and egt * update docs --------- Co-authored-by: Arthur BURIANNE <arthur.burianne@irtsystemx.fr> Co-authored-by: Tarek Chouaki <tarek.chouaki@irt-systemx.fr> Co-authored-by: Sebastian Hörl <hoerl.sebastian@gmail.com>
eqasim-org · Mar 18, 2024 · 43af03e · 43af03e
1 parent 3c9b137
commit 43af03e
Show file tree

Hide file tree

Showing 13 changed files with 269 additions and 36 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,8 @@
 
 **Under development**
 
+- feat: make statistical matching attribute list configurable
+- feat: add urban type classifiation (unité urbaine)
 - feat: functionality to make use of INSEE population projection data
 - update: don't remove households with people not living/studying in Île-de-France anymore to be more consistent with other use cases
 - fix bug where always one household_id existed twice

diff --git a/data/census/cleaned.py b/data/census/cleaned.py
@@ -13,6 +13,9 @@ def configure(context):
     context.stage("data.census.raw")
     context.stage("data.spatial.codes")
 
+    if context.config("use_urban_type", False):
+        context.stage("data.spatial.urban_type")
+
 def execute(context):
     df = context.stage("data.census.raw")
 
@@ -96,11 +99,24 @@ def execute(context):
     # Consumption units
     df = pd.merge(df, hts.calculate_consumption_units(df), on = "household_id")
 
-    return df[[
+    df = df[[
         "person_id", "household_id", "weight",
         "iris_id", "commune_id", "departement_id",
         "age", "sex", "couple",
         "commute_mode", "employed",
         "studies", "number_of_vehicles", "household_size",
         "consumption_units", "socioprofessional_class"
     ]]
+
+    if context.config("use_urban_type"):
+        df_urban_type = context.stage("data.spatial.urban_type")[[
+            "commune_id", "urban_type"
+        ]]
+
+        # Impute urban type
+        df = pd.merge(df, df_urban_type, on = "commune_id", how = "left")
+        df.loc[df["commune_id"] == "undefined", "urban_type"] = "none"
+        df["commune_id"] = df["commune_id"].astype("category")
+        assert ~np.any(df["urban_type"].isna()) 
+
+    return df
diff --git a/data/hts/egt/cleaned.py b/data/hts/egt/cleaned.py
@@ -10,6 +10,9 @@
 def configure(context):
     context.stage("data.hts.egt.raw")
 
+    if context.config("use_urban_type", False):
+        context.stage("data.spatial.urban_type")
+
 INCOME_CLASS_BOUNDS = [800, 1200, 1600, 2000, 2400, 3000, 3500, 4500, 5500, 1e6]
 
 PURPOSE_MAP = {
@@ -111,6 +114,24 @@ def execute(context):
     df_households.loc[df_households["income_class"].isin([10.0, 11.0, np.nan]), "income_class"] = -1
     df_households["income_class"] = df_households["income_class"].astype(int)
 
+    # Impute urban type
+    if context.config("use_urban_type"):
+        df_urban_type = context.stage("data.spatial.urban_type")[[
+            "commune_id", "urban_type"
+        ]]
+
+        # Household municipality
+        df_households["commune_id"] = df_households["RESCOMM"].astype("category")
+        df_persons = pd.merge(df_persons, df_households[["household_id", "commune_id"]], how = "left")
+        assert np.all(~df_persons["commune_id"].isna())
+
+        # Impute urban type
+        df_persons = pd.merge(df_persons, df_urban_type, on = "commune_id", how = "left")
+        df_persons["urban_type"] = df_persons["urban_type"].fillna("none").astype("category")
+
+        df_households.drop(columns = ["commune_id"])
+        df_persons.drop(columns = ["commune_id"])
+
     # Trip purpose
     df_trips["following_purpose"] = "other"
     df_trips["preceding_purpose"] = "other"

diff --git a/data/hts/egt/filtered.py b/data/hts/egt/filtered.py
@@ -12,7 +12,6 @@ def configure(context):
 
 def execute(context):
     df_codes = context.stage("data.spatial.codes")
-    assert (df_codes["region_id"] == 11).all() # Otherwise EGT doesn't make sense
 
     df_households, df_persons, df_trips = context.stage("data.hts.egt.cleaned")
 
@@ -39,9 +38,15 @@ def execute(context):
     df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])]
 
     # Finish up
-    df_households = df_households[hts.HOUSEHOLD_COLUMNS + ["income_class"] + ["egt_household_id"]]
-    df_persons = df_persons[hts.PERSON_COLUMNS + ["egt_household_id", "egt_person_id"]]
-    df_trips = df_trips[hts.TRIP_COLUMNS + ["euclidean_distance"] + ["egt_household_id", "egt_person_id", "egt_trip_id"]]
+    household_columns = hts.HOUSEHOLD_COLUMNS + ["income_class"] + ["egt_household_id"]
+    df_households = df_households[household_columns]
+
+    person_columns = hts.PERSON_COLUMNS + ["egt_household_id", "egt_person_id"]
+    if "urban_type" in df_persons: person_columns.append("urban_type")
+    df_persons = df_persons[person_columns]
+
+    trip_columns = hts.TRIP_COLUMNS + ["euclidean_distance"] + ["egt_household_id", "egt_person_id", "egt_trip_id"]
+    df_trips = df_trips[trip_columns]
 
     hts.check(df_households, df_persons, df_trips)
 

diff --git a/data/hts/entd/cleaned.py b/data/hts/entd/cleaned.py
@@ -115,6 +115,17 @@ def execute(context):
     df_trips["origin_departement_id"] = df_trips["V2_MORIDEP"].fillna("undefined").astype("category")
     df_trips["destination_departement_id"] = df_trips["V2_MDESDEP"].fillna("undefined").astype("category")
 
+    # Clean urban type
+    df_households["urban_type"] = df_households["numcom_UU2010"].replace({
+        "B": "suburb",
+        "C": "central_city",
+        "I": "isolated_city",
+        "R": "none"
+    })
+
+    assert np.all(~df_households["urban_type"].isna())
+    df_households["urban_type"] = df_households["urban_type"].astype("category")
+
     # Clean employment
     df_persons["employed"] = df_persons["SITUA"].isin([1, 2])
 

diff --git a/data/hts/entd/filtered.py b/data/hts/entd/filtered.py
@@ -33,7 +33,7 @@ def execute(context):
     df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])]
 
     # Finish up
-    df_households = df_households[hts.HOUSEHOLD_COLUMNS + ["income_class"]]
+    df_households = df_households[hts.HOUSEHOLD_COLUMNS + ["urban_type", "income_class"]]
     df_persons = df_persons[hts.PERSON_COLUMNS]
     df_trips = df_trips[hts.TRIP_COLUMNS + ["routed_distance"]]
 

diff --git a/data/hts/entd/raw.py b/data/hts/entd/raw.py
@@ -14,7 +14,7 @@
 
 Q_TCM_MENAGE_COLUMNS = [
     "NPERS", "PONDV1", "TrancheRevenuMensuel",
-    "DEP", "idENT_MEN", "RG"
+    "DEP", "idENT_MEN", "RG", "numcom_UU2010"
 ]
 
 Q_INDIVIDU_COLUMNS = [

diff --git a/data/spatial/urban_type.py b/data/spatial/urban_type.py
@@ -0,0 +1,73 @@
+import pandas as pd
+import os
+import zipfile
+import numpy as np
+
+# START Money patching openpyxl to parse INSEE file
+from openpyxl.styles.colors import WHITE, RGB
+__old_rgb_set__ = RGB.__set__
+
+def __rgb_set_fixed__(self, instance, value):
+    try:
+        __old_rgb_set__(self, instance, value)
+    except ValueError as e:
+        if e.args[0] == 'Colors must be aRGB hex values':
+            __old_rgb_set__(self, instance, WHITE)
+
+RGB.__set__ = __rgb_set_fixed__
+# END Monkey patching openpyxl
+
+# Loads the input data for the urban type (unité urbain)
+
+def configure(context):
+    context.stage("data.spatial.municipalities")
+
+    context.config("data_path")
+    context.config("urban_type_path", "urban_type/UU2020_au_01-01-2023.zip")
+
+def execute(context):
+    with zipfile.ZipFile("{}/{}".format(
+        context.config("data_path"), context.config("urban_type_path"))) as archive:
+        assert len(archive.filelist) == 1
+        with archive.open(archive.filelist[0]) as f:
+            df = pd.read_excel(f, sheet_name = "Composition_communale", skiprows = 5)
+
+    df = df[["CODGEO", "STATUT_2017"]].copy()
+    df = df.set_axis(["commune_id", "urban_type"], axis = "columns")
+
+    # Cities that have districts are not detailed in the UU file, only the whole city is mentioned
+    # However the municipalities file details the districts with their respective INSEE codes
+    cities_with_districts = {"75056": [str(75101 + i) for i in (range(20))],  # Paris
+                             "69123": [str(69001 + i) for i in range(9)],  # Lyon
+                             "13055": [str(13201 + i) for i in range(15)]}  # Marseilles
+
+    # Replacing each line of the UU file corresponding to a city with districts by multiple lines one for each districts
+    for city_code in cities_with_districts:
+        base_type = df[df["commune_id"] == city_code].iloc[0]["urban_type"]
+        replacement_codes = cities_with_districts[city_code]
+
+        df = pd.concat([df, pd.DataFrame({
+            "commune_id": replacement_codes,
+            "urban_type": [base_type] * len(replacement_codes)
+        })])
+
+    df = df[~df["commune_id"].isin(cities_with_districts.keys())]
+
+    # Clean unités urbaines
+    df["urban_type"] = df["urban_type"].replace({"B":"suburb","C":"central_city","I":"isolated_city","H":"none"})
+    assert np.all(~df["urban_type"].isna())
+    df["urban_type"] = df["urban_type"].astype("category")
+
+    df_municipalities = context.stage("data.spatial.municipalities")
+    requested_communes = set(df_municipalities["commune_id"].unique())
+    df = df[df["commune_id"].isin(requested_communes)]
+
+    assert len(df["commune_id"].unique()) == len(df)
+
+    return df
+
+def validate(context):
+    if not os.path.exists("%s/%s" % (context.config("data_path"), context.config("urban_type_path"))):
+        raise RuntimeError("Urban type data is not available")
+
+    return os.path.getsize("%s/%s" % (context.config("data_path"), context.config("urban_type_path")))
diff --git a/docs/population.md b/docs/population.md
@@ -313,3 +313,31 @@ config:
   # [...]
   projection_scenario: 00_central
 ```
+
+### Urban type
+
+The pipeline allows to work with INSEE's urban type classification (unité urbaine) that distinguishes municipalities in *center cities*, *suburbs*, *isolated cities*, and unclassified ones. To impute the data (currently only for some HTS), activate it via the configuration:
+
+```yaml
+config:
+  # [...]
+  use_urban_type: true
+```
+
+In order to make use of it for activity chain matching, you can set a custom list of matching attributes like so:
+
+```yaml
+config:
+  # [...]
+  matching_attributes: ["urban_type", "*default*"]
+```
+
+The `*default*` trigger will be replaced by the default list of matching attributes.
+
+Note that not all HTS implement the urban type, so matching may not work with some implementations. Most of them, however, contain the data, we just need to update the code to read them in.
+
+To make use of the urban type, the following data is needed:
+- [Download the urban type data from INSEE](https://www.insee.fr/fr/information/4802589). The pipeline is currently compatible with the 2023 data set (referencing 2020 boundaries). 
+- Put the downloaded *zip* file into `data/urban_type`, so you will have the file `data/urban_type/UU2020_au_01-01-2023.zip`
+
+Then, you should be able to run the pipeline with the configuration explained above.
diff --git a/synthesis/population/matched.py b/synthesis/population/matched.py
@@ -19,10 +19,16 @@
     "entd": data.hts.entd.cleaned.calculate_income_class,
 }
 
+DEFAULT_MATCHING_ATTRIBUTES = [
+    "sex", "any_cars", "age_class", "socioprofessional_class",
+    "departement_id"
+]
+
 def configure(context):
     context.config("processes")
     context.config("random_seed")
     context.config("matching_minimum_observations", 20)
+    context.config("matching_attributes", DEFAULT_MATCHING_ATTRIBUTES)
 
     context.stage("synthesis.population.sampled")
     context.stage("synthesis.population.income")
@@ -112,6 +118,9 @@ def statistical_matching(progress, df_source, source_identifier, weight, df_targ
 
     progress.update(np.count_nonzero(unassigned_mask))
 
+    if np.count_nonzero(unassigned_mask) > 0:
+        raise RuntimeError("Some target observations could not be matched. Minimum observations configured too high?")
+
     assert np.count_nonzero(unassigned_mask) == 0
     assert np.count_nonzero(assigned_indices == -1) == 0
 
@@ -165,27 +174,40 @@ def execute(context):
 
     df_target = context.stage("synthesis.population.sampled")
 
+    columns = context.config("matching_attributes")
+
+    try:
+        default_index = columns.index("*default*")
+        columns[default_index:default_index + 1] = DEFAULT_MATCHING_ATTRIBUTES
+    except ValueError: pass
+
     # Define matching attributes
     AGE_BOUNDARIES = [14, 29, 44, 59, 74, 1000]
-    df_target["age_class"] = np.digitize(df_target["age"], AGE_BOUNDARIES, right = True)
-    df_source["age_class"] = np.digitize(df_source["age"], AGE_BOUNDARIES, right = True)
+
+    if "age_class" in columns:
+        df_target["age_class"] = np.digitize(df_target["age"], AGE_BOUNDARIES, right = True)
+        df_source["age_class"] = np.digitize(df_source["age"], AGE_BOUNDARIES, right = True)
 
-    if "income_class" in df_source:
+    if "income_class" in columns:
         df_income = context.stage("synthesis.population.income")[["household_id", "household_income"]]
 
         df_target = pd.merge(df_target, df_income)
         df_target["income_class"] = INCOME_CLASS[hts](df_target)
 
-    df_target["any_cars"] = df_target["number_of_vehicles"] > 0
-    df_source["any_cars"] = df_source["number_of_vehicles"] > 0
-
-    columns = ["sex", "any_cars", "age_class", "socioprofessional_class"]
-    if "income_class" in df_source: columns += ["income_class"]
-    columns += ["departement_id"]
+    if "any_cars" in columns:
+        df_target["any_cars"] = df_target["number_of_vehicles"] > 0
+        df_source["any_cars"] = df_source["number_of_vehicles"] > 0
 
     # Perform statistical matching
     df_source = df_source.rename(columns = { "person_id": "hts_id" })
 
+    for column in columns:
+        if not column in df_source:
+            raise RuntimeError("Attribute not available in source (HTS) for matching: {}".format(column))
+
+        if not column in df_target:
+            raise RuntimeError("Attribute not available in target (census) for matching: {}".format(column))
+
     df_assignment, levels = parallel_statistical_matching(
         context,
         df_source, "hts_id", "person_weight",

diff --git a/tests/test_determinism.py b/tests/test_determinism.py
@@ -54,7 +54,11 @@ def _test_determinism(index, data_path, tmpdir):
         regions = [10, 11], sampling_rate = 1.0, hts = "entd",
         random_seed = 1000, processes = 1,
         secloc_maximum_iterations = 10,
-        maven_skip_tests = True
+        maven_skip_tests = True,
+        matching_attributes = [
+            "sex", "any_cars", "age_class", "socioprofessional_class",
+            "income_class", "departement_id"
+        ]
     )
 
     stages = [
@@ -111,7 +115,11 @@ def _test_determinism_matsim(index, data_path, tmpdir):
         regions = [10, 11], sampling_rate = 1.0, hts = "entd",
         random_seed = 1000, processes = 1,
         secloc_maximum_iterations = 10,
-        maven_skip_tests = True
+        maven_skip_tests = True,
+        matching_attributes = [
+            "sex", "any_cars", "age_class", "socioprofessional_class",
+            "income_class", "departement_id"
+        ]
     )
 
     stages = [