From 5e0d2f6315337145cefc828b99862a047f3fd989 Mon Sep 17 00:00:00 2001
From: Pablo Arriagada <63430031+paarriagadap@users.noreply.github.com>
Date: Thu, 23 Jan 2025 12:05:15 +0000
Subject: [PATCH] =?UTF-8?q?=F0=9F=93=8A=20Consolidate=20table=20on=20livin?=
 =?UTF-8?q?g=20conditions=20vs.=20GDP=20per=20capita=20(#3853)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* 📊 Consolidate table on living conditions vs. GDP per capita

* :construction: garden

* :construction: filter the right indicators

* :scroll: explain the goal of the doc

* :sparkles: data done

* :sparkles: make csv in garden step

* :bug: suggestions by Tuna

* :bug: typo in column
---
 dag/main.yml                                  |  15 ++
 .../2025-01-16/gdppc_vs_living_conditions.py  | 231 ++++++++++++++++++
 2 files changed, 246 insertions(+)
 create mode 100644 etl/steps/data/garden/growth/2025-01-16/gdppc_vs_living_conditions.py

diff --git a/dag/main.yml b/dag/main.yml
index 943a083f3623..0e57e437c654 100644
--- a/dag/main.yml
+++ b/dag/main.yml
@@ -739,6 +739,21 @@ steps:
   data://grapher/imf/2024-11-25/world_economic_outlook:
     - data://garden/imf/2024-11-25/world_economic_outlook
 
+  # GDP per capita vs. living conditions indicators
+  data://garden/growth/2025-01-16/gdppc_vs_living_conditions:
+    - data://garden/worldbank_wdi/2024-05-20/wdi
+    - data://garden/un/2024-07-12/un_wpp
+    - data://garden/un/2024-09-11/igme
+    - data://garden/who/2024-07-26/mortality_database
+    - data://garden/wash/2024-01-06/who
+    - data://garden/tourism/2024-08-17/unwto
+    - data://garden/ggdc/2022-11-28/penn_world_table
+    - data://garden/wb/2024-11-04/edstats
+    - data://garden/unesco/2024-06-25/education_sdgs
+    - data://garden/happiness/2024-06-09/happiness
+    - data://garden/demography/2023-03-31/population
+    - data://garden/regions/2023-01-01/regions
+
 include:
   - dag/open_numbers.yml
   - dag/faostat.yml
diff --git a/etl/steps/data/garden/growth/2025-01-16/gdppc_vs_living_conditions.py b/etl/steps/data/garden/growth/2025-01-16/gdppc_vs_living_conditions.py
new file mode 100644
index 000000000000..1e66661e8f3a
--- /dev/null
+++ b/etl/steps/data/garden/growth/2025-01-16/gdppc_vs_living_conditions.py
@@ -0,0 +1,231 @@
+"""
+This is to create a dataset that contains the GDP per capita and living conditions of countries.
+
+This data is used to create the static chart "How is life at different levels of GDP per capita?", available in this article: https://ourworldindata.org/global-economic-inequality-introduction
+
+Including this in the ETL facilitates creating new versions of the data in the future.
+
+"""
+
+import owid.catalog.processing as pr
+from owid.catalog import Dataset, Table
+from structlog import get_logger
+
+from etl.data_helpers import geo
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+# Initialize logger
+log = get_logger()
+
+# Define most recent year, so we don't process projections
+MOST_RECENT_YEAR = 2024
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset.
+    ds_wdi = paths.load_dataset("wdi")
+    ds_un_wpp = paths.load_dataset("un_wpp")
+    ds_igme = paths.load_dataset("igme")
+    ds_mortality = paths.load_dataset("mortality_database")
+    ds_wash = paths.load_dataset("who")
+    ds_unwto = paths.load_dataset("unwto")
+    ds_pwt = paths.load_dataset("penn_world_table")
+    ds_edstats = paths.load_dataset("edstats")
+    ds_unesco = paths.load_dataset("education_sdgs")
+    ds_happiness = paths.load_dataset("happiness")
+    ds_population = paths.load_dataset("population")
+    ds_regions = paths.load_dataset("regions")
+
+    # Read table from meadow dataset.
+    tb_wdi = ds_wdi.read("wdi")
+    tb_un_wpp = ds_un_wpp.read("life_expectancy")
+    tb_igme = ds_igme.read("igme")
+    tb_mortality = ds_mortality.read("mortality_database")
+    tb_wash = ds_wash.read("who")
+    tb_unwto = ds_unwto.read("unwto")
+    tb_pwt = ds_pwt.read("penn_world_table")
+    tb_edstats = ds_edstats.read("edstats")
+    tb_unesco = ds_unesco.read("education_sdgs")
+    tb_happiness = ds_happiness.read("happiness")
+
+    #
+    # Process data.
+    #
+    # Select only the necessary columns and dimensions from the tables.
+    # WDI
+    tb_wdi = tb_wdi[["country", "year", "ny_gdp_pcap_pp_kd", "sh_med_phys_zs", "eg_elc_accs_zs"]].rename(
+        columns={
+            "ny_gdp_pcap_pp_kd": "gdp_per_capita",
+            "sh_med_phys_zs": "physicians_per_1000_people",
+            "eg_elc_accs_zs": "access_to_electricity",
+        }
+    )
+
+    # UN WPP
+    tb_un_wpp = tb_un_wpp[
+        (tb_un_wpp["sex"] == "all") & (tb_un_wpp["age"] == 0) & (tb_un_wpp["variant"] == "estimates")
+    ].reset_index(drop=True)
+    tb_un_wpp = tb_un_wpp[["country", "year", "life_expectancy"]]
+
+    # IGME
+    tb_igme = tb_igme[
+        (tb_igme["indicator"] == "Under-five mortality rate")
+        & (tb_igme["sex"] == "Total")
+        & (tb_igme["wealth_quintile"] == "Total")
+        & (tb_igme["unit_of_measure"] == "Deaths per 100 live births")
+    ].reset_index(drop=True)
+    tb_igme = tb_igme[["country", "year", "obs_value"]].rename(columns={"obs_value": "child_mortality_rate"})
+
+    # Mortality Database
+    tb_mortality = tb_mortality[
+        (tb_mortality["sex"] == "Both sexes")
+        & (tb_mortality["age_group"] == "all ages")
+        & (tb_mortality["cause"] == "Maternal conditions")
+        & (tb_mortality["icd10_codes"] == "O00-O99")
+    ].reset_index(drop=True)
+    tb_mortality = tb_mortality[
+        ["country", "year", "age_standardized_death_rate_per_100_000_standard_population"]
+    ].rename(columns={"age_standardized_death_rate_per_100_000_standard_population": "maternal_death_rate"})
+
+    # WHO
+    tb_wash = tb_wash[tb_wash["residence"] == "Total"].reset_index(drop=True)
+    tb_wash = tb_wash[["country", "year", "wat_imp"]].rename(columns={"wat_imp": "access_to_improved_drinking_water"})
+
+    # UNWTO
+    tb_unwto = tb_unwto[["country", "year", "out_tour_departures_ovn_vis_tourists_per_1000"]].rename(
+        columns={"out_tour_departures_ovn_vis_tourists_per_1000": "tourist_departures_per_1000_people"}
+    )
+
+    # Penn World Table
+    tb_pwt = tb_pwt[["country", "year", "avh"]].rename(columns={"avh": "average_working_hours"})
+
+    # Edstats
+    tb_edstats = tb_edstats[["country", "year", "learning_adjusted_years_of_school", "harmonized_test_scores"]]
+
+    # UNESCO
+    tb_unesco = tb_unesco[
+        ["country", "year", "adult_literacy_rate__population_15plus_years__both_sexes__pct__lr_ag15t99"]
+    ].rename(
+        columns={"adult_literacy_rate__population_15plus_years__both_sexes__pct__lr_ag15t99": "adult_literacy_rate"}
+    )
+
+    # Happiness
+    tb_happiness = tb_happiness[["country", "year", "cantril_ladder_score"]].rename(
+        columns={"cantril_ladder_score": "happiness_score"}
+    )
+
+    # Merge all the tables
+    tb = pr.multi_merge(
+        [tb_wdi, tb_un_wpp, tb_igme, tb_mortality, tb_wash, tb_unwto, tb_pwt, tb_edstats, tb_unesco, tb_happiness],
+        on=["country", "year"],
+        how="outer",
+    )
+
+    tb = geo.add_population_to_table(tb=tb, ds_population=ds_population, warn_on_missing_countries=False)
+
+    tb = select_most_recent_data(tb)
+
+    tb = add_regions_columns(tb, ds_regions)
+
+    tb = tb.format(["country"], short_name="gdppc_vs_living_conditions")
+
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset with the same metadata as the meadow dataset.
+    ds_garden = create_dataset(
+        dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_wdi.metadata, formats=["csv"]
+    )
+
+    # Save changes in the new garden dataset.
+    ds_garden.save()
+
+
+def select_most_recent_data(tb: Table) -> Table:
+    """
+    Select the most recent data for each indicator and country in the table.
+    """
+
+    tb = tb.sort_values(by=["country", "year"], ascending=False).reset_index(drop=True)
+
+    # Define the columns that are indicators (the columns that are not country or year)
+    indicators = tb.columns.difference(["country", "year"]).tolist()
+
+    tb_list = []
+
+    for indicator in indicators:
+        tb_indicator = tb[["country", "year", indicator]].copy()
+
+        # Drop rows with missing values
+        tb_indicator = tb_indicator.dropna(subset=[indicator]).reset_index(drop=True)
+
+        # Define latest year in the dataset
+        latest_year = tb_indicator["year"].max()
+
+        if latest_year > MOST_RECENT_YEAR:
+            log.warning(
+                f"Indicator {indicator} has data for until year {latest_year}, which is higher than {MOST_RECENT_YEAR}. We keep only data until {MOST_RECENT_YEAR}."
+            )
+
+            # Drop rows with year higher than MOST_RECENT_YEAR
+            tb_indicator = tb_indicator[tb_indicator["year"] <= MOST_RECENT_YEAR].reset_index(drop=True)
+
+        # Select all the rows where the data is at most 10 years old (MOST_RECENT_YEAR - 10)
+        tb_indicator = tb_indicator[tb_indicator["year"] >= MOST_RECENT_YEAR - 10].reset_index(drop=True)
+
+        # For each country, select the row with the latest year
+        tb_indicator = tb_indicator.groupby("country").first().reset_index()
+
+        # Calculate latest year again and earliest year
+        latest_year = tb_indicator["year"].max()
+        earliest_year = tb_indicator["year"].min()
+
+        log.info(f"The indicator {indicator} ranges between {earliest_year} and {latest_year}.")
+
+        # Drop year column
+        tb_indicator = tb_indicator.drop(columns=["year"])
+
+        tb_list.append(tb_indicator)
+
+    tb = pr.multi_merge(tb_list, on=["country"], how="outer")
+
+    return tb
+
+
+def add_regions_columns(tb: Table, ds_regions: Dataset) -> Table:
+    """
+    Add region columns to the table.
+    """
+
+    tb_regions = geo.create_table_of_regions_and_subregions(ds_regions=ds_regions)
+
+    # Explode the regions table to have one row per country
+    tb_regions = tb_regions.explode("members").reset_index(drop=False)
+
+    # Select OWID regions
+    tb_regions = tb_regions[
+        tb_regions["region"].isin(["North America", "South America", "Europe", "Africa", "Asia", "Oceania"])
+    ].reset_index(drop=True)
+
+    # Merge the regions table with the table
+    tb = pr.merge(
+        tb,
+        tb_regions,
+        left_on="country",
+        right_on="members",
+        how="left",
+    )
+
+    # Delete the members column
+    tb = tb.drop(columns=["members"])
+
+    # Keep only the rows where region is not missing
+    tb = tb.dropna(subset=["region"]).reset_index(drop=True)
+
+    return tb