From fe9f6de70f01825df3f9f429acb2e74d05696468 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Tue, 17 Dec 2024 20:10:11 +0100 Subject: [PATCH 01/15] =?UTF-8?q?=F0=9F=93=8A=20effective=20fertility=20ra?= =?UTF-8?q?te?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From 203d4080ebea74116135c6a454bf9fbd934576c4 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Tue, 17 Dec 2024 22:11:23 +0100 Subject: [PATCH 02/15] wip --- dag/demography.yml | 17 +++++- .../efr_malani_jacob.countries.json | 2 + .../efr_malani_jacob.excluded_countries.json | 2 + .../2024-12-17/efr_malani_jacob.meta.yml | 58 +++++++++++++++++++ .../demography/2024-12-17/efr_malani_jacob.py | 37 ++++++++++++ .../demography/2024-12-17/efr_malani_jacob.py | 28 +++++++++ 6 files changed, 141 insertions(+), 3 deletions(-) create mode 100644 etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.countries.json create mode 100644 etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.excluded_countries.json create mode 100644 etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml create mode 100644 etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py create mode 100644 etl/steps/data/grapher/demography/2024-12-17/efr_malani_jacob.py diff --git a/dag/demography.yml b/dag/demography.yml index 9fc85049335..77b65326aa1 100644 --- a/dag/demography.yml +++ b/dag/demography.yml @@ -263,7 +263,7 @@ steps: - data://garden/hmd/2024-12-01/hmd data://grapher/hmd/2024-12-03/hmd_country: - data://garden/hmd/2024-12-03/hmd_country - + # Patternal ages (Kai & Klüssner) data://meadow/demography/2024-12-05/paternal_ages: - snapshot://demography/2024-12-04/paternal_ages.rdata @@ -271,11 +271,22 @@ steps: - data://meadow/demography/2024-12-05/paternal_ages data://grapher/demography/2024-12-05/paternal_ages: - data://garden/demography/2024-12-05/paternal_ages - + + # + # Effective Fertility Rate (Malani & Jacob) + # + data://garden/demography/2024-12-17/efr_malani_jacob: + - data://garden/demography/2024-12-03/life_tables + - data://garden/un/2024-12-02/un_wpp_lt + - data://garden/un/2024-07-12/un_wpp + - data://garden/hmd/2024-12-01/hmd + - data://garden/hmd/2024-11-19/hfd + data://grapher/demography/2024-12-17/efr_malani_jacob: + - data://garden/demography/2024-12-17/efr_malani_jacob + ######################################################################## # OTHERS ######################################################################## - # Wittgenstein Centre (Projections) data://meadow/demography/2024-12-06/wittgenstein_human_capital_proj: - snapshot://demography/2024-12-06/wittgenstein_human_capital.zip diff --git a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.countries.json b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.countries.json new file mode 100644 index 00000000000..2c63c085104 --- /dev/null +++ b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.countries.json @@ -0,0 +1,2 @@ +{ +} diff --git a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.excluded_countries.json b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.excluded_countries.json new file mode 100644 index 00000000000..0d4f101c7a3 --- /dev/null +++ b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.excluded_countries.json @@ -0,0 +1,2 @@ +[ +] diff --git a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml new file mode 100644 index 00000000000..9ec97ed28e5 --- /dev/null +++ b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml @@ -0,0 +1,58 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Fertility Rate + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + + +tables: + efr_malani_jacob: + variables: + # testing_variable: + # title: Testing variable title + # unit: arbitrary units + # short_unit: au + # description_short: Short description of testing variable. + # description_processing: Description of processing of testing variable. + # description_key: List of key points about the indicator. + # description_from_producer: Description of testing variable from producer. + # processing_level: minor + # type: + # sort: + # presentation: + # attribution: + # attribution_short: + # faqs: + # grapher_config: + # title_public: + # title_variant: + # topic_tags: + # display: + # name: Testing variable + # numDecimalPlaces: 0 + # tolerance: 0 + # color: + # conversionFactor: 1 + # description: + # entityAnnotationsMap: Test annotation + # includeInTable: + # isProjection: false + # unit: arbitrary units + # shortUnit: au + # tableDisplay: + # hideAbsoluteChange: + # hideRelativeChange: + # yearIsDay: false + # zeroDay: + # roundingMode: + # numSignificantFigures: + # + {} + diff --git a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py new file mode 100644 index 00000000000..c7b6fc0415d --- /dev/null +++ b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py @@ -0,0 +1,37 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("un_wpp_lt") + + # Read table from meadow dataset. + tb = ds_meadow.read("efr_malani_jacob") + + # + # Process data. + # + tb = geo.harmonize_countries( + df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path + ) + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/grapher/demography/2024-12-17/efr_malani_jacob.py b/etl/steps/data/grapher/demography/2024-12-17/efr_malani_jacob.py new file mode 100644 index 00000000000..8fb82651c03 --- /dev/null +++ b/etl/steps/data/grapher/demography/2024-12-17/efr_malani_jacob.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("efr_malani_jacob") + + # Read table from garden dataset. + tb = ds_garden.read("efr_malani_jacob", reset_index=False) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() From aaf3d549f9d6c45a1436eca86b48efa17b802d0c Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Tue, 17 Dec 2024 23:05:48 +0100 Subject: [PATCH 03/15] wip --- .../garden/demography/2024-12-17/efr_malani_jacob.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py index c7b6fc0415d..35ea2e1eab3 100644 --- a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py +++ b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py @@ -12,10 +12,12 @@ def run(dest_dir: str) -> None: # Load inputs. # # Load meadow dataset. - ds_meadow = paths.load_dataset("un_wpp_lt") + ds_un = paths.load_dataset("un_wpp_lt") + ds_hmd = paths.load_dataset("hmd") + ds_hfd = paths.load_dataset("hfd") # Read table from meadow dataset. - tb = ds_meadow.read("efr_malani_jacob") + tb = ds_un.read("efr_malani_jacob") # # Process data. @@ -30,7 +32,9 @@ def run(dest_dir: str) -> None: # # Create a new garden dataset with the same metadata as the meadow dataset. ds_garden = create_dataset( - dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + dest_dir, + tables=[tb], + check_variables_metadata=True, ) # Save changes in the new garden dataset. From fddcf27beeccffbece108a754878ffcfc8e00f96 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Tue, 17 Dec 2024 23:26:01 +0100 Subject: [PATCH 04/15] wip: LT projections by WPP --- snapshots/un/2024-12-02/un_wpp_lt.py | 37 ++++++++++++++----- .../un/2024-12-02/un_wpp_lt_proj_all.csv.dvc | 36 ++++++++++++++++++ .../un/2024-12-02/un_wpp_lt_proj_f.csv.dvc | 36 ++++++++++++++++++ .../un/2024-12-02/un_wpp_lt_proj_m.csv.dvc | 36 ++++++++++++++++++ 4 files changed, 136 insertions(+), 9 deletions(-) create mode 100644 snapshots/un/2024-12-02/un_wpp_lt_proj_all.csv.dvc create mode 100644 snapshots/un/2024-12-02/un_wpp_lt_proj_f.csv.dvc create mode 100644 snapshots/un/2024-12-02/un_wpp_lt_proj_m.csv.dvc diff --git a/snapshots/un/2024-12-02/un_wpp_lt.py b/snapshots/un/2024-12-02/un_wpp_lt.py index 41b239039cc..79c9bac2ce0 100644 --- a/snapshots/un/2024-12-02/un_wpp_lt.py +++ b/snapshots/un/2024-12-02/un_wpp_lt.py @@ -5,16 +5,20 @@ To download this files: 1. Go to the CSV Format section of UN WPP page: https://population.un.org/wpp/Download/Standard/CSV/ - 2. Download the Life Tables ZIP files with the estimates (1950-2021): + 2. Download the Life Tables ZIP files with the estimates (1950-2023): - https://population.un.org/wpp/Download/Files/1_Indicator%20(Standard)/CSV_FILES/WPP2024_Life_Table_Complete_Medium_Both_1950-2023.csv.gz - https://population.un.org/wpp/Download/Files/1_Indicator%20(Standard)/CSV_FILES/WPP2024_Life_Table_Complete_Medium_Female_1950-2023.csv.gz - https://population.un.org/wpp/Download/Files/1_Indicator%20(Standard)/CSV_FILES/WPP2024_Life_Table_Complete_Medium_Male_1950-2023.csv.gz + - https://population.un.org/wpp/Download/Files/1_Indicator%20(Standard)/CSV_FILES/WPP2024_Life_Table_Complete_Medium_Both_2024-2100.csv.gz + - https://population.un.org/wpp/Download/Files/1_Indicator%20(Standard)/CSV_FILES/WPP2024_Life_Table_Complete_Medium_Female_2024-2100.csv.gz + - https://population.un.org/wpp/Download/Files/1_Indicator%20(Standard)/CSV_FILES/WPP2024_Life_Table_Complete_Medium_Male_2024-2100.csv.gz 3. Run the snapshot script and wait for it to be ingested into S3: python snapshots/un/2023-10-02/un_wpp_lt.py --path-to-file-all /path/WPP2022_Life_Table_Complete_Medium_Both_1950-2021.zip --path-to-file-f path/WPP2022_Life_Table_Complete_Medium_Female_1950-2021.zip --path-to-file-m path/WPP2022_Life_Table_Complete_Medium_Male_1950-2021.zip """ from pathlib import Path +from typing import Optional import click @@ -26,21 +30,36 @@ @click.command() @click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") -@click.option("--path-to-file-all", prompt=True, type=str, help="Path to local data file (both sexes).") -@click.option("--path-to-file-f", prompt=True, type=str, help="Path to local data file (female).") -@click.option("--path-to-file-m", prompt=True, type=str, help="Path to local data file (male).") -def main(path_to_file_all: str, path_to_file_f: str, path_to_file_m: str, upload: bool) -> None: +@click.option("--path-to-file-all", "-a", type=str, help="Path to local data file (both sexes).") +@click.option("--path-to-file-f", "-f", type=str, help="Path to local data file (female).") +@click.option("--path-to-file-m", "-m", type=str, help="Path to local data file (male).") +@click.option("--path-to-file-proj-all", "-pa", type=str, help="Path to local data file (projections, both sexes).") +@click.option("--path-to-file-proj-f", "-pf", type=str, help="Path to local data file (projections, female).") +@click.option("--path-to-file-proj-m", "-pm", type=str, help="Path to local data file (projections, male).") +def main( + path_to_file_all: Optional[str], + path_to_file_f: Optional[str], + path_to_file_m: Optional[str], + path_to_file_proj_all: Optional[str], + path_to_file_proj_f: Optional[str], + path_to_file_proj_m: Optional[str], + upload: bool, +) -> None: snaps = [ ("un_wpp_lt_all", path_to_file_all), # ALL ("un_wpp_lt_f", path_to_file_f), # FEMALE ("un_wpp_lt_m", path_to_file_m), # MALE + ("un_wpp_lt_proj_all", path_to_file_proj_all), # PROJECTIONS, ALL + ("un_wpp_lt_proj_f", path_to_file_proj_f), # PROJECTIONS, FMALE + ("un_wpp_lt_proj_m", path_to_file_proj_m), # PROJECTIONS MALE ] for snap_props in snaps: - # Create a new snapshot. - snap = Snapshot(f"un/{SNAPSHOT_VERSION}/{snap_props[0]}.csv") - # Copy local data file to snapshots data folder, add file to DVC and upload to S3. - snap.create_snapshot(filename=snap_props[1], upload=upload) + if snap_props[1] is not None: + # Create a new snapshot. + snap = Snapshot(f"un/{SNAPSHOT_VERSION}/{snap_props[0]}.csv") + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=snap_props[1], upload=upload) if __name__ == "__main__": diff --git a/snapshots/un/2024-12-02/un_wpp_lt_proj_all.csv.dvc b/snapshots/un/2024-12-02/un_wpp_lt_proj_all.csv.dvc new file mode 100644 index 00000000000..d1033312bed --- /dev/null +++ b/snapshots/un/2024-12-02/un_wpp_lt_proj_all.csv.dvc @@ -0,0 +1,36 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/origin/ +meta: + origin: + # Data product / Snapshot + title: World Population Prospects + description: |- + The World Population Prospects 2024 is the 28th edition of the official estimates and projections of the global population published by the United Nations since 1951. The estimates are based on all available sources of data on population size and levels of fertility, mortality, and international migration for 237 countries or areas. + + For each revision, any new, recent, and historical, information that has become available from population censuses, vital registration of births and deaths, and household surveys is considered to produce consistent time series of population estimates for each country or areas from 1950 to today + + For the estimation period between 1950 and 2023, data from 1,910 censuses were considered in the present evaluation, which is 79 more than the 2022 revision. In some countries, population registers based on administrative data systems provide the necessary information. Population data from censuses or registers referring to 2019 or later were available for 114 countries or areas, representing 48 per cent of the 237 countries or areas included in this analysis (and 54 per cent of the world population). For 43 countries or areas, the most recent available population count was from the period 2014-2018, and for another 57 locations from the period 2009-2013. For the remaining 23 countries or areas, the most recent available census data were from before 2009, that is more than 15 years ago. + date_published: 2024-07-11 + title_snapshot: World Population Prospects - Life Tables (Medium projections, Both sexes) + description_snapshot: |- + Provides single-age life tables up to age 100 for both sexes projected to 2024-2100 using Medium scenario. It contains a set of values showing the mortality experience of a hypothetical group of infants born at the same time and subject throughout their lifetime to the specific mortality rates of a given year. The following series are provided: age-specific mortality rates (mx), probabilities of dying (qx), probabilities of surviving (px), number surviving (lx), number dying (dx), number of person-years lived (Lx), survivorship ratios (Sx), cumulative stationary population (Tx), average remaining life expectancy (ex) and average number of years lived (ax). + + # Citation + producer: United Nations + citation_full: |- + United Nations, Department of Economic and Social Affairs, Population Division (2024). World Population Prospects 2024, Online Edition. + attribution: UN, World Population Prospects (2024) + attribution_short: UN WPP + + # Files + url_main: https://population.un.org/wpp/Download/ + date_accessed: 2024-12-17 + + # License + license: + name: CC BY 3.0 IGO + url: https://population.un.org/wpp/Download/Standard/MostUsed/ +outs: + - md5: 708b52307dd4e2e347deeb6ccba6d0ea + size: 203785578 + path: un_wpp_lt_proj_all.csv diff --git a/snapshots/un/2024-12-02/un_wpp_lt_proj_f.csv.dvc b/snapshots/un/2024-12-02/un_wpp_lt_proj_f.csv.dvc new file mode 100644 index 00000000000..1ec3008a544 --- /dev/null +++ b/snapshots/un/2024-12-02/un_wpp_lt_proj_f.csv.dvc @@ -0,0 +1,36 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/origin/ +meta: + origin: + # Data product / Snapshot + title: World Population Prospects + description: |- + The World Population Prospects 2024 is the 28th edition of the official estimates and projections of the global population published by the United Nations since 1951. The estimates are based on all available sources of data on population size and levels of fertility, mortality, and international migration for 237 countries or areas. + + For each revision, any new, recent, and historical, information that has become available from population censuses, vital registration of births and deaths, and household surveys is considered to produce consistent time series of population estimates for each country or areas from 1950 to today + + For the estimation period between 1950 and 2023, data from 1,910 censuses were considered in the present evaluation, which is 79 more than the 2022 revision. In some countries, population registers based on administrative data systems provide the necessary information. Population data from censuses or registers referring to 2019 or later were available for 114 countries or areas, representing 48 per cent of the 237 countries or areas included in this analysis (and 54 per cent of the world population). For 43 countries or areas, the most recent available population count was from the period 2014-2018, and for another 57 locations from the period 2009-2013. For the remaining 23 countries or areas, the most recent available census data were from before 2009, that is more than 15 years ago. + date_published: 2024-07-11 + title_snapshot: World Population Prospects - Life Tables (Medium projections, Female) + description_snapshot: |- + Provides single-age life tables up to age 100 for females projected to 2024-2100 using Medium scenario. It contains a set of values showing the mortality experience of a hypothetical group of infants born at the same time and subject throughout their lifetime to the specific mortality rates of a given year. The following series are provided: age-specific mortality rates (mx), probabilities of dying (qx), probabilities of surviving (px), number surviving (lx), number dying (dx), number of person-years lived (Lx), survivorship ratios (Sx), cumulative stationary population (Tx), average remaining life expectancy (ex) and average number of years lived (ax). + + # Citation + producer: United Nations + citation_full: |- + United Nations, Department of Economic and Social Affairs, Population Division (2024). World Population Prospects 2024, Online Edition. + attribution: UN, World Population Prospects (2024) + attribution_short: UN WPP + + # Files + url_main: https://population.un.org/wpp/Download/ + date_accessed: 2024-12-17 + + # License + license: + name: CC BY 3.0 IGO + url: https://population.un.org/wpp/Download/Standard/MostUsed/ +outs: + - md5: 5d04540b7db63a277bebb2f060f1161d + size: 203000798 + path: un_wpp_lt_proj_f.csv diff --git a/snapshots/un/2024-12-02/un_wpp_lt_proj_m.csv.dvc b/snapshots/un/2024-12-02/un_wpp_lt_proj_m.csv.dvc new file mode 100644 index 00000000000..86dc3b987bf --- /dev/null +++ b/snapshots/un/2024-12-02/un_wpp_lt_proj_m.csv.dvc @@ -0,0 +1,36 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/origin/ +meta: + origin: + # Data product / Snapshot + title: World Population Prospects + description: |- + The World Population Prospects 2024 is the 28th edition of the official estimates and projections of the global population published by the United Nations since 1951. The estimates are based on all available sources of data on population size and levels of fertility, mortality, and international migration for 237 countries or areas. + + For each revision, any new, recent, and historical, information that has become available from population censuses, vital registration of births and deaths, and household surveys is considered to produce consistent time series of population estimates for each country or areas from 1950 to today + + For the estimation period between 1950 and 2023, data from 1,910 censuses were considered in the present evaluation, which is 79 more than the 2022 revision. In some countries, population registers based on administrative data systems provide the necessary information. Population data from censuses or registers referring to 2019 or later were available for 114 countries or areas, representing 48 per cent of the 237 countries or areas included in this analysis (and 54 per cent of the world population). For 43 countries or areas, the most recent available population count was from the period 2014-2018, and for another 57 locations from the period 2009-2013. For the remaining 23 countries or areas, the most recent available census data were from before 2009, that is more than 15 years ago. + date_published: 2024-07-11 + title_snapshot: World Population Prospects - Life Tables (Medium projections, Male) + description_snapshot: |- + Provides single-age life tables up to age 100 for males projected to 2024-2100 using Medium scenario. It contains a set of values showing the mortality experience of a hypothetical group of infants born at the same time and subject throughout their lifetime to the specific mortality rates of a given year. The following series are provided: age-specific mortality rates (mx), probabilities of dying (qx), probabilities of surviving (px), number surviving (lx), number dying (dx), number of person-years lived (Lx), survivorship ratios (Sx), cumulative stationary population (Tx), average remaining life expectancy (ex) and average number of years lived (ax). + + # Citation + producer: United Nations + citation_full: |- + United Nations, Department of Economic and Social Affairs, Population Division (2024). World Population Prospects 2024, Online Edition. + attribution: UN, World Population Prospects (2024) + attribution_short: UN WPP + + # Files + url_main: https://population.un.org/wpp/Download/ + date_accessed: 2024-12-17 + + # License + license: + name: CC BY 3.0 IGO + url: https://population.un.org/wpp/Download/Standard/MostUsed/ +outs: + - md5: e30f9037f5c2af6ddb72bacd18cf3e78 + size: 204285181 + path: un_wpp_lt_proj_m.csv From 68b61fb0784b4d1b5763f3d8191c0750afb6b930 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 18 Dec 2024 17:56:09 +0100 Subject: [PATCH 05/15] add projections --- dag/demography.yml | 3 ++ .../garden/un/2024-12-02/un_wpp_lt.meta.yml | 4 +- .../data/garden/un/2024-12-02/un_wpp_lt.py | 43 +++++++++++++------ .../data/meadow/un/2024-12-02/un_wpp_lt.py | 5 ++- 4 files changed, 39 insertions(+), 16 deletions(-) diff --git a/dag/demography.yml b/dag/demography.yml index 77b65326aa1..d0e8e54c212 100644 --- a/dag/demography.yml +++ b/dag/demography.yml @@ -145,6 +145,9 @@ steps: - snapshot://un/2024-12-02/un_wpp_lt_m.csv - snapshot://un/2024-12-02/un_wpp_lt_all.csv - snapshot://un/2024-12-02/un_wpp_lt_f.csv + - snapshot://un/2024-12-02/un_wpp_lt_proj_m.csv + - snapshot://un/2024-12-02/un_wpp_lt_proj_all.csv + - snapshot://un/2024-12-02/un_wpp_lt_proj_f.csv data://garden/un/2024-12-02/un_wpp_lt: - data://meadow/un/2024-12-02/un_wpp_lt diff --git a/etl/steps/data/garden/un/2024-12-02/un_wpp_lt.meta.yml b/etl/steps/data/garden/un/2024-12-02/un_wpp_lt.meta.yml index c7b3d363bb8..d1cc6955062 100644 --- a/etl/steps/data/garden/un/2024-12-02/un_wpp_lt.meta.yml +++ b/etl/steps/data/garden/un/2024-12-02/un_wpp_lt.meta.yml @@ -24,7 +24,7 @@ dataset: # Learn more about the available fields: # http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ tables: - un_wpp_lt: + un_wpp_lt: &table_metadata variables: central_death_rate: title: Central death rate @@ -102,3 +102,5 @@ tables: <%- else -%> It refers to the remaining life expectancy for people who have already survived to the given age. <%- endif -%> + + un_wpp_lt_proj: *table_metadata diff --git a/etl/steps/data/garden/un/2024-12-02/un_wpp_lt.py b/etl/steps/data/garden/un/2024-12-02/un_wpp_lt.py index 21b1a4d92ba..29af30caabd 100644 --- a/etl/steps/data/garden/un/2024-12-02/un_wpp_lt.py +++ b/etl/steps/data/garden/un/2024-12-02/un_wpp_lt.py @@ -12,12 +12,7 @@ "time": "year", "agegrpstart": "age", } -COLUMNS_INDEX = [ - "location", - "year", - "sex", - "age", -] +COLUMNS_INDEX = ["location", "year", "sex", "age", "variant"] COLUMNS_INDICATORS = [ "central_death_rate", "probability_of_death", @@ -30,6 +25,8 @@ "life_expectancy", "average_survival_length", ] +# Year threshold for projections +YEAR_PROJ_START = 2024 def run(dest_dir: str) -> None: @@ -43,12 +40,16 @@ def run(dest_dir: str) -> None: paths.log.info("load tables, concatenate.") tb = pr.concat( [ - ds_meadow["un_wpp_lt_all"].reset_index(), - ds_meadow["un_wpp_lt_f"].reset_index(), - ds_meadow["un_wpp_lt_m"].reset_index(), + ds_meadow.read("un_wpp_lt_all"), + ds_meadow.read("un_wpp_lt_f"), + ds_meadow.read("un_wpp_lt_m"), + ds_meadow.read("un_wpp_lt_proj_all"), + ds_meadow.read("un_wpp_lt_proj_f"), + ds_meadow.read("un_wpp_lt_proj_m"), ], short_name=paths.short_name, - ).reset_index() + ignore_index=True, + ) # # Process data. @@ -64,7 +65,7 @@ def run(dest_dir: str) -> None: # DTypes tb = tb.astype( { - "age": str, + "age": "string", } ) @@ -79,21 +80,35 @@ def run(dest_dir: str) -> None: # Harmonize country names. paths.log.info("harmonise country names.") - tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path, country_col="location") + tb = geo.harmonize_countries( + df=tb, + countries_file=paths.country_mapping_path, + country_col="location", + ) # Harmonize sex sex tb["sex"] = tb["sex"].map({"Total": "total", "Male": "male", "Female": "female"}) assert tb["sex"].notna().all(), "NaNs detected after mapping sex values!" + # Historical and Projection-only tables + tb_hist = tb.loc[tb["year"] < YEAR_PROJ_START] + tb_future = tb.loc[tb["year"] >= YEAR_PROJ_START] + # Set index - tb = tb.set_index(COLUMNS_INDEX, verify_integrity=True)[COLUMNS_INDICATORS] + tables = [ + tb_hist.format(COLUMNS_INDEX, short_name="un_wpp_lt"), + tb_future.format(COLUMNS_INDEX, short_name="un_wpp_lt_proj"), + ] # # Save outputs. # # Create a new garden dataset with the same metadata as the meadow dataset. ds_garden = create_dataset( - dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + dest_dir, + tables=tables, + check_variables_metadata=True, + default_metadata=ds_meadow.metadata, ) # Save changes in the new garden dataset. diff --git a/etl/steps/data/meadow/un/2024-12-02/un_wpp_lt.py b/etl/steps/data/meadow/un/2024-12-02/un_wpp_lt.py index 9535957eedc..9b1799650d4 100644 --- a/etl/steps/data/meadow/un/2024-12-02/un_wpp_lt.py +++ b/etl/steps/data/meadow/un/2024-12-02/un_wpp_lt.py @@ -31,6 +31,9 @@ def run(dest_dir: str) -> None: "un_wpp_lt_all", # ALL "un_wpp_lt_f", # FEMALE "un_wpp_lt_m", # MALE + "un_wpp_lt_proj_all", # PROJECTIONS, ALL + "un_wpp_lt_proj_f", # PROJECTIONS, FEMALE + "un_wpp_lt_proj_m", # PROJECTIONS, MALE ] tables = [] @@ -53,7 +56,7 @@ def run(dest_dir: str) -> None: tb["LocTypeName"].isin(["Geographic region", "Income group", "Country/Area", "World", "Development group"]) ] # Set index - tb = tb.format(["location", "time", "sex", "agegrp", "loctypename"]) + tb = tb.format(["location", "time", "sex", "agegrp", "loctypename", "variant"]) # Add to tables list tables.append(tb) From 95f45f9734481965da46377d2db81b165d6fb412 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Thu, 19 Dec 2024 17:55:02 +0100 Subject: [PATCH 06/15] wip --- dag/demography.yml | 2 - .../demography/2024-12-17/efr_malani_jacob.py | 44 ++++++++++++++++++- 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/dag/demography.yml b/dag/demography.yml index 3acd9cf2449..74c4b6b10b4 100644 --- a/dag/demography.yml +++ b/dag/demography.yml @@ -279,11 +279,9 @@ steps: # Effective Fertility Rate (Malani & Jacob) # data://garden/demography/2024-12-17/efr_malani_jacob: - - data://garden/demography/2024-12-03/life_tables - data://garden/un/2024-12-02/un_wpp_lt - data://garden/un/2024-07-12/un_wpp - data://garden/hmd/2024-12-01/hmd - - data://garden/hmd/2024-11-19/hfd data://grapher/demography/2024-12-17/efr_malani_jacob: - data://garden/demography/2024-12-17/efr_malani_jacob diff --git a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py index 35ea2e1eab3..f0e8cc3eb07 100644 --- a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py +++ b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py @@ -1,21 +1,63 @@ """Load a meadow dataset and create a garden dataset.""" +import pandas as pd +from owid.catalog import processing as pr + from etl.data_helpers import geo from etl.helpers import PathFinder, create_dataset # Get paths and naming conventions for current step. paths = PathFinder(__file__) +COLUMNS_UN = ["location", "year", "age", "sex", "probability_of_survival"] + def run(dest_dir: str) -> None: # # Load inputs. # # Load meadow dataset. - ds_un = paths.load_dataset("un_wpp_lt") + ds_un_lt = paths.load_dataset("un_wpp_lt") + ds_un_wpp = paths.load_dataset("un_wpp") + ds_hmd = paths.load_dataset("hmd") ds_hfd = paths.load_dataset("hfd") + # Load tables + tb_un = ds_un_lt.read("un_wpp_lt") + tb_un_proj = ds_un_lt.read("un_wpp_lt_proj") + + # Concatenate + tb_un = pr.concat([tb_un, tb_un_proj], ignore_index=True) + + # Filter 'total' and 'female' + tb_un = tb_un.loc[tb_un["sex"].isin(["total", "female"]), COLUMNS_UN] + + # Dtypes + tb_un["age"] = tb_un["age"].str.replace("100+", "100").astype("UInt16") + + # Scale + tb_un["probability_of_survival"] /= 100 + + # Cumulative product + # We estimate the cumulative survival probability. This is the probability to survive from birth to a given age. + # The source provides the probability to survive from one age to the next (pn = probability to survive age n to n+1). + # To estimate this for people born in 1950, we need the data of p0 in 1950, p1 in 1951, etc. That's why we create year_born. + # After that, we just do the cumulative product for each year_born. + # Note that for the cumulative product to make sense, we need to first sort table by age! + # Step 1: Create year_born + tb_un["year_born"] = tb_un["year"] - tb_un["age"] + # Step 2: We only estimate the cumulative survival probability for people born between 1950 and 2023 (reduction of 50% rows) + tb_un = tb_un.loc[(tb_un["year_born"] >= 1950) & (tb_un["year_born"] <= 2023)] + # Step 3: Sort by age + tb_un = tb_un.sort_values(["location", "year_born", "sex", "age"]) + # Step 4: Estimate cumulative survival probability + tb_un["cumulative_survival"] = tb_un.groupby(["location", "sex", "year_born"])["probability_of_survival"].cumprod() + # Step 5: Keep only years of interest (15-65), further reduction of 50% rows + tb_un = tb_un.loc[(tb_un["age"] >= 15) & (tb_un["age"] <= 65)] + # Step 6: Drop columns + tb_un = tb_un.drop(columns=["year_born"]) + # Read table from meadow dataset. tb = ds_un.read("efr_malani_jacob") From fae53b314f3dcc67b4821db227b2a9130646b653 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Thu, 19 Dec 2024 23:46:30 +0100 Subject: [PATCH 07/15] wip --- .../efr_malani_jacob.countries.json | 2 - .../efr_malani_jacob.excluded_countries.json | 2 - .../2024-12-17/efr_malani_jacob.meta.yml | 104 ++++++++------ .../demography/2024-12-17/efr_malani_jacob.py | 133 ++++++++++++------ .../demography/2024-12-17/efr_malani_jacob.py | 4 +- 5 files changed, 155 insertions(+), 90 deletions(-) delete mode 100644 etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.countries.json delete mode 100644 etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.excluded_countries.json diff --git a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.countries.json b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.countries.json deleted file mode 100644 index 2c63c085104..00000000000 --- a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.countries.json +++ /dev/null @@ -1,2 +0,0 @@ -{ -} diff --git a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.excluded_countries.json b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.excluded_countries.json deleted file mode 100644 index 0d4f101c7a3..00000000000 --- a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.excluded_countries.json +++ /dev/null @@ -1,2 +0,0 @@ -[ -] diff --git a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml index 9ec97ed28e5..6b2b8dc953d 100644 --- a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml +++ b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml @@ -1,58 +1,76 @@ # NOTE: To learn more about the fields, hover over their names. definitions: common: + description_key: [] presentation: + grapher_config: none topic_tags: - Fertility Rate - # Learn more about the available fields: # http://docs.owid.io/projects/etl/architecture/metadata/reference/ dataset: update_period_days: 365 - tables: - efr_malani_jacob: + un: variables: - # testing_variable: - # title: Testing variable title - # unit: arbitrary units - # short_unit: au - # description_short: Short description of testing variable. - # description_processing: Description of processing of testing variable. - # description_key: List of key points about the indicator. - # description_from_producer: Description of testing variable from producer. - # processing_level: minor - # type: - # sort: - # presentation: - # attribution: - # attribution_short: - # faqs: - # grapher_config: - # title_public: - # title_variant: - # topic_tags: - # display: - # name: Testing variable - # numDecimalPlaces: 0 - # tolerance: 0 - # color: - # conversionFactor: 1 - # description: - # entityAnnotationsMap: Test annotation - # includeInTable: - # isProjection: false - # unit: arbitrary units - # shortUnit: au - # tableDisplay: - # hideAbsoluteChange: - # hideRelativeChange: - # yearIsDay: false - # zeroDay: - # roundingMode: - # numSignificantFigures: - # - {} + efr_repr: + title: Reproductive Effective Fertility rate (scaled by sex ratio) + description_short: |- + The number of daughters that live long enough to reproduce, between ages 15 and 49. This focuses on daughters, not all children because only females reproduce. Because a child need not live until age 49 to reproduce, we approximate efr_r by taking the average of efr over all reproductive ages (15-49). + unit: "children per women" + description_processing: |- + For a given cohort year, we estimate the cumulative survival probability for a person to reach each age from 0 to 49. For example, the probability of a person born in 2000 reaching age 15, 16, 17, and so on up to 49. + + We then estimate the Effective Fertility Rate (EFR) for each age group by multiplying the Total Fertility Rate (TFR) by the cumulative survival probability. The EFR for a given age gives us an approximation of the average number of children from a woman that will live long enough to reach that age. + + The Reproductive Effective Fertility rate (EFR) is the average of the EFR over all reproductive ages (15-49). + + Note that the Reproductive Effective Fertility rate (EFR) is an approximation of the number of daughters, so it uses the total fertility rate of female children, or equivalently, the TFR weighted by the sex ratio at birth. + + So we have that: EFR_repr = (TFR * mean(EFR)) / (1 + SRB), where SRB is the male-to-female ratio and the mean is taken over all reproductive ages (15-49). + + This indicator is scaled by the sex ratio to allow easy comparability with the Total Fertility Rate (TFR) and the Labor Effective Fertility rate (EFR_labor). + + Read more details in the author's paper: https://www.nber.org/papers/w33175 + + efr_labor: + title: Labor Effective Fertility rate + description_short: |- + The number of children born in a year who will live long enough to earn labor income. This is approximated this by taking the average of Effective Fertility rate (EFR) over all working ages (15-65). + unit: "children per women" + description_processing: |- + For a given cohort year, we estimate the cumulative survival probability for a person to reach each age age from 0 to 65. E.g. the probability of a person born in 2000 to reach age 15, 16, 17, ..., 65. + + We then estimate the Effective Fertility Rate (EFR) for each age group by multiplying the Total Fertility Rate (TFR) by the cumulative survival probability. The EFR for a given age gives us an approximation of the average number of children from a women that will live long enough to reach that age. + + The Labor Effective Fertility rate (EFR) is the average of the EFR over all labor ages (15-65). + + So we have that: EFR_labor = (TFR * mean(EFR)), where the mean is taken over all labor ages (15-65). + + Read more details in the author's paper: https://www.nber.org/papers/w33175 + + cumulative_survival_repr: + title: Cumulative survival probability to reproductive age + description_short: |- + The probability that a person born in a given year will live long enough to reach reproductive age (15-49). + description_processing: |- + For a given cohort year, we estimate the cumulative survival probability for a person to reach each age from 0 to 49. For example, the probability of a person born in 2000 reaching age 15, 16, 17, and so on up to 49. + + This is done by multiplying the survival probability at various years, depending on the age of the person. For example, if born in 2000, we use the probability of surviving age 0 from 2000, the probability of surviving age 1 from 2001, etc. + + Read more details in the author's paper: https://www.nber.org/papers/w33175 + unit: "" + + cumulative_survival_labor: + title: Cumulative survival probability to labor age + description_short: |- + The probability that a person born in a given year will live long enough to reach labor age (15-65). + description_processing: |- + For a given cohort year, we estimate the cumulative survival probability for a person to reach each age from 0 to 65. For example, the probability of a person born in 2000 reaching age 15, 16, 17, and so on up to 65. + + This is done by multiplying the survival probability at various years, depending on the age of the person. For example, if born in 2000, we use the probability of surviving age 0 from 2000, the probability of surviving age 1 from 2001, etc. + Read more details in the author's paper: https://www.nber.org/papers/w33175 + unit: "" diff --git a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py index f0e8cc3eb07..36a48b1e5a4 100644 --- a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py +++ b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py @@ -1,6 +1,5 @@ """Load a meadow dataset and create a garden dataset.""" -import pandas as pd from owid.catalog import processing as pr from etl.data_helpers import geo @@ -9,7 +8,14 @@ # Get paths and naming conventions for current step. paths = PathFinder(__file__) -COLUMNS_UN = ["location", "year", "age", "sex", "probability_of_survival"] +COLUMNS_UN = ["country", "year", "age", "sex", "probability_of_survival"] +# Years +YEAR_UN_START = 1950 +YEAR_UN_END = 2023 +# Ages +AGE_LAB_START = 15 +AGE_REPR_END = 49 +AGE_LAB_END = 65 def run(dest_dir: str) -> None: @@ -20,64 +26,109 @@ def run(dest_dir: str) -> None: ds_un_lt = paths.load_dataset("un_wpp_lt") ds_un_wpp = paths.load_dataset("un_wpp") - ds_hmd = paths.load_dataset("hmd") - ds_hfd = paths.load_dataset("hfd") + # ds_hmd = paths.load_dataset("hmd") + # ds_hfd = paths.load_dataset("hfd") # Load tables tb_un = ds_un_lt.read("un_wpp_lt") tb_un_proj = ds_un_lt.read("un_wpp_lt_proj") + tb_tfr = ds_un_wpp.read("fertility_rate") - # Concatenate - tb_un = pr.concat([tb_un, tb_un_proj], ignore_index=True) + # Estimate cumulative survival in UN LT tables + tb_un = estimate_un_cum_survival( + tb=tb_un, + tb_proj=tb_un_proj, + ) - # Filter 'total' and 'female' - tb_un = tb_un.loc[tb_un["sex"].isin(["total", "female"]), COLUMNS_UN] + # Filter TFR table + tb_tfr = tb_tfr.loc[ + (tb_tfr["sex"] == "all") & (tb_tfr["age"] == "all") & (tb_tfr["variant"].isin(["estimates", "medium"])), + ["country", "year", "fertility_rate"], + ] - # Dtypes - tb_un["age"] = tb_un["age"].str.replace("100+", "100").astype("UInt16") + # Add TFR + tb_un = tb_un.merge(tb_tfr, on=["country", "year"], validate="m:1") - # Scale - tb_un["probability_of_survival"] /= 100 + # Estimate EFR + tb_un["efr"] = tb_un["fertility_rate"] * tb_un["cumulative_survival"] - # Cumulative product - # We estimate the cumulative survival probability. This is the probability to survive from birth to a given age. - # The source provides the probability to survive from one age to the next (pn = probability to survive age n to n+1). - # To estimate this for people born in 1950, we need the data of p0 in 1950, p1 in 1951, etc. That's why we create year_born. - # After that, we just do the cumulative product for each year_born. - # Note that for the cumulative product to make sense, we need to first sort table by age! - # Step 1: Create year_born - tb_un["year_born"] = tb_un["year"] - tb_un["age"] - # Step 2: We only estimate the cumulative survival probability for people born between 1950 and 2023 (reduction of 50% rows) - tb_un = tb_un.loc[(tb_un["year_born"] >= 1950) & (tb_un["year_born"] <= 2023)] - # Step 3: Sort by age - tb_un = tb_un.sort_values(["location", "year_born", "sex", "age"]) - # Step 4: Estimate cumulative survival probability - tb_un["cumulative_survival"] = tb_un.groupby(["location", "sex", "year_born"])["probability_of_survival"].cumprod() - # Step 5: Keep only years of interest (15-65), further reduction of 50% rows - tb_un = tb_un.loc[(tb_un["age"] >= 15) & (tb_un["age"] <= 65)] - # Step 6: Drop columns - tb_un = tb_un.drop(columns=["year_born"]) + # Estimate metrics + ## EFR-labor: Average number of daughters that make it to the reproductive age (15-49) + ## EFR-reproductive: Average number of kids that make it to the labour age (15-65) + ## Cum survival prob, labor: Probability of a girl to survive to the reproductive age (15-49) + ## Cum survival prob, reproductive: Probability of a kid to survive to the labor age (15-65) + tb_un = tb_un.loc[(tb_un["age"] <= AGE_REPR_END) | (tb_un["sex"] == "total")] + tb_un = tb_un.groupby(["country", "year", "sex"], as_index=False)[["efr", "cumulative_survival"]].mean() - # Read table from meadow dataset. - tb = ds_un.read("efr_malani_jacob") + # Pivot + tb_un = tb_un.pivot(index=["country", "year"], columns=["sex"], values=["efr", "cumulative_survival"]).reset_index() - # - # Process data. - # - tb = geo.harmonize_countries( - df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path - ) - tb = tb.format(["country", "year"]) + def rename_col(colname): + mapping = { + "female": "repr", + "total": "labor", + } + + if colname[1] == "": + return colname[0] + else: + return f"{colname[0]}_{mapping.get(colname[1])}" + + tb_un.columns = [rename_col(col) for col in tb_un.columns] + + # Format + tb_un = tb_un.format(["country", "year"], short_name="un") + + tables = [ + tb_un, + ] - # # Save outputs. # # Create a new garden dataset with the same metadata as the meadow dataset. ds_garden = create_dataset( dest_dir, - tables=[tb], + tables=tables, check_variables_metadata=True, ) # Save changes in the new garden dataset. ds_garden.save() + + +def estimate_un_cum_survival(tb, tb_proj): + # Concatenate + tb = pr.concat([tb, tb_proj], ignore_index=True) + + # Rename columns + tb = tb.rename(columns={"location": "country"}) + + # Filter 'total' and 'female' + tb = tb.loc[tb["sex"].isin(["total", "female"]), COLUMNS_UN] + + # Dtypes + tb["age"] = tb["age"].str.replace("100+", "100").astype("UInt16") + + # Scale + tb["probability_of_survival"] /= 100 + + # Cumulative product + # We estimate the cumulative survival probability. This is the probability to survive from birth to a given age. + # The source provides the probability to survive from one age to the next (pn = probability to survive age n to n+1). + # To estimate this for people born in 1950, we need the data of p0 in 1950, p1 in 1951, etc. That's why we create year_born. + # After that, we just do the cumulative product for each year_born. + # Note that for the cumulative product to make sense, we need to first sort table by age! + # Step 1: Replace year with "cohort year" + tb["year"] = tb["year"] - tb["age"] + # Step 2: We only estimate the cumulative survival probability for people born between 1950 and 2023 (reduction of 50% rows) + tb = tb.loc[(tb["year"] >= YEAR_UN_START) & (tb["year"] <= YEAR_UN_END)] + # Step 3: Sort by age, so we can do the cumulative product later + tb = tb.sort_values(["country", "sex", "year", "age"], ignore_index=True) + # Step 4: Estimate cumulative survival probability + tb["cumulative_survival"] = tb.groupby(["country", "sex", "year"])["probability_of_survival"].cumprod() + # Step 5: Keep only years of interest (15-65), further reduction of 65% rows (aggregate -83%) + tb = tb.loc[(tb["age"] >= AGE_LAB_START) & (tb["age"] <= AGE_LAB_END)] + # # Step 6: Drop columns + # tb = tb.drop(columns=["year_born"]) + + return tb diff --git a/etl/steps/data/grapher/demography/2024-12-17/efr_malani_jacob.py b/etl/steps/data/grapher/demography/2024-12-17/efr_malani_jacob.py index 8fb82651c03..f805b3f04b9 100644 --- a/etl/steps/data/grapher/demography/2024-12-17/efr_malani_jacob.py +++ b/etl/steps/data/grapher/demography/2024-12-17/efr_malani_jacob.py @@ -14,14 +14,14 @@ def run(dest_dir: str) -> None: ds_garden = paths.load_dataset("efr_malani_jacob") # Read table from garden dataset. - tb = ds_garden.read("efr_malani_jacob", reset_index=False) + tables = list(ds_garden) # # Save outputs. # # Create a new grapher dataset with the same metadata as the garden dataset. ds_grapher = create_dataset( - dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_garden.metadata ) # Save changes in the new grapher dataset. From 6c6d0c01afe6cc6b1cefc8cf4f0a33de34fa4336 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Thu, 19 Dec 2024 23:53:26 +0100 Subject: [PATCH 08/15] wip --- .../garden/demography/2024-12-17/efr_malani_jacob.meta.yml | 3 +-- .../data/garden/demography/2024-12-17/efr_malani_jacob.py | 7 +++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml index 6b2b8dc953d..bf8cf8a1203 100644 --- a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml +++ b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml @@ -1,9 +1,7 @@ # NOTE: To learn more about the fields, hover over their names. definitions: common: - description_key: [] presentation: - grapher_config: none topic_tags: - Fertility Rate @@ -11,6 +9,7 @@ definitions: # http://docs.owid.io/projects/etl/architecture/metadata/reference/ dataset: update_period_days: 365 + title: "Effective Fertility Rates (Malani and Jacob)" tables: un: diff --git a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py index 36a48b1e5a4..86a2c896c85 100644 --- a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py +++ b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py @@ -2,7 +2,6 @@ from owid.catalog import processing as pr -from etl.data_helpers import geo from etl.helpers import PathFinder, create_dataset # Get paths and naming conventions for current step. @@ -30,9 +29,9 @@ def run(dest_dir: str) -> None: # ds_hfd = paths.load_dataset("hfd") # Load tables - tb_un = ds_un_lt.read("un_wpp_lt") - tb_un_proj = ds_un_lt.read("un_wpp_lt_proj") - tb_tfr = ds_un_wpp.read("fertility_rate") + tb_un = ds_un_lt.read("un_wpp_lt", reset_metadata="keep_origins") + tb_un_proj = ds_un_lt.read("un_wpp_lt_proj", reset_metadata="keep_origins") + tb_tfr = ds_un_wpp.read("fertility_rate", reset_metadata="keep_origins") # Estimate cumulative survival in UN LT tables tb_un = estimate_un_cum_survival( From fdf1e8bb05c5b24358ff58cbdac59205d5c6ade2 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Thu, 19 Dec 2024 23:57:14 +0100 Subject: [PATCH 09/15] add clarification to the name --- .../demography/2024-12-17/efr_malani_jacob.meta.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml index bf8cf8a1203..88626f997e6 100644 --- a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml +++ b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml @@ -15,7 +15,7 @@ tables: un: variables: efr_repr: - title: Reproductive Effective Fertility rate (scaled by sex ratio) + title: Reproductive Effective Fertility rate (scaled by sex ratio), using UN data description_short: |- The number of daughters that live long enough to reproduce, between ages 15 and 49. This focuses on daughters, not all children because only females reproduce. Because a child need not live until age 49 to reproduce, we approximate efr_r by taking the average of efr over all reproductive ages (15-49). unit: "children per women" @@ -35,7 +35,7 @@ tables: Read more details in the author's paper: https://www.nber.org/papers/w33175 efr_labor: - title: Labor Effective Fertility rate + title: Labor Effective Fertility rate, using UN data description_short: |- The number of children born in a year who will live long enough to earn labor income. This is approximated this by taking the average of Effective Fertility rate (EFR) over all working ages (15-65). unit: "children per women" @@ -51,7 +51,7 @@ tables: Read more details in the author's paper: https://www.nber.org/papers/w33175 cumulative_survival_repr: - title: Cumulative survival probability to reproductive age + title: Cumulative survival probability to reproductive age, using UN data description_short: |- The probability that a person born in a given year will live long enough to reach reproductive age (15-49). description_processing: |- @@ -63,7 +63,7 @@ tables: unit: "" cumulative_survival_labor: - title: Cumulative survival probability to labor age + title: Cumulative survival probability to labor age, using UN data description_short: |- The probability that a person born in a given year will live long enough to reach labor age (15-65). description_processing: |- From 1863d0ba201f1339a43d4bde2d43312b5f829e27 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Fri, 20 Dec 2024 00:24:53 +0100 Subject: [PATCH 10/15] wip --- .../2024-12-17/efr_malani_jacob.meta.yml | 4 +- .../demography/2024-12-17/efr_malani_jacob.py | 91 ++++++++++++------- 2 files changed, 58 insertions(+), 37 deletions(-) diff --git a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml index 88626f997e6..62525770764 100644 --- a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml +++ b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml @@ -17,7 +17,7 @@ tables: efr_repr: title: Reproductive Effective Fertility rate (scaled by sex ratio), using UN data description_short: |- - The number of daughters that live long enough to reproduce, between ages 15 and 49. This focuses on daughters, not all children because only females reproduce. Because a child need not live until age 49 to reproduce, we approximate efr_r by taking the average of efr over all reproductive ages (15-49). + The number of children who live long enough to reproduce, per woman. This number is dependent on the survival of daughters to childbearing age (between 15 and 49 years old). unit: "children per women" description_processing: |- For a given cohort year, we estimate the cumulative survival probability for a person to reach each age from 0 to 49. For example, the probability of a person born in 2000 reaching age 15, 16, 17, and so on up to 49. @@ -37,7 +37,7 @@ tables: efr_labor: title: Labor Effective Fertility rate, using UN data description_short: |- - The number of children born in a year who will live long enough to earn labor income. This is approximated this by taking the average of Effective Fertility rate (EFR) over all working ages (15-65). + The number of children who live long enough to earn labor income, per woman. This number is dependent on the survival of daughters to childbearing age (between 15 and 49 years old). unit: "children per women" description_processing: |- For a given cohort year, we estimate the cumulative survival probability for a person to reach each age age from 0 to 65. E.g. the probability of a person born in 2000 to reach age 15, 16, 17, ..., 65. diff --git a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py index 86a2c896c85..c40939754f4 100644 --- a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py +++ b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py @@ -1,5 +1,6 @@ """Load a meadow dataset and create a garden dataset.""" +from owid.catalog import Origin from owid.catalog import processing as pr from etl.helpers import PathFinder, create_dataset @@ -16,6 +17,14 @@ AGE_REPR_END = 49 AGE_LAB_END = 65 +# Additional origin metadata of the paper +origin = Origin( + producer="Malani and Jacob", + title="A New Measure of Surviving Children that Sheds Light on Long-term Trends in Fertility", + citation_full="Malani, A., & Jacob, A. (2024). A New Measure of Surviving Children that Sheds Light on Long-term Trends in Fertility. https://doi.org/10.3386/w33175", + date_published="2024-11-01", # type: ignore +) + def run(dest_dir: str) -> None: # @@ -39,45 +48,16 @@ def run(dest_dir: str) -> None: tb_proj=tb_un_proj, ) - # Filter TFR table - tb_tfr = tb_tfr.loc[ - (tb_tfr["sex"] == "all") & (tb_tfr["age"] == "all") & (tb_tfr["variant"].isin(["estimates", "medium"])), - ["country", "year", "fertility_rate"], - ] - - # Add TFR - tb_un = tb_un.merge(tb_tfr, on=["country", "year"], validate="m:1") - - # Estimate EFR - tb_un["efr"] = tb_un["fertility_rate"] * tb_un["cumulative_survival"] - - # Estimate metrics - ## EFR-labor: Average number of daughters that make it to the reproductive age (15-49) - ## EFR-reproductive: Average number of kids that make it to the labour age (15-65) - ## Cum survival prob, labor: Probability of a girl to survive to the reproductive age (15-49) - ## Cum survival prob, reproductive: Probability of a kid to survive to the labor age (15-65) - tb_un = tb_un.loc[(tb_un["age"] <= AGE_REPR_END) | (tb_un["sex"] == "total")] - tb_un = tb_un.groupby(["country", "year", "sex"], as_index=False)[["efr", "cumulative_survival"]].mean() - - # Pivot - tb_un = tb_un.pivot(index=["country", "year"], columns=["sex"], values=["efr", "cumulative_survival"]).reset_index() - - def rename_col(colname): - mapping = { - "female": "repr", - "total": "labor", - } - - if colname[1] == "": - return colname[0] - else: - return f"{colname[0]}_{mapping.get(colname[1])}" - - tb_un.columns = [rename_col(col) for col in tb_un.columns] + # Add EFR + tb_un = estimate_un_efr(tb_un, tb_tfr) # Format tb_un = tb_un.format(["country", "year"], short_name="un") + # Add extra origin + tb_un.efr_repr.metadata.origins = [origin] + tb_un.efr_repr.metadata.origins + + # Build list of tables tables = [ tb_un, ] @@ -131,3 +111,44 @@ def estimate_un_cum_survival(tb, tb_proj): # tb = tb.drop(columns=["year_born"]) return tb + + +def estimate_un_efr(tb_un, tb_tfr): + # Filter TFR table + tb_tfr = tb_tfr.loc[ + (tb_tfr["sex"] == "all") & (tb_tfr["age"] == "all") & (tb_tfr["variant"].isin(["estimates", "medium"])), + ["country", "year", "fertility_rate"], + ] + + # Add TFR + tb_un = tb_un.merge(tb_tfr, on=["country", "year"], validate="m:1") + + # Estimate EFR + tb_un["efr"] = tb_un["fertility_rate"] * tb_un["cumulative_survival"] + + # Estimate metrics + ## EFR-labor: Average number of daughters that make it to the reproductive age (15-49) + ## EFR-reproductive: Average number of kids that make it to the labour age (15-65) + ## Cum survival prob, labor: Probability of a girl to survive to the reproductive age (15-49) + ## Cum survival prob, reproductive: Probability of a kid to survive to the labor age (15-65) + tb_un = tb_un.loc[(tb_un["age"] <= AGE_REPR_END) | (tb_un["sex"] == "total")] + tb_un = tb_un.groupby(["country", "year", "sex"], as_index=False)[["efr", "cumulative_survival"]].mean() + + # Pivot + tb_un = tb_un.pivot(index=["country", "year"], columns=["sex"], values=["efr", "cumulative_survival"]).reset_index() + + # Rename columns + def rename_col(colname): + mapping = { + "female": "repr", + "total": "labor", + } + + if colname[1] == "": + return colname[0] + else: + return f"{colname[0]}_{mapping.get(colname[1])}" + + tb_un.columns = [rename_col(col) for col in tb_un.columns] + + return tb_un From 916354eff0b3428b9b9d3f1dbe3753cbbe28d47f Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Fri, 20 Dec 2024 00:29:42 +0100 Subject: [PATCH 11/15] improve origins --- .../data/garden/demography/2024-12-17/efr_malani_jacob.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py index c40939754f4..35cd6069300 100644 --- a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py +++ b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py @@ -23,6 +23,14 @@ title="A New Measure of Surviving Children that Sheds Light on Long-term Trends in Fertility", citation_full="Malani, A., & Jacob, A. (2024). A New Measure of Surviving Children that Sheds Light on Long-term Trends in Fertility. https://doi.org/10.3386/w33175", date_published="2024-11-01", # type: ignore + description=""" +The world has experienced a dramatic decline in total fertility rate (TFR) since the Industrial Revolution. Yet the consequences of this decline flow not merely from a reduction in births, but from a reduction in the number of surviving children. Authors propose a new measure of the number of surviving children per female, which authors call the effective fertility rate (EFR). EFR can be approximated as the product of TFR and the probability of survival. Moreover, TFR changes can be decomposed into changes that preserve EFR and those that change EFR. Authors specialized EFR to measure the number of daughters that survive to reproduce (reproductive EFR) and the number children that survive to become workers (labor EFR). + +Authors use three data sets to shed light on EFR over time across locations. First, authors use data from 165 countries between 1950-2019 to show that one-third of the global decline in TFR during this period did not change labor EFR, suggesting that a substantial portion of fertility decline merely compensated for higher survival rates. Focusing on the change in labor EFR, at least 40% of variation cannot be explained by economic factors such as income, prices, education levels, structural transformation, an urbanization, leaving room for explanations like cultural change. Second, using historical demographic data on European countries since 1750, authors find that there was dramatic fluctuation in labor EFR in Europe around each of the World Wars, a phenomenon that is distinct from the demographic transition. However, prior to that fluctuation, EFRs were remarkably constant, even as European countries were undergoing demographic transitions. Indeed, even when EFRs fell below 2 after 1975, we find that EFRs remained stable rather than continuing to decline. Third, data from the US since 1800 reveal that, despite great differences in mortality rates, Black and White populations have remarkably similar numbers of surviving children over time. + + +""", + url_main="https://www.nber.org/papers/w33175", ) From b21ce2e6f8d5d24bc712c1e045897a36a420dd46 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Fri, 20 Dec 2024 15:07:02 +0100 Subject: [PATCH 12/15] metadata edits --- .../demography/2024-12-17/efr_malani_jacob.meta.yml | 8 ++++---- .../data/garden/demography/2024-12-17/efr_malani_jacob.py | 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml index 62525770764..a27eb6b6ef5 100644 --- a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml +++ b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml @@ -15,7 +15,7 @@ tables: un: variables: efr_repr: - title: Reproductive Effective Fertility rate (scaled by sex ratio), using UN data + title: Reproductive Effective Fertility rate (scaled by sex ratio) description_short: |- The number of children who live long enough to reproduce, per woman. This number is dependent on the survival of daughters to childbearing age (between 15 and 49 years old). unit: "children per women" @@ -35,7 +35,7 @@ tables: Read more details in the author's paper: https://www.nber.org/papers/w33175 efr_labor: - title: Labor Effective Fertility rate, using UN data + title: Labor Effective Fertility rate description_short: |- The number of children who live long enough to earn labor income, per woman. This number is dependent on the survival of daughters to childbearing age (between 15 and 49 years old). unit: "children per women" @@ -51,7 +51,7 @@ tables: Read more details in the author's paper: https://www.nber.org/papers/w33175 cumulative_survival_repr: - title: Cumulative survival probability to reproductive age, using UN data + title: Cumulative survival probability to reproductive age description_short: |- The probability that a person born in a given year will live long enough to reach reproductive age (15-49). description_processing: |- @@ -63,7 +63,7 @@ tables: unit: "" cumulative_survival_labor: - title: Cumulative survival probability to labor age, using UN data + title: Cumulative survival probability to labor age description_short: |- The probability that a person born in a given year will live long enough to reach labor age (15-65). description_processing: |- diff --git a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py index 35cd6069300..d36331acf7c 100644 --- a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py +++ b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py @@ -63,7 +63,8 @@ def run(dest_dir: str) -> None: tb_un = tb_un.format(["country", "year"], short_name="un") # Add extra origin - tb_un.efr_repr.metadata.origins = [origin] + tb_un.efr_repr.metadata.origins + for col in tb_un.columns: + tb_un[col].metadata.origins = [origin] + tb_un[col].metadata.origins # Build list of tables tables = [ From e03524a71b7abea487a46f39bb94f8d94c535d6c Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Fri, 20 Dec 2024 18:56:26 +0100 Subject: [PATCH 13/15] wip --- .../2024-12-17/efr_malani_jacob.meta.yml | 14 +- .../demography/2024-12-17/efr_malani_jacob.py | 223 +++++++++++++----- 2 files changed, 179 insertions(+), 58 deletions(-) diff --git a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml index a27eb6b6ef5..f05fdec3da7 100644 --- a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml +++ b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml @@ -12,7 +12,7 @@ dataset: title: "Effective Fertility Rates (Malani and Jacob)" tables: - un: + efr_malani_jacob: variables: efr_repr: title: Reproductive Effective Fertility rate (scaled by sex ratio) @@ -20,10 +20,12 @@ tables: The number of children who live long enough to reproduce, per woman. This number is dependent on the survival of daughters to childbearing age (between 15 and 49 years old). unit: "children per women" description_processing: |- - For a given cohort year, we estimate the cumulative survival probability for a person to reach each age from 0 to 49. For example, the probability of a person born in 2000 reaching age 15, 16, 17, and so on up to 49. + For a given cohort year, we estimate the cumulative survival probability for a person to reach each age from 0 to 49. For example, the probability of a person born in 2000 reaching age 15, 16, 17, and so on up to 49. We have used HMD data for years before 1950, and UN's for years after 1950 (including). We then estimate the Effective Fertility Rate (EFR) for each age group by multiplying the Total Fertility Rate (TFR) by the cumulative survival probability. The EFR for a given age gives us an approximation of the average number of children from a woman that will live long enough to reach that age. + For years before 1950, we have used HMD data, which does not provide TFR values. Instead, we have used an approximation of the TFR based on births and female population (in reproductive ages), as suggested by Jacob and Malani (2024). + The Reproductive Effective Fertility rate (EFR) is the average of the EFR over all reproductive ages (15-49). Note that the Reproductive Effective Fertility rate (EFR) is an approximation of the number of daughters, so it uses the total fertility rate of female children, or equivalently, the TFR weighted by the sex ratio at birth. @@ -40,10 +42,12 @@ tables: The number of children who live long enough to earn labor income, per woman. This number is dependent on the survival of daughters to childbearing age (between 15 and 49 years old). unit: "children per women" description_processing: |- - For a given cohort year, we estimate the cumulative survival probability for a person to reach each age age from 0 to 65. E.g. the probability of a person born in 2000 to reach age 15, 16, 17, ..., 65. + For a given cohort year, we estimate the cumulative survival probability for a person to reach each age age from 0 to 65. E.g. the probability of a person born in 2000 to reach age 15, 16, 17, ..., 65. We have used HMD data for years before 1950, and UN's for years after 1950 (including). We then estimate the Effective Fertility Rate (EFR) for each age group by multiplying the Total Fertility Rate (TFR) by the cumulative survival probability. The EFR for a given age gives us an approximation of the average number of children from a women that will live long enough to reach that age. + For years before 1950, we have used HMD data, which does not provide TFR values. Instead, we have used an approximation of the TFR based on births and female population (in reproductive ages), as suggested by Jacob and Malani (2024). + The Labor Effective Fertility rate (EFR) is the average of the EFR over all labor ages (15-65). So we have that: EFR_labor = (TFR * mean(EFR)), where the mean is taken over all labor ages (15-65). @@ -55,7 +59,7 @@ tables: description_short: |- The probability that a person born in a given year will live long enough to reach reproductive age (15-49). description_processing: |- - For a given cohort year, we estimate the cumulative survival probability for a person to reach each age from 0 to 49. For example, the probability of a person born in 2000 reaching age 15, 16, 17, and so on up to 49. + For a given cohort year, we estimate the cumulative survival probability for a person to reach each age from 0 to 49. For example, the probability of a person born in 2000 reaching age 15, 16, 17, and so on up to 49. We have used HMD data for years before 1950, and UN's for years after 1950 (including). This is done by multiplying the survival probability at various years, depending on the age of the person. For example, if born in 2000, we use the probability of surviving age 0 from 2000, the probability of surviving age 1 from 2001, etc. @@ -67,7 +71,7 @@ tables: description_short: |- The probability that a person born in a given year will live long enough to reach labor age (15-65). description_processing: |- - For a given cohort year, we estimate the cumulative survival probability for a person to reach each age from 0 to 65. For example, the probability of a person born in 2000 reaching age 15, 16, 17, and so on up to 65. + For a given cohort year, we estimate the cumulative survival probability for a person to reach each age from 0 to 65. For example, the probability of a person born in 2000 reaching age 15, 16, 17, and so on up to 65. We have used HMD data for years before 1950, and UN's for years after 1950 (including). This is done by multiplying the survival probability at various years, depending on the age of the person. For example, if born in 2000, we use the probability of surviving age 0 from 2000, the probability of surviving age 1 from 2001, etc. diff --git a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py index d36331acf7c..f8ad6bb9da2 100644 --- a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py +++ b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py @@ -1,5 +1,6 @@ """Load a meadow dataset and create a garden dataset.""" +import numpy as np from owid.catalog import Origin from owid.catalog import processing as pr @@ -9,6 +10,7 @@ paths = PathFinder(__file__) COLUMNS_UN = ["country", "year", "age", "sex", "probability_of_survival"] +COLUMNS_HMD = ["country", "year", "age", "sex", "probability_of_death"] # Years YEAR_UN_START = 1950 YEAR_UN_END = 2023 @@ -32,44 +34,169 @@ """, url_main="https://www.nber.org/papers/w33175", ) +# Extrapolate data for UK nations from UK & Ireland +UK_NATION_EXTRAPOLATION = { + "England and Wales": "United Kingdom", + "Scotland": "United Kingdom", + "Northern Ireland": "Ireland", +} +# Countries that are in HMD but not in UN +COUNTRIES_NOT_IN_UN = ["West Germany", "East Germany", "Taiwan"] + + +def _clean_un_table(tb): + """Basic cleaning of UN table.""" + # Rename columns + tb = tb.rename(columns={"location": "country"}) + + # Filter 'total' and 'female', select relevant columns + tb = tb.loc[tb["sex"].isin(["total", "female"]), COLUMNS_UN] + + # Dtypes + tb["age"] = tb["age"].str.replace("100+", "100").astype("UInt16") + + # Scale + tb["probability_of_survival"] /= 100 + + return tb + + +def _clean_hmd_table(tb): + """Basic cleaning of HMD table""" + # Filter 'total' and 'female', 'period' life tables, relevant columns + tb = tb.loc[tb["sex"].isin(["total", "female"]) & (tb["type"] == "period"), COLUMNS_HMD] + + # Dtypes + tb = tb.loc[~tb["age"].str.contains("-")] + tb["age"] = tb["age"].str.replace("110+", "110").astype("UInt16") + + # Scale + tb["probability_of_survival"] = 1 - tb["probability_of_death"] / 100 + tb = tb.drop(columns=["probability_of_death"]) + + return tb + + +def combine_un_hmd(tb_un, tb_hmd): + """Combine UN and HMD tables. + + We use this function to combine survival probabilities and TFR time-series. + """ + # Keep old years (we use UN for post-1950) + tb_hmd = tb_hmd.loc[tb_hmd["year"] < YEAR_UN_START] + + # Drop countries not covered by UN + tb_hmd = tb_hmd.loc[~tb_hmd["country"].isin(COUNTRIES_NOT_IN_UN)] + ## sanity check + countries_hmd = set(tb_hmd["country"].unique()) + countries_un = set(tb_un["country"].unique()) + countries_unexpected = { + c for c in countries_hmd if (c not in countries_un) and (c not in UK_NATION_EXTRAPOLATION.keys()) + } + assert ( + countries_unexpected == set() + ), f"There should be no country ({countries_unexpected}) in HMD that is not in UN" + + # UK nation adaptations (extrapolate data from UK & Ireland) + tb_extra = [] + for nation, country in UK_NATION_EXTRAPOLATION.items(): + tb_extra.append(tb_un.loc[tb_un["country"] == country].assign(country=nation)) + # Combine + tb = pr.concat([tb_un, tb_hmd, *tb_extra], ignore_index=True) + # sanity check + cols = list({"country", "year", "age", "sex"}.intersection(tb.columns)) + _ = tb.format(cols) + + return tb + + +def get_tfr_estimation(tb_b, tb_p): + ## Get total births + tb_b = tb_b.loc[tb_b["sex"] == "total", ["country", "year", "births"]] + + ## Get female population aged 15-49 + ages = {f"{a}-{a+4}" for a in range(15, 50, 5)} + tb_p = tb_p.loc[(tb_p["sex"] == "female") & tb_p.age.isin(ages)] + ## sanity check + x = tb_p.groupby(["country", "year"]).agg({"age": ("unique", "nunique")}) + x.columns = ["set", "nun"] + assert x.nun.unique() == 7, "There should be 7 unique age groups for each country-year" + ## Aggregate and get population for women 15-49 years old + tb_p = tb_p.groupby(["country", "year"], as_index=False)["population"].sum() + + ## Merge + tb_appr = tb_b.merge(tb_p, on=["country", "year"], validate="1:1") + + ## Approximate TFR = 35 * births / population(females in reproductive age) + tb_appr["fertility_rate"] = 35 * tb_appr["births"] / tb_appr["population"] + + ## Drop unnecessary columns + tb_appr = tb_appr.drop(columns=["births", "population"]) + + return tb_appr def run(dest_dir: str) -> None: - # - # Load inputs. - # # Load meadow dataset. ds_un_lt = paths.load_dataset("un_wpp_lt") ds_un_wpp = paths.load_dataset("un_wpp") + ds_hmd = paths.load_dataset("hmd") - # ds_hmd = paths.load_dataset("hmd") - # ds_hfd = paths.load_dataset("hfd") + # + # 1/ Estimate cumulative survival probabilities + # # Load tables tb_un = ds_un_lt.read("un_wpp_lt", reset_metadata="keep_origins") tb_un_proj = ds_un_lt.read("un_wpp_lt_proj", reset_metadata="keep_origins") + tb_hmd = ds_hmd.read("life_tables", reset_metadata="keep_origins") tb_tfr = ds_un_wpp.read("fertility_rate", reset_metadata="keep_origins") - # Estimate cumulative survival in UN LT tables - tb_un = estimate_un_cum_survival( - tb=tb_un, - tb_proj=tb_un_proj, - ) + # Prepare UN table + tb_un = _clean_un_table(tb_un) + tb_un_proj = _clean_un_table(tb_un_proj) + tb_un = pr.concat([tb_un, tb_un_proj], ignore_index=True) + # Prepare HMD data + tb_hmd = _clean_hmd_table(tb_hmd) + # Combine HMD and UN data (survival probabilities) + tb = combine_un_hmd(tb_un, tb_hmd) + + # Estimate cumulative survival probabilities + tb = estimate_cum_survival(tb=tb) + + # + # 2/ Estimate EFR + # + + # TFR approximation + ## Get total births + tb_b = ds_hmd.read("births") + ## Get population + tb_p = ds_hmd.read("population") + ## Combine + tb_tfr_apr = get_tfr_estimation(tb_b, tb_p) + + # Load HFD data + tb_tfr = tb_tfr.loc[ + (tb_tfr["sex"] == "all") & (tb_tfr["age"] == "all") & (tb_tfr["variant"].isin(["estimates", "medium"])), + ["country", "year", "fertility_rate"], + ] + + # Combine HMD and UN data (tfr) + tb_tfr = combine_un_hmd(tb_tfr, tb_tfr_apr) # Add EFR - tb_un = estimate_un_efr(tb_un, tb_tfr) + tb = estimate_efr(tb, tb_tfr) # Format - tb_un = tb_un.format(["country", "year"], short_name="un") + tb = tb.format(["country", "year"], short_name="efr_malani_jacob") # Add extra origin - for col in tb_un.columns: - tb_un[col].metadata.origins = [origin] + tb_un[col].metadata.origins + for col in tb.columns: + tb[col].metadata.origins = [origin] + tb[col].metadata.origins # Build list of tables - tables = [ - tb_un, - ] + tables = [tb] # Save outputs. # @@ -84,67 +211,49 @@ def run(dest_dir: str) -> None: ds_garden.save() -def estimate_un_cum_survival(tb, tb_proj): - # Concatenate - tb = pr.concat([tb, tb_proj], ignore_index=True) - - # Rename columns - tb = tb.rename(columns={"location": "country"}) - - # Filter 'total' and 'female' - tb = tb.loc[tb["sex"].isin(["total", "female"]), COLUMNS_UN] - - # Dtypes - tb["age"] = tb["age"].str.replace("100+", "100").astype("UInt16") - - # Scale - tb["probability_of_survival"] /= 100 - +def estimate_cum_survival(tb): # Cumulative product # We estimate the cumulative survival probability. This is the probability to survive from birth to a given age. # The source provides the probability to survive from one age to the next (pn = probability to survive age n to n+1). # To estimate this for people born in 1950, we need the data of p0 in 1950, p1 in 1951, etc. That's why we create year_born. # After that, we just do the cumulative product for each year_born. # Note that for the cumulative product to make sense, we need to first sort table by age! + # Step 0: Save min year. i.e. year to start recording + tb["year_min"] = tb.groupby("country")["year"].transform("min") # Step 1: Replace year with "cohort year" tb["year"] = tb["year"] - tb["age"] - # Step 2: We only estimate the cumulative survival probability for people born between 1950 and 2023 (reduction of 50% rows) - tb = tb.loc[(tb["year"] >= YEAR_UN_START) & (tb["year"] <= YEAR_UN_END)] + # Step 2: We only estimate the cumulative survival probability for people born between year_min* and 2023 (reduction of 50% rows) + # year_min is the first year for which the source reported data (e.g. 1950 for most UN-only countries, varies for HMD countries) + tb = tb.loc[(tb["year"] >= tb["year_min"]) & (tb["year"] <= YEAR_UN_END)] + assert ( + tb[tb["year"] == tb["year_min"]].groupby("country").age.min().max() == 0 + ), "There should be age zero for starting year of each country!" # Step 3: Sort by age, so we can do the cumulative product later tb = tb.sort_values(["country", "sex", "year", "age"], ignore_index=True) # Step 4: Estimate cumulative survival probability tb["cumulative_survival"] = tb.groupby(["country", "sex", "year"])["probability_of_survival"].cumprod() - # Step 5: Keep only years of interest (15-65), further reduction of 65% rows (aggregate -83%) + # Step 5: Keep only years of interest (15-65), further reduction of 50% rows (aggregate -50%) tb = tb.loc[(tb["age"] >= AGE_LAB_START) & (tb["age"] <= AGE_LAB_END)] - # # Step 6: Drop columns - # tb = tb.drop(columns=["year_born"]) - return tb -def estimate_un_efr(tb_un, tb_tfr): - # Filter TFR table - tb_tfr = tb_tfr.loc[ - (tb_tfr["sex"] == "all") & (tb_tfr["age"] == "all") & (tb_tfr["variant"].isin(["estimates", "medium"])), - ["country", "year", "fertility_rate"], - ] - +def estimate_efr(tb, tb_tfr): # Add TFR - tb_un = tb_un.merge(tb_tfr, on=["country", "year"], validate="m:1") + tb = tb.merge(tb_tfr, on=["country", "year"], validate="m:1") # Estimate EFR - tb_un["efr"] = tb_un["fertility_rate"] * tb_un["cumulative_survival"] + tb["efr"] = tb["fertility_rate"] * tb["cumulative_survival"] # Estimate metrics ## EFR-labor: Average number of daughters that make it to the reproductive age (15-49) ## EFR-reproductive: Average number of kids that make it to the labour age (15-65) ## Cum survival prob, labor: Probability of a girl to survive to the reproductive age (15-49) ## Cum survival prob, reproductive: Probability of a kid to survive to the labor age (15-65) - tb_un = tb_un.loc[(tb_un["age"] <= AGE_REPR_END) | (tb_un["sex"] == "total")] - tb_un = tb_un.groupby(["country", "year", "sex"], as_index=False)[["efr", "cumulative_survival"]].mean() + tb = tb.loc[(tb["age"] <= AGE_REPR_END) | (tb["sex"] == "total")] + tb = tb.groupby(["country", "year", "sex"], as_index=False)[["efr", "cumulative_survival"]].mean() # Pivot - tb_un = tb_un.pivot(index=["country", "year"], columns=["sex"], values=["efr", "cumulative_survival"]).reset_index() + tb = tb.pivot(index=["country", "year"], columns=["sex"], values=["efr", "cumulative_survival"]).reset_index() # Rename columns def rename_col(colname): @@ -158,6 +267,14 @@ def rename_col(colname): else: return f"{colname[0]}_{mapping.get(colname[1])}" - tb_un.columns = [rename_col(col) for col in tb_un.columns] + tb.columns = [rename_col(col) for col in tb.columns] - return tb_un + # Check inf values + x = tb[tb["efr_repr"].isin([np.inf, -np.inf])] + assert len(x) == 4 + x = tb[tb["efr_labor"].isin([np.inf, -np.inf])] + assert len(x) == 4 + + # Replace inf with NA + tb[["efr_repr", "efr_labor"]] = tb[["efr_repr", "efr_labor"]].replace([np.inf, -np.inf], np.nan) + return tb From 1654fc21aebf26dc026efee26c4e60ca7a4e65b4 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Fri, 20 Dec 2024 19:20:18 +0100 Subject: [PATCH 14/15] add distribution indicator --- .../2024-12-17/efr_malani_jacob.meta.yml | 18 +++++- .../demography/2024-12-17/efr_malani_jacob.py | 60 +++++++++++++------ 2 files changed, 59 insertions(+), 19 deletions(-) diff --git a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml index f05fdec3da7..bb33bb20c37 100644 --- a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml +++ b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml @@ -12,7 +12,7 @@ dataset: title: "Effective Fertility Rates (Malani and Jacob)" tables: - efr_malani_jacob: + aggregated: variables: efr_repr: title: Reproductive Effective Fertility rate (scaled by sex ratio) @@ -77,3 +77,19 @@ tables: Read more details in the author's paper: https://www.nber.org/papers/w33175 unit: "" + + distribution: + variables: + efr: + title: Effective Fertility rate, distribution by age (year << birth_year >>) + description_short: |- + The EFR for a given age gives us an approximation of the average number of children from a woman that will live long enough to reach that age. + unit: "children per women" + description_processing: |- + For a given cohort year, we estimate the cumulative survival probability for a person to reach each age from 0 to 100. For example, the probability of a person born in 2000 reaching age 15, 16, 17, and so on. We have used HMD data for years before 1950, and UN's for years after 1950 (including). + + We then estimate the Effective Fertility Rate (EFR) for each age group by multiplying the Total Fertility Rate (TFR) by the cumulative survival probability. The EFR for a given age gives us an approximation of the average number of children from a woman that will live long enough to reach that age. + + For years before 1950, we have used HMD data, which does not provide TFR values. Instead, we have used an approximation of the TFR based on births and female population (in reproductive ages), as suggested by Jacob and Malani (2024). + + Read more details in the author's paper: https://www.nber.org/papers/w33175 diff --git a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py index f8ad6bb9da2..a2e717bd0bf 100644 --- a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py +++ b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py @@ -42,6 +42,7 @@ } # Countries that are in HMD but not in UN COUNTRIES_NOT_IN_UN = ["West Germany", "East Germany", "Taiwan"] +YEARS_EFR_DISTR = [1950, 2023] def _clean_un_table(tb): @@ -150,7 +151,6 @@ def run(dest_dir: str) -> None: tb_un = ds_un_lt.read("un_wpp_lt", reset_metadata="keep_origins") tb_un_proj = ds_un_lt.read("un_wpp_lt_proj", reset_metadata="keep_origins") tb_hmd = ds_hmd.read("life_tables", reset_metadata="keep_origins") - tb_tfr = ds_un_wpp.read("fertility_rate", reset_metadata="keep_origins") # Prepare UN table tb_un = _clean_un_table(tb_un) @@ -168,35 +168,54 @@ def run(dest_dir: str) -> None: # 2/ Estimate EFR # - # TFR approximation + # 2.1 TFR approximation from HFD ## Get total births - tb_b = ds_hmd.read("births") + tb_b = ds_hmd.read("births", reset_metadata="keep_origins") ## Get population - tb_p = ds_hmd.read("population") + tb_p = ds_hmd.read("population", reset_metadata="keep_origins") ## Combine tb_tfr_apr = get_tfr_estimation(tb_b, tb_p) - # Load HFD data + # 2.2 Load TFR from UN + tb_tfr = ds_un_wpp.read("fertility_rate", reset_metadata="keep_origins") tb_tfr = tb_tfr.loc[ (tb_tfr["sex"] == "all") & (tb_tfr["age"] == "all") & (tb_tfr["variant"].isin(["estimates", "medium"])), ["country", "year", "fertility_rate"], ] - # Combine HMD and UN data (tfr) + # 2.3 TFR: Combine HMD and UN data tb_tfr = combine_un_hmd(tb_tfr, tb_tfr_apr) - # Add EFR + # 2.4 Get EFR distribution (for each age) tb = estimate_efr(tb, tb_tfr) - # Format - tb = tb.format(["country", "year"], short_name="efr_malani_jacob") - - # Add extra origin - for col in tb.columns: - tb[col].metadata.origins = [origin] + tb[col].metadata.origins - - # Build list of tables - tables = [tb] + # + # 3/ Create output tables + + # 3.1 Distribution indicators (EFR(age) in YEARS_EFR_DISTR) + tb_efr = tb.loc[tb["year"].isin(YEARS_EFR_DISTR) & (tb["sex"] == "total"), ["country", "year", "age", "efr"]] + tb_efr = tb_efr.rename(columns={"year": "birth_year"}) + + # 3.2 Obtain labor and reproductive EFRs + ## Keep only years of interest (15-65), further reduction of 50% rows (aggregate -50%) + tb_agg = tb.loc[(tb["age"] >= AGE_LAB_START) & (tb["age"] <= AGE_LAB_END)] + tb_agg = aggregate_efr(tb=tb_agg) + + # 3.3 Format + tb_agg = tb_agg.format(["country", "year"], short_name="aggregated") + tb_efr = tb_efr.format(["country", "age", "birth_year"], short_name="distribution") + + # 3.4 Add extra origin + for col in tb_agg.columns: + tb_agg[col].metadata.origins = [origin] + tb_agg[col].metadata.origins + for col in tb_efr.columns: + tb_efr[col].metadata.origins = [origin] + tb_efr[col].metadata.origins + + # 3.5 Build list of tables + tables = [ + tb, + tb_agg, + ] # Save outputs. # @@ -232,8 +251,8 @@ def estimate_cum_survival(tb): tb = tb.sort_values(["country", "sex", "year", "age"], ignore_index=True) # Step 4: Estimate cumulative survival probability tb["cumulative_survival"] = tb.groupby(["country", "sex", "year"])["probability_of_survival"].cumprod() - # Step 5: Keep only years of interest (15-65), further reduction of 50% rows (aggregate -50%) - tb = tb.loc[(tb["age"] >= AGE_LAB_START) & (tb["age"] <= AGE_LAB_END)] + # Step 6: Drop unnecessary columns + tb = tb.drop(columns=["year_min"]) return tb @@ -244,6 +263,11 @@ def estimate_efr(tb, tb_tfr): # Estimate EFR tb["efr"] = tb["fertility_rate"] * tb["cumulative_survival"] + return tb + + +def aggregate_efr(tb): + """Estimate labor and reproductive EFRs.""" # Estimate metrics ## EFR-labor: Average number of daughters that make it to the reproductive age (15-49) ## EFR-reproductive: Average number of kids that make it to the labour age (15-65) From e0f56fa7382ed33792a907e2b81fd880c3494d8d Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Fri, 20 Dec 2024 19:56:52 +0100 Subject: [PATCH 15/15] more years --- .../garden/demography/2024-12-17/efr_malani_jacob.py | 5 +++-- .../grapher/demography/2024-12-17/efr_malani_jacob.py | 9 ++++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py index a2e717bd0bf..d21f5b5052d 100644 --- a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py +++ b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py @@ -42,7 +42,8 @@ } # Countries that are in HMD but not in UN COUNTRIES_NOT_IN_UN = ["West Germany", "East Germany", "Taiwan"] -YEARS_EFR_DISTR = [1950, 2023] +# Add more years if you need distribution-indicators for a particular year. +YEARS_EFR_DISTR = [1925, 1950, 1975, 2000, 2023] def _clean_un_table(tb): @@ -213,7 +214,7 @@ def run(dest_dir: str) -> None: # 3.5 Build list of tables tables = [ - tb, + tb_efr, tb_agg, ] diff --git a/etl/steps/data/grapher/demography/2024-12-17/efr_malani_jacob.py b/etl/steps/data/grapher/demography/2024-12-17/efr_malani_jacob.py index f805b3f04b9..0a8f587fe8d 100644 --- a/etl/steps/data/grapher/demography/2024-12-17/efr_malani_jacob.py +++ b/etl/steps/data/grapher/demography/2024-12-17/efr_malani_jacob.py @@ -14,7 +14,14 @@ def run(dest_dir: str) -> None: ds_garden = paths.load_dataset("efr_malani_jacob") # Read table from garden dataset. - tables = list(ds_garden) + tables = [ + ds_garden.read("aggregated", reset_index=False), + ds_garden.read("distribution", reset_index=False).rename_index_names( + { + "age": "year", + } + ), + ] # # Save outputs.