From db6867e72d9406a3f385bf3e6065809a69ee2b07 Mon Sep 17 00:00:00 2001 From: Marigold Date: Mon, 18 Dec 2023 09:53:57 +0100 Subject: [PATCH] :bar_chart: migrate fertility rate and GDP to ETL --- dag/migrated.yml | 8 +++ .../2019-05-25/fertility_rate.meta.yml | 13 ++++ .../gapminder/2019-05-25/fertility_rate.py | 23 +++++++ .../garden/wb/2017-04-16/world_gdp.meta.yml | 12 ++++ .../data/garden/wb/2017-04-16/world_gdp.py | 23 +++++++ .../gapminder/2019-05-25/fertility_rate.py | 26 ++++++++ .../data/grapher/wb/2017-04-16/world_gdp.py | 26 ++++++++ ...d_bank_plus_maddison__2017_config.json.dvc | 6 +- ...ank_plus_maddison__2017_values.feather.dvc | 6 +- ...ected_gapminder__v12__2017_config.json.dvc | 6 +- ...ed_gapminder__v12__2017_values.feather.dvc | 6 +- .../2019-05-25/fertility_rate.feather.dvc | 64 +++++++++++++++++++ .../gapminder/2019-05-25/fertility_rate.py | 50 +++++++++++++++ snapshots/wb/2017-04-16/world_gdp.feather.dvc | 18 ++++++ snapshots/wb/2017-04-16/world_gdp.py | 54 ++++++++++++++++ 15 files changed, 329 insertions(+), 12 deletions(-) create mode 100644 etl/steps/data/garden/gapminder/2019-05-25/fertility_rate.meta.yml create mode 100644 etl/steps/data/garden/gapminder/2019-05-25/fertility_rate.py create mode 100644 etl/steps/data/garden/wb/2017-04-16/world_gdp.meta.yml create mode 100644 etl/steps/data/garden/wb/2017-04-16/world_gdp.py create mode 100644 etl/steps/data/grapher/gapminder/2019-05-25/fertility_rate.py create mode 100644 etl/steps/data/grapher/wb/2017-04-16/world_gdp.py create mode 100644 snapshots/gapminder/2019-05-25/fertility_rate.feather.dvc create mode 100644 snapshots/gapminder/2019-05-25/fertility_rate.py create mode 100644 snapshots/wb/2017-04-16/world_gdp.feather.dvc create mode 100644 snapshots/wb/2017-04-16/world_gdp.py diff --git a/dag/migrated.yml b/dag/migrated.yml index c17f475a0eb..a75f987a32f 100644 --- a/dag/migrated.yml +++ b/dag/migrated.yml @@ -32,3 +32,11 @@ steps: - data://garden/iucn/2022-12-08/threatened_and_evaluated_species data://garden/iucn/2022-12-08/threatened_and_evaluated_species: - snapshot://iucn/2022-12-08/threatened_and_evaluated_species.feather + data://grapher/gapminder/2019-05-25/fertility_rate: + - data://garden/gapminder/2019-05-25/fertility_rate + data://garden/gapminder/2019-05-25/fertility_rate: + - snapshot://gapminder/2019-05-25/fertility_rate.feather + data://grapher/wb/2017-04-16/world_gdp: + - data://garden/wb/2017-04-16/world_gdp + data://garden/wb/2017-04-16/world_gdp: + - snapshot://wb/2017-04-16/world_gdp.feather diff --git a/etl/steps/data/garden/gapminder/2019-05-25/fertility_rate.meta.yml b/etl/steps/data/garden/gapminder/2019-05-25/fertility_rate.meta.yml new file mode 100644 index 00000000000..fb5d3fb0d9f --- /dev/null +++ b/etl/steps/data/garden/gapminder/2019-05-25/fertility_rate.meta.yml @@ -0,0 +1,13 @@ +dataset: + title: Fertility Rate (Selected Gapminder, v12) (2017) +tables: + fertility_rate: + variables: + fertility_rate__select_gapminder__v12__2017: + title: Fertility rate (Select Gapminder, v12) (2017) + unit: children per woman + display: + name: Fertility rate + description: Total fertility rate represents the number of children that would be born to a woman if she were to live + to the end of her childbearing years and bear children in accordance with age-specific fertility rates of the specified + year. diff --git a/etl/steps/data/garden/gapminder/2019-05-25/fertility_rate.py b/etl/steps/data/garden/gapminder/2019-05-25/fertility_rate.py new file mode 100644 index 00000000000..a4968d65180 --- /dev/null +++ b/etl/steps/data/garden/gapminder/2019-05-25/fertility_rate.py @@ -0,0 +1,23 @@ +"""Load snapshot and create a garden dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load data from snapshot. + # + snap = paths.load_snapshot() + tb = snap.read().set_index(["country", "year"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the snapshot. + ds_garden = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/wb/2017-04-16/world_gdp.meta.yml b/etl/steps/data/garden/wb/2017-04-16/world_gdp.meta.yml new file mode 100644 index 00000000000..cbc2d8a9976 --- /dev/null +++ b/etl/steps/data/garden/wb/2017-04-16/world_gdp.meta.yml @@ -0,0 +1,12 @@ +dataset: + title: World GDP in 2011 int $ – OWID based on World Bank + Maddison (2017) +tables: + world_gdp: + variables: + world_gdp_in_2011_int_dollar__owid_based_on_world_bank__and__maddison__2017: + title: World GDP in 2011 Int.$ (OWID based on World Bank & Maddison (2017)) + unit: int.-$ + short_unit: $ + display: + numDecimalPlaces: 0 + unit: international-$ in 2011 prices diff --git a/etl/steps/data/garden/wb/2017-04-16/world_gdp.py b/etl/steps/data/garden/wb/2017-04-16/world_gdp.py new file mode 100644 index 00000000000..a4968d65180 --- /dev/null +++ b/etl/steps/data/garden/wb/2017-04-16/world_gdp.py @@ -0,0 +1,23 @@ +"""Load snapshot and create a garden dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load data from snapshot. + # + snap = paths.load_snapshot() + tb = snap.read().set_index(["country", "year"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the snapshot. + ds_garden = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/grapher/gapminder/2019-05-25/fertility_rate.py b/etl/steps/data/grapher/gapminder/2019-05-25/fertility_rate.py new file mode 100644 index 00000000000..7342084428a --- /dev/null +++ b/etl/steps/data/grapher/gapminder/2019-05-25/fertility_rate.py @@ -0,0 +1,26 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset() + + # Read table from garden dataset. + tb = ds_garden["fertility_rate"] + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/wb/2017-04-16/world_gdp.py b/etl/steps/data/grapher/wb/2017-04-16/world_gdp.py new file mode 100644 index 00000000000..e267f9f83c6 --- /dev/null +++ b/etl/steps/data/grapher/wb/2017-04-16/world_gdp.py @@ -0,0 +1,26 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset() + + # Read table from garden dataset. + tb = ds_garden["world_gdp"] + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/snapshots/backport/latest/dataset_393_world_gdp_in_2011_int_dollar__owid_based_on_world_bank_plus_maddison__2017_config.json.dvc b/snapshots/backport/latest/dataset_393_world_gdp_in_2011_int_dollar__owid_based_on_world_bank_plus_maddison__2017_config.json.dvc index 9b9a0be67dd..9e70090098e 100644 --- a/snapshots/backport/latest/dataset_393_world_gdp_in_2011_int_dollar__owid_based_on_world_bank_plus_maddison__2017_config.json.dvc +++ b/snapshots/backport/latest/dataset_393_world_gdp_in_2011_int_dollar__owid_based_on_world_bank_plus_maddison__2017_config.json.dvc @@ -2,7 +2,7 @@ meta: source: name: Our World in Data catalog backport url: https://owid.cloud/admin/datasets/393 - date_accessed: 2023-08-10 07:51:10.903473 + date_accessed: 2023-12-18 08:51:10.041539 publication_date: latest published_by: Our World in Data catalog backport name: Grapher metadata for @@ -10,7 +10,7 @@ meta: description: '' wdir: ../../../data/snapshots/backport/latest outs: -- md5: fec4d98fd5ca13082abafc76b86a26aa - size: 2307 +- md5: 576d8edddb47c338949704b1195f9ab8 + size: 2320 path: dataset_393_world_gdp_in_2011_int_dollar__owid_based_on_world_bank_plus_maddison__2017_config.json diff --git a/snapshots/backport/latest/dataset_393_world_gdp_in_2011_int_dollar__owid_based_on_world_bank_plus_maddison__2017_values.feather.dvc b/snapshots/backport/latest/dataset_393_world_gdp_in_2011_int_dollar__owid_based_on_world_bank_plus_maddison__2017_values.feather.dvc index 4a15ef47500..95cd572a8c7 100644 --- a/snapshots/backport/latest/dataset_393_world_gdp_in_2011_int_dollar__owid_based_on_world_bank_plus_maddison__2017_values.feather.dvc +++ b/snapshots/backport/latest/dataset_393_world_gdp_in_2011_int_dollar__owid_based_on_world_bank_plus_maddison__2017_values.feather.dvc @@ -2,14 +2,14 @@ meta: source: name: Our World in Data catalog backport url: https://owid.cloud/admin/datasets/393 - date_accessed: 2023-08-10 07:51:43.657146 + date_accessed: 2023-12-18 08:51:19.058056 publication_date: latest published_by: Our World in Data catalog backport name: World GDP in 2011 int $ – OWID based on World Bank + Maddison (2017) description: '' wdir: ../../../data/snapshots/backport/latest outs: -- md5: b68bc2ee4faebabe881e34c670882ad6 - size: 7170 +- md5: f79db210e3a7fc9c5262d491edd32e33 + size: 7826 path: dataset_393_world_gdp_in_2011_int_dollar__owid_based_on_world_bank_plus_maddison__2017_values.feather diff --git a/snapshots/backport/latest/dataset_4132_fertility_rate__selected_gapminder__v12__2017_config.json.dvc b/snapshots/backport/latest/dataset_4132_fertility_rate__selected_gapminder__v12__2017_config.json.dvc index 173f7362c38..dd96ab0946c 100644 --- a/snapshots/backport/latest/dataset_4132_fertility_rate__selected_gapminder__v12__2017_config.json.dvc +++ b/snapshots/backport/latest/dataset_4132_fertility_rate__selected_gapminder__v12__2017_config.json.dvc @@ -2,13 +2,13 @@ meta: source: name: Our World in Data catalog backport url: https://owid.cloud/admin/datasets/4132 - date_accessed: 2023-08-10 07:51:28.217495 + date_accessed: 2023-12-18 08:46:37.900758 publication_date: latest published_by: Our World in Data catalog backport name: Grapher metadata for dataset_4132_fertility_rate__selected_gapminder__v12__2017 description: '' wdir: ../../../data/snapshots/backport/latest outs: -- md5: cfd06346be488f16a4434610171f380b - size: 3932 +- md5: 9dfa632754bb109296e90e3f69514aff + size: 3945 path: dataset_4132_fertility_rate__selected_gapminder__v12__2017_config.json diff --git a/snapshots/backport/latest/dataset_4132_fertility_rate__selected_gapminder__v12__2017_values.feather.dvc b/snapshots/backport/latest/dataset_4132_fertility_rate__selected_gapminder__v12__2017_values.feather.dvc index fde82cd85cb..f8e93fa6b66 100644 --- a/snapshots/backport/latest/dataset_4132_fertility_rate__selected_gapminder__v12__2017_values.feather.dvc +++ b/snapshots/backport/latest/dataset_4132_fertility_rate__selected_gapminder__v12__2017_values.feather.dvc @@ -2,13 +2,13 @@ meta: source: name: Our World in Data catalog backport url: https://owid.cloud/admin/datasets/4132 - date_accessed: 2023-08-10 07:51:58.704001 + date_accessed: 2023-12-18 08:46:47.411987 publication_date: latest published_by: Our World in Data catalog backport name: Fertility Rate (Selected Gapminder, v12) (2017) description: '' wdir: ../../../data/snapshots/backport/latest outs: -- md5: 67300677b53c79834d38e4d057b06577 - size: 121386 +- md5: aaf0381967e890beadbdbc736bd783f7 + size: 271458 path: dataset_4132_fertility_rate__selected_gapminder__v12__2017_values.feather diff --git a/snapshots/gapminder/2019-05-25/fertility_rate.feather.dvc b/snapshots/gapminder/2019-05-25/fertility_rate.feather.dvc new file mode 100644 index 00000000000..bab475de678 --- /dev/null +++ b/snapshots/gapminder/2019-05-25/fertility_rate.feather.dvc @@ -0,0 +1,64 @@ +meta: + source: + name: Fertility Rate (Selected Gapminder, v12) (2017) + description: >- + Data is that of version 12 of Gapminder, the latest version as of 2019. This + is the full fertility rate dataset published + by Gapminder. + + + Gapminder's sources and methodology if well-documented in its dataset at: https://www.gapminder.org/data/ + + + It notes its data sources during three key periods of time: + + + — 1800 to 1950 (and in some cases also years after 1950): Gapminder v6 which + were compiled and documented by Mattias + Lindgren. + + + — 1950 to 2014: In most cases we use the latest UN estimates from World Population + Prospects 2017 published in the file + with Annually interpolated demographic indicators, called WPP2017_INT_F01_ANNUAL_DEMOGRAPHIC_INDICATORS.xlsx + , accessed + on September 2, 2017. + + + — 2015 – 2099: We use the UN forecast of future fertility rate in all countries, + called median fertility variant. + + + Version 12 of the dataset extends back to the year 1800. Version 6 of Gapminder's + fertility series includes data for + a few countries further than 1800. We have included more historic data from + Version 6 for Finland, the United Kingdom + and Sweden. All data from 1800 onwards is from Version 12; data from pre-1800 + is from Version 6. + + + There are significant uncertainties in data for many countries pre-1950. To + develop full series back to 1800 for all + countries, Gapminder combines published estimates within the academic literature + and national statistics, with their + own guesstimates and extrapolations for countries without published estimates. + This series presents the selective Gapminder + dataset: we have removed data points which were estimated by Gapminder with + high uncertainty and instead only include + those from published sources or the United Nations dataset. + + + We also publish the full dataset from Gapminder for users looking for a complete + series. However, we should highlight + that some of these estimates have a high degree of uncertainty. This dataset + can be accessed here: https://ourworldindata.org/grapher/fertility-rate-complete-gapminder + url: https://drive.google.com/drive/folders/1i30LyIcvbLa800Q1ZFrPGlNHIw9ymYGm + date_accessed: 2019-05-25 + published_by: Gapminder + name: Fertility Rate (Selected Gapminder, v12) (2017) + description: '' +wdir: ../../../data/snapshots/gapminder/2019-05-25 +outs: +- md5: 07302fc652ae3018457e339e7fde539a + size: 135234 + path: fertility_rate.feather diff --git a/snapshots/gapminder/2019-05-25/fertility_rate.py b/snapshots/gapminder/2019-05-25/fertility_rate.py new file mode 100644 index 00000000000..099ac75360b --- /dev/null +++ b/snapshots/gapminder/2019-05-25/fertility_rate.py @@ -0,0 +1,50 @@ +from pathlib import Path + +import click +import pandas as pd + +from etl.backport_helpers import long_to_wide +from etl.snapshot import Snapshot, SnapshotMeta + +SNAPSHOT_NAMESPACE = Path(__file__).parent.parent.name +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option( + "--upload/--skip-upload", + default=True, + type=bool, + help="Upload dataset to Snapshot", +) +def main(upload: bool) -> None: + # Load backported snapshot. + snap_values = Snapshot("backport/latest/dataset_4132_fertility_rate__selected_gapminder__v12__2017_values.feather") + snap_values.pull() + snap_config = Snapshot("backport/latest/dataset_4132_fertility_rate__selected_gapminder__v12__2017_config.json") + snap_config.pull() + + # Create snapshot metadata for the new file + meta = SnapshotMeta(**snap_values.metadata.to_dict()) + meta.namespace = SNAPSHOT_NAMESPACE + meta.version = SNAPSHOT_VERSION + meta.short_name = "fertility_rate" + meta.fill_from_backport_snapshot(snap_config.path) + meta.save() + + # Create a new snapshot. + snap = Snapshot(meta.uri) + + # Convert from long to wide format. + df = long_to_wide(pd.read_feather(snap_values.path)) + + # Copy file to the new snapshot. + snap.path.parent.mkdir(parents=True, exist_ok=True) + df.reset_index().to_feather(snap.path) + + # Add file to DVC and upload to S3. + snap.dvc_add(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/wb/2017-04-16/world_gdp.feather.dvc b/snapshots/wb/2017-04-16/world_gdp.feather.dvc new file mode 100644 index 00000000000..d221e784083 --- /dev/null +++ b/snapshots/wb/2017-04-16/world_gdp.feather.dvc @@ -0,0 +1,18 @@ +meta: + source: + name: Our World In Data based on World Bank & Maddison (2017) + description: 'The data presented here from 1990 onwards is from the World Bank. + It is total global GDP in 2011 international-$ as published here: http://data.worldbank.org/indicator/NY.GDP.MKTP.PP.KD + (accessed on April 16, 2017). Data earlier than 1990 is backwards extended from + the World Bank observation for 1990 based on the growth rates implied by Maddison + data. The Maddison data is published here: http://www.ggdc.net/maddison/oriindex.htm' + url: Please see additional information for links + date_accessed: 2017-04-16 + published_by: New Maddison Project Database and World Bank + name: World GDP in 2011 int $ – OWID based on World Bank + Maddison (2017) + description: '' +wdir: ../../../data/snapshots/wb/2017-04-16 +outs: +- md5: 8987b0657951b975b5bfc9d71306078d + size: 4050 + path: world_gdp.feather diff --git a/snapshots/wb/2017-04-16/world_gdp.py b/snapshots/wb/2017-04-16/world_gdp.py new file mode 100644 index 00000000000..f82f7eb391b --- /dev/null +++ b/snapshots/wb/2017-04-16/world_gdp.py @@ -0,0 +1,54 @@ +from pathlib import Path + +import click +import pandas as pd + +from etl.backport_helpers import long_to_wide +from etl.snapshot import Snapshot, SnapshotMeta + +SNAPSHOT_NAMESPACE = Path(__file__).parent.parent.name +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option( + "--upload/--skip-upload", + default=True, + type=bool, + help="Upload dataset to Snapshot", +) +def main(upload: bool) -> None: + # Load backported snapshot. + snap_values = Snapshot( + "backport/latest/dataset_393_world_gdp_in_2011_int_dollar__owid_based_on_world_bank_plus_maddison__2017_values.feather" + ) + snap_values.pull() + snap_config = Snapshot( + "backport/latest/dataset_393_world_gdp_in_2011_int_dollar__owid_based_on_world_bank_plus_maddison__2017_config.json" + ) + snap_config.pull() + + # Create snapshot metadata for the new file + meta = SnapshotMeta(**snap_values.metadata.to_dict()) + meta.namespace = SNAPSHOT_NAMESPACE + meta.version = SNAPSHOT_VERSION + meta.short_name = "world_gdp" + meta.fill_from_backport_snapshot(snap_config.path) + meta.save() + + # Create a new snapshot. + snap = Snapshot(meta.uri) + + # Convert from long to wide format. + df = long_to_wide(pd.read_feather(snap_values.path)) + + # Copy file to the new snapshot. + snap.path.parent.mkdir(parents=True, exist_ok=True) + df.reset_index().to_feather(snap.path) + + # Add file to DVC and upload to S3. + snap.dvc_add(upload=upload) + + +if __name__ == "__main__": + main()