Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

📊 migrate fertility rate and GDP to ETL #2111

Merged
merged 1 commit into from
Dec 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions dag/migrated.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,11 @@ steps:
- data://garden/iucn/2022-12-08/threatened_and_evaluated_species
data://garden/iucn/2022-12-08/threatened_and_evaluated_species:
- snapshot://iucn/2022-12-08/threatened_and_evaluated_species.feather
data://grapher/gapminder/2019-05-25/fertility_rate:
- data://garden/gapminder/2019-05-25/fertility_rate
data://garden/gapminder/2019-05-25/fertility_rate:
- snapshot://gapminder/2019-05-25/fertility_rate.feather
data://grapher/wb/2017-04-16/world_gdp:
- data://garden/wb/2017-04-16/world_gdp
data://garden/wb/2017-04-16/world_gdp:
- snapshot://wb/2017-04-16/world_gdp.feather
13 changes: 13 additions & 0 deletions etl/steps/data/garden/gapminder/2019-05-25/fertility_rate.meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
dataset:
title: Fertility Rate (Selected Gapminder, v12) (2017)
tables:
fertility_rate:
variables:
fertility_rate__select_gapminder__v12__2017:
title: Fertility rate (Select Gapminder, v12) (2017)
unit: children per woman
display:
name: Fertility rate
description: Total fertility rate represents the number of children that would be born to a woman if she were to live
to the end of her childbearing years and bear children in accordance with age-specific fertility rates of the specified
year.
23 changes: 23 additions & 0 deletions etl/steps/data/garden/gapminder/2019-05-25/fertility_rate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""Load snapshot and create a garden dataset."""

from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run(dest_dir: str) -> None:
#
# Load data from snapshot.
#
snap = paths.load_snapshot()
tb = snap.read().set_index(["country", "year"])

#
# Save outputs.
#
# Create a new garden dataset with the same metadata as the snapshot.
ds_garden = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata)

# Save changes in the new garden dataset.
ds_garden.save()
12 changes: 12 additions & 0 deletions etl/steps/data/garden/wb/2017-04-16/world_gdp.meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
dataset:
title: World GDP in 2011 int $ – OWID based on World Bank + Maddison (2017)
tables:
world_gdp:
variables:
world_gdp_in_2011_int_dollar__owid_based_on_world_bank__and__maddison__2017:
title: World GDP in 2011 Int.$ (OWID based on World Bank & Maddison (2017))
unit: int.-$
short_unit: $
display:
numDecimalPlaces: 0
unit: international-$ in 2011 prices
23 changes: 23 additions & 0 deletions etl/steps/data/garden/wb/2017-04-16/world_gdp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""Load snapshot and create a garden dataset."""

from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run(dest_dir: str) -> None:
#
# Load data from snapshot.
#
snap = paths.load_snapshot()
tb = snap.read().set_index(["country", "year"])

#
# Save outputs.
#
# Create a new garden dataset with the same metadata as the snapshot.
ds_garden = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata)

# Save changes in the new garden dataset.
ds_garden.save()
26 changes: 26 additions & 0 deletions etl/steps/data/grapher/gapminder/2019-05-25/fertility_rate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Load a garden dataset and create a grapher dataset."""

from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Load garden dataset.
ds_garden = paths.load_dataset()

# Read table from garden dataset.
tb = ds_garden["fertility_rate"]

#
# Save outputs.
#
# Create a new grapher dataset with the same metadata as the garden dataset.
ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata)

# Save changes in the new grapher dataset.
ds_grapher.save()
26 changes: 26 additions & 0 deletions etl/steps/data/grapher/wb/2017-04-16/world_gdp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Load a garden dataset and create a grapher dataset."""

from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Load garden dataset.
ds_garden = paths.load_dataset()

# Read table from garden dataset.
tb = ds_garden["world_gdp"]

#
# Save outputs.
#
# Create a new grapher dataset with the same metadata as the garden dataset.
ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata)

# Save changes in the new grapher dataset.
ds_grapher.save()
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@ meta:
source:
name: Our World in Data catalog backport
url: https://owid.cloud/admin/datasets/393
date_accessed: 2023-08-10 07:51:10.903473
date_accessed: 2023-12-18 08:51:10.041539
publication_date: latest
published_by: Our World in Data catalog backport
name: Grapher metadata for
dataset_393_world_gdp_in_2011_int_dollar__owid_based_on_world_bank_plus_maddison__2017
description: ''
wdir: ../../../data/snapshots/backport/latest
outs:
- md5: fec4d98fd5ca13082abafc76b86a26aa
size: 2307
- md5: 576d8edddb47c338949704b1195f9ab8
size: 2320
path:
dataset_393_world_gdp_in_2011_int_dollar__owid_based_on_world_bank_plus_maddison__2017_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@ meta:
source:
name: Our World in Data catalog backport
url: https://owid.cloud/admin/datasets/393
date_accessed: 2023-08-10 07:51:43.657146
date_accessed: 2023-12-18 08:51:19.058056
publication_date: latest
published_by: Our World in Data catalog backport
name: World GDP in 2011 int $ – OWID based on World Bank + Maddison (2017)
description: ''
wdir: ../../../data/snapshots/backport/latest
outs:
- md5: b68bc2ee4faebabe881e34c670882ad6
size: 7170
- md5: f79db210e3a7fc9c5262d491edd32e33
size: 7826
path:
dataset_393_world_gdp_in_2011_int_dollar__owid_based_on_world_bank_plus_maddison__2017_values.feather
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@ meta:
source:
name: Our World in Data catalog backport
url: https://owid.cloud/admin/datasets/4132
date_accessed: 2023-08-10 07:51:28.217495
date_accessed: 2023-12-18 08:46:37.900758
publication_date: latest
published_by: Our World in Data catalog backport
name: Grapher metadata for dataset_4132_fertility_rate__selected_gapminder__v12__2017
description: ''
wdir: ../../../data/snapshots/backport/latest
outs:
- md5: cfd06346be488f16a4434610171f380b
size: 3932
- md5: 9dfa632754bb109296e90e3f69514aff
size: 3945
path: dataset_4132_fertility_rate__selected_gapminder__v12__2017_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@ meta:
source:
name: Our World in Data catalog backport
url: https://owid.cloud/admin/datasets/4132
date_accessed: 2023-08-10 07:51:58.704001
date_accessed: 2023-12-18 08:46:47.411987
publication_date: latest
published_by: Our World in Data catalog backport
name: Fertility Rate (Selected Gapminder, v12) (2017)
description: ''
wdir: ../../../data/snapshots/backport/latest
outs:
- md5: 67300677b53c79834d38e4d057b06577
size: 121386
- md5: aaf0381967e890beadbdbc736bd783f7
size: 271458
path: dataset_4132_fertility_rate__selected_gapminder__v12__2017_values.feather
64 changes: 64 additions & 0 deletions snapshots/gapminder/2019-05-25/fertility_rate.feather.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
meta:
source:
name: Fertility Rate (Selected Gapminder, v12) (2017)
description: >-
Data is that of version 12 of Gapminder, the latest version as of 2019. This
is the full fertility rate dataset published
by Gapminder.


Gapminder's sources and methodology if well-documented in its dataset at: https://www.gapminder.org/data/


It notes its data sources during three key periods of time:


— 1800 to 1950 (and in some cases also years after 1950): Gapminder v6 which
were compiled and documented by Mattias
Lindgren.


— 1950 to 2014: In most cases we use the latest UN estimates from World Population
Prospects 2017 published in the file
with Annually interpolated demographic indicators, called WPP2017_INT_F01_ANNUAL_DEMOGRAPHIC_INDICATORS.xlsx
, accessed
on September 2, 2017.


— 2015 – 2099: We use the UN forecast of future fertility rate in all countries,
called median fertility variant.


Version 12 of the dataset extends back to the year 1800. Version 6 of Gapminder's
fertility series includes data for
a few countries further than 1800. We have included more historic data from
Version 6 for Finland, the United Kingdom
and Sweden. All data from 1800 onwards is from Version 12; data from pre-1800
is from Version 6.


There are significant uncertainties in data for many countries pre-1950. To
develop full series back to 1800 for all
countries, Gapminder combines published estimates within the academic literature
and national statistics, with their
own guesstimates and extrapolations for countries without published estimates.
This series presents the selective Gapminder
dataset: we have removed data points which were estimated by Gapminder with
high uncertainty and instead only include
those from published sources or the United Nations dataset.


We also publish the full dataset from Gapminder for users looking for a complete
series. However, we should highlight
that some of these estimates have a high degree of uncertainty. This dataset
can be accessed here: https://ourworldindata.org/grapher/fertility-rate-complete-gapminder
url: https://drive.google.com/drive/folders/1i30LyIcvbLa800Q1ZFrPGlNHIw9ymYGm
date_accessed: 2019-05-25
published_by: Gapminder
name: Fertility Rate (Selected Gapminder, v12) (2017)
description: ''
wdir: ../../../data/snapshots/gapminder/2019-05-25
outs:
- md5: 07302fc652ae3018457e339e7fde539a
size: 135234
path: fertility_rate.feather
50 changes: 50 additions & 0 deletions snapshots/gapminder/2019-05-25/fertility_rate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from pathlib import Path

import click
import pandas as pd

from etl.backport_helpers import long_to_wide
from etl.snapshot import Snapshot, SnapshotMeta

SNAPSHOT_NAMESPACE = Path(__file__).parent.parent.name
SNAPSHOT_VERSION = Path(__file__).parent.name


@click.command()
@click.option(
"--upload/--skip-upload",
default=True,
type=bool,
help="Upload dataset to Snapshot",
)
def main(upload: bool) -> None:
# Load backported snapshot.
snap_values = Snapshot("backport/latest/dataset_4132_fertility_rate__selected_gapminder__v12__2017_values.feather")
snap_values.pull()
snap_config = Snapshot("backport/latest/dataset_4132_fertility_rate__selected_gapminder__v12__2017_config.json")
snap_config.pull()

# Create snapshot metadata for the new file
meta = SnapshotMeta(**snap_values.metadata.to_dict())
meta.namespace = SNAPSHOT_NAMESPACE
meta.version = SNAPSHOT_VERSION
meta.short_name = "fertility_rate"
meta.fill_from_backport_snapshot(snap_config.path)
meta.save()

# Create a new snapshot.
snap = Snapshot(meta.uri)

# Convert from long to wide format.
df = long_to_wide(pd.read_feather(snap_values.path))

# Copy file to the new snapshot.
snap.path.parent.mkdir(parents=True, exist_ok=True)
df.reset_index().to_feather(snap.path)

# Add file to DVC and upload to S3.
snap.dvc_add(upload=upload)


if __name__ == "__main__":
main()
18 changes: 18 additions & 0 deletions snapshots/wb/2017-04-16/world_gdp.feather.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
meta:
source:
name: Our World In Data based on World Bank & Maddison (2017)
description: 'The data presented here from 1990 onwards is from the World Bank.
It is total global GDP in 2011 international-$ as published here: http://data.worldbank.org/indicator/NY.GDP.MKTP.PP.KD
(accessed on April 16, 2017). Data earlier than 1990 is backwards extended from
the World Bank observation for 1990 based on the growth rates implied by Maddison
data. The Maddison data is published here: http://www.ggdc.net/maddison/oriindex.htm'
url: Please see additional information for links
date_accessed: 2017-04-16
published_by: New Maddison Project Database and World Bank
name: World GDP in 2011 int $ – OWID based on World Bank + Maddison (2017)
description: ''
wdir: ../../../data/snapshots/wb/2017-04-16
outs:
- md5: 8987b0657951b975b5bfc9d71306078d
size: 4050
path: world_gdp.feather
Loading