From da1595fc653c1d2abeb2cc19e18535c9ab14184f Mon Sep 17 00:00:00 2001 From: veronikasamborska1994 <32176660+veronikasamborska1994@users.noreply.github.com> Date: Mon, 30 Dec 2024 13:46:26 +0000 Subject: [PATCH] =?UTF-8?q?=F0=9F=93=8A=20Marriages=20and=20Divorces:=20OE?= =?UTF-8?q?CD=20Family=20Database?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dag/families.yml | 10 ++ dag/main.yml | 1 + .../2024-12-30/family_database.countries.json | 52 ++++++++ .../oecd/2024-12-30/family_database.meta.yml | 118 ++++++++++++++++++ .../garden/oecd/2024-12-30/family_database.py | 57 +++++++++ .../oecd/2024-12-30/family_database.py | 28 +++++ .../meadow/oecd/2024-12-30/family_database.py | 36 ++++++ .../oecd/2024-12-30/family_database.csv.dvc | 27 ++++ snapshots/oecd/2024-12-30/family_database.py | 22 ++++ 9 files changed, 351 insertions(+) create mode 100644 dag/families.yml create mode 100644 etl/steps/data/garden/oecd/2024-12-30/family_database.countries.json create mode 100644 etl/steps/data/garden/oecd/2024-12-30/family_database.meta.yml create mode 100644 etl/steps/data/garden/oecd/2024-12-30/family_database.py create mode 100644 etl/steps/data/grapher/oecd/2024-12-30/family_database.py create mode 100644 etl/steps/data/meadow/oecd/2024-12-30/family_database.py create mode 100644 snapshots/oecd/2024-12-30/family_database.csv.dvc create mode 100644 snapshots/oecd/2024-12-30/family_database.py diff --git a/dag/families.yml b/dag/families.yml new file mode 100644 index 000000000000..c117e1ce7ce7 --- /dev/null +++ b/dag/families.yml @@ -0,0 +1,10 @@ +steps: + # + # OECD Family Database + # + data://meadow/oecd/2024-12-30/family_database: + - snapshot://oecd/2024-12-30/family_database.csv + data://garden/oecd/2024-12-30/family_database: + - data://meadow/oecd/2024-12-30/family_database + data://grapher/oecd/2024-12-30/family_database: + - data://garden/oecd/2024-12-30/family_database diff --git a/dag/main.yml b/dag/main.yml index d8be36af51dc..61b2f4b5e617 100644 --- a/dag/main.yml +++ b/dag/main.yml @@ -784,3 +784,4 @@ include: - dag/tourism.yml - dag/migration.yml - dag/equality.yml + - dag/families.yml diff --git a/etl/steps/data/garden/oecd/2024-12-30/family_database.countries.json b/etl/steps/data/garden/oecd/2024-12-30/family_database.countries.json new file mode 100644 index 000000000000..e20dea56ba40 --- /dev/null +++ b/etl/steps/data/garden/oecd/2024-12-30/family_database.countries.json @@ -0,0 +1,52 @@ +{ + "Argentina": "Argentina", + "Australia": "Australia", + "Austria": "Austria", + "Belgium": "Belgium", + "Brazil": "Brazil", + "Bulgaria": "Bulgaria", + "Canada": "Canada", + "Chile": "Chile", + "Colombia": "Colombia", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cyprus": "Cyprus", + "Czechia": "Czechia", + "Denmark": "Denmark", + "Estonia": "Estonia", + "Finland": "Finland", + "France": "France", + "Germany": "Germany", + "Greece": "Greece", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Japan": "Japan", + "Latvia": "Latvia", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Malta": "Malta", + "Mexico": "Mexico", + "Netherlands": "Netherlands", + "New Zealand": "New Zealand", + "Norway": "Norway", + "Poland": "Poland", + "Portugal": "Portugal", + "Romania": "Romania", + "Russia": "Russia", + "Slovak Republic": "Slovakia", + "Slovenia": "Slovenia", + "South Africa": "South Africa", + "Spain": "Spain", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "United Kingdom": "United Kingdom", + "United States": "United States", + "China (People's Republic of)": "China", + "Korea": "South Korea", + "T\u00fcrkiye": "Turkey" +} \ No newline at end of file diff --git a/etl/steps/data/garden/oecd/2024-12-30/family_database.meta.yml b/etl/steps/data/garden/oecd/2024-12-30/family_database.meta.yml new file mode 100644 index 000000000000..5bf4ca4e801a --- /dev/null +++ b/etl/steps/data/garden/oecd/2024-12-30/family_database.meta.yml @@ -0,0 +1,118 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Marriages & Divorces + display: + numDecimalPlaces: 1 + + description_producer_empl: &description_producer_empl + Employment rates for women (15–64-year-olds) with at least one child aged 0-14, with ‘children’ defined as any children aged 0-14 inclusivewho live in the same household as the woman and who are reported as the child of the woman (including both biological children and step or adoptive children). Women with children who do not live in the same household are generally not included, nor are women with children aged 15 and over regardless of whether or not the child lives in the same household and/or is dependent on the mother. Exceptions to this definition are Canada, Korea and the United States, were children aged 0-17 are included. For Australia and Japan, data cover all women aged 15 and over, and for Korea married women aged 15-54. + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + + +tables: + family_database: + variables: + child_poverty_rate: + title: Child poverty rate + unit: "%" + short_unit: "%" + description_short: The percentage of children under 18 living in households with incomes below the poverty line. + description_from_producer: The child relative income poverty rate, defined as the percentage of children (0-17 year-olds) with an equivalised household disposable income (i.e. an income after taxes and transfers adjusted for household size) below the poverty threshold. The poverty threshold is set here at 50% of the median disposable income in each country. + + crude_divorce_rate__divorces_per_1000_people: + title: Crude divorce rate (divorces per 1,000 people) + unit: "per 1,000 people" + short_unit: "per 1,000 people" + description_short: Number of divorces during a given year per 1,000 people. + description_from_producer: The crude divorce rate is defined as the number of divorces during a given year per 1,000 people. + + crude_marriage_rate__marriages_per_1000_people: + title: Crude marriage rate (marriages per 1,000 people) + unit: "per 1,000 people" + short_unit: "per 1,000 people" + description_short: Number of marriages during a given year per 1,000 people. + description_from_producer: The crude marriage rate is defined as the number of marriages during a given year per 1,000 people. + + employment_rates__pct__for_all_mothers__15_64_year_olds__with_at_least_one_child_under_15: + title: Employment rates (%), for all mothers (15-64 year-olds) with at least one child under 15 + unit: "%" + short_unit: "%" + description_short: The percentage of mothers aged 15-64 with at least one child under 15 who are employed. + + employment_rates__pct__for_partnered_mothers__15_64_year_olds__with_at_least_one_child_under_15: + title: Employment rates (%), for partnered mothers (15-64 year-olds) with at least one child under 15 + unit: "%" + short_unit: "%" + description_short: The percentage of partnered mothers aged 15-64 with at least one child under 15 who are employed. + description_from_producer: *description_producer_empl + + employment_rates__pct__for_sole_parent_mothers__15_64_year_olds__with_at_least_one_child_under_15: + title: Employment rates (%), for sole parent mothers (15-64 year-olds) with at least one child under 15 + unit: "%" + short_unit: "%" + description_short: The percentage of sole parent mothers aged 15-64 with at least one child under 15 who are employed. + description_from_producer: *description_producer_empl + + length_of_paid_maternity__parental_and_home_care_leave_available_to_mothers_in_weeks: + title: Length of paid maternity, parental and home care leave available to mothers + unit: "weeks" + short_unit: "" + description_short: The number of weeks of paid maternity, parental and home care leave available to mothers. + + + length_of_paid_paternity_and_parental_leave_reserved_for_fathers_in_weeks: + title: Length of paid paternity and parental leave reserved for fathers + unit: "weeks" + short_unit: "" + description_short: The number of weeks of paid paternity and parental leave reserved for fathers. + + proportion__pct__of_children__aged_0_14__that_live_in_households_where_all_adults_are_in_employment__working: + title: Proportion of children aged 0-14 that live in households where all adults are in employment + unit: "%" + short_unit: "%" + description_short: The percentage of children aged 0-14 that live in households where all adults are in employment. + + proportion__pct__of_children__aged_0_17__living_in_other_types_of_household: + title: Proportion of children aged 0-17 living in other types of household + unit: "%" + short_unit: "%" + description_short: The percentage of children aged 0-17 living in other types of household. + + proportion__pct__of_children__aged_0_17__living_with_a_single_parent: + title: Proportion of children aged 0-17 living with a single parent + unit: "%" + short_unit: "%" + description_short: The percentage of children aged 0-17 living with a single parent. + + proportion__pct__of_children__aged_0_17__living_with_two_parents: + title: Proportion of children aged 0-17 living with two parents + unit: "%" + short_unit: "%" + description_short: The percentage of children aged 0-17 living with two parents. + + + proportion__pct__of_children_aged_0_2_enrolled_in_formal_childcare_and_pre_school: + title: Proportion of children aged 0-2 enrolled in formal childcare and pre-school + unit: "%" + short_unit: "%" + description_short: The percentage of children aged 0-2 enrolled in formal childcare and pre-school. + + + share_of_births_outside_of_marriage__pct_of_all_births: + title: Share of births outside of marriage (% of all births) + unit: "%" + short_unit: "%" + description_short: The percentage of births that occur outside of marriage. + + total_public_social_expenditure_on_families_as_a_pct_of_gdp: + title: Total public social expenditure on families as a % of GDP + unit: "%" + short_unit: "%" + description_short: Total public social expenditure on families as a percentage of GDP. \ No newline at end of file diff --git a/etl/steps/data/garden/oecd/2024-12-30/family_database.py b/etl/steps/data/garden/oecd/2024-12-30/family_database.py new file mode 100644 index 000000000000..e309a2ea4a04 --- /dev/null +++ b/etl/steps/data/garden/oecd/2024-12-30/family_database.py @@ -0,0 +1,57 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("family_database") + + # Read table from meadow dataset. + tb = ds_meadow.read("family_database") + + # + # Process data. + # + tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + + tb = tb.pivot(index=["country", "year"], columns="indicator", values="value").reset_index() + + columns_of_interest = [ + "Child poverty rate", + "Crude divorce rate (divorces per 1000 people)", + "Crude marriage rate (marriages per 1000 people)", + "Employment rates (%) for all mothers (15-64 year olds) with at least one child under 15", + "Employment rates (%) for partnered mothers (15-64 year olds) with at least one child under 15", + "Employment rates (%) for sole-parent mothers (15-64 year olds) with at least one child under 15", + "Length of paid maternity, parental and home care leave available to mothers in weeks", + "Length of paid paternity and parental leave reserved for fathers in weeks", + "Proportion (%) of children (aged 0-14) that live in households where all adults are in employment (working)", + "Proportion (%) of children (aged 0-17) living in 'other' types of household", + "Proportion (%) of children (aged 0-17) living with a single parent", + "Proportion (%) of children (aged 0-17) living with two parents", + "Proportion (%) of children aged 0-2 enrolled in formal childcare and pre-school", + "Share of births outside of marriage (% of all births)", + "Total public social expenditure on families as a % of GDP", + ] + + tb = tb[["country", "year"] + columns_of_interest] + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/grapher/oecd/2024-12-30/family_database.py b/etl/steps/data/grapher/oecd/2024-12-30/family_database.py new file mode 100644 index 000000000000..3b4dcb4dae31 --- /dev/null +++ b/etl/steps/data/grapher/oecd/2024-12-30/family_database.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("family_database") + + # Read table from garden dataset. + tb = ds_garden.read("family_database", reset_index=False) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/meadow/oecd/2024-12-30/family_database.py b/etl/steps/data/meadow/oecd/2024-12-30/family_database.py new file mode 100644 index 000000000000..dcb34b9b272e --- /dev/null +++ b/etl/steps/data/meadow/oecd/2024-12-30/family_database.py @@ -0,0 +1,36 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("family_database.csv") + + # Load data from snapshot. + tb = snap.read(safe_types=False) + columns_to_use = ["Country", "Indicator", "TIME_PERIOD", "OBS_VALUE"] + + tb = tb[columns_to_use] + tb = tb.rename(columns={"TIME_PERIOD": "year", "OBS_VALUE": "value"}) + + # + # Process data. + # + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year", "indicator"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/snapshots/oecd/2024-12-30/family_database.csv.dvc b/snapshots/oecd/2024-12-30/family_database.csv.dvc new file mode 100644 index 000000000000..713aef287495 --- /dev/null +++ b/snapshots/oecd/2024-12-30/family_database.csv.dvc @@ -0,0 +1,27 @@ +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: OECD Family Dabatabse + description: |- + The OECD Family Database provides cross-national indicators on family outcomes and family policies across the OECD countries, its enhanced engagement partners and EU member states. It includes 70 indicators under four main dimensions: (i) structure of families, (ii) labour market position of families, (iii) public policies for families and children and (iv) child outcomes. + date_published: 2024-03-27 + + # Citation + producer: OECD + citation_full: |- + OECD (2024). OECD Family Database. + + # Files + url_main: https://data-explorer.oecd.org/vis?tenant=archive&df[ds]=DisseminateArchiveDMZ&df[id]=DF_FAMILY&df[ag]=OECD&dq=..FAM14%2BFAM13%2BFAM15A%2BFAM15B%2BFAM10C%2BFAM10B%2BFAM10A%2BFAM9C%2BFAM9A%2BFAM9B%2BFAM7%2BFAM17%2BFAM8C%2BFAM8B%2BFAM11D%2BFAM11C%2BFAM11B%2BFAM11A%2BFAM8A%2BFAM5C%2BFAM5B%2BFAM5A%2BFAM4B%2BFAM12A%2BFAM12B%2BFAM4A%2BFAM3&pd=1960%2C2022&to[TIME_PERIOD]=false&vw=tb + url_download: https://sdmx.oecd.org/archive/rest/data/OECD,DF_FAMILY,/..FAM14+FAM13+FAM15A+FAM15B+FAM10C+FAM10B+FAM10A+FAM9C+FAM9A+FAM9B+FAM7+FAM17+FAM8C+FAM8B+FAM11D+FAM11C+FAM11B+FAM11A+FAM8A+FAM5C+FAM5B+FAM5A+FAM4B+FAM12A+FAM12B+FAM4A+FAM3?startPeriod=1960&endPeriod=2022&dimensionAtObservation=AllDimensions&format=csvfilewithlabels + date_accessed: 2024-12-30 + + # License + license: + name: OECD Terms of Conditions + url: https://www.oecd.org/en/about/terms-conditions.html +outs: + - md5: 13a5061e7b4794ce209b2c139c7346f9 + size: 4314783 + path: family_database.csv diff --git a/snapshots/oecd/2024-12-30/family_database.py b/snapshots/oecd/2024-12-30/family_database.py new file mode 100644 index 000000000000..c03a68fa5494 --- /dev/null +++ b/snapshots/oecd/2024-12-30/family_database.py @@ -0,0 +1,22 @@ +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"oecd/{SNAPSHOT_VERSION}/family_database.csv") + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(upload=upload) + + +if __name__ == "__main__": + main()