From 15dd9266c3b5a8f7169038d79aa42846eb1aff8f Mon Sep 17 00:00:00 2001 From: Marigold Date: Tue, 17 Dec 2024 09:46:32 +0100 Subject: [PATCH 1/5] =?UTF-8?q?=E2=9C=A8=20Add=20reset=5Fmetadata=20paramt?= =?UTF-8?q?er=20to=20.load?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From 862081b19e1c50a96736a2c2a2cd60c8f89733d7 Mon Sep 17 00:00:00 2001 From: Marigold Date: Tue, 17 Dec 2024 09:47:04 +0100 Subject: [PATCH 2/5] wip --- .../2024-12-16/malnutrition.meta.yml | 34 +++++++++++ .../malnutrition/2024-12-16/malnutrition.py | 56 +++++++++++++++++++ lib/catalog/owid/catalog/datasets.py | 16 +++++- 3 files changed, 104 insertions(+), 2 deletions(-) create mode 100644 etl/steps/data/garden/malnutrition/2024-12-16/malnutrition.meta.yml create mode 100644 etl/steps/data/garden/malnutrition/2024-12-16/malnutrition.py diff --git a/etl/steps/data/garden/malnutrition/2024-12-16/malnutrition.meta.yml b/etl/steps/data/garden/malnutrition/2024-12-16/malnutrition.meta.yml new file mode 100644 index 00000000000..4688dee7907 --- /dev/null +++ b/etl/steps/data/garden/malnutrition/2024-12-16/malnutrition.meta.yml @@ -0,0 +1,34 @@ +dataset: + title: Number of stunted, wasted and underweight children + description: | + Data on the prevalence of malnourishment is taken from the World Bank's World Development Indicators. It is measured in three metrics: i) Prevalence of stunting, height for age (% of children under 5); ii) Prevalence of wasting, weight for height (% of children under 5); iii) Prevalence of underweight, weight for age (% of children under 5) +tables: + malnutrition: + variables: + number_of_stunted_children: + title: Number of stunted children + description_short: Prevalence of stunting is the percentage of children under age 5 whose height for age is more than two standard deviations below the median for the international reference population ages 0-59 months. For children up to two years old height is measured by recumbent length. For older children height is measured by stature while standing. The data are based on the WHO's new child growth standards released in 2006. + unit: stunted children + short_unit: "" + display: + numDecimalPlaces: 0 + # presentation: + # title_public: Number of stunted children + number_of_underweight_children: + title: Number of underweight children + description_short: Prevalence of underweight children is the percentage of children under age 5 whose weight for age is more than two standard deviations below the median for the international reference population ages 0-59 months. The data are based on the WHO's child growth standards released in 2006. + unit: underweight children + short_unit: "" + display: + numDecimalPlaces: 0 + # presentation: + # title_public: Number of underweight children + number_of_wasted_children: + title: Number of wasted children + description_short: Prevalence of wasting is the proportion of children under age 5 whose weight for height is more than two standard deviations below the median for the international reference population ages 0-59. + unit: wasted children + short_unit: "" + display: + numDecimalPlaces: 0 + # presentation: + # title_public: Number of wasted children diff --git a/etl/steps/data/garden/malnutrition/2024-12-16/malnutrition.py b/etl/steps/data/garden/malnutrition/2024-12-16/malnutrition.py new file mode 100644 index 00000000000..f684b5723ed --- /dev/null +++ b/etl/steps/data/garden/malnutrition/2024-12-16/malnutrition.py @@ -0,0 +1,56 @@ +from owid.catalog import processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +COLUMNS = { + "sh_sta_stnt_me_zs": "number_of_stunted_children", + "sh_sta_maln_zs": "number_of_underweight_children", + "sh_sta_wast_zs": "number_of_wasted_children", +} + + +def run(dest_dir: str) -> None: + paths.log.info("start") + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("wdi") + ds_population = paths.load_dataset("un_wpp") + + # Read table from meadow dataset. + tb = ds_meadow["wdi"].reset_index() + tb = tb[["country", "year"] + list(COLUMNS.keys())] + # Get the under-five population data. + tb_population = ds_population.read("population", reset_metadata=True) + tb_under_five = tb_population[ + (tb_population["age"] == "0-4") & (tb_population["sex"] == "all") & (tb_population["variant"] == "estimates") + ].drop(columns=["population_change", "population_density"]) + # Merge the two datasets. + tb = pr.merge(tb, tb_under_five, on=["country", "year"]) + + # Calculate the number of malnourished children. + for col in COLUMNS.keys(): + tb[COLUMNS[col]] = ((tb[col] / 100) * tb["population"]).round(0).astype("Int64") + + # Drop the columns that are no longer needed. + tb = tb.drop(columns=list(COLUMNS.keys()) + ["population", "sex", "age", "variant"]) + tb = tb.dropna(subset=[COLUMNS[col] for col in COLUMNS.keys()], how="all") + # Format + tb = tb.format(["country", "year"], short_name="malnutrition") + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + paths.log.info("end") diff --git a/lib/catalog/owid/catalog/datasets.py b/lib/catalog/owid/catalog/datasets.py index 4c2452ab7a5..d393f19d4bd 100644 --- a/lib/catalog/owid/catalog/datasets.py +++ b/lib/catalog/owid/catalog/datasets.py @@ -21,7 +21,7 @@ from owid.repack import to_safe_types from . import tables, utils -from .meta import SOURCE_EXISTS_OPTIONS, DatasetMeta, TableMeta +from .meta import SOURCE_EXISTS_OPTIONS, DatasetMeta, TableMeta, VariableMeta from .processing_log import disable_processing_log from .properties import metadata_property @@ -155,7 +155,13 @@ def add( table_filename = join(self.path, table.metadata.checked_name + f".{format}") table.to(table_filename, repack=repack) - def read(self, name: str, reset_index: bool = True, safe_types: bool = True) -> tables.Table: + def read( + self, + name: str, + reset_index: bool = True, + safe_types: bool = True, + reset_metadata: bool = False, + ) -> tables.Table: """Read dataset's table from disk. Alternative to ds[table_name], but with more options to optimize the reading. @@ -163,6 +169,8 @@ def read(self, name: str, reset_index: bool = True, safe_types: bool = True) -> large datasets with multi-indexes much faster. :param safe_types: If true, convert numeric columns to Float64 and Int64 and categorical columns to string[pyarrow]. This can significantly increase memory usage. + :param reset_metadata: If true, reset table and columns metadata. This is useful for loading + datasets like population which could pollute the metadata with irrelevant information. """ stem = self.path / Path(name) @@ -173,6 +181,10 @@ def read(self, name: str, reset_index: bool = True, safe_types: bool = True) -> t.metadata.dataset = self.metadata if safe_types: t = cast(tables.Table, to_safe_types(t)) + if reset_metadata: + t.metadata = TableMeta() + for col in t.columns: + t[col].metadata = VariableMeta() return t raise KeyError(f"Table `{name}` not found, available tables: {', '.join(self.table_names)}") From 1c205f5a5c1eb18f9c62f01ac433631ba2e72155 Mon Sep 17 00:00:00 2001 From: Marigold Date: Tue, 17 Dec 2024 09:47:44 +0100 Subject: [PATCH 3/5] wip --- .../2024-12-16/malnutrition.meta.yml | 34 ----------- .../malnutrition/2024-12-16/malnutrition.py | 56 ------------------- 2 files changed, 90 deletions(-) delete mode 100644 etl/steps/data/garden/malnutrition/2024-12-16/malnutrition.meta.yml delete mode 100644 etl/steps/data/garden/malnutrition/2024-12-16/malnutrition.py diff --git a/etl/steps/data/garden/malnutrition/2024-12-16/malnutrition.meta.yml b/etl/steps/data/garden/malnutrition/2024-12-16/malnutrition.meta.yml deleted file mode 100644 index 4688dee7907..00000000000 --- a/etl/steps/data/garden/malnutrition/2024-12-16/malnutrition.meta.yml +++ /dev/null @@ -1,34 +0,0 @@ -dataset: - title: Number of stunted, wasted and underweight children - description: | - Data on the prevalence of malnourishment is taken from the World Bank's World Development Indicators. It is measured in three metrics: i) Prevalence of stunting, height for age (% of children under 5); ii) Prevalence of wasting, weight for height (% of children under 5); iii) Prevalence of underweight, weight for age (% of children under 5) -tables: - malnutrition: - variables: - number_of_stunted_children: - title: Number of stunted children - description_short: Prevalence of stunting is the percentage of children under age 5 whose height for age is more than two standard deviations below the median for the international reference population ages 0-59 months. For children up to two years old height is measured by recumbent length. For older children height is measured by stature while standing. The data are based on the WHO's new child growth standards released in 2006. - unit: stunted children - short_unit: "" - display: - numDecimalPlaces: 0 - # presentation: - # title_public: Number of stunted children - number_of_underweight_children: - title: Number of underweight children - description_short: Prevalence of underweight children is the percentage of children under age 5 whose weight for age is more than two standard deviations below the median for the international reference population ages 0-59 months. The data are based on the WHO's child growth standards released in 2006. - unit: underweight children - short_unit: "" - display: - numDecimalPlaces: 0 - # presentation: - # title_public: Number of underweight children - number_of_wasted_children: - title: Number of wasted children - description_short: Prevalence of wasting is the proportion of children under age 5 whose weight for height is more than two standard deviations below the median for the international reference population ages 0-59. - unit: wasted children - short_unit: "" - display: - numDecimalPlaces: 0 - # presentation: - # title_public: Number of wasted children diff --git a/etl/steps/data/garden/malnutrition/2024-12-16/malnutrition.py b/etl/steps/data/garden/malnutrition/2024-12-16/malnutrition.py deleted file mode 100644 index f684b5723ed..00000000000 --- a/etl/steps/data/garden/malnutrition/2024-12-16/malnutrition.py +++ /dev/null @@ -1,56 +0,0 @@ -from owid.catalog import processing as pr - -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - -COLUMNS = { - "sh_sta_stnt_me_zs": "number_of_stunted_children", - "sh_sta_maln_zs": "number_of_underweight_children", - "sh_sta_wast_zs": "number_of_wasted_children", -} - - -def run(dest_dir: str) -> None: - paths.log.info("start") - # - # Load inputs. - # - # Load meadow dataset. - ds_meadow = paths.load_dataset("wdi") - ds_population = paths.load_dataset("un_wpp") - - # Read table from meadow dataset. - tb = ds_meadow["wdi"].reset_index() - tb = tb[["country", "year"] + list(COLUMNS.keys())] - # Get the under-five population data. - tb_population = ds_population.read("population", reset_metadata=True) - tb_under_five = tb_population[ - (tb_population["age"] == "0-4") & (tb_population["sex"] == "all") & (tb_population["variant"] == "estimates") - ].drop(columns=["population_change", "population_density"]) - # Merge the two datasets. - tb = pr.merge(tb, tb_under_five, on=["country", "year"]) - - # Calculate the number of malnourished children. - for col in COLUMNS.keys(): - tb[COLUMNS[col]] = ((tb[col] / 100) * tb["population"]).round(0).astype("Int64") - - # Drop the columns that are no longer needed. - tb = tb.drop(columns=list(COLUMNS.keys()) + ["population", "sex", "age", "variant"]) - tb = tb.dropna(subset=[COLUMNS[col] for col in COLUMNS.keys()], how="all") - # Format - tb = tb.format(["country", "year"], short_name="malnutrition") - - # - # Save outputs. - # - # Create a new garden dataset with the same metadata as the meadow dataset. - ds_garden = create_dataset( - dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata - ) - - # Save changes in the new garden dataset. - ds_garden.save() - - paths.log.info("end") From da6a30ba53b00f5c98070ffef4c853ba3d8ef944 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Tue, 17 Dec 2024 09:53:05 +0000 Subject: [PATCH 4/5] adding option to keep origin --- lib/catalog/owid/catalog/datasets.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/lib/catalog/owid/catalog/datasets.py b/lib/catalog/owid/catalog/datasets.py index d393f19d4bd..e4567398edd 100644 --- a/lib/catalog/owid/catalog/datasets.py +++ b/lib/catalog/owid/catalog/datasets.py @@ -160,7 +160,7 @@ def read( name: str, reset_index: bool = True, safe_types: bool = True, - reset_metadata: bool = False, + reset_metadata: Optional[Union[bool, str]] = False, # Allow "keep_origins" as a valid option ) -> tables.Table: """Read dataset's table from disk. Alternative to ds[table_name], but with more options to optimize the reading. @@ -169,8 +169,10 @@ def read( large datasets with multi-indexes much faster. :param safe_types: If true, convert numeric columns to Float64 and Int64 and categorical columns to string[pyarrow]. This can significantly increase memory usage. - :param reset_metadata: If true, reset table and columns metadata. This is useful for loading - datasets like population which could pollute the metadata with irrelevant information. + :param reset_metadata: + - If True, reset table and columns metadata. + - If "keep_origins", reset metadata but retain the 'origins' attribute for columns. + - If False, leave metadata unchanged. """ stem = self.path / Path(name) @@ -181,10 +183,15 @@ def read( t.metadata.dataset = self.metadata if safe_types: t = cast(tables.Table, to_safe_types(t)) - if reset_metadata: + if reset_metadata: # Handles True and "keep_origins" t.metadata = TableMeta() for col in t.columns: - t[col].metadata = VariableMeta() + if reset_metadata == "keep_origins": + origins = t[col].metadata.origins if hasattr(t[col].metadata, "origins") else None + t[col].metadata = VariableMeta() + t[col].metadata.origins = origins # Preserve 'origins' attribute + else: # True case: reset all metadata + t[col].metadata = VariableMeta() return t raise KeyError(f"Table `{name}` not found, available tables: {', '.join(self.table_names)}") From 6887bed564ac78bb5b8ee4b65fe77373f23ce659 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Thu, 19 Dec 2024 10:54:49 +0000 Subject: [PATCH 5/5] Mojmirs suggestions --- lib/catalog/owid/catalog/datasets.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/lib/catalog/owid/catalog/datasets.py b/lib/catalog/owid/catalog/datasets.py index e4567398edd..a1eba76cff1 100644 --- a/lib/catalog/owid/catalog/datasets.py +++ b/lib/catalog/owid/catalog/datasets.py @@ -160,7 +160,7 @@ def read( name: str, reset_index: bool = True, safe_types: bool = True, - reset_metadata: Optional[Union[bool, str]] = False, # Allow "keep_origins" as a valid option + reset_metadata: Literal["keep", "keep_origins", "reset"] = "keep", ) -> tables.Table: """Read dataset's table from disk. Alternative to ds[table_name], but with more options to optimize the reading. @@ -169,10 +169,10 @@ def read( large datasets with multi-indexes much faster. :param safe_types: If true, convert numeric columns to Float64 and Int64 and categorical columns to string[pyarrow]. This can significantly increase memory usage. - :param reset_metadata: - - If True, reset table and columns metadata. - - If "keep_origins", reset metadata but retain the 'origins' attribute for columns. - - If False, leave metadata unchanged. + :param reset_metadata: Controls variable metadata reset behavior. + - "keep": Leave metadata unchanged (default). + - "keep_origins": Reset variable metadata but retain the 'origins' attribute. + - "reset": Reset all variable metadata. """ stem = self.path / Path(name) @@ -183,14 +183,14 @@ def read( t.metadata.dataset = self.metadata if safe_types: t = cast(tables.Table, to_safe_types(t)) - if reset_metadata: # Handles True and "keep_origins" + if reset_metadata in ["keep_origins", "reset"]: # Handles "keep_origins" and "reset" t.metadata = TableMeta() for col in t.columns: - if reset_metadata == "keep_origins": + if reset_metadata == "keep_origins": # Preserve 'origins' attribute origins = t[col].metadata.origins if hasattr(t[col].metadata, "origins") else None t[col].metadata = VariableMeta() t[col].metadata.origins = origins # Preserve 'origins' attribute - else: # True case: reset all metadata + if reset_metadata == "reset": # Reset all metadata t[col].metadata = VariableMeta() return t