diff --git a/lib/catalog/owid/catalog/datasets.py b/lib/catalog/owid/catalog/datasets.py index 4c2452ab7a5..a1eba76cff1 100644 --- a/lib/catalog/owid/catalog/datasets.py +++ b/lib/catalog/owid/catalog/datasets.py @@ -21,7 +21,7 @@ from owid.repack import to_safe_types from . import tables, utils -from .meta import SOURCE_EXISTS_OPTIONS, DatasetMeta, TableMeta +from .meta import SOURCE_EXISTS_OPTIONS, DatasetMeta, TableMeta, VariableMeta from .processing_log import disable_processing_log from .properties import metadata_property @@ -155,7 +155,13 @@ def add( table_filename = join(self.path, table.metadata.checked_name + f".{format}") table.to(table_filename, repack=repack) - def read(self, name: str, reset_index: bool = True, safe_types: bool = True) -> tables.Table: + def read( + self, + name: str, + reset_index: bool = True, + safe_types: bool = True, + reset_metadata: Literal["keep", "keep_origins", "reset"] = "keep", + ) -> tables.Table: """Read dataset's table from disk. Alternative to ds[table_name], but with more options to optimize the reading. @@ -163,6 +169,10 @@ def read(self, name: str, reset_index: bool = True, safe_types: bool = True) -> large datasets with multi-indexes much faster. :param safe_types: If true, convert numeric columns to Float64 and Int64 and categorical columns to string[pyarrow]. This can significantly increase memory usage. + :param reset_metadata: Controls variable metadata reset behavior. + - "keep": Leave metadata unchanged (default). + - "keep_origins": Reset variable metadata but retain the 'origins' attribute. + - "reset": Reset all variable metadata. """ stem = self.path / Path(name) @@ -173,6 +183,15 @@ def read(self, name: str, reset_index: bool = True, safe_types: bool = True) -> t.metadata.dataset = self.metadata if safe_types: t = cast(tables.Table, to_safe_types(t)) + if reset_metadata in ["keep_origins", "reset"]: # Handles "keep_origins" and "reset" + t.metadata = TableMeta() + for col in t.columns: + if reset_metadata == "keep_origins": # Preserve 'origins' attribute + origins = t[col].metadata.origins if hasattr(t[col].metadata, "origins") else None + t[col].metadata = VariableMeta() + t[col].metadata.origins = origins # Preserve 'origins' attribute + if reset_metadata == "reset": # Reset all metadata + t[col].metadata = VariableMeta() return t raise KeyError(f"Table `{name}` not found, available tables: {', '.join(self.table_names)}")