Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add reset metadata origin option #3731

Merged
merged 5 commits into from
Dec 19, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 21 additions & 2 deletions lib/catalog/owid/catalog/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from owid.repack import to_safe_types

from . import tables, utils
from .meta import SOURCE_EXISTS_OPTIONS, DatasetMeta, TableMeta
from .meta import SOURCE_EXISTS_OPTIONS, DatasetMeta, TableMeta, VariableMeta
from .processing_log import disable_processing_log
from .properties import metadata_property

Expand Down Expand Up @@ -155,14 +155,24 @@ def add(
table_filename = join(self.path, table.metadata.checked_name + f".{format}")
table.to(table_filename, repack=repack)

def read(self, name: str, reset_index: bool = True, safe_types: bool = True) -> tables.Table:
def read(
self,
name: str,
reset_index: bool = True,
safe_types: bool = True,
reset_metadata: Literal["keep", "keep_origins", "reset"] = "keep",
) -> tables.Table:
"""Read dataset's table from disk. Alternative to ds[table_name], but
with more options to optimize the reading.

:param reset_index: If true, don't set primary keys of the table. This can make loading
large datasets with multi-indexes much faster.
:param safe_types: If true, convert numeric columns to Float64 and Int64 and categorical
columns to string[pyarrow]. This can significantly increase memory usage.
:param reset_metadata: Controls variable metadata reset behavior.
- "keep": Leave metadata unchanged (default).
- "keep_origins": Reset variable metadata but retain the 'origins' attribute.
- "reset": Reset all variable metadata.
"""
stem = self.path / Path(name)

Expand All @@ -173,6 +183,15 @@ def read(self, name: str, reset_index: bool = True, safe_types: bool = True) ->
t.metadata.dataset = self.metadata
if safe_types:
t = cast(tables.Table, to_safe_types(t))
if reset_metadata in ["keep_origins", "reset"]: # Handles "keep_origins" and "reset"
t.metadata = TableMeta()
for col in t.columns:
if reset_metadata == "keep_origins": # Preserve 'origins' attribute
origins = t[col].metadata.origins if hasattr(t[col].metadata, "origins") else None
t[col].metadata = VariableMeta()
t[col].metadata.origins = origins # Preserve 'origins' attribute
if reset_metadata == "reset": # Reset all metadata
t[col].metadata = VariableMeta()
return t

raise KeyError(f"Table `{name}` not found, available tables: {', '.join(self.table_names)}")
Expand Down
Loading