From 862b76779db499a808d28cf8b09375bba2eadf2b Mon Sep 17 00:00:00 2001 From: Maxwell Levin <24307537+maxwelllevin@users.noreply.github.com> Date: Tue, 26 Sep 2023 09:54:28 -0700 Subject: [PATCH] Add faceted dim csv --- src/ncconvert/cli.py | 3 +- src/ncconvert/csv.py | 25 ++++++++++- src/ncconvert/utils.py | 97 ++++++++++++++++++++++++++++++++++++++++++ test/conftest.py | 2 +- test/test_csv.py | 28 ++++++++++++ 5 files changed, 152 insertions(+), 3 deletions(-) diff --git a/src/ncconvert/cli.py b/src/ncconvert/cli.py index fc0c881..5ba847f 100644 --- a/src/ncconvert/cli.py +++ b/src/ncconvert/cli.py @@ -21,7 +21,7 @@ ) sys.exit(1) -from .csv import to_csv, to_csv_collection +from .csv import to_csv, to_csv_collection, to_faceted_dim_csv from .parquet import to_parquet, to_parquet_collection @@ -39,6 +39,7 @@ def __call__( # Register to_* methods as options. For now this is a manual process AVAILABLE_METHODS: Dict[str, Converter] = { to_csv.__name__: to_csv, + to_faceted_dim_csv.__name__: to_faceted_dim_csv, to_csv_collection.__name__: to_csv_collection, to_parquet.__name__: to_parquet, to_parquet_collection.__name__: to_parquet_collection, diff --git a/src/ncconvert/csv.py b/src/ncconvert/csv.py index 78855f7..9a2b1f4 100644 --- a/src/ncconvert/csv.py +++ b/src/ncconvert/csv.py @@ -5,7 +5,12 @@ import xarray as xr -from .utils import _dump_metadata, _to_dataframe, _to_dataframe_collection +from .utils import ( + _dump_metadata, + _to_dataframe, + _to_dataframe_collection, + _to_faceted_dim_dataframe, +) def to_csv( @@ -89,3 +94,21 @@ def to_csv_collection( metadata_path = _dump_metadata(dataset, filepath) if metadata else None return tuple(filepaths), metadata_path + + +def to_faceted_dim_csv( + dataset: xr.Dataset, + filepath: str | Path, + metadata: bool = True, + **kwargs: Any, +) -> tuple[Path, Path | None]: + to_csv_kwargs = kwargs.get("to_csv_kwargs", {}) + + Path(filepath).parent.mkdir(parents=True, exist_ok=True) + + filepath, df = _to_faceted_dim_dataframe(dataset, filepath, ".csv") + df.to_csv(filepath, **to_csv_kwargs) # type: ignore + + metadata_path = _dump_metadata(dataset, filepath) if metadata else None + + return Path(filepath), metadata_path diff --git a/src/ncconvert/utils.py b/src/ncconvert/utils.py index 01d1da5..beb99bf 100644 --- a/src/ncconvert/utils.py +++ b/src/ncconvert/utils.py @@ -1,12 +1,15 @@ from __future__ import annotations import json +import logging from collections import defaultdict from pathlib import Path import pandas as pd import xarray as xr +logger = logging.getLogger(__name__) + def _dump_metadata(dataset: xr.Dataset, filepath: str | Path) -> Path: metadata = dataset.to_dict(data=False, encoding=True) @@ -56,3 +59,97 @@ def _to_dataframe_collection( outputs.append((dim_group_path, df)) return tuple(outputs) + + +def _to_faceted_dim_dataframe( + dataset: xr.Dataset, filepath: str | Path, extension: str +) -> tuple[Path, pd.DataFrame]: + if not extension.startswith("."): + extension = "." + extension + + # Get variable dimension groupings + dimension_groups: dict[tuple[str, ...], list[str]] = defaultdict(list) + for var_name, data_var in dataset.data_vars.items(): + dims = tuple(str(d) for d in data_var.dims) + if len(dims) > 2: + logger.error( + ( + "Variable %s has more than 2 dimensions and will not be supported." + " Dims: %s" + ), + var_name, + dims, + ) + continue + elif len(dims) == 2 and "time" not in dims: + logger.error( + ( + "2D variables are only supported when 'time' is one of its" + " dimensions. Found variable %s with dimensions: %s." + ), + var_name, + dims, + ) + continue + dimension_groups[dims].append(var_name) + + ds = dataset[["time"]].copy() + for dims, var_list in dimension_groups.items(): + # simple case + if dims == ("time", ): + ds.update(dataset[var_list]) + continue + + shape = dataset[var_list[0]].shape + + # If scalar, expand to make time the first dimension + if not shape: + _tmp = dataset[var_list].expand_dims({"time": dataset["time"]}) + ds.update(_tmp[var_list]) + continue + + _tmp = dataset[var_list] + + # If 1D, expand to make time a dimension (2D) + if len(shape) == 1: + _tmp = _tmp.expand_dims({"time": dataset["time"]}) + + # For 2D, make time the first dimension and flatten the second + new_dims = ("time", [d for d in dims if d != "time"][0]) + _tmp = _tmp.transpose(*new_dims) + _tmp = _flatten_dataset(_tmp, new_dims[1]) + ds = ds.merge(_tmp) + + df = ds.to_dataframe() + + return Path(filepath).with_suffix(extension), df + + +def _flatten_dataset(ds: xr.Dataset, second_dim: str) -> xr.Dataset: + """Transforms a 2D dataset into 1D by adding variables for each value of the second + dimension. The first dimension must be 'time'. + + Args: + ds (xr.Dataset): The dataset to flatten. Must only contain two dimensions/coords + and only the variables to flatten. + + Returns: + xr.Dataset: The flattened dataset. Preserves attributes. + """ + + output = ds[["time"]] + + dim_values = ds[second_dim].values + + dim_units = ds[second_dim].attrs.get("units") + if not dim_units or dim_units == "1": + dim_units = "" + + dim_suffixes = [f"{dim_val}{dim_units}" for dim_val in dim_values] + + for var_name, data in ds.data_vars.items(): + for i, suffix in enumerate(dim_suffixes): + output[f"{var_name}_{suffix}"] = data[:, i] + + output = output.drop_vars(second_dim) # remove from coords + return output diff --git a/test/conftest.py b/test/conftest.py index c9fa265..305bfe1 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -19,7 +19,7 @@ def dataset() -> xr.Dataset: ), "height": ( "height", - [0.0, 10.0, 20.0, 30.0], + [0, 10, 20, 30], {"units": "m", "long_name": "Height AGL"}, ), }, diff --git a/test/test_csv.py b/test/test_csv.py index 28223ed..08e04df 100644 --- a/test/test_csv.py +++ b/test/test_csv.py @@ -14,6 +14,7 @@ def test_vanilla_csv(dataset: xr.Dataset): output_path, metadata_path = to_csv(dataset, filepath) assert output_path == filepath + assert metadata_path is not None assert metadata_path == filepath.with_suffix(".json") df = pd.read_csv(output_path) @@ -42,6 +43,7 @@ def test_csv_collection(dataset: xr.Dataset): assert filepath.with_suffix(".height.csv") in output_paths assert filepath.with_suffix(".time.csv") in output_paths assert filepath.with_suffix(".time.height.csv") in output_paths + assert metadata_path is not None assert metadata_path == filepath.with_suffix(".json") df = pd.read_csv(sorted(output_paths)[0]) # type: ignore @@ -67,3 +69,29 @@ def test_csv_collection(dataset: xr.Dataset): for output_path in output_paths: os.remove(output_path) os.remove(metadata_path) + +def test_faceted_csv(dataset: xr.Dataset): + from ncconvert.csv import to_faceted_dim_csv + + filepath = Path(".tmp/data/faceted.csv") + + output_path, metadata_path = to_faceted_dim_csv(dataset, filepath) + + assert output_path == filepath + assert metadata_path is not None + assert metadata_path == filepath.with_suffix(".json") + + df = pd.read_csv(output_path) + + + # cols=time, humidity, static, and [temperature, other]@each height + assert len(df.index) == len(dataset.time) + assert len(df.columns) == 2*len(dataset.height) + 3 + + meta = json.loads(metadata_path.read_text()) + + assert "datastream" in meta["attrs"] + assert "time" in meta["coords"] + + os.remove(output_path) + os.remove(metadata_path)