Skip to content

Commit

Permalink
Add faceted dim csv
Browse files Browse the repository at this point in the history
  • Loading branch information
maxwelllevin committed Sep 26, 2023
1 parent 4d5b811 commit 862b767
Show file tree
Hide file tree
Showing 5 changed files with 152 additions and 3 deletions.
3 changes: 2 additions & 1 deletion src/ncconvert/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
)
sys.exit(1)

from .csv import to_csv, to_csv_collection
from .csv import to_csv, to_csv_collection, to_faceted_dim_csv
from .parquet import to_parquet, to_parquet_collection


Expand All @@ -39,6 +39,7 @@ def __call__(
# Register to_* methods as options. For now this is a manual process
AVAILABLE_METHODS: Dict[str, Converter] = {
to_csv.__name__: to_csv,
to_faceted_dim_csv.__name__: to_faceted_dim_csv,
to_csv_collection.__name__: to_csv_collection,
to_parquet.__name__: to_parquet,
to_parquet_collection.__name__: to_parquet_collection,
Expand Down
25 changes: 24 additions & 1 deletion src/ncconvert/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@

import xarray as xr

from .utils import _dump_metadata, _to_dataframe, _to_dataframe_collection
from .utils import (
_dump_metadata,
_to_dataframe,
_to_dataframe_collection,
_to_faceted_dim_dataframe,
)


def to_csv(
Expand Down Expand Up @@ -89,3 +94,21 @@ def to_csv_collection(
metadata_path = _dump_metadata(dataset, filepath) if metadata else None

return tuple(filepaths), metadata_path


def to_faceted_dim_csv(
dataset: xr.Dataset,
filepath: str | Path,
metadata: bool = True,
**kwargs: Any,
) -> tuple[Path, Path | None]:
to_csv_kwargs = kwargs.get("to_csv_kwargs", {})

Path(filepath).parent.mkdir(parents=True, exist_ok=True)

filepath, df = _to_faceted_dim_dataframe(dataset, filepath, ".csv")
df.to_csv(filepath, **to_csv_kwargs) # type: ignore

metadata_path = _dump_metadata(dataset, filepath) if metadata else None

return Path(filepath), metadata_path
97 changes: 97 additions & 0 deletions src/ncconvert/utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
from __future__ import annotations

import json
import logging
from collections import defaultdict
from pathlib import Path

import pandas as pd
import xarray as xr

logger = logging.getLogger(__name__)


def _dump_metadata(dataset: xr.Dataset, filepath: str | Path) -> Path:
metadata = dataset.to_dict(data=False, encoding=True)
Expand Down Expand Up @@ -56,3 +59,97 @@ def _to_dataframe_collection(
outputs.append((dim_group_path, df))

return tuple(outputs)


def _to_faceted_dim_dataframe(
dataset: xr.Dataset, filepath: str | Path, extension: str
) -> tuple[Path, pd.DataFrame]:
if not extension.startswith("."):
extension = "." + extension

Check warning on line 68 in src/ncconvert/utils.py

View check run for this annotation

Codecov / codecov/patch

src/ncconvert/utils.py#L68

Added line #L68 was not covered by tests

# Get variable dimension groupings
dimension_groups: dict[tuple[str, ...], list[str]] = defaultdict(list)
for var_name, data_var in dataset.data_vars.items():
dims = tuple(str(d) for d in data_var.dims)
if len(dims) > 2:
logger.error(

Check warning on line 75 in src/ncconvert/utils.py

View check run for this annotation

Codecov / codecov/patch

src/ncconvert/utils.py#L75

Added line #L75 was not covered by tests
(
"Variable %s has more than 2 dimensions and will not be supported."
" Dims: %s"
),
var_name,
dims,
)
continue

Check warning on line 83 in src/ncconvert/utils.py

View check run for this annotation

Codecov / codecov/patch

src/ncconvert/utils.py#L83

Added line #L83 was not covered by tests
elif len(dims) == 2 and "time" not in dims:
logger.error(

Check warning on line 85 in src/ncconvert/utils.py

View check run for this annotation

Codecov / codecov/patch

src/ncconvert/utils.py#L85

Added line #L85 was not covered by tests
(
"2D variables are only supported when 'time' is one of its"
" dimensions. Found variable %s with dimensions: %s."
),
var_name,
dims,
)
continue

Check warning on line 93 in src/ncconvert/utils.py

View check run for this annotation

Codecov / codecov/patch

src/ncconvert/utils.py#L93

Added line #L93 was not covered by tests
dimension_groups[dims].append(var_name)

ds = dataset[["time"]].copy()
for dims, var_list in dimension_groups.items():
# simple case
if dims == ("time", ):
ds.update(dataset[var_list])
continue

shape = dataset[var_list[0]].shape

# If scalar, expand to make time the first dimension
if not shape:
_tmp = dataset[var_list].expand_dims({"time": dataset["time"]})
ds.update(_tmp[var_list])
continue

_tmp = dataset[var_list]

# If 1D, expand to make time a dimension (2D)
if len(shape) == 1:
_tmp = _tmp.expand_dims({"time": dataset["time"]})

# For 2D, make time the first dimension and flatten the second
new_dims = ("time", [d for d in dims if d != "time"][0])
_tmp = _tmp.transpose(*new_dims)
_tmp = _flatten_dataset(_tmp, new_dims[1])
ds = ds.merge(_tmp)

df = ds.to_dataframe()

return Path(filepath).with_suffix(extension), df


def _flatten_dataset(ds: xr.Dataset, second_dim: str) -> xr.Dataset:
"""Transforms a 2D dataset into 1D by adding variables for each value of the second
dimension. The first dimension must be 'time'.
Args:
ds (xr.Dataset): The dataset to flatten. Must only contain two dimensions/coords
and only the variables to flatten.
Returns:
xr.Dataset: The flattened dataset. Preserves attributes.
"""

output = ds[["time"]]

dim_values = ds[second_dim].values

dim_units = ds[second_dim].attrs.get("units")
if not dim_units or dim_units == "1":
dim_units = ""

Check warning on line 146 in src/ncconvert/utils.py

View check run for this annotation

Codecov / codecov/patch

src/ncconvert/utils.py#L146

Added line #L146 was not covered by tests

dim_suffixes = [f"{dim_val}{dim_units}" for dim_val in dim_values]

for var_name, data in ds.data_vars.items():
for i, suffix in enumerate(dim_suffixes):
output[f"{var_name}_{suffix}"] = data[:, i]

output = output.drop_vars(second_dim) # remove from coords
return output
2 changes: 1 addition & 1 deletion test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def dataset() -> xr.Dataset:
),
"height": (
"height",
[0.0, 10.0, 20.0, 30.0],
[0, 10, 20, 30],
{"units": "m", "long_name": "Height AGL"},
),
},
Expand Down
28 changes: 28 additions & 0 deletions test/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def test_vanilla_csv(dataset: xr.Dataset):
output_path, metadata_path = to_csv(dataset, filepath)

assert output_path == filepath
assert metadata_path is not None
assert metadata_path == filepath.with_suffix(".json")

df = pd.read_csv(output_path)
Expand Down Expand Up @@ -42,6 +43,7 @@ def test_csv_collection(dataset: xr.Dataset):
assert filepath.with_suffix(".height.csv") in output_paths
assert filepath.with_suffix(".time.csv") in output_paths
assert filepath.with_suffix(".time.height.csv") in output_paths
assert metadata_path is not None
assert metadata_path == filepath.with_suffix(".json")

df = pd.read_csv(sorted(output_paths)[0]) # type: ignore
Expand All @@ -67,3 +69,29 @@ def test_csv_collection(dataset: xr.Dataset):
for output_path in output_paths:
os.remove(output_path)
os.remove(metadata_path)

def test_faceted_csv(dataset: xr.Dataset):
from ncconvert.csv import to_faceted_dim_csv

filepath = Path(".tmp/data/faceted.csv")

output_path, metadata_path = to_faceted_dim_csv(dataset, filepath)

assert output_path == filepath
assert metadata_path is not None
assert metadata_path == filepath.with_suffix(".json")

df = pd.read_csv(output_path)


# cols=time, humidity, static, and [temperature, other]@each height
assert len(df.index) == len(dataset.time)
assert len(df.columns) == 2*len(dataset.height) + 3

meta = json.loads(metadata_path.read_text())

assert "datastream" in meta["attrs"]
assert "time" in meta["coords"]

os.remove(output_path)
os.remove(metadata_path)

0 comments on commit 862b767

Please sign in to comment.