Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cf-coding #7654

Closed
wants to merge 22 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
d19d57a
implement coders, adapt tests
kmuehlbauer Apr 1, 2023
2e750e4
preserve boolean-dtype within encoding, adapt test
kmuehlbauer Mar 31, 2023
0cd301b
determine cf packed data dtype, adapt tests
kmuehlbauer Mar 31, 2023
7cfc6ac
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 1, 2023
bf4371d
fix typing, fix if-clauses, add link to issue
kmuehlbauer Apr 1, 2023
fe006df
add comment ass per review
kmuehlbauer Apr 1, 2023
66628fc
choose float64 if add_offset is not up to cf expectations
kmuehlbauer Apr 1, 2023
b6310e4
use scale_factor/add_offset in tests as specified by cf conventions
kmuehlbauer Apr 1, 2023
06ec16a
add test which checks add_offset being not conforming to cf standards
kmuehlbauer Apr 1, 2023
19ef234
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 1, 2023
e877aa7
convert to float32 to keep #1840 in sync
kmuehlbauer Apr 1, 2023
8cc7319
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 1, 2023
6a73653
add more comments, add more typing
kmuehlbauer Apr 2, 2023
d6887cd
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 2, 2023
c8c3f14
add additional test, make _choose_float_dtype more explicit
kmuehlbauer Apr 2, 2023
2f5709a
Apply suggestions from code review
kmuehlbauer Apr 2, 2023
f215d01
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 2, 2023
c5cd53d
Apply suggestions from code review
kmuehlbauer Apr 2, 2023
986ffc6
Test against None, as 0 (False) is a valid input for add_offset. Chec…
kmuehlbauer Apr 3, 2023
51f7515
apply astype in the right order to prevent loss of precision, more fi…
kmuehlbauer Apr 4, 2023
8c074f0
add mask/scale roundtrip test for variable cf encoding/decoding
kmuehlbauer Apr 4, 2023
031cac5
separate CFMaskCoder test for multiple missing_values/__FillValues co…
kmuehlbauer Apr 4, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
312 changes: 263 additions & 49 deletions xarray/coding/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,71 @@ def __repr__(self) -> str:
)


class NativeEndiannessArray(indexing.ExplicitlyIndexedNDArrayMixin):
"""Decode arrays on the fly from non-native to native endianness

This is useful for decoding arrays from netCDF3 files (which are all
big endian) into native endianness, so they can be used with Cython
functions, such as those found in bottleneck and pandas.

>>> x = np.arange(5, dtype=">i2")

>>> x.dtype
dtype('>i2')

>>> NativeEndiannessArray(x).dtype
dtype('int16')

>>> indexer = indexing.BasicIndexer((slice(None),))
>>> NativeEndiannessArray(x)[indexer].dtype
dtype('int16')
"""

__slots__ = ("array",)

def __init__(self, array) -> None:
self.array = indexing.as_indexable(array)

@property
def dtype(self) -> np.dtype:
return np.dtype(self.array.dtype.kind + str(self.array.dtype.itemsize))

def __getitem__(self, key) -> np.ndarray:
return np.asarray(self.array[key], dtype=self.dtype)


class BoolTypeArray(indexing.ExplicitlyIndexedNDArrayMixin):
"""Decode arrays on the fly from integer to boolean datatype

This is useful for decoding boolean arrays from integer typed netCDF
variables.

>>> x = np.array([1, 0, 1, 1, 0], dtype="i1")

>>> x.dtype
dtype('int8')

>>> BoolTypeArray(x).dtype
dtype('bool')

>>> indexer = indexing.BasicIndexer((slice(None),))
>>> BoolTypeArray(x)[indexer].dtype
dtype('bool')
"""

__slots__ = ("array",)

def __init__(self, array) -> None:
self.array = indexing.as_indexable(array)

@property
def dtype(self) -> np.dtype:
return np.dtype("bool")

def __getitem__(self, key) -> np.ndarray:
return np.asarray(self.array[key], dtype=self.dtype)


def lazy_elemwise_func(array, func: Callable, dtype: np.typing.DTypeLike):
"""Lazily apply an element-wise function to an array.
Parameters
Expand Down Expand Up @@ -155,34 +220,39 @@ class CFMaskCoder(VariableCoder):
def encode(self, variable: Variable, name: T_Name = None):
dims, data, attrs, encoding = unpack_for_encoding(variable)

# get dtype from encoding if available, otherwise use data.dtype
dtype = np.dtype(encoding.get("dtype", data.dtype))
fv = encoding.get("_FillValue")
mv = encoding.get("missing_value")

if (
fv is not None
and mv is not None
and not duck_array_ops.allclose_or_equiv(fv, mv)
):
raise ValueError(
f"Variable {name!r} has conflicting _FillValue ({fv}) and missing_value ({mv}). Cannot encode data."
)

if fv is not None:
# Ensure _FillValue is cast to same dtype as data's
encoding["_FillValue"] = dtype.type(fv)
fill_value = pop_to(encoding, attrs, "_FillValue", name=name)
if not pd.isnull(fill_value):
data = duck_array_ops.fillna(data, fill_value)

if mv is not None:
# Ensure missing_value is cast to same dtype as data's
encoding["missing_value"] = dtype.type(mv)
fill_value = pop_to(encoding, attrs, "missing_value", name=name)
if not pd.isnull(fill_value) and fv is None:
data = duck_array_ops.fillna(data, fill_value)
if fv is not None or mv is not None:
if (
fv is not None
and mv is not None
and not duck_array_ops.allclose_or_equiv(fv, mv)
):
raise ValueError(
f"Variable {name!r} has conflicting _FillValue ({fv}) and missing_value ({mv}). Cannot encode data."
)

return Variable(dims, data, attrs, encoding, fastpath=True)
if fv is not None:
# Ensure _FillValue is cast to same dtype as data's
encoding["_FillValue"] = dtype.type(fv)
fill_value = pop_to(encoding, attrs, "_FillValue", name=name)
if not pd.isnull(fill_value):
data = duck_array_ops.fillna(data, fill_value)

if mv is not None:
# Use _FillValue if available to align missing_value to prevent issues
# when decoding
# Ensure missing_value is cast to same dtype as data's
encoding["missing_value"] = attrs.get("_FillValue", dtype.type(mv))
fill_value = pop_to(encoding, attrs, "missing_value", name=name)
if not pd.isnull(fill_value) and fv is None:
data = duck_array_ops.fillna(data, fill_value)
return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable

def decode(self, variable: Variable, name: T_Name = None):
dims, data, attrs, encoding = unpack_for_decoding(variable)
Expand All @@ -207,7 +277,13 @@ def decode(self, variable: Variable, name: T_Name = None):
stacklevel=3,
)

dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
if "scale_factor" not in attrs and "add_offset" not in attrs:
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
else:
dtype, decoded_fill_value = (
_choose_float_dtype(data.dtype, attrs),
np.nan,
)

if encoded_fill_values:
transform = partial(
Expand All @@ -232,20 +308,50 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTyp
return data


def _choose_float_dtype(dtype: np.dtype, has_offset: bool) -> type[np.floating[Any]]:
"""Return a float dtype that can losslessly represent `dtype` values."""
# Keep float32 as-is. Upcast half-precision to single-precision,
def _choose_float_dtype(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mankoff do you have time to take a look here please?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @dcherian . I'm not sure what to look for. I will link to my open-but-stale PR that tried to start addressing this/similar issues: #6812

Perhaps also relevant are my last few comments on #2304 (see #2304 (comment) ). The problem for me is that (1) the CF standards are ambiguously defined and (2) xarray needs to address the many use-cases where the CF standards are not followed (usually this means different data types).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! Does this PR fix your original issue?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @mankoff, I'll have a look at your extensive notes over there and try to come up with aomething.

dtype: np.dtype, mapping: MutableMapping
) -> type[np.floating[Any]]:
# check scale/offset first to derive wanted float dtype
# see https://github.com/pydata/xarray/issues/5597#issuecomment-879561954
scale_factor = mapping.get("scale_factor")
add_offset = mapping.get("add_offset")
if scale_factor is not None or add_offset is not None:
# get the type from scale_factor/add_offset to determine
# the needed floating point type
if scale_factor is not None:
scale_type = type(scale_factor)
if add_offset is not None:
offset_type = type(add_offset)
# CF conforming, both scale_factor and add-offset are given and
# of same floating point type (float32/64)
if (
add_offset is not None
and scale_factor is not None
and offset_type == scale_type
and scale_type in [np.float32, np.float64]
):
# in case of int32 -> we need upcast to float64
# due to precision issues
if dtype.itemsize == 4 and np.issubdtype(dtype, np.integer):
return np.float64
return np.dtype(scale_type).type
# Not CF conforming and add_offset given:
# A scale factor is entirely safe (vanishing into the mantissa),
# but a large integer offset could lead to loss of precision.
# Sensitivity analysis can be tricky, so we just use a float64
# if there's any offset at all - better unoptimised than wrong!
if add_offset is not None:
return np.float64
# return float32 in other cases where only scale_factor is given
return np.float32
# If no scale_factor or add_offset is given, use some general rules.
# Keep float32 as-is. Upcast half-precision to single-precision,
# because float16 is "intended for storage but not computation"
if dtype.itemsize <= 4 and np.issubdtype(dtype, np.floating):
return np.float32
# float32 can exactly represent all integers up to 24 bits
if dtype.itemsize <= 2 and np.issubdtype(dtype, np.integer):
# A scale factor is entirely safe (vanishing into the mantissa),
# but a large integer offset could lead to loss of precision.
# Sensitivity analysis can be tricky, so we just use a float64
# if there's any offset at all - better unoptimised than wrong!
if not has_offset:
return np.float32
return np.float32
# For all other types and circumstances, we just use float64.
# (safe because eg. complex numbers are not supported in NetCDF)
return np.float64
Expand All @@ -260,29 +366,40 @@ class CFScaleOffsetCoder(VariableCoder):

def encode(self, variable: Variable, name: T_Name = None) -> Variable:
dims, data, attrs, encoding = unpack_for_encoding(variable)

if "scale_factor" in encoding or "add_offset" in encoding:
dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
scale_factor = pop_to(encoding, attrs, "scale_factor", name=name)
add_offset = pop_to(encoding, attrs, "add_offset", name=name)
if scale_factor or add_offset:
# if we have a _FillValue/masked_value we do not want to cast now
# but leave that to CFMaskCoder
dtype = data.dtype
if "_FillValue" not in encoding and "missing_value" not in encoding:
dtype = _choose_float_dtype(data.dtype, attrs)
# but still we need a copy prevent changing original data
data = data.astype(dtype=dtype, copy=True)
if "add_offset" in encoding:
data -= pop_to(encoding, attrs, "add_offset", name=name)
if "scale_factor" in encoding:
data /= pop_to(encoding, attrs, "scale_factor", name=name)

return Variable(dims, data, attrs, encoding, fastpath=True)
if add_offset:
data -= add_offset
if scale_factor:
data /= scale_factor
return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable

def decode(self, variable: Variable, name: T_Name = None) -> Variable:
_attrs = variable.attrs
if "scale_factor" in _attrs or "add_offset" in _attrs:
dims, data, attrs, encoding = unpack_for_decoding(variable)

scale_factor = pop_to(attrs, encoding, "scale_factor", name=name)
add_offset = pop_to(attrs, encoding, "add_offset", name=name)
dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
dims, data, attrs, encoding = unpack_for_decoding(variable)
scale_factor = pop_to(attrs, encoding, "scale_factor", name=name)
add_offset = pop_to(attrs, encoding, "add_offset", name=name)
if scale_factor or add_offset:
if np.ndim(scale_factor) > 0:
scale_factor = np.asarray(scale_factor).item()
if np.ndim(add_offset) > 0:
add_offset = np.asarray(add_offset).item()
# if we have a _FillValue/masked_value we already have the wanted
# floating point dtype here (via CFMaskCoder), so no check is necessary
# only check in other cases
dtype = data.dtype
if "_FillValue" not in encoding and "missing_value" not in encoding:
dtype = _choose_float_dtype(dtype, encoding)

transform = partial(
_scale_offset_decoding,
scale_factor=scale_factor,
Expand Down Expand Up @@ -349,3 +466,100 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable


class DefaultFillvalueCoder(VariableCoder):
"""Encode default _FillValue if needed."""

def encode(self, variable: Variable, name: T_Name = None) -> Variable:
dims, data, attrs, encoding = unpack_for_encoding(variable)
# make NaN the fill value for float types
if (
"_FillValue" not in attrs
and "_FillValue" not in encoding
and np.issubdtype(variable.dtype, np.floating)
):
attrs["_FillValue"] = variable.dtype.type(np.nan)
return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable

def decode(self, variable: Variable, name: T_Name = None) -> Variable:
raise NotImplementedError()


class BooleanCoder(VariableCoder):
"""Code boolean values."""

def encode(self, variable: Variable, name: T_Name = None) -> Variable:
if (
(variable.dtype == bool)
and ("dtype" not in variable.encoding)
and ("dtype" not in variable.attrs)
):
dims, data, attrs, encoding = unpack_for_encoding(variable)
attrs["dtype"] = "bool"
data = duck_array_ops.astype(data, dtype="i1", copy=True)

return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable

def decode(self, variable: Variable, name: T_Name = None) -> Variable:
if variable.attrs.get("dtype", False) == "bool":
dims, data, attrs, encoding = unpack_for_decoding(variable)
# overwrite (!) encoding accordingly and remove from attrs
encoding["dtype"] = attrs.pop("dtype")
data = BoolTypeArray(data)
return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable


class EndianCoder(VariableCoder):
"""Decode Endianness to native."""

def encode(self):
raise NotImplementedError()

def decode(self, variable: Variable, name: T_Name = None) -> Variable:
dims, data, attrs, encoding = unpack_for_decoding(variable)
if not data.dtype.isnative:
data = NativeEndiannessArray(data)
return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable


class NonStringCoder(VariableCoder):
"""Encode NonString variables if dtypes differ."""

def encode(self, variable: Variable, name: T_Name = None) -> Variable:
if "dtype" in variable.encoding and variable.encoding["dtype"] not in (
"S1",
str,
):
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
dims, data, attrs, encoding = unpack_for_encoding(variable)
dtype = np.dtype(encoding.pop("dtype"))
if dtype != variable.dtype:
if np.issubdtype(dtype, np.integer):
if (
np.issubdtype(variable.dtype, np.floating)
and "_FillValue" not in variable.attrs
and "missing_value" not in variable.attrs
):
warnings.warn(
f"saving variable {name} with floating "
"point data as an integer dtype without "
"any _FillValue to use for NaNs",
SerializationWarning,
stacklevel=10,
)
data = np.around(data)
data = data.astype(dtype=dtype)
return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable

def decode(self):
raise NotImplementedError()
Loading