From d19d57a4663adb6d3e5b28784f10fe7313b75747 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sat, 1 Apr 2023 10:18:08 +0200 Subject: [PATCH 01/22] implement coders, adapt tests --- xarray/coding/variables.py | 162 +++++++++++++++++++++++++++++++ xarray/conventions.py | 137 ++------------------------ xarray/tests/test_conventions.py | 6 +- 3 files changed, 175 insertions(+), 130 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 4107b3aa883..1cd0803730f 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -78,6 +78,72 @@ def __repr__(self) -> str: ) +class NativeEndiannessArray(indexing.ExplicitlyIndexedNDArrayMixin): + """Decode arrays on the fly from non-native to native endianness + + This is useful for decoding arrays from netCDF3 files (which are all + big endian) into native endianness, so they can be used with Cython + functions, such as those found in bottleneck and pandas. + + >>> x = np.arange(5, dtype=">i2") + + >>> x.dtype + dtype('>i2') + + >>> NativeEndiannessArray(x).dtype + dtype('int16') + + >>> indexer = indexing.BasicIndexer((slice(None),)) + >>> NativeEndiannessArray(x)[indexer].dtype + dtype('int16') + """ + + __slots__ = ("array",) + + def __init__(self, array): + self.array = indexing.as_indexable(array) + + @property + def dtype(self): + return np.dtype(self.array.dtype.kind + str(self.array.dtype.itemsize)) + + def __getitem__(self, key): + return np.asarray(self.array[key], dtype=self.dtype) + + +class BoolTypeArray(indexing.ExplicitlyIndexedNDArrayMixin): + """Decode arrays on the fly from integer to boolean datatype + + This is useful for decoding boolean arrays from integer typed netCDF + variables. + + >>> x = np.array([1, 0, 1, 1, 0], dtype="i1") + + >>> x.dtype + dtype('int8') + + >>> BoolTypeArray(x).dtype + dtype('bool') + + >>> indexer = indexing.BasicIndexer((slice(None),)) + >>> BoolTypeArray(x)[indexer].dtype + dtype('bool') + """ + + __slots__ = ("array",) + + def __init__(self, array): + self.array = indexing.as_indexable(array) + + @property + def dtype(self): + return np.dtype("bool") + + def __getitem__(self, key): + return np.asarray(self.array[key], dtype=self.dtype) + + + def lazy_elemwise_func(array, func: Callable, dtype: np.typing.DTypeLike): """Lazily apply an element-wise function to an array. Parameters @@ -349,3 +415,99 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: return Variable(dims, data, attrs, encoding, fastpath=True) else: return variable + + +class DefaultFillvalueCoder(VariableCoder): + """Encode default _FillValue if needed.""" + + def encode(self, variable: Variable, name: T_Name = None) -> Variable: + dims, data, attrs, encoding = unpack_for_encoding(variable) + # make NaN the fill value for float types + if ( + "_FillValue" not in attrs + and "_FillValue" not in encoding + and np.issubdtype(variable.dtype, np.floating) + ): + attrs["_FillValue"] = variable.dtype.type(np.nan) + return Variable(dims, data, attrs, encoding, fastpath=True) + else: + return variable + def decode(self, variable: Variable, name: T_Name = None) -> Variable: + raise NotImplementedError() + + +class BooleanCoder(VariableCoder): + """Code boolean values.""" + + def encode(self, variable: Variable, name: T_Name = None) -> Variable: + if ( + (variable.dtype == bool) + and ("dtype" not in variable.encoding) + and ("dtype" not in variable.attrs) + ): + dims, data, attrs, encoding = unpack_for_encoding(variable) + attrs["dtype"] = "bool" + data = duck_array_ops.astype(data, dtype="i1", copy=True) + + return Variable(dims, data, attrs, encoding, fastpath=True) + else: + return variable + + def decode(self, variable: Variable, name: T_Name = None) -> Variable: + if variable.attrs.get("dtype", False) == "bool": + dims, data, attrs, encoding = unpack_for_decoding(variable) + del attrs["dtype"] + data = BoolTypeArray(data) + return Variable(dims, data, attrs, encoding, fastpath=True) + else: + return variable + + +class EndianCoder(VariableCoder): + """Decode Endianness to native.""" + + def encode(self): + raise NotImplementedError() + + def decode(self, variable: Variable, name: T_Name = None) -> Variable: + dims, data, attrs, encoding = unpack_for_decoding(variable) + if not data.dtype.isnative: + data = NativeEndiannessArray(data) + return Variable(dims, data, attrs, encoding, fastpath=True) + else: + return variable + + +class NonStringCoder(VariableCoder): + """Encode NonString variables if dtypes differ.""" + + def encode(self, variable: Variable, name: T_Name = None) -> Variable: + if "dtype" in variable.encoding and variable.encoding["dtype"] not in ( + "S1", + str, + ): + dims, data, attrs, encoding = unpack_for_encoding(variable) + dtype = np.dtype(encoding.pop("dtype")) + if dtype != variable.dtype: + if np.issubdtype(dtype, np.integer): + if ( + np.issubdtype(variable.dtype, np.floating) + and "_FillValue" not in variable.attrs + and "missing_value" not in variable.attrs + ): + warnings.warn( + f"saving variable {name} with floating " + "point data as an integer dtype without " + "any _FillValue to use for NaNs", + SerializationWarning, + stacklevel=10, + ) + data = np.around(data) + data = data.astype(dtype=dtype) + return Variable(dims, data, attrs, encoding, fastpath=True) + else: + return variable + + def decode(self): + raise NotImplementedError() + diff --git a/xarray/conventions.py b/xarray/conventions.py index 780172879c6..f7cdc0bc7d3 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -48,123 +48,10 @@ T_DatasetOrAbstractstore = Union[Dataset, AbstractDataStore] -class NativeEndiannessArray(indexing.ExplicitlyIndexedNDArrayMixin): - """Decode arrays on the fly from non-native to native endianness - - This is useful for decoding arrays from netCDF3 files (which are all - big endian) into native endianness, so they can be used with Cython - functions, such as those found in bottleneck and pandas. - - >>> x = np.arange(5, dtype=">i2") - - >>> x.dtype - dtype('>i2') - - >>> NativeEndiannessArray(x).dtype - dtype('int16') - - >>> indexer = indexing.BasicIndexer((slice(None),)) - >>> NativeEndiannessArray(x)[indexer].dtype - dtype('int16') - """ - - __slots__ = ("array",) - - def __init__(self, array): - self.array = indexing.as_indexable(array) - - @property - def dtype(self): - return np.dtype(self.array.dtype.kind + str(self.array.dtype.itemsize)) - - def __getitem__(self, key): - return np.asarray(self.array[key], dtype=self.dtype) - - -class BoolTypeArray(indexing.ExplicitlyIndexedNDArrayMixin): - """Decode arrays on the fly from integer to boolean datatype - - This is useful for decoding boolean arrays from integer typed netCDF - variables. - - >>> x = np.array([1, 0, 1, 1, 0], dtype="i1") - - >>> x.dtype - dtype('int8') - - >>> BoolTypeArray(x).dtype - dtype('bool') - - >>> indexer = indexing.BasicIndexer((slice(None),)) - >>> BoolTypeArray(x)[indexer].dtype - dtype('bool') - """ - - __slots__ = ("array",) - - def __init__(self, array): - self.array = indexing.as_indexable(array) - - @property - def dtype(self): - return np.dtype("bool") - - def __getitem__(self, key): - return np.asarray(self.array[key], dtype=self.dtype) - - def _var_as_tuple(var: Variable) -> T_VarTuple: return var.dims, var.data, var.attrs.copy(), var.encoding.copy() -def maybe_encode_nonstring_dtype(var: Variable, name: T_Name = None) -> Variable: - if "dtype" in var.encoding and var.encoding["dtype"] not in ("S1", str): - dims, data, attrs, encoding = _var_as_tuple(var) - dtype = np.dtype(encoding.pop("dtype")) - if dtype != var.dtype: - if np.issubdtype(dtype, np.integer): - if ( - np.issubdtype(var.dtype, np.floating) - and "_FillValue" not in var.attrs - and "missing_value" not in var.attrs - ): - warnings.warn( - f"saving variable {name} with floating " - "point data as an integer dtype without " - "any _FillValue to use for NaNs", - SerializationWarning, - stacklevel=10, - ) - data = np.around(data) - data = data.astype(dtype=dtype) - var = Variable(dims, data, attrs, encoding, fastpath=True) - return var - - -def maybe_default_fill_value(var: Variable) -> Variable: - # make NaN the fill value for float types: - if ( - "_FillValue" not in var.attrs - and "_FillValue" not in var.encoding - and np.issubdtype(var.dtype, np.floating) - ): - var.attrs["_FillValue"] = var.dtype.type(np.nan) - return var - - -def maybe_encode_bools(var: Variable) -> Variable: - if ( - (var.dtype == bool) - and ("dtype" not in var.encoding) - and ("dtype" not in var.attrs) - ): - dims, data, attrs, encoding = _var_as_tuple(var) - attrs["dtype"] = "bool" - data = duck_array_ops.astype(data, dtype="i1", copy=True) - var = Variable(dims, data, attrs, encoding, fastpath=True) - return var - - def _infer_dtype(array, name: T_Name = None) -> np.dtype: """Given an object array with no missing values, infer its dtype from its first element @@ -292,13 +179,13 @@ def encode_cf_variable( variables.CFScaleOffsetCoder(), variables.CFMaskCoder(), variables.UnsignedIntegerCoder(), + variables.NonStringCoder(), + variables.DefaultFillvalueCoder(), + variables.BooleanCoder(), ]: var = coder.encode(var, name=name) - # TODO(shoyer): convert all of these to use coders, too: - var = maybe_encode_nonstring_dtype(var, name=name) - var = maybe_default_fill_value(var) - var = maybe_encode_bools(var) + # TODO(kmuehlbauer): check if ensure_dtype_not_object can be moved to backends: var = ensure_dtype_not_object(var, name=name) for attr_name in CF_RELATED_DATA: @@ -389,19 +276,15 @@ def decode_cf_variable( if decode_times: var = times.CFDatetimeCoder(use_cftime=use_cftime).decode(var, name=name) - dimensions, data, attributes, encoding = variables.unpack_for_decoding(var) - # TODO(shoyer): convert everything below to use coders + if decode_endianness and not var.dtype.isnative: + var = variables.EndianCoder().decode(var) + original_dtype = var.dtype - if decode_endianness and not data.dtype.isnative: - # do this last, so it's only done if we didn't already unmask/scale - data = NativeEndiannessArray(data) - original_dtype = data.dtype + var = variables.BooleanCoder().decode(var) - encoding.setdefault("dtype", original_dtype) + dimensions, data, attributes, encoding = variables.unpack_for_decoding(var) - if "dtype" in attributes and attributes["dtype"] == "bool": - del attributes["dtype"] - data = BoolTypeArray(data) + encoding.setdefault("dtype", original_dtype) if not is_duck_dask_array(data): data = indexing.LazilyIndexedArray(data) diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index 9485b506b89..6d219f09e0e 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -32,7 +32,7 @@ class TestBoolTypeArray: def test_booltype_array(self) -> None: x = np.array([1, 0, 1, 1, 0], dtype="i1") - bx = conventions.BoolTypeArray(x) + bx = coding.variables.BoolTypeArray(x) assert bx.dtype == bool assert_array_equal(bx, np.array([True, False, True, True, False], dtype=bool)) @@ -41,7 +41,7 @@ class TestNativeEndiannessArray: def test(self) -> None: x = np.arange(5, dtype=">i8") expected = np.arange(5, dtype="int64") - a = conventions.NativeEndiannessArray(x) + a = coding.variables.NativeEndiannessArray(x) assert a.dtype == expected.dtype assert a.dtype == expected[:].dtype assert_array_equal(a, expected) @@ -247,7 +247,7 @@ def test_decode_coordinates(self) -> None: def test_0d_int32_encoding(self) -> None: original = Variable((), np.int32(0), encoding={"dtype": "int64"}) expected = Variable((), np.int64(0)) - actual = conventions.maybe_encode_nonstring_dtype(original) + actual = coding.variables.NonStringCoder().encode(original) assert_identical(expected, actual) def test_decode_cf_with_multiple_missing_values(self) -> None: From 2e750e4b2a942d07ead45cdfdf461d2d0ca957ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 31 Mar 2023 09:00:51 +0200 Subject: [PATCH 02/22] preserve boolean-dtype within encoding, adapt test --- xarray/coding/variables.py | 3 ++- xarray/tests/test_backends.py | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 1cd0803730f..80d1740e88f 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -456,7 +456,8 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable: def decode(self, variable: Variable, name: T_Name = None) -> Variable: if variable.attrs.get("dtype", False) == "bool": dims, data, attrs, encoding = unpack_for_decoding(variable) - del attrs["dtype"] + # overwrite encoding accordingly and remove from attrs + encoding["dtype"] = attrs.pop("dtype") data = BoolTypeArray(data) return Variable(dims, data, attrs, encoding, fastpath=True) else: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 14b510d4c97..ccd974f37c1 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -630,6 +630,9 @@ def test_roundtrip_boolean_dtype(self) -> None: with self.roundtrip(original) as actual: assert_identical(original, actual) assert actual["x"].dtype == "bool" + with self.roundtrip(actual) as actual2: + assert_identical(original, actual2) + assert actual2["x"].dtype == "bool" def test_orthogonal_indexing(self) -> None: in_memory = create_test_data() From 0cd301b9879dec1fa4a269616c53026e1a9db5ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 31 Mar 2023 10:34:30 +0200 Subject: [PATCH 03/22] determine cf packed data dtype, adapt tests --- xarray/coding/variables.py | 30 +++++++++++----- xarray/tests/test_backends.py | 67 ++++++++++++++++++++--------------- 2 files changed, 60 insertions(+), 37 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 80d1740e88f..834bb107987 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -298,9 +298,24 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTyp return data -def _choose_float_dtype(dtype: np.dtype, has_offset: bool) -> type[np.floating[Any]]: - """Return a float dtype that can losslessly represent `dtype` values.""" - # Keep float32 as-is. Upcast half-precision to single-precision, +def _choose_float_dtype( + dtype: np.dtype, encoding: MutableMapping +) -> type[np.floating[Any]]: + # check scale/offset first to derive dtype with + if "scale_factor" in encoding or "add_offset" in encoding: + scale_factor = encoding.get("scale_factor", False) + add_offset = encoding.get("add_offset", False) + # minimal floating point size -> 4 byte + maxsize = 4 + if scale_factor and np.issubdtype(type(scale_factor), np.floating): + maxsize = max(maxsize, np.dtype(type(scale_factor)).itemsize) + if add_offset and np.issubdtype(type(add_offset), np.floating): + maxsize = max(maxsize, np.dtype(type(add_offset)).itemsize) + if maxsize == 4: + return np.float32 + else: + return np.float64 + # Keep float32 as-is. Upcast half-precision to single-precision, # because float16 is "intended for storage but not computation" if dtype.itemsize <= 4 and np.issubdtype(dtype, np.floating): return np.float32 @@ -308,10 +323,7 @@ def _choose_float_dtype(dtype: np.dtype, has_offset: bool) -> type[np.floating[A if dtype.itemsize <= 2 and np.issubdtype(dtype, np.integer): # A scale factor is entirely safe (vanishing into the mantissa), # but a large integer offset could lead to loss of precision. - # Sensitivity analysis can be tricky, so we just use a float64 - # if there's any offset at all - better unoptimised than wrong! - if not has_offset: - return np.float32 + return np.float32 # For all other types and circumstances, we just use float64. # (safe because eg. complex numbers are not supported in NetCDF) return np.float64 @@ -328,7 +340,7 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable: dims, data, attrs, encoding = unpack_for_encoding(variable) if "scale_factor" in encoding or "add_offset" in encoding: - dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding) + dtype = _choose_float_dtype(data.dtype, encoding) data = data.astype(dtype=dtype, copy=True) if "add_offset" in encoding: data -= pop_to(encoding, attrs, "add_offset", name=name) @@ -344,7 +356,7 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: scale_factor = pop_to(attrs, encoding, "scale_factor", name=name) add_offset = pop_to(attrs, encoding, "add_offset", name=name) - dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding) + dtype = _choose_float_dtype(data.dtype, encoding) if np.ndim(scale_factor) > 0: scale_factor = np.asarray(scale_factor).item() if np.ndim(add_offset) > 0: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index ccd974f37c1..4c8e16c33cd 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -138,96 +138,96 @@ def open_example_mfdataset(names, *args, **kwargs) -> Dataset: ) -def create_masked_and_scaled_data() -> Dataset: - x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=np.float32) +def create_masked_and_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset: + x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=dtype) encoding = { "_FillValue": -1, "add_offset": 10, - "scale_factor": np.float32(0.1), + "scale_factor": dtype(0.1), "dtype": "i2", } return Dataset({"x": ("t", x, {}, encoding)}) -def create_encoded_masked_and_scaled_data() -> Dataset: - attributes = {"_FillValue": -1, "add_offset": 10, "scale_factor": np.float32(0.1)} +def create_encoded_masked_and_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset: + attributes = {"_FillValue": -1, "add_offset": 10, "scale_factor": dtype(0.1)} return Dataset( {"x": ("t", np.array([-1, -1, 0, 1, 2], dtype=np.int16), attributes)} ) -def create_unsigned_masked_scaled_data() -> Dataset: +def create_unsigned_masked_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset: encoding = { "_FillValue": 255, "_Unsigned": "true", "dtype": "i1", "add_offset": 10, - "scale_factor": np.float32(0.1), + "scale_factor": dtype(0.1), } - x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32) + x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype) return Dataset({"x": ("t", x, {}, encoding)}) -def create_encoded_unsigned_masked_scaled_data() -> Dataset: +def create_encoded_unsigned_masked_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset: # These are values as written to the file: the _FillValue will # be represented in the signed form. attributes = { "_FillValue": -1, "_Unsigned": "true", "add_offset": 10, - "scale_factor": np.float32(0.1), + "scale_factor": dtype(0.1), } # Create unsigned data corresponding to [0, 1, 127, 128, 255] unsigned sb = np.asarray([0, 1, 127, -128, -1], dtype="i1") return Dataset({"x": ("t", sb, attributes)}) -def create_bad_unsigned_masked_scaled_data() -> Dataset: +def create_bad_unsigned_masked_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset: encoding = { "_FillValue": 255, "_Unsigned": True, "dtype": "i1", "add_offset": 10, - "scale_factor": np.float32(0.1), + "scale_factor": dtype(0.1), } - x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32) + x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype) return Dataset({"x": ("t", x, {}, encoding)}) -def create_bad_encoded_unsigned_masked_scaled_data() -> Dataset: +def create_bad_encoded_unsigned_masked_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset: # These are values as written to the file: the _FillValue will # be represented in the signed form. attributes = { "_FillValue": -1, "_Unsigned": True, "add_offset": 10, - "scale_factor": np.float32(0.1), + "scale_factor": dtype(0.1), } # Create signed data corresponding to [0, 1, 127, 128, 255] unsigned sb = np.asarray([0, 1, 127, -128, -1], dtype="i1") return Dataset({"x": ("t", sb, attributes)}) -def create_signed_masked_scaled_data() -> Dataset: +def create_signed_masked_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset: encoding = { "_FillValue": -127, "_Unsigned": "false", "dtype": "i1", "add_offset": 10, - "scale_factor": np.float32(0.1), + "scale_factor": dtype(0.1), } - x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=np.float32) + x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=dtype) return Dataset({"x": ("t", x, {}, encoding)}) -def create_encoded_signed_masked_scaled_data() -> Dataset: +def create_encoded_signed_masked_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset: # These are values as written to the file: the _FillValue will # be represented in the signed form. attributes = { "_FillValue": -127, "_Unsigned": "false", "add_offset": 10, - "scale_factor": np.float32(0.1), + "scale_factor": dtype(0.1), } # Create signed data corresponding to [0, 1, 127, 128, 255] unsigned sb = np.asarray([-110, 1, 127, -127], dtype="i1") @@ -857,6 +857,7 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None: with self.roundtrip(original) as actual: assert_identical(expected, actual) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize( "decoded_fn, encoded_fn", [ @@ -876,12 +877,19 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None: (create_masked_and_scaled_data, create_encoded_masked_and_scaled_data), ], ) - def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn) -> None: - decoded = decoded_fn() - encoded = encoded_fn() + def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None: + if dtype == np.float32 and isinstance( + self, (TestZarrDirectoryStore, TestZarrDictStore) + ): + pytest.skip( + "zarr attributes (eg. `scale_factor` are unconditionally promoted to `float64`" + ) + decoded = decoded_fn(dtype) + encoded = encoded_fn(dtype) with self.roundtrip(decoded) as actual: for k in decoded.variables: + print(k, decoded.variables[k].dtype) assert decoded.variables[k].dtype == actual.variables[k].dtype assert_allclose(decoded, actual, decode_bytes=False) @@ -899,7 +907,7 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn) -> None: # make sure roundtrip encoding didn't change the # original dataset. - assert_allclose(encoded, encoded_fn(), decode_bytes=False) + assert_allclose(encoded, encoded_fn(dtype), decode_bytes=False) with self.roundtrip(encoded) as actual: for k in decoded.variables: @@ -1533,7 +1541,8 @@ def test_encoding_chunksizes_unlimited(self) -> None: with self.roundtrip(ds) as actual: assert_equal(ds, actual) - def test_mask_and_scale(self) -> None: + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) + def test_mask_and_scale(self, dtype) -> None: with create_tmp_file() as tmp_file: with nc4.Dataset(tmp_file, mode="w") as nc: nc.createDimension("t", 5) @@ -1541,21 +1550,23 @@ def test_mask_and_scale(self) -> None: v = nc.variables["x"] v.set_auto_maskandscale(False) v.add_offset = 10 - v.scale_factor = 0.1 + v.scale_factor = dtype(0.1) v[:] = np.array([-1, -1, 0, 1, 2]) # first make sure netCDF4 reads the masked and scaled data # correctly with nc4.Dataset(tmp_file, mode="r") as nc: expected = np.ma.array( - [-1, -1, 10, 10.1, 10.2], mask=[True, True, False, False, False] + [-1, -1, 10, 10.1, 10.2], + mask=[True, True, False, False, False], + dtype=dtype, ) actual = nc.variables["x"][:] assert_array_equal(expected, actual) # now check xarray with open_dataset(tmp_file) as ds: - expected = create_masked_and_scaled_data() + expected = create_masked_and_scaled_data(dtype) assert_identical(expected, ds) def test_0dimensional_variable(self) -> None: From 7cfc6aca3a0b761970765671f49e16cb05cffc25 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 1 Apr 2023 08:36:05 +0000 Subject: [PATCH 04/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/coding/variables.py | 3 +-- xarray/conventions.py | 2 +- xarray/tests/test_backends.py | 28 +++++++++++++++++++++------- 3 files changed, 23 insertions(+), 10 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 834bb107987..6b773b9343a 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -143,7 +143,6 @@ def __getitem__(self, key): return np.asarray(self.array[key], dtype=self.dtype) - def lazy_elemwise_func(array, func: Callable, dtype: np.typing.DTypeLike): """Lazily apply an element-wise function to an array. Parameters @@ -444,6 +443,7 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable: return Variable(dims, data, attrs, encoding, fastpath=True) else: return variable + def decode(self, variable: Variable, name: T_Name = None) -> Variable: raise NotImplementedError() @@ -523,4 +523,3 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable: def decode(self): raise NotImplementedError() - diff --git a/xarray/conventions.py b/xarray/conventions.py index f7cdc0bc7d3..1506efc31e8 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -10,7 +10,7 @@ from xarray.coding import strings, times, variables from xarray.coding.variables import SerializationWarning, pop_to -from xarray.core import duck_array_ops, indexing +from xarray.core import indexing from xarray.core.common import ( _contains_datetime_like_objects, contains_cftime_datetimes, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 4c8e16c33cd..345e6cc9d5e 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -149,14 +149,18 @@ def create_masked_and_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Da return Dataset({"x": ("t", x, {}, encoding)}) -def create_encoded_masked_and_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset: +def create_encoded_masked_and_scaled_data( + dtype: np.typing.DTypeLike = np.float32, +) -> Dataset: attributes = {"_FillValue": -1, "add_offset": 10, "scale_factor": dtype(0.1)} return Dataset( {"x": ("t", np.array([-1, -1, 0, 1, 2], dtype=np.int16), attributes)} ) -def create_unsigned_masked_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset: +def create_unsigned_masked_scaled_data( + dtype: np.typing.DTypeLike = np.float32, +) -> Dataset: encoding = { "_FillValue": 255, "_Unsigned": "true", @@ -168,7 +172,9 @@ def create_unsigned_masked_scaled_data(dtype: np.typing.DTypeLike = np.float32) return Dataset({"x": ("t", x, {}, encoding)}) -def create_encoded_unsigned_masked_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset: +def create_encoded_unsigned_masked_scaled_data( + dtype: np.typing.DTypeLike = np.float32, +) -> Dataset: # These are values as written to the file: the _FillValue will # be represented in the signed form. attributes = { @@ -182,7 +188,9 @@ def create_encoded_unsigned_masked_scaled_data(dtype: np.typing.DTypeLike = np.f return Dataset({"x": ("t", sb, attributes)}) -def create_bad_unsigned_masked_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset: +def create_bad_unsigned_masked_scaled_data( + dtype: np.typing.DTypeLike = np.float32, +) -> Dataset: encoding = { "_FillValue": 255, "_Unsigned": True, @@ -194,7 +202,9 @@ def create_bad_unsigned_masked_scaled_data(dtype: np.typing.DTypeLike = np.float return Dataset({"x": ("t", x, {}, encoding)}) -def create_bad_encoded_unsigned_masked_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset: +def create_bad_encoded_unsigned_masked_scaled_data( + dtype: np.typing.DTypeLike = np.float32, +) -> Dataset: # These are values as written to the file: the _FillValue will # be represented in the signed form. attributes = { @@ -208,7 +218,9 @@ def create_bad_encoded_unsigned_masked_scaled_data(dtype: np.typing.DTypeLike = return Dataset({"x": ("t", sb, attributes)}) -def create_signed_masked_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset: +def create_signed_masked_scaled_data( + dtype: np.typing.DTypeLike = np.float32, +) -> Dataset: encoding = { "_FillValue": -127, "_Unsigned": "false", @@ -220,7 +232,9 @@ def create_signed_masked_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> return Dataset({"x": ("t", x, {}, encoding)}) -def create_encoded_signed_masked_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset: +def create_encoded_signed_masked_scaled_data( + dtype: np.typing.DTypeLike = np.float32, +) -> Dataset: # These are values as written to the file: the _FillValue will # be represented in the signed form. attributes = { From bf4371d51f0758faac491aabbbf264eee5eb9901 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sat, 1 Apr 2023 18:23:38 +0200 Subject: [PATCH 05/22] fix typing, fix if-clauses, add link to issue --- xarray/coding/variables.py | 38 +++++++++++++++++------------------ xarray/tests/test_backends.py | 17 ++++++++-------- 2 files changed, 27 insertions(+), 28 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 6b773b9343a..8c77e94afe4 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -298,12 +298,13 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTyp def _choose_float_dtype( - dtype: np.dtype, encoding: MutableMapping + dtype: np.dtype, mapping: MutableMapping ) -> type[np.floating[Any]]: - # check scale/offset first to derive dtype with - if "scale_factor" in encoding or "add_offset" in encoding: - scale_factor = encoding.get("scale_factor", False) - add_offset = encoding.get("add_offset", False) + # check scale/offset first to derive dtype + # see https://github.com/pydata/xarray/issues/5597#issuecomment-879561954 + scale_factor = mapping.get("scale_factor", False) + add_offset = mapping.get("add_offset", False) + if scale_factor or add_offset: # minimal floating point size -> 4 byte maxsize = 4 if scale_factor and np.issubdtype(type(scale_factor), np.floating): @@ -337,29 +338,28 @@ class CFScaleOffsetCoder(VariableCoder): def encode(self, variable: Variable, name: T_Name = None) -> Variable: dims, data, attrs, encoding = unpack_for_encoding(variable) - - if "scale_factor" in encoding or "add_offset" in encoding: - dtype = _choose_float_dtype(data.dtype, encoding) + scale_factor = pop_to(encoding, attrs, "scale_factor", name=name) + add_offset = pop_to(encoding, attrs, "add_offset", name=name) + if scale_factor or add_offset: + dtype = _choose_float_dtype(data.dtype, attrs) data = data.astype(dtype=dtype, copy=True) - if "add_offset" in encoding: - data -= pop_to(encoding, attrs, "add_offset", name=name) - if "scale_factor" in encoding: - data /= pop_to(encoding, attrs, "scale_factor", name=name) + if add_offset: + data -= add_offset + if scale_factor: + data /= scale_factor return Variable(dims, data, attrs, encoding, fastpath=True) def decode(self, variable: Variable, name: T_Name = None) -> Variable: - _attrs = variable.attrs - if "scale_factor" in _attrs or "add_offset" in _attrs: - dims, data, attrs, encoding = unpack_for_decoding(variable) - - scale_factor = pop_to(attrs, encoding, "scale_factor", name=name) - add_offset = pop_to(attrs, encoding, "add_offset", name=name) - dtype = _choose_float_dtype(data.dtype, encoding) + dims, data, attrs, encoding = unpack_for_decoding(variable) + scale_factor = pop_to(attrs, encoding, "scale_factor", name=name) + add_offset = pop_to(attrs, encoding, "add_offset", name=name) + if scale_factor or add_offset: if np.ndim(scale_factor) > 0: scale_factor = np.asarray(scale_factor).item() if np.ndim(add_offset) > 0: add_offset = np.asarray(add_offset).item() + dtype = _choose_float_dtype(data.dtype, encoding) transform = partial( _scale_offset_decoding, scale_factor=scale_factor, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 345e6cc9d5e..7e9e0ffe8d9 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -138,7 +138,7 @@ def open_example_mfdataset(names, *args, **kwargs) -> Dataset: ) -def create_masked_and_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset: +def create_masked_and_scaled_data(dtype: type[np.number] = np.float32) -> Dataset: x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=dtype) encoding = { "_FillValue": -1, @@ -150,7 +150,7 @@ def create_masked_and_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Da def create_encoded_masked_and_scaled_data( - dtype: np.typing.DTypeLike = np.float32, + dtype: type[np.number] = np.float32, ) -> Dataset: attributes = {"_FillValue": -1, "add_offset": 10, "scale_factor": dtype(0.1)} return Dataset( @@ -159,7 +159,7 @@ def create_encoded_masked_and_scaled_data( def create_unsigned_masked_scaled_data( - dtype: np.typing.DTypeLike = np.float32, + dtype: type[np.number] = np.float32, ) -> Dataset: encoding = { "_FillValue": 255, @@ -173,7 +173,7 @@ def create_unsigned_masked_scaled_data( def create_encoded_unsigned_masked_scaled_data( - dtype: np.typing.DTypeLike = np.float32, + dtype: type[np.number] = np.float32, ) -> Dataset: # These are values as written to the file: the _FillValue will # be represented in the signed form. @@ -189,7 +189,7 @@ def create_encoded_unsigned_masked_scaled_data( def create_bad_unsigned_masked_scaled_data( - dtype: np.typing.DTypeLike = np.float32, + dtype: type[np.number] = np.float32, ) -> Dataset: encoding = { "_FillValue": 255, @@ -203,7 +203,7 @@ def create_bad_unsigned_masked_scaled_data( def create_bad_encoded_unsigned_masked_scaled_data( - dtype: np.typing.DTypeLike = np.float32, + dtype: type[np.number] = np.float32, ) -> Dataset: # These are values as written to the file: the _FillValue will # be represented in the signed form. @@ -219,7 +219,7 @@ def create_bad_encoded_unsigned_masked_scaled_data( def create_signed_masked_scaled_data( - dtype: np.typing.DTypeLike = np.float32, + dtype: type[np.number] = np.float32, ) -> Dataset: encoding = { "_FillValue": -127, @@ -233,7 +233,7 @@ def create_signed_masked_scaled_data( def create_encoded_signed_masked_scaled_data( - dtype: np.typing.DTypeLike = np.float32, + dtype: type[np.number] = np.float32, ) -> Dataset: # These are values as written to the file: the _FillValue will # be represented in the signed form. @@ -903,7 +903,6 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None: with self.roundtrip(decoded) as actual: for k in decoded.variables: - print(k, decoded.variables[k].dtype) assert decoded.variables[k].dtype == actual.variables[k].dtype assert_allclose(decoded, actual, decode_bytes=False) From fe006df525aec7817a279e648c85b93266d9047e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sat, 1 Apr 2023 18:31:17 +0200 Subject: [PATCH 06/22] add comment ass per review --- xarray/tests/test_backends.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 7e9e0ffe8d9..08945337851 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -644,6 +644,8 @@ def test_roundtrip_boolean_dtype(self) -> None: with self.roundtrip(original) as actual: assert_identical(original, actual) assert actual["x"].dtype == "bool" + # this checks for preserving dtype during second roundtrip + # see https://github.com/pydata/xarray/issues/7652#issuecomment-1476956975 with self.roundtrip(actual) as actual2: assert_identical(original, actual2) assert actual2["x"].dtype == "bool" From 66628fcdc7056a33eb6c740fdc57372a27d073df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sat, 1 Apr 2023 18:57:35 +0200 Subject: [PATCH 07/22] choose float64 if add_offset is not up to cf expectations --- xarray/coding/variables.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 8c77e94afe4..078ba09358b 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -305,13 +305,22 @@ def _choose_float_dtype( scale_factor = mapping.get("scale_factor", False) add_offset = mapping.get("add_offset", False) if scale_factor or add_offset: - # minimal floating point size -> 4 byte + # get the maximum itemsize from scale_factor/add_offset to determine + # the needed floating point type + # start with minimal floating point size -> 4 byte maxsize = 4 if scale_factor and np.issubdtype(type(scale_factor), np.floating): maxsize = max(maxsize, np.dtype(type(scale_factor)).itemsize) - if add_offset and np.issubdtype(type(add_offset), np.floating): - maxsize = max(maxsize, np.dtype(type(add_offset)).itemsize) - if maxsize == 4: + add_offset_type = type(add_offset) + if add_offset and np.issubdtype(add_offset_type, np.floating): + maxsize = max(maxsize, np.dtype(add_offset_type).itemsize) + # if add_offset is malformed (eg. no float32 or no float64 as + # cf conventions expects): + # A scale factor is entirely safe (vanishing into the mantissa), + # but a large integer offset could lead to loss of precision. + # Sensitivity analysis can be tricky, so we just use a float64 + # if there's any offset at all - better unoptimised than wrong! + if maxsize == 4 and np.issubdtype(add_offset_type, np.floating): return np.float32 else: return np.float64 @@ -321,8 +330,6 @@ def _choose_float_dtype( return np.float32 # float32 can exactly represent all integers up to 24 bits if dtype.itemsize <= 2 and np.issubdtype(dtype, np.integer): - # A scale factor is entirely safe (vanishing into the mantissa), - # but a large integer offset could lead to loss of precision. return np.float32 # For all other types and circumstances, we just use float64. # (safe because eg. complex numbers are not supported in NetCDF) From b6310e48e34d73aa0aa3b133283c4ab76a2bc7b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sat, 1 Apr 2023 20:07:53 +0200 Subject: [PATCH 08/22] use scale_factor/add_offset in tests as specified by cf conventions --- xarray/tests/test_backends.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 08945337851..cd3e833e48b 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -142,7 +142,7 @@ def create_masked_and_scaled_data(dtype: type[np.number] = np.float32) -> Datase x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=dtype) encoding = { "_FillValue": -1, - "add_offset": 10, + "add_offset": dtype(10), "scale_factor": dtype(0.1), "dtype": "i2", } @@ -152,7 +152,7 @@ def create_masked_and_scaled_data(dtype: type[np.number] = np.float32) -> Datase def create_encoded_masked_and_scaled_data( dtype: type[np.number] = np.float32, ) -> Dataset: - attributes = {"_FillValue": -1, "add_offset": 10, "scale_factor": dtype(0.1)} + attributes = {"_FillValue": -1, "add_offset": dtype(10), "scale_factor": dtype(0.1)} return Dataset( {"x": ("t", np.array([-1, -1, 0, 1, 2], dtype=np.int16), attributes)} ) @@ -165,7 +165,7 @@ def create_unsigned_masked_scaled_data( "_FillValue": 255, "_Unsigned": "true", "dtype": "i1", - "add_offset": 10, + "add_offset": dtype(10), "scale_factor": dtype(0.1), } x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype) @@ -180,7 +180,7 @@ def create_encoded_unsigned_masked_scaled_data( attributes = { "_FillValue": -1, "_Unsigned": "true", - "add_offset": 10, + "add_offset": dtype(10), "scale_factor": dtype(0.1), } # Create unsigned data corresponding to [0, 1, 127, 128, 255] unsigned @@ -195,7 +195,7 @@ def create_bad_unsigned_masked_scaled_data( "_FillValue": 255, "_Unsigned": True, "dtype": "i1", - "add_offset": 10, + "add_offset": dtype(0), "scale_factor": dtype(0.1), } x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype) @@ -210,7 +210,7 @@ def create_bad_encoded_unsigned_masked_scaled_data( attributes = { "_FillValue": -1, "_Unsigned": True, - "add_offset": 10, + "add_offset": dtype(10), "scale_factor": dtype(0.1), } # Create signed data corresponding to [0, 1, 127, 128, 255] unsigned @@ -225,7 +225,7 @@ def create_signed_masked_scaled_data( "_FillValue": -127, "_Unsigned": "false", "dtype": "i1", - "add_offset": 10, + "add_offset": dtype(10), "scale_factor": dtype(0.1), } x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=dtype) @@ -240,7 +240,7 @@ def create_encoded_signed_masked_scaled_data( attributes = { "_FillValue": -127, "_Unsigned": "false", - "add_offset": 10, + "add_offset": dtype(10), "scale_factor": dtype(0.1), } # Create signed data corresponding to [0, 1, 127, 128, 255] unsigned @@ -1564,7 +1564,7 @@ def test_mask_and_scale(self, dtype) -> None: nc.createVariable("x", "int16", ("t",), fill_value=-1) v = nc.variables["x"] v.set_auto_maskandscale(False) - v.add_offset = 10 + v.add_offset = dtype(10) v.scale_factor = dtype(0.1) v[:] = np.array([-1, -1, 0, 1, 2]) From 06ec16a03dfdc80436b049c5ec239bcdefd4d680 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sat, 1 Apr 2023 20:37:54 +0200 Subject: [PATCH 09/22] add test which checks add_offset being not conforming to cf standards --- xarray/tests/test_backends.py | 40 +++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index cd3e833e48b..939b96d3bce 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1584,6 +1584,46 @@ def test_mask_and_scale(self, dtype) -> None: expected = create_masked_and_scaled_data(dtype) assert_identical(expected, ds) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) + @pytest.mark.parametrize("offset_cf_conforming", [True, False]) + def test_mask_and_scale_non_cf_conforming(self, dtype, offset_cf_conforming) -> None: + with create_tmp_file() as tmp_file: + with nc4.Dataset(tmp_file, mode="w") as nc: + nc.createDimension("t", 5) + nc.createVariable("x", "int16", ("t",), fill_value=-1) + v = nc.variables["x"] + v.set_auto_maskandscale(False) + if offset_cf_conforming: + v.add_offset = dtype(10) + else: + v.add_offset = 10 + v.scale_factor = dtype(0.1) + v[:] = np.array([-1, -1, 0, 1, 2]) + + # first make sure netCDF4 reads the masked and scaled data + # correctly + with nc4.Dataset(tmp_file, mode="r") as nc: + expected = np.ma.array( + [-1, -1, 10, 10.1, 10.2], + mask=[True, True, False, False, False], + dtype=dtype, + ) + actual = nc.variables["x"][:] + assert_array_equal(expected, actual) + + # now check xarray + with open_dataset(tmp_file) as ds: + if not offset_cf_conforming: + # if not conforming, this get's promoted to float64 in any case + # if add_offset is available + expected = create_masked_and_scaled_data(np.float64) + # here we have slight differences possibly + # due to using float32 first and casting to float64 later + assert_allclose(expected, ds) + else: + expected = create_masked_and_scaled_data(dtype) + assert_identical(expected, ds) + def test_0dimensional_variable(self) -> None: # This fix verifies our work-around to this netCDF4-python bug: # https://github.com/Unidata/netcdf4-python/pull/220 From 19ef2340fd278b3d1aa2793d7c4d659bd9baf2bd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 1 Apr 2023 18:38:42 +0000 Subject: [PATCH 10/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_backends.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 939b96d3bce..9dd4a984fd7 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1586,7 +1586,9 @@ def test_mask_and_scale(self, dtype) -> None: @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("offset_cf_conforming", [True, False]) - def test_mask_and_scale_non_cf_conforming(self, dtype, offset_cf_conforming) -> None: + def test_mask_and_scale_non_cf_conforming( + self, dtype, offset_cf_conforming + ) -> None: with create_tmp_file() as tmp_file: with nc4.Dataset(tmp_file, mode="w") as nc: nc.createDimension("t", 5) From e877aa7e10f3949c6542a127f7a9e9ef26bd8a37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sat, 1 Apr 2023 23:10:08 +0200 Subject: [PATCH 11/22] convert to float32 to keep #1840 in sync --- xarray/coding/variables.py | 18 ++++++++++-------- xarray/tests/test_coding.py | 7 ++++--- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 078ba09358b..c9093245a74 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -302,8 +302,8 @@ def _choose_float_dtype( ) -> type[np.floating[Any]]: # check scale/offset first to derive dtype # see https://github.com/pydata/xarray/issues/5597#issuecomment-879561954 - scale_factor = mapping.get("scale_factor", False) - add_offset = mapping.get("add_offset", False) + scale_factor = mapping.get("scale_factor") + add_offset = mapping.get("add_offset") if scale_factor or add_offset: # get the maximum itemsize from scale_factor/add_offset to determine # the needed floating point type @@ -320,7 +320,7 @@ def _choose_float_dtype( # but a large integer offset could lead to loss of precision. # Sensitivity analysis can be tricky, so we just use a float64 # if there's any offset at all - better unoptimised than wrong! - if maxsize == 4 and np.issubdtype(add_offset_type, np.floating): + if maxsize == 4 or not np.issubdtype(add_offset_type, np.floating): return np.float32 else: return np.float64 @@ -350,12 +350,14 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable: if scale_factor or add_offset: dtype = _choose_float_dtype(data.dtype, attrs) data = data.astype(dtype=dtype, copy=True) - if add_offset: - data -= add_offset - if scale_factor: - data /= scale_factor + if add_offset: + data -= add_offset + if scale_factor: + data /= scale_factor - return Variable(dims, data, attrs, encoding, fastpath=True) + return Variable(dims, data, attrs, encoding, fastpath=True) + else: + return variable def decode(self, variable: Variable, name: T_Name = None) -> Variable: dims, data, attrs, encoding = unpack_for_decoding(variable) diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index f7579c4b488..a245f91fa82 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -95,10 +95,11 @@ def test_coder_roundtrip() -> None: assert_identical(original, roundtripped) -@pytest.mark.parametrize("dtype", "u1 u2 i1 i2 f2 f4".split()) -def test_scaling_converts_to_float32(dtype) -> None: +@pytest.mark.parametrize("unpacked_dtype", [np.float32, np.float64, np.int32]) +@pytest.mark.parametrize("packed_dtype", "u1 u2 i1 i2 f2 f4".split()) +def test_scaling_converts_to_float32(packed_dtype, unpacked_dtype) -> None: original = xr.Variable( - ("x",), np.arange(10, dtype=dtype), encoding=dict(scale_factor=10) + ("x",), np.arange(10, dtype=packed_dtype), encoding=dict(scale_factor=unpacked_dtype(10)) ) coder = variables.CFScaleOffsetCoder() encoded = coder.encode(original) From 8cc7319ea24090676af2dbc9035ea117fa00e08c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 1 Apr 2023 21:11:50 +0000 Subject: [PATCH 12/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_coding.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index a245f91fa82..544f3c47224 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -99,7 +99,9 @@ def test_coder_roundtrip() -> None: @pytest.mark.parametrize("packed_dtype", "u1 u2 i1 i2 f2 f4".split()) def test_scaling_converts_to_float32(packed_dtype, unpacked_dtype) -> None: original = xr.Variable( - ("x",), np.arange(10, dtype=packed_dtype), encoding=dict(scale_factor=unpacked_dtype(10)) + ("x",), + np.arange(10, dtype=packed_dtype), + encoding=dict(scale_factor=unpacked_dtype(10)), ) coder = variables.CFScaleOffsetCoder() encoded = coder.encode(original) From 6a73653b08e30eea29873801bdd27cd68ef12885 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 2 Apr 2023 10:57:13 +0200 Subject: [PATCH 13/22] add more comments, add more typing --- xarray/coding/variables.py | 5 +++-- xarray/tests/test_backends.py | 6 +++--- xarray/tests/test_coding.py | 5 ++++- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index c9093245a74..8a6141b3638 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -300,7 +300,7 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTyp def _choose_float_dtype( dtype: np.dtype, mapping: MutableMapping ) -> type[np.floating[Any]]: - # check scale/offset first to derive dtype + # check scale/offset first to derive wanted float dtype # see https://github.com/pydata/xarray/issues/5597#issuecomment-879561954 scale_factor = mapping.get("scale_factor") add_offset = mapping.get("add_offset") @@ -324,6 +324,7 @@ def _choose_float_dtype( return np.float32 else: return np.float64 + # If no scale_factor or add_offset is given, use some general rules. # Keep float32 as-is. Upcast half-precision to single-precision, # because float16 is "intended for storage but not computation" if dtype.itemsize <= 4 and np.issubdtype(dtype, np.floating): @@ -477,7 +478,7 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable: def decode(self, variable: Variable, name: T_Name = None) -> Variable: if variable.attrs.get("dtype", False) == "bool": dims, data, attrs, encoding = unpack_for_decoding(variable) - # overwrite encoding accordingly and remove from attrs + # overwrite (!) encoding accordingly and remove from attrs encoding["dtype"] = attrs.pop("dtype") data = BoolTypeArray(data) return Variable(dims, data, attrs, encoding, fastpath=True) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 9dd4a984fd7..2b01d1b6333 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -17,7 +17,7 @@ from contextlib import ExitStack from io import BytesIO from pathlib import Path -from typing import TYPE_CHECKING, Any, Final, cast +from typing import TYPE_CHECKING, Any, Callable, Final, cast import numpy as np import pandas as pd @@ -893,7 +893,7 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None: (create_masked_and_scaled_data, create_encoded_masked_and_scaled_data), ], ) - def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None: + def test_roundtrip_mask_and_scale(self, decoded_fn: Callable, encoded_fn: Callable, dtype: type[np.number]) -> None: if dtype == np.float32 and isinstance( self, (TestZarrDirectoryStore, TestZarrDictStore) ): @@ -1557,7 +1557,7 @@ def test_encoding_chunksizes_unlimited(self) -> None: assert_equal(ds, actual) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) - def test_mask_and_scale(self, dtype) -> None: + def test_mask_and_scale(self, dtype: type[np.number]) -> None: with create_tmp_file() as tmp_file: with nc4.Dataset(tmp_file, mode="w") as nc: nc.createDimension("t", 5) diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index 544f3c47224..8e47dacebbd 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -97,7 +97,9 @@ def test_coder_roundtrip() -> None: @pytest.mark.parametrize("unpacked_dtype", [np.float32, np.float64, np.int32]) @pytest.mark.parametrize("packed_dtype", "u1 u2 i1 i2 f2 f4".split()) -def test_scaling_converts_to_float32(packed_dtype, unpacked_dtype) -> None: +def test_scaling_converts_to_float32(packed_dtype: str, unpacked_dtype: type[np.number]) -> None: + # if scale_facor but no add_offset is given transform to float32 in any case + # this minimizes memory usage, see #1840, #1842 original = xr.Variable( ("x",), np.arange(10, dtype=packed_dtype), @@ -115,6 +117,7 @@ def test_scaling_converts_to_float32(packed_dtype, unpacked_dtype) -> None: @pytest.mark.parametrize("add_offset", (0.1, [0.1])) def test_scaling_offset_as_list(scale_factor, add_offset) -> None: # test for #4631 + # att: scale_factor and add_offset are not conforming to cf specs here encoding = dict(scale_factor=scale_factor, add_offset=add_offset) original = xr.Variable(("x",), np.arange(10.0), encoding=encoding) coder = variables.CFScaleOffsetCoder() From d6887cd86e25333d3d69f4ae7e2c93b709b68879 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 2 Apr 2023 08:58:35 +0000 Subject: [PATCH 14/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_backends.py | 4 +++- xarray/tests/test_coding.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 2b01d1b6333..2a3e6c1b121 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -893,7 +893,9 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None: (create_masked_and_scaled_data, create_encoded_masked_and_scaled_data), ], ) - def test_roundtrip_mask_and_scale(self, decoded_fn: Callable, encoded_fn: Callable, dtype: type[np.number]) -> None: + def test_roundtrip_mask_and_scale( + self, decoded_fn: Callable, encoded_fn: Callable, dtype: type[np.number] + ) -> None: if dtype == np.float32 and isinstance( self, (TestZarrDirectoryStore, TestZarrDictStore) ): diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index 8e47dacebbd..7f5d6dbe8d0 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -97,7 +97,9 @@ def test_coder_roundtrip() -> None: @pytest.mark.parametrize("unpacked_dtype", [np.float32, np.float64, np.int32]) @pytest.mark.parametrize("packed_dtype", "u1 u2 i1 i2 f2 f4".split()) -def test_scaling_converts_to_float32(packed_dtype: str, unpacked_dtype: type[np.number]) -> None: +def test_scaling_converts_to_float32( + packed_dtype: str, unpacked_dtype: type[np.number] +) -> None: # if scale_facor but no add_offset is given transform to float32 in any case # this minimizes memory usage, see #1840, #1842 original = xr.Variable( From c8c3f14ec9be3dae7574bb3a7ffd8eb2a77e2591 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 2 Apr 2023 12:31:11 +0200 Subject: [PATCH 15/22] add additional test, make _choose_float_dtype more explicit --- xarray/coding/variables.py | 32 ++++++++++++++++++-------------- xarray/tests/test_coding.py | 22 +++++++++++++++++++++- 2 files changed, 39 insertions(+), 15 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 8a6141b3638..a78c861ce38 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -305,25 +305,30 @@ def _choose_float_dtype( scale_factor = mapping.get("scale_factor") add_offset = mapping.get("add_offset") if scale_factor or add_offset: - # get the maximum itemsize from scale_factor/add_offset to determine + # get the type from scale_factor/add_offset to determine # the needed floating point type - # start with minimal floating point size -> 4 byte - maxsize = 4 - if scale_factor and np.issubdtype(type(scale_factor), np.floating): - maxsize = max(maxsize, np.dtype(type(scale_factor)).itemsize) - add_offset_type = type(add_offset) - if add_offset and np.issubdtype(add_offset_type, np.floating): - maxsize = max(maxsize, np.dtype(add_offset_type).itemsize) - # if add_offset is malformed (eg. no float32 or no float64 as - # cf conventions expects): + if scale_factor: + scale_type = type(scale_factor) + if add_offset: + offset_type = type(add_offset) + # CF conforming, both scale_factor and add-offset are given and + # of same floating point type + if ( + add_offset + and scale_factor + and offset_type == scale_type + and np.issubdtype(scale_type, np.floating) + ): + return np.dtype(scale_type) + # Not CF conforming and add_offset given: # A scale factor is entirely safe (vanishing into the mantissa), # but a large integer offset could lead to loss of precision. # Sensitivity analysis can be tricky, so we just use a float64 # if there's any offset at all - better unoptimised than wrong! - if maxsize == 4 or not np.issubdtype(add_offset_type, np.floating): - return np.float32 - else: + if add_offset: return np.float64 + # return float32 in other cases where only scale_factor is given + return np.float32 # If no scale_factor or add_offset is given, use some general rules. # Keep float32 as-is. Upcast half-precision to single-precision, # because float16 is "intended for storage but not computation" @@ -355,7 +360,6 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable: data -= add_offset if scale_factor: data /= scale_factor - return Variable(dims, data, attrs, encoding, fastpath=True) else: return variable diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index 7f5d6dbe8d0..17320892108 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -100,7 +100,7 @@ def test_coder_roundtrip() -> None: def test_scaling_converts_to_float32( packed_dtype: str, unpacked_dtype: type[np.number] ) -> None: - # if scale_facor but no add_offset is given transform to float32 in any case + # if scale_factor but no add_offset is given transform to float32 in any case # this minimizes memory usage, see #1840, #1842 original = xr.Variable( ("x",), @@ -115,6 +115,26 @@ def test_scaling_converts_to_float32( assert roundtripped.dtype == np.float32 +@pytest.mark.parametrize("unpacked_dtype", [np.float32, np.float64, np.int32]) +@pytest.mark.parametrize("packed_dtype", "u1 u2 i1 i2 f2 f4".split()) +def test_scaling_converts_to_float64( + packed_dtype: str, unpacked_dtype: type[np.number] +) -> None: + # if add_offset is given, but no scale_factor transform to float64 in any case + # to prevent precision issues + original = xr.Variable( + ("x",), + np.arange(10, dtype=packed_dtype), + encoding=dict(add_offset=unpacked_dtype(10)), + ) + coder = variables.CFScaleOffsetCoder() + encoded = coder.encode(original) + assert encoded.dtype == np.float64 + roundtripped = coder.decode(encoded) + assert_identical(original, roundtripped) + assert roundtripped.dtype == np.float64 + + @pytest.mark.parametrize("scale_factor", (10, [10])) @pytest.mark.parametrize("add_offset", (0.1, [0.1])) def test_scaling_offset_as_list(scale_factor, add_offset) -> None: From 2f5709a3eb5f6089b924e69b666b29ed55339165 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 2 Apr 2023 14:08:47 +0200 Subject: [PATCH 16/22] Apply suggestions from code review typing by @Illviljan Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> --- xarray/coding/variables.py | 7 ++----- xarray/tests/test_backends.py | 4 ++-- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index a78c861ce38..17c5a212b86 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -319,7 +319,7 @@ def _choose_float_dtype( and offset_type == scale_type and np.issubdtype(scale_type, np.floating) ): - return np.dtype(scale_type) + return np.dtype(scale_type).type # Not CF conforming and add_offset given: # A scale factor is entirely safe (vanishing into the mantissa), # but a large integer offset could lead to loss of precision. @@ -509,10 +509,7 @@ class NonStringCoder(VariableCoder): """Encode NonString variables if dtypes differ.""" def encode(self, variable: Variable, name: T_Name = None) -> Variable: - if "dtype" in variable.encoding and variable.encoding["dtype"] not in ( - "S1", - str, - ): + if "dtype" in variable.encoding and variable.encoding["dtype"] not in ("S1", str): dims, data, attrs, encoding = unpack_for_encoding(variable) dtype = np.dtype(encoding.pop("dtype")) if dtype != variable.dtype: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 2a3e6c1b121..214c1616b79 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -894,7 +894,7 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None: ], ) def test_roundtrip_mask_and_scale( - self, decoded_fn: Callable, encoded_fn: Callable, dtype: type[np.number] + self, decoded_fn: Callable[[type[np.number]], Dataset], encoded_fn: Callable[[type[np.number]], Dataset], dtype: type[np.number] ) -> None: if dtype == np.float32 and isinstance( self, (TestZarrDirectoryStore, TestZarrDictStore) @@ -1589,7 +1589,7 @@ def test_mask_and_scale(self, dtype: type[np.number]) -> None: @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("offset_cf_conforming", [True, False]) def test_mask_and_scale_non_cf_conforming( - self, dtype, offset_cf_conforming + self, dtype: type[np.number], offset_cf_conforming: bool ) -> None: with create_tmp_file() as tmp_file: with nc4.Dataset(tmp_file, mode="w") as nc: From f215d01de62a2e120ab520c9b9a2708768d28208 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 2 Apr 2023 12:09:23 +0000 Subject: [PATCH 17/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/coding/variables.py | 5 ++++- xarray/tests/test_backends.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 17c5a212b86..02fdcea0189 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -509,7 +509,10 @@ class NonStringCoder(VariableCoder): """Encode NonString variables if dtypes differ.""" def encode(self, variable: Variable, name: T_Name = None) -> Variable: - if "dtype" in variable.encoding and variable.encoding["dtype"] not in ("S1", str): + if "dtype" in variable.encoding and variable.encoding["dtype"] not in ( + "S1", + str, + ): dims, data, attrs, encoding = unpack_for_encoding(variable) dtype = np.dtype(encoding.pop("dtype")) if dtype != variable.dtype: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 214c1616b79..82c2de60753 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -894,7 +894,10 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None: ], ) def test_roundtrip_mask_and_scale( - self, decoded_fn: Callable[[type[np.number]], Dataset], encoded_fn: Callable[[type[np.number]], Dataset], dtype: type[np.number] + self, + decoded_fn: Callable[[type[np.number]], Dataset], + encoded_fn: Callable[[type[np.number]], Dataset], + dtype: type[np.number], ) -> None: if dtype == np.float32 and isinstance( self, (TestZarrDirectoryStore, TestZarrDictStore) From c5cd53d7f1500bb9fcdb97e38ca3f9ad5988e7e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 2 Apr 2023 17:41:09 +0200 Subject: [PATCH 18/22] Apply suggestions from code review Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> --- xarray/coding/variables.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 02fdcea0189..f20a7f7f890 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -100,14 +100,14 @@ class NativeEndiannessArray(indexing.ExplicitlyIndexedNDArrayMixin): __slots__ = ("array",) - def __init__(self, array): + def __init__(self, array) -> None: self.array = indexing.as_indexable(array) @property - def dtype(self): + def dtype(self) -> np.dtype: return np.dtype(self.array.dtype.kind + str(self.array.dtype.itemsize)) - def __getitem__(self, key): + def __getitem__(self, key) -> np.ndarray: return np.asarray(self.array[key], dtype=self.dtype) @@ -132,14 +132,14 @@ class BoolTypeArray(indexing.ExplicitlyIndexedNDArrayMixin): __slots__ = ("array",) - def __init__(self, array): + def __init__(self, array) -> None: self.array = indexing.as_indexable(array) @property - def dtype(self): + def dtype(self) -> np.dtype: return np.dtype("bool") - def __getitem__(self, key): + def __getitem__(self, key) -> np.ndarray: return np.asarray(self.array[key], dtype=self.dtype) From 986ffc6ef6fd4c12b5137fdc3d3a91a1fbca452a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 3 Apr 2023 10:50:16 +0200 Subject: [PATCH 19/22] Test against None, as 0 (False) is a valid input for add_offset. Check for float32/64 only. --- xarray/coding/variables.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index f20a7f7f890..2676b42e04b 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -304,20 +304,20 @@ def _choose_float_dtype( # see https://github.com/pydata/xarray/issues/5597#issuecomment-879561954 scale_factor = mapping.get("scale_factor") add_offset = mapping.get("add_offset") - if scale_factor or add_offset: + if scale_factor is not None or add_offset is not None: # get the type from scale_factor/add_offset to determine # the needed floating point type - if scale_factor: + if scale_factor is not None: scale_type = type(scale_factor) - if add_offset: + if add_offset is not None: offset_type = type(add_offset) # CF conforming, both scale_factor and add-offset are given and - # of same floating point type + # of same floating point type (float32/64) if ( - add_offset - and scale_factor + add_offset is not None + and scale_factor is not None and offset_type == scale_type - and np.issubdtype(scale_type, np.floating) + and scale_type in [np.float32, np.float64] ): return np.dtype(scale_type).type # Not CF conforming and add_offset given: @@ -325,7 +325,7 @@ def _choose_float_dtype( # but a large integer offset could lead to loss of precision. # Sensitivity analysis can be tricky, so we just use a float64 # if there's any offset at all - better unoptimised than wrong! - if add_offset: + if add_offset is not None: return np.float64 # return float32 in other cases where only scale_factor is given return np.float32 From 51f751535cb467cae1147d2583020ba624af6fbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 4 Apr 2023 15:24:12 +0200 Subject: [PATCH 20/22] apply astype in the right order to prevent loss of precision, more fillvalue fixes --- xarray/coding/variables.py | 77 +++++++++++++++++++++++++------------- 1 file changed, 51 insertions(+), 26 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 2676b42e04b..7c9581fef74 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -220,34 +220,38 @@ class CFMaskCoder(VariableCoder): def encode(self, variable: Variable, name: T_Name = None): dims, data, attrs, encoding = unpack_for_encoding(variable) + # get dtype from encoding if available, otherwise use data.dtype dtype = np.dtype(encoding.get("dtype", data.dtype)) fv = encoding.get("_FillValue") mv = encoding.get("missing_value") - if ( - fv is not None - and mv is not None - and not duck_array_ops.allclose_or_equiv(fv, mv) - ): - raise ValueError( - f"Variable {name!r} has conflicting _FillValue ({fv}) and missing_value ({mv}). Cannot encode data." - ) - - if fv is not None: - # Ensure _FillValue is cast to same dtype as data's - encoding["_FillValue"] = dtype.type(fv) - fill_value = pop_to(encoding, attrs, "_FillValue", name=name) - if not pd.isnull(fill_value): - data = duck_array_ops.fillna(data, fill_value) - - if mv is not None: - # Ensure missing_value is cast to same dtype as data's - encoding["missing_value"] = dtype.type(mv) - fill_value = pop_to(encoding, attrs, "missing_value", name=name) - if not pd.isnull(fill_value) and fv is None: - data = duck_array_ops.fillna(data, fill_value) + if fv is not None or mv is not None: + if ( + fv is not None + and mv is not None + and not duck_array_ops.allclose_or_equiv(fv, mv) + ): + raise ValueError( + f"Variable {name!r} has conflicting _FillValue ({fv}) and missing_value ({mv}). Cannot encode data." + ) - return Variable(dims, data, attrs, encoding, fastpath=True) + if fv is not None: + # Ensure _FillValue is cast to same dtype as data's + encoding["_FillValue"] = dtype.type(fv) + fill_value = pop_to(encoding, attrs, "_FillValue", name=name) + if not pd.isnull(fill_value): + data = duck_array_ops.fillna(data, fill_value) + + if mv is not None: + # Only use mv if _FillValue isn't available + # Ensure missing_value is cast to same dtype as data's + encoding["missing_value"] = attrs.get("_FillValue", dtype.type(mv)) + fill_value = pop_to(encoding, attrs, "missing_value", name=name) + if not pd.isnull(fill_value) and fv is None: + data = duck_array_ops.fillna(data, fill_value) + return Variable(dims, data, attrs, encoding, fastpath=True) + else: + return variable def decode(self, variable: Variable, name: T_Name = None): dims, data, attrs, encoding = unpack_for_decoding(variable) @@ -272,7 +276,13 @@ def decode(self, variable: Variable, name: T_Name = None): stacklevel=3, ) - dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype) + if "scale_factor" not in attrs and "add_offset" not in attrs: + dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype) + else: + dtype, decoded_fill_value = ( + _choose_float_dtype(data.dtype, attrs), + np.nan, + ) if encoded_fill_values: transform = partial( @@ -319,6 +329,10 @@ def _choose_float_dtype( and offset_type == scale_type and scale_type in [np.float32, np.float64] ): + # in case of int32 -> we need upcast to float64 + # due to precision issues + if dtype.itemsize == 4 and np.issubdtype(dtype, np.integer): + return np.float64 return np.dtype(scale_type).type # Not CF conforming and add_offset given: # A scale factor is entirely safe (vanishing into the mantissa), @@ -354,7 +368,12 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable: scale_factor = pop_to(encoding, attrs, "scale_factor", name=name) add_offset = pop_to(encoding, attrs, "add_offset", name=name) if scale_factor or add_offset: - dtype = _choose_float_dtype(data.dtype, attrs) + # if we have a _FillValue/masked_value we do not want to cast now + # but leave that to CFMaskCoder + dtype = data.dtype + if "_FillValue" not in encoding and "missing_value" not in encoding: + dtype = _choose_float_dtype(data.dtype, attrs) + # but still we need a copy prevent changing original data data = data.astype(dtype=dtype, copy=True) if add_offset: data -= add_offset @@ -373,7 +392,13 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: scale_factor = np.asarray(scale_factor).item() if np.ndim(add_offset) > 0: add_offset = np.asarray(add_offset).item() - dtype = _choose_float_dtype(data.dtype, encoding) + # if we have a _FillValue/masked_value we already have the wanted + # floating point dtype here (via CFMaskCoder), so no check is necessary + # only check in other cases + dtype = data.dtype + if "_FillValue" not in encoding and "missing_value" not in encoding: + dtype = _choose_float_dtype(dtype, encoding) + transform = partial( _scale_offset_decoding, scale_factor=scale_factor, From 8c074f0bf4170a0fdf4b324a0c20e16fccbe391f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 4 Apr 2023 15:24:30 +0200 Subject: [PATCH 21/22] add mask/scale roundtrip test for variable cf encoding/decoding --- xarray/tests/test_coding.py | 70 ++++++++++++++++++++++++++++++++----- 1 file changed, 62 insertions(+), 8 deletions(-) diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index 17320892108..95c3288d7b7 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -95,17 +95,71 @@ def test_coder_roundtrip() -> None: assert_identical(original, roundtripped) -@pytest.mark.parametrize("unpacked_dtype", [np.float32, np.float64, np.int32]) +@pytest.mark.parametrize("ptype", "u1 u2 u4 i1 i2 i4".split()) +@pytest.mark.parametrize("utype", "f4 f8".split()) +def test_mask_scale_roundtrip(utype: str, ptype: str) -> None: + # this tests cf conforming packing/unpacking via + # encode_cf_variable/decode_cf_variable + # f4->i4 packing is skipped as non-conforming + if utype[1] == "4" and ptype[1] == "4": + pytest.skip("Can't pack float32 into int32/uint32") + # fillvalues according to netCDF4 + filldict = { + "i1": -127, + "u1": 255, + "i2": -32767, + "u2": 65535, + "i4": -2147483647, + "u4": 4294967295, + } + fillvalue = filldict[ptype] + unpacked_dtype = np.dtype(utype).type + packed_dtype = np.dtype(ptype).type + info = np.iinfo(packed_dtype) + + # create original "encoded" Variable + packed_data = np.array( + [info.min, fillvalue, info.max - 1, info.max], dtype=packed_dtype + ) + attrs = dict( + scale_factor=unpacked_dtype(1), + add_offset=unpacked_dtype(0), + _FillValue=packed_dtype(fillvalue), + ) + original = xr.Variable(("x",), packed_data, attrs=attrs) + + # create wanted "decoded" Variable + unpacked_data = np.array( + [info.min, fillvalue, info.max - 1, info.max], dtype=unpacked_dtype + ) + encoding = dict( + scale_factor=unpacked_dtype(1), + add_offset=unpacked_dtype(0), + _FillValue=packed_dtype(fillvalue), + ) + wanted = xr.Variable(("x"), unpacked_data, encoding=encoding) + wanted = wanted.where(wanted != fillvalue) + + # decode original and compare with wanted + decoded = decode_cf_variable("x", original) + assert wanted.dtype == decoded.dtype + xr.testing.assert_identical(wanted, decoded) + + # encode again and compare with original + encoded = encode_cf_variable(decoded) + assert original.dtype == encoded.dtype + xr.testing.assert_identical(original, encoded) + + +@pytest.mark.parametrize("unpacked_dtype", "f4 f8 i4".split()) @pytest.mark.parametrize("packed_dtype", "u1 u2 i1 i2 f2 f4".split()) -def test_scaling_converts_to_float32( - packed_dtype: str, unpacked_dtype: type[np.number] -) -> None: +def test_scaling_converts_to_float32(packed_dtype: str, unpacked_dtype: str) -> None: # if scale_factor but no add_offset is given transform to float32 in any case # this minimizes memory usage, see #1840, #1842 original = xr.Variable( ("x",), np.arange(10, dtype=packed_dtype), - encoding=dict(scale_factor=unpacked_dtype(10)), + encoding=dict(scale_factor=np.dtype(unpacked_dtype).type(10)), ) coder = variables.CFScaleOffsetCoder() encoded = coder.encode(original) @@ -115,7 +169,7 @@ def test_scaling_converts_to_float32( assert roundtripped.dtype == np.float32 -@pytest.mark.parametrize("unpacked_dtype", [np.float32, np.float64, np.int32]) +@pytest.mark.parametrize("unpacked_dtype", "f4 f8 i4".split()) @pytest.mark.parametrize("packed_dtype", "u1 u2 i1 i2 f2 f4".split()) def test_scaling_converts_to_float64( packed_dtype: str, unpacked_dtype: type[np.number] @@ -125,7 +179,7 @@ def test_scaling_converts_to_float64( original = xr.Variable( ("x",), np.arange(10, dtype=packed_dtype), - encoding=dict(add_offset=unpacked_dtype(10)), + encoding=dict(add_offset=np.dtype(unpacked_dtype).type(10)), ) coder = variables.CFScaleOffsetCoder() encoded = coder.encode(original) @@ -139,7 +193,7 @@ def test_scaling_converts_to_float64( @pytest.mark.parametrize("add_offset", (0.1, [0.1])) def test_scaling_offset_as_list(scale_factor, add_offset) -> None: # test for #4631 - # att: scale_factor and add_offset are not conforming to cf specs here + # attention: scale_factor and add_offset are not conforming to cf specs here encoding = dict(scale_factor=scale_factor, add_offset=add_offset) original = xr.Variable(("x",), np.arange(10.0), encoding=encoding) coder = variables.CFScaleOffsetCoder() From 031cac5f63827f359824e87bf65a1009f22b7ac7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 4 Apr 2023 15:43:25 +0200 Subject: [PATCH 22/22] separate CFMaskCoder test for multiple missing_values/__FillValues conflict, use _FillValue for missing_value if available --- xarray/coding/variables.py | 3 ++- xarray/tests/test_coding.py | 10 ++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 7c9581fef74..a5a41718648 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -243,7 +243,8 @@ def encode(self, variable: Variable, name: T_Name = None): data = duck_array_ops.fillna(data, fill_value) if mv is not None: - # Only use mv if _FillValue isn't available + # Use _FillValue if available to align missing_value to prevent issues + # when decoding # Ensure missing_value is cast to same dtype as data's encoding["missing_value"] = attrs.get("_FillValue", dtype.type(mv)) fill_value = pop_to(encoding, attrs, "missing_value", name=name) diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index 95c3288d7b7..406921a8556 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -51,9 +51,15 @@ def test_CFMaskCoder_encode_missing_fill_values_conflict(data, encoding) -> None assert encoded.dtype == encoded.attrs["missing_value"].dtype assert encoded.dtype == encoded.attrs["_FillValue"].dtype + +def test_CFMaskCoder_multiple_missing_values_conflict(): + data = np.array([0.0, -1.0, 1.0]) + attrs = dict(_FillValue=np.float64(1e20), missing_value=np.float64(1e21)) + original = xr.Variable(("x",), data, attrs=attrs) with pytest.warns(variables.SerializationWarning): - roundtripped = decode_cf_variable("foo", encoded) - assert_identical(roundtripped, original) + decoded = decode_cf_variable("foo", original) + with pytest.raises(ValueError): + encode_cf_variable(decoded) def test_CFMaskCoder_missing_value() -> None: