From d19d57a4663adb6d3e5b28784f10fe7313b75747 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kmuehlbauer@wradlib.org>
Date: Sat, 1 Apr 2023 10:18:08 +0200
Subject: [PATCH 01/22] implement coders, adapt tests

---
 xarray/coding/variables.py       | 162 +++++++++++++++++++++++++++++++
 xarray/conventions.py            | 137 ++------------------------
 xarray/tests/test_conventions.py |   6 +-
 3 files changed, 175 insertions(+), 130 deletions(-)

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
index 4107b3aa883..1cd0803730f 100644
--- a/xarray/coding/variables.py
+++ b/xarray/coding/variables.py
@@ -78,6 +78,72 @@ def __repr__(self) -> str:
         )
 
 
+class NativeEndiannessArray(indexing.ExplicitlyIndexedNDArrayMixin):
+    """Decode arrays on the fly from non-native to native endianness
+
+    This is useful for decoding arrays from netCDF3 files (which are all
+    big endian) into native endianness, so they can be used with Cython
+    functions, such as those found in bottleneck and pandas.
+
+    >>> x = np.arange(5, dtype=">i2")
+
+    >>> x.dtype
+    dtype('>i2')
+
+    >>> NativeEndiannessArray(x).dtype
+    dtype('int16')
+
+    >>> indexer = indexing.BasicIndexer((slice(None),))
+    >>> NativeEndiannessArray(x)[indexer].dtype
+    dtype('int16')
+    """
+
+    __slots__ = ("array",)
+
+    def __init__(self, array):
+        self.array = indexing.as_indexable(array)
+
+    @property
+    def dtype(self):
+        return np.dtype(self.array.dtype.kind + str(self.array.dtype.itemsize))
+
+    def __getitem__(self, key):
+        return np.asarray(self.array[key], dtype=self.dtype)
+
+
+class BoolTypeArray(indexing.ExplicitlyIndexedNDArrayMixin):
+    """Decode arrays on the fly from integer to boolean datatype
+
+    This is useful for decoding boolean arrays from integer typed netCDF
+    variables.
+
+    >>> x = np.array([1, 0, 1, 1, 0], dtype="i1")
+
+    >>> x.dtype
+    dtype('int8')
+
+    >>> BoolTypeArray(x).dtype
+    dtype('bool')
+
+    >>> indexer = indexing.BasicIndexer((slice(None),))
+    >>> BoolTypeArray(x)[indexer].dtype
+    dtype('bool')
+    """
+
+    __slots__ = ("array",)
+
+    def __init__(self, array):
+        self.array = indexing.as_indexable(array)
+
+    @property
+    def dtype(self):
+        return np.dtype("bool")
+
+    def __getitem__(self, key):
+        return np.asarray(self.array[key], dtype=self.dtype)
+
+
+
 def lazy_elemwise_func(array, func: Callable, dtype: np.typing.DTypeLike):
     """Lazily apply an element-wise function to an array.
     Parameters
@@ -349,3 +415,99 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
             return Variable(dims, data, attrs, encoding, fastpath=True)
         else:
             return variable
+
+
+class DefaultFillvalueCoder(VariableCoder):
+    """Encode default _FillValue if needed."""
+
+    def encode(self, variable: Variable, name: T_Name = None) -> Variable:
+        dims, data, attrs, encoding = unpack_for_encoding(variable)
+        # make NaN the fill value for float types
+        if (
+            "_FillValue" not in attrs
+            and "_FillValue" not in encoding
+            and np.issubdtype(variable.dtype, np.floating)
+        ):
+            attrs["_FillValue"] = variable.dtype.type(np.nan)
+            return Variable(dims, data, attrs, encoding, fastpath=True)
+        else:
+            return variable
+    def decode(self, variable: Variable, name: T_Name = None) -> Variable:
+        raise NotImplementedError()
+
+
+class BooleanCoder(VariableCoder):
+    """Code boolean values."""
+
+    def encode(self, variable: Variable, name: T_Name = None) -> Variable:
+        if (
+            (variable.dtype == bool)
+            and ("dtype" not in variable.encoding)
+            and ("dtype" not in variable.attrs)
+        ):
+            dims, data, attrs, encoding = unpack_for_encoding(variable)
+            attrs["dtype"] = "bool"
+            data = duck_array_ops.astype(data, dtype="i1", copy=True)
+
+            return Variable(dims, data, attrs, encoding, fastpath=True)
+        else:
+            return variable
+
+    def decode(self, variable: Variable, name: T_Name = None) -> Variable:
+        if variable.attrs.get("dtype", False) == "bool":
+            dims, data, attrs, encoding = unpack_for_decoding(variable)
+            del attrs["dtype"]
+            data = BoolTypeArray(data)
+            return Variable(dims, data, attrs, encoding, fastpath=True)
+        else:
+            return variable
+
+
+class EndianCoder(VariableCoder):
+    """Decode Endianness to native."""
+
+    def encode(self):
+        raise NotImplementedError()
+
+    def decode(self, variable: Variable, name: T_Name = None) -> Variable:
+        dims, data, attrs, encoding = unpack_for_decoding(variable)
+        if not data.dtype.isnative:
+            data = NativeEndiannessArray(data)
+            return Variable(dims, data, attrs, encoding, fastpath=True)
+        else:
+            return variable
+
+
+class NonStringCoder(VariableCoder):
+    """Encode NonString variables if dtypes differ."""
+
+    def encode(self, variable: Variable, name: T_Name = None) -> Variable:
+        if "dtype" in variable.encoding and variable.encoding["dtype"] not in (
+            "S1",
+            str,
+        ):
+            dims, data, attrs, encoding = unpack_for_encoding(variable)
+            dtype = np.dtype(encoding.pop("dtype"))
+            if dtype != variable.dtype:
+                if np.issubdtype(dtype, np.integer):
+                    if (
+                        np.issubdtype(variable.dtype, np.floating)
+                        and "_FillValue" not in variable.attrs
+                        and "missing_value" not in variable.attrs
+                    ):
+                        warnings.warn(
+                            f"saving variable {name} with floating "
+                            "point data as an integer dtype without "
+                            "any _FillValue to use for NaNs",
+                            SerializationWarning,
+                            stacklevel=10,
+                        )
+                    data = np.around(data)
+                data = data.astype(dtype=dtype)
+            return Variable(dims, data, attrs, encoding, fastpath=True)
+        else:
+            return variable
+
+    def decode(self):
+        raise NotImplementedError()
+
diff --git a/xarray/conventions.py b/xarray/conventions.py
index 780172879c6..f7cdc0bc7d3 100644
--- a/xarray/conventions.py
+++ b/xarray/conventions.py
@@ -48,123 +48,10 @@
     T_DatasetOrAbstractstore = Union[Dataset, AbstractDataStore]
 
 
-class NativeEndiannessArray(indexing.ExplicitlyIndexedNDArrayMixin):
-    """Decode arrays on the fly from non-native to native endianness
-
-    This is useful for decoding arrays from netCDF3 files (which are all
-    big endian) into native endianness, so they can be used with Cython
-    functions, such as those found in bottleneck and pandas.
-
-    >>> x = np.arange(5, dtype=">i2")
-
-    >>> x.dtype
-    dtype('>i2')
-
-    >>> NativeEndiannessArray(x).dtype
-    dtype('int16')
-
-    >>> indexer = indexing.BasicIndexer((slice(None),))
-    >>> NativeEndiannessArray(x)[indexer].dtype
-    dtype('int16')
-    """
-
-    __slots__ = ("array",)
-
-    def __init__(self, array):
-        self.array = indexing.as_indexable(array)
-
-    @property
-    def dtype(self):
-        return np.dtype(self.array.dtype.kind + str(self.array.dtype.itemsize))
-
-    def __getitem__(self, key):
-        return np.asarray(self.array[key], dtype=self.dtype)
-
-
-class BoolTypeArray(indexing.ExplicitlyIndexedNDArrayMixin):
-    """Decode arrays on the fly from integer to boolean datatype
-
-    This is useful for decoding boolean arrays from integer typed netCDF
-    variables.
-
-    >>> x = np.array([1, 0, 1, 1, 0], dtype="i1")
-
-    >>> x.dtype
-    dtype('int8')
-
-    >>> BoolTypeArray(x).dtype
-    dtype('bool')
-
-    >>> indexer = indexing.BasicIndexer((slice(None),))
-    >>> BoolTypeArray(x)[indexer].dtype
-    dtype('bool')
-    """
-
-    __slots__ = ("array",)
-
-    def __init__(self, array):
-        self.array = indexing.as_indexable(array)
-
-    @property
-    def dtype(self):
-        return np.dtype("bool")
-
-    def __getitem__(self, key):
-        return np.asarray(self.array[key], dtype=self.dtype)
-
-
 def _var_as_tuple(var: Variable) -> T_VarTuple:
     return var.dims, var.data, var.attrs.copy(), var.encoding.copy()
 
 
-def maybe_encode_nonstring_dtype(var: Variable, name: T_Name = None) -> Variable:
-    if "dtype" in var.encoding and var.encoding["dtype"] not in ("S1", str):
-        dims, data, attrs, encoding = _var_as_tuple(var)
-        dtype = np.dtype(encoding.pop("dtype"))
-        if dtype != var.dtype:
-            if np.issubdtype(dtype, np.integer):
-                if (
-                    np.issubdtype(var.dtype, np.floating)
-                    and "_FillValue" not in var.attrs
-                    and "missing_value" not in var.attrs
-                ):
-                    warnings.warn(
-                        f"saving variable {name} with floating "
-                        "point data as an integer dtype without "
-                        "any _FillValue to use for NaNs",
-                        SerializationWarning,
-                        stacklevel=10,
-                    )
-                data = np.around(data)
-            data = data.astype(dtype=dtype)
-        var = Variable(dims, data, attrs, encoding, fastpath=True)
-    return var
-
-
-def maybe_default_fill_value(var: Variable) -> Variable:
-    # make NaN the fill value for float types:
-    if (
-        "_FillValue" not in var.attrs
-        and "_FillValue" not in var.encoding
-        and np.issubdtype(var.dtype, np.floating)
-    ):
-        var.attrs["_FillValue"] = var.dtype.type(np.nan)
-    return var
-
-
-def maybe_encode_bools(var: Variable) -> Variable:
-    if (
-        (var.dtype == bool)
-        and ("dtype" not in var.encoding)
-        and ("dtype" not in var.attrs)
-    ):
-        dims, data, attrs, encoding = _var_as_tuple(var)
-        attrs["dtype"] = "bool"
-        data = duck_array_ops.astype(data, dtype="i1", copy=True)
-        var = Variable(dims, data, attrs, encoding, fastpath=True)
-    return var
-
-
 def _infer_dtype(array, name: T_Name = None) -> np.dtype:
     """Given an object array with no missing values, infer its dtype from its
     first element
@@ -292,13 +179,13 @@ def encode_cf_variable(
         variables.CFScaleOffsetCoder(),
         variables.CFMaskCoder(),
         variables.UnsignedIntegerCoder(),
+        variables.NonStringCoder(),
+        variables.DefaultFillvalueCoder(),
+        variables.BooleanCoder(),
     ]:
         var = coder.encode(var, name=name)
 
-    # TODO(shoyer): convert all of these to use coders, too:
-    var = maybe_encode_nonstring_dtype(var, name=name)
-    var = maybe_default_fill_value(var)
-    var = maybe_encode_bools(var)
+    # TODO(kmuehlbauer): check if ensure_dtype_not_object can be moved to backends:
     var = ensure_dtype_not_object(var, name=name)
 
     for attr_name in CF_RELATED_DATA:
@@ -389,19 +276,15 @@ def decode_cf_variable(
     if decode_times:
         var = times.CFDatetimeCoder(use_cftime=use_cftime).decode(var, name=name)
 
-    dimensions, data, attributes, encoding = variables.unpack_for_decoding(var)
-    # TODO(shoyer): convert everything below to use coders
+    if decode_endianness and not var.dtype.isnative:
+        var = variables.EndianCoder().decode(var)
+        original_dtype = var.dtype
 
-    if decode_endianness and not data.dtype.isnative:
-        # do this last, so it's only done if we didn't already unmask/scale
-        data = NativeEndiannessArray(data)
-        original_dtype = data.dtype
+    var = variables.BooleanCoder().decode(var)
 
-    encoding.setdefault("dtype", original_dtype)
+    dimensions, data, attributes, encoding = variables.unpack_for_decoding(var)
 
-    if "dtype" in attributes and attributes["dtype"] == "bool":
-        del attributes["dtype"]
-        data = BoolTypeArray(data)
+    encoding.setdefault("dtype", original_dtype)
 
     if not is_duck_dask_array(data):
         data = indexing.LazilyIndexedArray(data)
diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py
index 9485b506b89..6d219f09e0e 100644
--- a/xarray/tests/test_conventions.py
+++ b/xarray/tests/test_conventions.py
@@ -32,7 +32,7 @@
 class TestBoolTypeArray:
     def test_booltype_array(self) -> None:
         x = np.array([1, 0, 1, 1, 0], dtype="i1")
-        bx = conventions.BoolTypeArray(x)
+        bx = coding.variables.BoolTypeArray(x)
         assert bx.dtype == bool
         assert_array_equal(bx, np.array([True, False, True, True, False], dtype=bool))
 
@@ -41,7 +41,7 @@ class TestNativeEndiannessArray:
     def test(self) -> None:
         x = np.arange(5, dtype=">i8")
         expected = np.arange(5, dtype="int64")
-        a = conventions.NativeEndiannessArray(x)
+        a = coding.variables.NativeEndiannessArray(x)
         assert a.dtype == expected.dtype
         assert a.dtype == expected[:].dtype
         assert_array_equal(a, expected)
@@ -247,7 +247,7 @@ def test_decode_coordinates(self) -> None:
     def test_0d_int32_encoding(self) -> None:
         original = Variable((), np.int32(0), encoding={"dtype": "int64"})
         expected = Variable((), np.int64(0))
-        actual = conventions.maybe_encode_nonstring_dtype(original)
+        actual = coding.variables.NonStringCoder().encode(original)
         assert_identical(expected, actual)
 
     def test_decode_cf_with_multiple_missing_values(self) -> None:

From 2e750e4b2a942d07ead45cdfdf461d2d0ca957ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kai.muehlbauer@uni-bonn.de>
Date: Fri, 31 Mar 2023 09:00:51 +0200
Subject: [PATCH 02/22] preserve boolean-dtype within encoding, adapt test

---
 xarray/coding/variables.py    | 3 ++-
 xarray/tests/test_backends.py | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
index 1cd0803730f..80d1740e88f 100644
--- a/xarray/coding/variables.py
+++ b/xarray/coding/variables.py
@@ -456,7 +456,8 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
     def decode(self, variable: Variable, name: T_Name = None) -> Variable:
         if variable.attrs.get("dtype", False) == "bool":
             dims, data, attrs, encoding = unpack_for_decoding(variable)
-            del attrs["dtype"]
+            # overwrite encoding accordingly and remove from attrs
+            encoding["dtype"] = attrs.pop("dtype")
             data = BoolTypeArray(data)
             return Variable(dims, data, attrs, encoding, fastpath=True)
         else:
diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
index 14b510d4c97..ccd974f37c1 100644
--- a/xarray/tests/test_backends.py
+++ b/xarray/tests/test_backends.py
@@ -630,6 +630,9 @@ def test_roundtrip_boolean_dtype(self) -> None:
         with self.roundtrip(original) as actual:
             assert_identical(original, actual)
             assert actual["x"].dtype == "bool"
+            with self.roundtrip(actual) as actual2:
+                assert_identical(original, actual2)
+                assert actual2["x"].dtype == "bool"
 
     def test_orthogonal_indexing(self) -> None:
         in_memory = create_test_data()

From 0cd301b9879dec1fa4a269616c53026e1a9db5ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kai.muehlbauer@uni-bonn.de>
Date: Fri, 31 Mar 2023 10:34:30 +0200
Subject: [PATCH 03/22] determine cf packed data dtype, adapt tests

---
 xarray/coding/variables.py    | 30 +++++++++++-----
 xarray/tests/test_backends.py | 67 ++++++++++++++++++++---------------
 2 files changed, 60 insertions(+), 37 deletions(-)

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
index 80d1740e88f..834bb107987 100644
--- a/xarray/coding/variables.py
+++ b/xarray/coding/variables.py
@@ -298,9 +298,24 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTyp
     return data
 
 
-def _choose_float_dtype(dtype: np.dtype, has_offset: bool) -> type[np.floating[Any]]:
-    """Return a float dtype that can losslessly represent `dtype` values."""
-    # Keep float32 as-is.  Upcast half-precision to single-precision,
+def _choose_float_dtype(
+    dtype: np.dtype, encoding: MutableMapping
+) -> type[np.floating[Any]]:
+    # check scale/offset first to derive dtype with
+    if "scale_factor" in encoding or "add_offset" in encoding:
+        scale_factor = encoding.get("scale_factor", False)
+        add_offset = encoding.get("add_offset", False)
+        # minimal floating point size -> 4 byte
+        maxsize = 4
+        if scale_factor and np.issubdtype(type(scale_factor), np.floating):
+            maxsize = max(maxsize, np.dtype(type(scale_factor)).itemsize)
+        if add_offset and np.issubdtype(type(add_offset), np.floating):
+            maxsize = max(maxsize, np.dtype(type(add_offset)).itemsize)
+        if maxsize == 4:
+            return np.float32
+        else:
+            return np.float64
+    # Keep float32 as-is. Upcast half-precision to single-precision,
     # because float16 is "intended for storage but not computation"
     if dtype.itemsize <= 4 and np.issubdtype(dtype, np.floating):
         return np.float32
@@ -308,10 +323,7 @@ def _choose_float_dtype(dtype: np.dtype, has_offset: bool) -> type[np.floating[A
     if dtype.itemsize <= 2 and np.issubdtype(dtype, np.integer):
         # A scale factor is entirely safe (vanishing into the mantissa),
         # but a large integer offset could lead to loss of precision.
-        # Sensitivity analysis can be tricky, so we just use a float64
-        # if there's any offset at all - better unoptimised than wrong!
-        if not has_offset:
-            return np.float32
+        return np.float32
     # For all other types and circumstances, we just use float64.
     # (safe because eg. complex numbers are not supported in NetCDF)
     return np.float64
@@ -328,7 +340,7 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
         dims, data, attrs, encoding = unpack_for_encoding(variable)
 
         if "scale_factor" in encoding or "add_offset" in encoding:
-            dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
+            dtype = _choose_float_dtype(data.dtype, encoding)
             data = data.astype(dtype=dtype, copy=True)
         if "add_offset" in encoding:
             data -= pop_to(encoding, attrs, "add_offset", name=name)
@@ -344,7 +356,7 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
 
             scale_factor = pop_to(attrs, encoding, "scale_factor", name=name)
             add_offset = pop_to(attrs, encoding, "add_offset", name=name)
-            dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
+            dtype = _choose_float_dtype(data.dtype, encoding)
             if np.ndim(scale_factor) > 0:
                 scale_factor = np.asarray(scale_factor).item()
             if np.ndim(add_offset) > 0:
diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
index ccd974f37c1..4c8e16c33cd 100644
--- a/xarray/tests/test_backends.py
+++ b/xarray/tests/test_backends.py
@@ -138,96 +138,96 @@ def open_example_mfdataset(names, *args, **kwargs) -> Dataset:
     )
 
 
-def create_masked_and_scaled_data() -> Dataset:
-    x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=np.float32)
+def create_masked_and_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset:
+    x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=dtype)
     encoding = {
         "_FillValue": -1,
         "add_offset": 10,
-        "scale_factor": np.float32(0.1),
+        "scale_factor": dtype(0.1),
         "dtype": "i2",
     }
     return Dataset({"x": ("t", x, {}, encoding)})
 
 
-def create_encoded_masked_and_scaled_data() -> Dataset:
-    attributes = {"_FillValue": -1, "add_offset": 10, "scale_factor": np.float32(0.1)}
+def create_encoded_masked_and_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset:
+    attributes = {"_FillValue": -1, "add_offset": 10, "scale_factor": dtype(0.1)}
     return Dataset(
         {"x": ("t", np.array([-1, -1, 0, 1, 2], dtype=np.int16), attributes)}
     )
 
 
-def create_unsigned_masked_scaled_data() -> Dataset:
+def create_unsigned_masked_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset:
     encoding = {
         "_FillValue": 255,
         "_Unsigned": "true",
         "dtype": "i1",
         "add_offset": 10,
-        "scale_factor": np.float32(0.1),
+        "scale_factor": dtype(0.1),
     }
-    x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32)
+    x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype)
     return Dataset({"x": ("t", x, {}, encoding)})
 
 
-def create_encoded_unsigned_masked_scaled_data() -> Dataset:
+def create_encoded_unsigned_masked_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset:
     # These are values as written to the file: the _FillValue will
     # be represented in the signed form.
     attributes = {
         "_FillValue": -1,
         "_Unsigned": "true",
         "add_offset": 10,
-        "scale_factor": np.float32(0.1),
+        "scale_factor": dtype(0.1),
     }
     # Create unsigned data corresponding to [0, 1, 127, 128, 255] unsigned
     sb = np.asarray([0, 1, 127, -128, -1], dtype="i1")
     return Dataset({"x": ("t", sb, attributes)})
 
 
-def create_bad_unsigned_masked_scaled_data() -> Dataset:
+def create_bad_unsigned_masked_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset:
     encoding = {
         "_FillValue": 255,
         "_Unsigned": True,
         "dtype": "i1",
         "add_offset": 10,
-        "scale_factor": np.float32(0.1),
+        "scale_factor": dtype(0.1),
     }
-    x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32)
+    x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype)
     return Dataset({"x": ("t", x, {}, encoding)})
 
 
-def create_bad_encoded_unsigned_masked_scaled_data() -> Dataset:
+def create_bad_encoded_unsigned_masked_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset:
     # These are values as written to the file: the _FillValue will
     # be represented in the signed form.
     attributes = {
         "_FillValue": -1,
         "_Unsigned": True,
         "add_offset": 10,
-        "scale_factor": np.float32(0.1),
+        "scale_factor": dtype(0.1),
     }
     # Create signed data corresponding to [0, 1, 127, 128, 255] unsigned
     sb = np.asarray([0, 1, 127, -128, -1], dtype="i1")
     return Dataset({"x": ("t", sb, attributes)})
 
 
-def create_signed_masked_scaled_data() -> Dataset:
+def create_signed_masked_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset:
     encoding = {
         "_FillValue": -127,
         "_Unsigned": "false",
         "dtype": "i1",
         "add_offset": 10,
-        "scale_factor": np.float32(0.1),
+        "scale_factor": dtype(0.1),
     }
-    x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=np.float32)
+    x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=dtype)
     return Dataset({"x": ("t", x, {}, encoding)})
 
 
-def create_encoded_signed_masked_scaled_data() -> Dataset:
+def create_encoded_signed_masked_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset:
     # These are values as written to the file: the _FillValue will
     # be represented in the signed form.
     attributes = {
         "_FillValue": -127,
         "_Unsigned": "false",
         "add_offset": 10,
-        "scale_factor": np.float32(0.1),
+        "scale_factor": dtype(0.1),
     }
     # Create signed data corresponding to [0, 1, 127, 128, 255] unsigned
     sb = np.asarray([-110, 1, 127, -127], dtype="i1")
@@ -857,6 +857,7 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None:
             with self.roundtrip(original) as actual:
                 assert_identical(expected, actual)
 
+    @pytest.mark.parametrize("dtype", [np.float32, np.float64])
     @pytest.mark.parametrize(
         "decoded_fn, encoded_fn",
         [
@@ -876,12 +877,19 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None:
             (create_masked_and_scaled_data, create_encoded_masked_and_scaled_data),
         ],
     )
-    def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn) -> None:
-        decoded = decoded_fn()
-        encoded = encoded_fn()
+    def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None:
+        if dtype == np.float32 and isinstance(
+            self, (TestZarrDirectoryStore, TestZarrDictStore)
+        ):
+            pytest.skip(
+                "zarr attributes (eg. `scale_factor` are unconditionally promoted to `float64`"
+            )
+        decoded = decoded_fn(dtype)
+        encoded = encoded_fn(dtype)
 
         with self.roundtrip(decoded) as actual:
             for k in decoded.variables:
+                print(k, decoded.variables[k].dtype)
                 assert decoded.variables[k].dtype == actual.variables[k].dtype
             assert_allclose(decoded, actual, decode_bytes=False)
 
@@ -899,7 +907,7 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn) -> None:
 
         # make sure roundtrip encoding didn't change the
         # original dataset.
-        assert_allclose(encoded, encoded_fn(), decode_bytes=False)
+        assert_allclose(encoded, encoded_fn(dtype), decode_bytes=False)
 
         with self.roundtrip(encoded) as actual:
             for k in decoded.variables:
@@ -1533,7 +1541,8 @@ def test_encoding_chunksizes_unlimited(self) -> None:
         with self.roundtrip(ds) as actual:
             assert_equal(ds, actual)
 
-    def test_mask_and_scale(self) -> None:
+    @pytest.mark.parametrize("dtype", [np.float32, np.float64])
+    def test_mask_and_scale(self, dtype) -> None:
         with create_tmp_file() as tmp_file:
             with nc4.Dataset(tmp_file, mode="w") as nc:
                 nc.createDimension("t", 5)
@@ -1541,21 +1550,23 @@ def test_mask_and_scale(self) -> None:
                 v = nc.variables["x"]
                 v.set_auto_maskandscale(False)
                 v.add_offset = 10
-                v.scale_factor = 0.1
+                v.scale_factor = dtype(0.1)
                 v[:] = np.array([-1, -1, 0, 1, 2])
 
             # first make sure netCDF4 reads the masked and scaled data
             # correctly
             with nc4.Dataset(tmp_file, mode="r") as nc:
                 expected = np.ma.array(
-                    [-1, -1, 10, 10.1, 10.2], mask=[True, True, False, False, False]
+                    [-1, -1, 10, 10.1, 10.2],
+                    mask=[True, True, False, False, False],
+                    dtype=dtype,
                 )
                 actual = nc.variables["x"][:]
                 assert_array_equal(expected, actual)
 
             # now check xarray
             with open_dataset(tmp_file) as ds:
-                expected = create_masked_and_scaled_data()
+                expected = create_masked_and_scaled_data(dtype)
                 assert_identical(expected, ds)
 
     def test_0dimensional_variable(self) -> None:

From 7cfc6aca3a0b761970765671f49e16cb05cffc25 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 1 Apr 2023 08:36:05 +0000
Subject: [PATCH 04/22] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 xarray/coding/variables.py    |  3 +--
 xarray/conventions.py         |  2 +-
 xarray/tests/test_backends.py | 28 +++++++++++++++++++++-------
 3 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
index 834bb107987..6b773b9343a 100644
--- a/xarray/coding/variables.py
+++ b/xarray/coding/variables.py
@@ -143,7 +143,6 @@ def __getitem__(self, key):
         return np.asarray(self.array[key], dtype=self.dtype)
 
 
-
 def lazy_elemwise_func(array, func: Callable, dtype: np.typing.DTypeLike):
     """Lazily apply an element-wise function to an array.
     Parameters
@@ -444,6 +443,7 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
             return Variable(dims, data, attrs, encoding, fastpath=True)
         else:
             return variable
+
     def decode(self, variable: Variable, name: T_Name = None) -> Variable:
         raise NotImplementedError()
 
@@ -523,4 +523,3 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
 
     def decode(self):
         raise NotImplementedError()
-
diff --git a/xarray/conventions.py b/xarray/conventions.py
index f7cdc0bc7d3..1506efc31e8 100644
--- a/xarray/conventions.py
+++ b/xarray/conventions.py
@@ -10,7 +10,7 @@
 
 from xarray.coding import strings, times, variables
 from xarray.coding.variables import SerializationWarning, pop_to
-from xarray.core import duck_array_ops, indexing
+from xarray.core import indexing
 from xarray.core.common import (
     _contains_datetime_like_objects,
     contains_cftime_datetimes,
diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
index 4c8e16c33cd..345e6cc9d5e 100644
--- a/xarray/tests/test_backends.py
+++ b/xarray/tests/test_backends.py
@@ -149,14 +149,18 @@ def create_masked_and_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Da
     return Dataset({"x": ("t", x, {}, encoding)})
 
 
-def create_encoded_masked_and_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset:
+def create_encoded_masked_and_scaled_data(
+    dtype: np.typing.DTypeLike = np.float32,
+) -> Dataset:
     attributes = {"_FillValue": -1, "add_offset": 10, "scale_factor": dtype(0.1)}
     return Dataset(
         {"x": ("t", np.array([-1, -1, 0, 1, 2], dtype=np.int16), attributes)}
     )
 
 
-def create_unsigned_masked_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset:
+def create_unsigned_masked_scaled_data(
+    dtype: np.typing.DTypeLike = np.float32,
+) -> Dataset:
     encoding = {
         "_FillValue": 255,
         "_Unsigned": "true",
@@ -168,7 +172,9 @@ def create_unsigned_masked_scaled_data(dtype: np.typing.DTypeLike = np.float32)
     return Dataset({"x": ("t", x, {}, encoding)})
 
 
-def create_encoded_unsigned_masked_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset:
+def create_encoded_unsigned_masked_scaled_data(
+    dtype: np.typing.DTypeLike = np.float32,
+) -> Dataset:
     # These are values as written to the file: the _FillValue will
     # be represented in the signed form.
     attributes = {
@@ -182,7 +188,9 @@ def create_encoded_unsigned_masked_scaled_data(dtype: np.typing.DTypeLike = np.f
     return Dataset({"x": ("t", sb, attributes)})
 
 
-def create_bad_unsigned_masked_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset:
+def create_bad_unsigned_masked_scaled_data(
+    dtype: np.typing.DTypeLike = np.float32,
+) -> Dataset:
     encoding = {
         "_FillValue": 255,
         "_Unsigned": True,
@@ -194,7 +202,9 @@ def create_bad_unsigned_masked_scaled_data(dtype: np.typing.DTypeLike = np.float
     return Dataset({"x": ("t", x, {}, encoding)})
 
 
-def create_bad_encoded_unsigned_masked_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset:
+def create_bad_encoded_unsigned_masked_scaled_data(
+    dtype: np.typing.DTypeLike = np.float32,
+) -> Dataset:
     # These are values as written to the file: the _FillValue will
     # be represented in the signed form.
     attributes = {
@@ -208,7 +218,9 @@ def create_bad_encoded_unsigned_masked_scaled_data(dtype: np.typing.DTypeLike =
     return Dataset({"x": ("t", sb, attributes)})
 
 
-def create_signed_masked_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset:
+def create_signed_masked_scaled_data(
+    dtype: np.typing.DTypeLike = np.float32,
+) -> Dataset:
     encoding = {
         "_FillValue": -127,
         "_Unsigned": "false",
@@ -220,7 +232,9 @@ def create_signed_masked_scaled_data(dtype: np.typing.DTypeLike = np.float32) ->
     return Dataset({"x": ("t", x, {}, encoding)})
 
 
-def create_encoded_signed_masked_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset:
+def create_encoded_signed_masked_scaled_data(
+    dtype: np.typing.DTypeLike = np.float32,
+) -> Dataset:
     # These are values as written to the file: the _FillValue will
     # be represented in the signed form.
     attributes = {

From bf4371d51f0758faac491aabbbf264eee5eb9901 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kmuehlbauer@wradlib.org>
Date: Sat, 1 Apr 2023 18:23:38 +0200
Subject: [PATCH 05/22] fix typing, fix if-clauses, add link to issue

---
 xarray/coding/variables.py    | 38 +++++++++++++++++------------------
 xarray/tests/test_backends.py | 17 ++++++++--------
 2 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
index 6b773b9343a..8c77e94afe4 100644
--- a/xarray/coding/variables.py
+++ b/xarray/coding/variables.py
@@ -298,12 +298,13 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTyp
 
 
 def _choose_float_dtype(
-    dtype: np.dtype, encoding: MutableMapping
+    dtype: np.dtype, mapping: MutableMapping
 ) -> type[np.floating[Any]]:
-    # check scale/offset first to derive dtype with
-    if "scale_factor" in encoding or "add_offset" in encoding:
-        scale_factor = encoding.get("scale_factor", False)
-        add_offset = encoding.get("add_offset", False)
+    # check scale/offset first to derive dtype
+    # see https://github.com/pydata/xarray/issues/5597#issuecomment-879561954
+    scale_factor = mapping.get("scale_factor", False)
+    add_offset = mapping.get("add_offset", False)
+    if scale_factor or add_offset:
         # minimal floating point size -> 4 byte
         maxsize = 4
         if scale_factor and np.issubdtype(type(scale_factor), np.floating):
@@ -337,29 +338,28 @@ class CFScaleOffsetCoder(VariableCoder):
 
     def encode(self, variable: Variable, name: T_Name = None) -> Variable:
         dims, data, attrs, encoding = unpack_for_encoding(variable)
-
-        if "scale_factor" in encoding or "add_offset" in encoding:
-            dtype = _choose_float_dtype(data.dtype, encoding)
+        scale_factor = pop_to(encoding, attrs, "scale_factor", name=name)
+        add_offset = pop_to(encoding, attrs, "add_offset", name=name)
+        if scale_factor or add_offset:
+            dtype = _choose_float_dtype(data.dtype, attrs)
             data = data.astype(dtype=dtype, copy=True)
-        if "add_offset" in encoding:
-            data -= pop_to(encoding, attrs, "add_offset", name=name)
-        if "scale_factor" in encoding:
-            data /= pop_to(encoding, attrs, "scale_factor", name=name)
+        if add_offset:
+            data -= add_offset
+        if scale_factor:
+            data /= scale_factor
 
         return Variable(dims, data, attrs, encoding, fastpath=True)
 
     def decode(self, variable: Variable, name: T_Name = None) -> Variable:
-        _attrs = variable.attrs
-        if "scale_factor" in _attrs or "add_offset" in _attrs:
-            dims, data, attrs, encoding = unpack_for_decoding(variable)
-
-            scale_factor = pop_to(attrs, encoding, "scale_factor", name=name)
-            add_offset = pop_to(attrs, encoding, "add_offset", name=name)
-            dtype = _choose_float_dtype(data.dtype, encoding)
+        dims, data, attrs, encoding = unpack_for_decoding(variable)
+        scale_factor = pop_to(attrs, encoding, "scale_factor", name=name)
+        add_offset = pop_to(attrs, encoding, "add_offset", name=name)
+        if scale_factor or add_offset:
             if np.ndim(scale_factor) > 0:
                 scale_factor = np.asarray(scale_factor).item()
             if np.ndim(add_offset) > 0:
                 add_offset = np.asarray(add_offset).item()
+            dtype = _choose_float_dtype(data.dtype, encoding)
             transform = partial(
                 _scale_offset_decoding,
                 scale_factor=scale_factor,
diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
index 345e6cc9d5e..7e9e0ffe8d9 100644
--- a/xarray/tests/test_backends.py
+++ b/xarray/tests/test_backends.py
@@ -138,7 +138,7 @@ def open_example_mfdataset(names, *args, **kwargs) -> Dataset:
     )
 
 
-def create_masked_and_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Dataset:
+def create_masked_and_scaled_data(dtype: type[np.number] = np.float32) -> Dataset:
     x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=dtype)
     encoding = {
         "_FillValue": -1,
@@ -150,7 +150,7 @@ def create_masked_and_scaled_data(dtype: np.typing.DTypeLike = np.float32) -> Da
 
 
 def create_encoded_masked_and_scaled_data(
-    dtype: np.typing.DTypeLike = np.float32,
+    dtype: type[np.number] = np.float32,
 ) -> Dataset:
     attributes = {"_FillValue": -1, "add_offset": 10, "scale_factor": dtype(0.1)}
     return Dataset(
@@ -159,7 +159,7 @@ def create_encoded_masked_and_scaled_data(
 
 
 def create_unsigned_masked_scaled_data(
-    dtype: np.typing.DTypeLike = np.float32,
+    dtype: type[np.number] = np.float32,
 ) -> Dataset:
     encoding = {
         "_FillValue": 255,
@@ -173,7 +173,7 @@ def create_unsigned_masked_scaled_data(
 
 
 def create_encoded_unsigned_masked_scaled_data(
-    dtype: np.typing.DTypeLike = np.float32,
+    dtype: type[np.number] = np.float32,
 ) -> Dataset:
     # These are values as written to the file: the _FillValue will
     # be represented in the signed form.
@@ -189,7 +189,7 @@ def create_encoded_unsigned_masked_scaled_data(
 
 
 def create_bad_unsigned_masked_scaled_data(
-    dtype: np.typing.DTypeLike = np.float32,
+    dtype: type[np.number] = np.float32,
 ) -> Dataset:
     encoding = {
         "_FillValue": 255,
@@ -203,7 +203,7 @@ def create_bad_unsigned_masked_scaled_data(
 
 
 def create_bad_encoded_unsigned_masked_scaled_data(
-    dtype: np.typing.DTypeLike = np.float32,
+    dtype: type[np.number] = np.float32,
 ) -> Dataset:
     # These are values as written to the file: the _FillValue will
     # be represented in the signed form.
@@ -219,7 +219,7 @@ def create_bad_encoded_unsigned_masked_scaled_data(
 
 
 def create_signed_masked_scaled_data(
-    dtype: np.typing.DTypeLike = np.float32,
+    dtype: type[np.number] = np.float32,
 ) -> Dataset:
     encoding = {
         "_FillValue": -127,
@@ -233,7 +233,7 @@ def create_signed_masked_scaled_data(
 
 
 def create_encoded_signed_masked_scaled_data(
-    dtype: np.typing.DTypeLike = np.float32,
+    dtype: type[np.number] = np.float32,
 ) -> Dataset:
     # These are values as written to the file: the _FillValue will
     # be represented in the signed form.
@@ -903,7 +903,6 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None:
 
         with self.roundtrip(decoded) as actual:
             for k in decoded.variables:
-                print(k, decoded.variables[k].dtype)
                 assert decoded.variables[k].dtype == actual.variables[k].dtype
             assert_allclose(decoded, actual, decode_bytes=False)
 

From fe006df525aec7817a279e648c85b93266d9047e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kmuehlbauer@wradlib.org>
Date: Sat, 1 Apr 2023 18:31:17 +0200
Subject: [PATCH 06/22] add comment ass per review

---
 xarray/tests/test_backends.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
index 7e9e0ffe8d9..08945337851 100644
--- a/xarray/tests/test_backends.py
+++ b/xarray/tests/test_backends.py
@@ -644,6 +644,8 @@ def test_roundtrip_boolean_dtype(self) -> None:
         with self.roundtrip(original) as actual:
             assert_identical(original, actual)
             assert actual["x"].dtype == "bool"
+            # this checks for preserving dtype during second roundtrip
+            # see https://github.com/pydata/xarray/issues/7652#issuecomment-1476956975
             with self.roundtrip(actual) as actual2:
                 assert_identical(original, actual2)
                 assert actual2["x"].dtype == "bool"

From 66628fcdc7056a33eb6c740fdc57372a27d073df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kmuehlbauer@wradlib.org>
Date: Sat, 1 Apr 2023 18:57:35 +0200
Subject: [PATCH 07/22] choose float64 if add_offset is not up to cf
 expectations

---
 xarray/coding/variables.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
index 8c77e94afe4..078ba09358b 100644
--- a/xarray/coding/variables.py
+++ b/xarray/coding/variables.py
@@ -305,13 +305,22 @@ def _choose_float_dtype(
     scale_factor = mapping.get("scale_factor", False)
     add_offset = mapping.get("add_offset", False)
     if scale_factor or add_offset:
-        # minimal floating point size -> 4 byte
+        # get the maximum itemsize from scale_factor/add_offset to determine
+        # the needed floating point type
+        # start with minimal floating point size -> 4 byte
         maxsize = 4
         if scale_factor and np.issubdtype(type(scale_factor), np.floating):
             maxsize = max(maxsize, np.dtype(type(scale_factor)).itemsize)
-        if add_offset and np.issubdtype(type(add_offset), np.floating):
-            maxsize = max(maxsize, np.dtype(type(add_offset)).itemsize)
-        if maxsize == 4:
+        add_offset_type = type(add_offset)
+        if add_offset and np.issubdtype(add_offset_type, np.floating):
+            maxsize = max(maxsize, np.dtype(add_offset_type).itemsize)
+        # if add_offset is malformed (eg. no float32 or no float64 as
+        # cf conventions expects):
+        # A scale factor is entirely safe (vanishing into the mantissa),
+        # but a large integer offset could lead to loss of precision.
+        # Sensitivity analysis can be tricky, so we just use a float64
+        # if there's any offset at all - better unoptimised than wrong!
+        if maxsize == 4 and np.issubdtype(add_offset_type, np.floating):
             return np.float32
         else:
             return np.float64
@@ -321,8 +330,6 @@ def _choose_float_dtype(
         return np.float32
     # float32 can exactly represent all integers up to 24 bits
     if dtype.itemsize <= 2 and np.issubdtype(dtype, np.integer):
-        # A scale factor is entirely safe (vanishing into the mantissa),
-        # but a large integer offset could lead to loss of precision.
         return np.float32
     # For all other types and circumstances, we just use float64.
     # (safe because eg. complex numbers are not supported in NetCDF)

From b6310e48e34d73aa0aa3b133283c4ab76a2bc7b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kmuehlbauer@wradlib.org>
Date: Sat, 1 Apr 2023 20:07:53 +0200
Subject: [PATCH 08/22] use scale_factor/add_offset in tests as specified by cf
 conventions

---
 xarray/tests/test_backends.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
index 08945337851..cd3e833e48b 100644
--- a/xarray/tests/test_backends.py
+++ b/xarray/tests/test_backends.py
@@ -142,7 +142,7 @@ def create_masked_and_scaled_data(dtype: type[np.number] = np.float32) -> Datase
     x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=dtype)
     encoding = {
         "_FillValue": -1,
-        "add_offset": 10,
+        "add_offset": dtype(10),
         "scale_factor": dtype(0.1),
         "dtype": "i2",
     }
@@ -152,7 +152,7 @@ def create_masked_and_scaled_data(dtype: type[np.number] = np.float32) -> Datase
 def create_encoded_masked_and_scaled_data(
     dtype: type[np.number] = np.float32,
 ) -> Dataset:
-    attributes = {"_FillValue": -1, "add_offset": 10, "scale_factor": dtype(0.1)}
+    attributes = {"_FillValue": -1, "add_offset": dtype(10), "scale_factor": dtype(0.1)}
     return Dataset(
         {"x": ("t", np.array([-1, -1, 0, 1, 2], dtype=np.int16), attributes)}
     )
@@ -165,7 +165,7 @@ def create_unsigned_masked_scaled_data(
         "_FillValue": 255,
         "_Unsigned": "true",
         "dtype": "i1",
-        "add_offset": 10,
+        "add_offset": dtype(10),
         "scale_factor": dtype(0.1),
     }
     x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype)
@@ -180,7 +180,7 @@ def create_encoded_unsigned_masked_scaled_data(
     attributes = {
         "_FillValue": -1,
         "_Unsigned": "true",
-        "add_offset": 10,
+        "add_offset": dtype(10),
         "scale_factor": dtype(0.1),
     }
     # Create unsigned data corresponding to [0, 1, 127, 128, 255] unsigned
@@ -195,7 +195,7 @@ def create_bad_unsigned_masked_scaled_data(
         "_FillValue": 255,
         "_Unsigned": True,
         "dtype": "i1",
-        "add_offset": 10,
+        "add_offset": dtype(0),
         "scale_factor": dtype(0.1),
     }
     x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype)
@@ -210,7 +210,7 @@ def create_bad_encoded_unsigned_masked_scaled_data(
     attributes = {
         "_FillValue": -1,
         "_Unsigned": True,
-        "add_offset": 10,
+        "add_offset": dtype(10),
         "scale_factor": dtype(0.1),
     }
     # Create signed data corresponding to [0, 1, 127, 128, 255] unsigned
@@ -225,7 +225,7 @@ def create_signed_masked_scaled_data(
         "_FillValue": -127,
         "_Unsigned": "false",
         "dtype": "i1",
-        "add_offset": 10,
+        "add_offset": dtype(10),
         "scale_factor": dtype(0.1),
     }
     x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=dtype)
@@ -240,7 +240,7 @@ def create_encoded_signed_masked_scaled_data(
     attributes = {
         "_FillValue": -127,
         "_Unsigned": "false",
-        "add_offset": 10,
+        "add_offset": dtype(10),
         "scale_factor": dtype(0.1),
     }
     # Create signed data corresponding to [0, 1, 127, 128, 255] unsigned
@@ -1564,7 +1564,7 @@ def test_mask_and_scale(self, dtype) -> None:
                 nc.createVariable("x", "int16", ("t",), fill_value=-1)
                 v = nc.variables["x"]
                 v.set_auto_maskandscale(False)
-                v.add_offset = 10
+                v.add_offset = dtype(10)
                 v.scale_factor = dtype(0.1)
                 v[:] = np.array([-1, -1, 0, 1, 2])
 

From 06ec16a03dfdc80436b049c5ec239bcdefd4d680 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kmuehlbauer@wradlib.org>
Date: Sat, 1 Apr 2023 20:37:54 +0200
Subject: [PATCH 09/22] add test which checks add_offset being not conforming
 to cf standards

---
 xarray/tests/test_backends.py | 40 +++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
index cd3e833e48b..939b96d3bce 100644
--- a/xarray/tests/test_backends.py
+++ b/xarray/tests/test_backends.py
@@ -1584,6 +1584,46 @@ def test_mask_and_scale(self, dtype) -> None:
                 expected = create_masked_and_scaled_data(dtype)
                 assert_identical(expected, ds)
 
+    @pytest.mark.parametrize("dtype", [np.float32, np.float64])
+    @pytest.mark.parametrize("offset_cf_conforming", [True, False])
+    def test_mask_and_scale_non_cf_conforming(self, dtype, offset_cf_conforming) -> None:
+        with create_tmp_file() as tmp_file:
+            with nc4.Dataset(tmp_file, mode="w") as nc:
+                nc.createDimension("t", 5)
+                nc.createVariable("x", "int16", ("t",), fill_value=-1)
+                v = nc.variables["x"]
+                v.set_auto_maskandscale(False)
+                if offset_cf_conforming:
+                    v.add_offset = dtype(10)
+                else:
+                    v.add_offset = 10
+                v.scale_factor = dtype(0.1)
+                v[:] = np.array([-1, -1, 0, 1, 2])
+
+            # first make sure netCDF4 reads the masked and scaled data
+            # correctly
+            with nc4.Dataset(tmp_file, mode="r") as nc:
+                expected = np.ma.array(
+                    [-1, -1, 10, 10.1, 10.2],
+                    mask=[True, True, False, False, False],
+                    dtype=dtype,
+                )
+                actual = nc.variables["x"][:]
+                assert_array_equal(expected, actual)
+
+            # now check xarray
+            with open_dataset(tmp_file) as ds:
+                if not offset_cf_conforming:
+                    # if not conforming, this get's promoted to float64 in any case
+                    # if add_offset is available
+                    expected = create_masked_and_scaled_data(np.float64)
+                    # here we have slight differences possibly
+                    # due to using float32 first and casting to float64 later
+                    assert_allclose(expected, ds)
+                else:
+                    expected = create_masked_and_scaled_data(dtype)
+                    assert_identical(expected, ds)
+
     def test_0dimensional_variable(self) -> None:
         # This fix verifies our work-around to this netCDF4-python bug:
         # https://github.com/Unidata/netcdf4-python/pull/220

From 19ef2340fd278b3d1aa2793d7c4d659bd9baf2bd Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 1 Apr 2023 18:38:42 +0000
Subject: [PATCH 10/22] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 xarray/tests/test_backends.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
index 939b96d3bce..9dd4a984fd7 100644
--- a/xarray/tests/test_backends.py
+++ b/xarray/tests/test_backends.py
@@ -1586,7 +1586,9 @@ def test_mask_and_scale(self, dtype) -> None:
 
     @pytest.mark.parametrize("dtype", [np.float32, np.float64])
     @pytest.mark.parametrize("offset_cf_conforming", [True, False])
-    def test_mask_and_scale_non_cf_conforming(self, dtype, offset_cf_conforming) -> None:
+    def test_mask_and_scale_non_cf_conforming(
+        self, dtype, offset_cf_conforming
+    ) -> None:
         with create_tmp_file() as tmp_file:
             with nc4.Dataset(tmp_file, mode="w") as nc:
                 nc.createDimension("t", 5)

From e877aa7e10f3949c6542a127f7a9e9ef26bd8a37 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kmuehlbauer@wradlib.org>
Date: Sat, 1 Apr 2023 23:10:08 +0200
Subject: [PATCH 11/22] convert to float32 to keep #1840 in sync

---
 xarray/coding/variables.py  | 18 ++++++++++--------
 xarray/tests/test_coding.py |  7 ++++---
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
index 078ba09358b..c9093245a74 100644
--- a/xarray/coding/variables.py
+++ b/xarray/coding/variables.py
@@ -302,8 +302,8 @@ def _choose_float_dtype(
 ) -> type[np.floating[Any]]:
     # check scale/offset first to derive dtype
     # see https://github.com/pydata/xarray/issues/5597#issuecomment-879561954
-    scale_factor = mapping.get("scale_factor", False)
-    add_offset = mapping.get("add_offset", False)
+    scale_factor = mapping.get("scale_factor")
+    add_offset = mapping.get("add_offset")
     if scale_factor or add_offset:
         # get the maximum itemsize from scale_factor/add_offset to determine
         # the needed floating point type
@@ -320,7 +320,7 @@ def _choose_float_dtype(
         # but a large integer offset could lead to loss of precision.
         # Sensitivity analysis can be tricky, so we just use a float64
         # if there's any offset at all - better unoptimised than wrong!
-        if maxsize == 4 and np.issubdtype(add_offset_type, np.floating):
+        if maxsize == 4 or not np.issubdtype(add_offset_type, np.floating):
             return np.float32
         else:
             return np.float64
@@ -350,12 +350,14 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
         if scale_factor or add_offset:
             dtype = _choose_float_dtype(data.dtype, attrs)
             data = data.astype(dtype=dtype, copy=True)
-        if add_offset:
-            data -= add_offset
-        if scale_factor:
-            data /= scale_factor
+            if add_offset:
+                data -= add_offset
+            if scale_factor:
+                data /= scale_factor
 
-        return Variable(dims, data, attrs, encoding, fastpath=True)
+            return Variable(dims, data, attrs, encoding, fastpath=True)
+        else:
+            return variable
 
     def decode(self, variable: Variable, name: T_Name = None) -> Variable:
         dims, data, attrs, encoding = unpack_for_decoding(variable)
diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py
index f7579c4b488..a245f91fa82 100644
--- a/xarray/tests/test_coding.py
+++ b/xarray/tests/test_coding.py
@@ -95,10 +95,11 @@ def test_coder_roundtrip() -> None:
     assert_identical(original, roundtripped)
 
 
-@pytest.mark.parametrize("dtype", "u1 u2 i1 i2 f2 f4".split())
-def test_scaling_converts_to_float32(dtype) -> None:
+@pytest.mark.parametrize("unpacked_dtype", [np.float32, np.float64, np.int32])
+@pytest.mark.parametrize("packed_dtype", "u1 u2 i1 i2 f2 f4".split())
+def test_scaling_converts_to_float32(packed_dtype, unpacked_dtype) -> None:
     original = xr.Variable(
-        ("x",), np.arange(10, dtype=dtype), encoding=dict(scale_factor=10)
+        ("x",), np.arange(10, dtype=packed_dtype), encoding=dict(scale_factor=unpacked_dtype(10))
     )
     coder = variables.CFScaleOffsetCoder()
     encoded = coder.encode(original)

From 8cc7319ea24090676af2dbc9035ea117fa00e08c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 1 Apr 2023 21:11:50 +0000
Subject: [PATCH 12/22] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 xarray/tests/test_coding.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py
index a245f91fa82..544f3c47224 100644
--- a/xarray/tests/test_coding.py
+++ b/xarray/tests/test_coding.py
@@ -99,7 +99,9 @@ def test_coder_roundtrip() -> None:
 @pytest.mark.parametrize("packed_dtype", "u1 u2 i1 i2 f2 f4".split())
 def test_scaling_converts_to_float32(packed_dtype, unpacked_dtype) -> None:
     original = xr.Variable(
-        ("x",), np.arange(10, dtype=packed_dtype), encoding=dict(scale_factor=unpacked_dtype(10))
+        ("x",),
+        np.arange(10, dtype=packed_dtype),
+        encoding=dict(scale_factor=unpacked_dtype(10)),
     )
     coder = variables.CFScaleOffsetCoder()
     encoded = coder.encode(original)

From 6a73653b08e30eea29873801bdd27cd68ef12885 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kmuehlbauer@wradlib.org>
Date: Sun, 2 Apr 2023 10:57:13 +0200
Subject: [PATCH 13/22] add more comments, add more typing

---
 xarray/coding/variables.py    | 5 +++--
 xarray/tests/test_backends.py | 6 +++---
 xarray/tests/test_coding.py   | 5 ++++-
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
index c9093245a74..8a6141b3638 100644
--- a/xarray/coding/variables.py
+++ b/xarray/coding/variables.py
@@ -300,7 +300,7 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTyp
 def _choose_float_dtype(
     dtype: np.dtype, mapping: MutableMapping
 ) -> type[np.floating[Any]]:
-    # check scale/offset first to derive dtype
+    # check scale/offset first to derive wanted float dtype
     # see https://github.com/pydata/xarray/issues/5597#issuecomment-879561954
     scale_factor = mapping.get("scale_factor")
     add_offset = mapping.get("add_offset")
@@ -324,6 +324,7 @@ def _choose_float_dtype(
             return np.float32
         else:
             return np.float64
+    # If no scale_factor or add_offset is given, use some general rules.
     # Keep float32 as-is. Upcast half-precision to single-precision,
     # because float16 is "intended for storage but not computation"
     if dtype.itemsize <= 4 and np.issubdtype(dtype, np.floating):
@@ -477,7 +478,7 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
     def decode(self, variable: Variable, name: T_Name = None) -> Variable:
         if variable.attrs.get("dtype", False) == "bool":
             dims, data, attrs, encoding = unpack_for_decoding(variable)
-            # overwrite encoding accordingly and remove from attrs
+            # overwrite (!) encoding accordingly and remove from attrs
             encoding["dtype"] = attrs.pop("dtype")
             data = BoolTypeArray(data)
             return Variable(dims, data, attrs, encoding, fastpath=True)
diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
index 9dd4a984fd7..2b01d1b6333 100644
--- a/xarray/tests/test_backends.py
+++ b/xarray/tests/test_backends.py
@@ -17,7 +17,7 @@
 from contextlib import ExitStack
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Final, cast
+from typing import TYPE_CHECKING, Any, Callable, Final, cast
 
 import numpy as np
 import pandas as pd
@@ -893,7 +893,7 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None:
             (create_masked_and_scaled_data, create_encoded_masked_and_scaled_data),
         ],
     )
-    def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None:
+    def test_roundtrip_mask_and_scale(self, decoded_fn: Callable, encoded_fn: Callable, dtype: type[np.number]) -> None:
         if dtype == np.float32 and isinstance(
             self, (TestZarrDirectoryStore, TestZarrDictStore)
         ):
@@ -1557,7 +1557,7 @@ def test_encoding_chunksizes_unlimited(self) -> None:
             assert_equal(ds, actual)
 
     @pytest.mark.parametrize("dtype", [np.float32, np.float64])
-    def test_mask_and_scale(self, dtype) -> None:
+    def test_mask_and_scale(self, dtype: type[np.number]) -> None:
         with create_tmp_file() as tmp_file:
             with nc4.Dataset(tmp_file, mode="w") as nc:
                 nc.createDimension("t", 5)
diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py
index 544f3c47224..8e47dacebbd 100644
--- a/xarray/tests/test_coding.py
+++ b/xarray/tests/test_coding.py
@@ -97,7 +97,9 @@ def test_coder_roundtrip() -> None:
 
 @pytest.mark.parametrize("unpacked_dtype", [np.float32, np.float64, np.int32])
 @pytest.mark.parametrize("packed_dtype", "u1 u2 i1 i2 f2 f4".split())
-def test_scaling_converts_to_float32(packed_dtype, unpacked_dtype) -> None:
+def test_scaling_converts_to_float32(packed_dtype: str, unpacked_dtype: type[np.number]) -> None:
+    # if scale_facor but no add_offset is given transform to float32 in any case
+    # this minimizes memory usage, see #1840, #1842
     original = xr.Variable(
         ("x",),
         np.arange(10, dtype=packed_dtype),
@@ -115,6 +117,7 @@ def test_scaling_converts_to_float32(packed_dtype, unpacked_dtype) -> None:
 @pytest.mark.parametrize("add_offset", (0.1, [0.1]))
 def test_scaling_offset_as_list(scale_factor, add_offset) -> None:
     # test for #4631
+    # att: scale_factor and add_offset are not conforming to cf specs here
     encoding = dict(scale_factor=scale_factor, add_offset=add_offset)
     original = xr.Variable(("x",), np.arange(10.0), encoding=encoding)
     coder = variables.CFScaleOffsetCoder()

From d6887cd86e25333d3d69f4ae7e2c93b709b68879 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 2 Apr 2023 08:58:35 +0000
Subject: [PATCH 14/22] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 xarray/tests/test_backends.py | 4 +++-
 xarray/tests/test_coding.py   | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
index 2b01d1b6333..2a3e6c1b121 100644
--- a/xarray/tests/test_backends.py
+++ b/xarray/tests/test_backends.py
@@ -893,7 +893,9 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None:
             (create_masked_and_scaled_data, create_encoded_masked_and_scaled_data),
         ],
     )
-    def test_roundtrip_mask_and_scale(self, decoded_fn: Callable, encoded_fn: Callable, dtype: type[np.number]) -> None:
+    def test_roundtrip_mask_and_scale(
+        self, decoded_fn: Callable, encoded_fn: Callable, dtype: type[np.number]
+    ) -> None:
         if dtype == np.float32 and isinstance(
             self, (TestZarrDirectoryStore, TestZarrDictStore)
         ):
diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py
index 8e47dacebbd..7f5d6dbe8d0 100644
--- a/xarray/tests/test_coding.py
+++ b/xarray/tests/test_coding.py
@@ -97,7 +97,9 @@ def test_coder_roundtrip() -> None:
 
 @pytest.mark.parametrize("unpacked_dtype", [np.float32, np.float64, np.int32])
 @pytest.mark.parametrize("packed_dtype", "u1 u2 i1 i2 f2 f4".split())
-def test_scaling_converts_to_float32(packed_dtype: str, unpacked_dtype: type[np.number]) -> None:
+def test_scaling_converts_to_float32(
+    packed_dtype: str, unpacked_dtype: type[np.number]
+) -> None:
     # if scale_facor but no add_offset is given transform to float32 in any case
     # this minimizes memory usage, see #1840, #1842
     original = xr.Variable(

From c8c3f14ec9be3dae7574bb3a7ffd8eb2a77e2591 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kmuehlbauer@wradlib.org>
Date: Sun, 2 Apr 2023 12:31:11 +0200
Subject: [PATCH 15/22] add additional test, make _choose_float_dtype more
 explicit

---
 xarray/coding/variables.py  | 32 ++++++++++++++++++--------------
 xarray/tests/test_coding.py | 22 +++++++++++++++++++++-
 2 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
index 8a6141b3638..a78c861ce38 100644
--- a/xarray/coding/variables.py
+++ b/xarray/coding/variables.py
@@ -305,25 +305,30 @@ def _choose_float_dtype(
     scale_factor = mapping.get("scale_factor")
     add_offset = mapping.get("add_offset")
     if scale_factor or add_offset:
-        # get the maximum itemsize from scale_factor/add_offset to determine
+        # get the type from scale_factor/add_offset to determine
         # the needed floating point type
-        # start with minimal floating point size -> 4 byte
-        maxsize = 4
-        if scale_factor and np.issubdtype(type(scale_factor), np.floating):
-            maxsize = max(maxsize, np.dtype(type(scale_factor)).itemsize)
-        add_offset_type = type(add_offset)
-        if add_offset and np.issubdtype(add_offset_type, np.floating):
-            maxsize = max(maxsize, np.dtype(add_offset_type).itemsize)
-        # if add_offset is malformed (eg. no float32 or no float64 as
-        # cf conventions expects):
+        if scale_factor:
+            scale_type = type(scale_factor)
+        if add_offset:
+            offset_type = type(add_offset)
+        # CF conforming, both scale_factor and add-offset are given and
+        # of same floating point type
+        if (
+            add_offset
+            and scale_factor
+            and offset_type == scale_type
+            and np.issubdtype(scale_type, np.floating)
+        ):
+            return np.dtype(scale_type)
+        # Not CF conforming and add_offset given:
         # A scale factor is entirely safe (vanishing into the mantissa),
         # but a large integer offset could lead to loss of precision.
         # Sensitivity analysis can be tricky, so we just use a float64
         # if there's any offset at all - better unoptimised than wrong!
-        if maxsize == 4 or not np.issubdtype(add_offset_type, np.floating):
-            return np.float32
-        else:
+        if add_offset:
             return np.float64
+        # return float32 in other cases where only scale_factor is given
+        return np.float32
     # If no scale_factor or add_offset is given, use some general rules.
     # Keep float32 as-is. Upcast half-precision to single-precision,
     # because float16 is "intended for storage but not computation"
@@ -355,7 +360,6 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
                 data -= add_offset
             if scale_factor:
                 data /= scale_factor
-
             return Variable(dims, data, attrs, encoding, fastpath=True)
         else:
             return variable
diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py
index 7f5d6dbe8d0..17320892108 100644
--- a/xarray/tests/test_coding.py
+++ b/xarray/tests/test_coding.py
@@ -100,7 +100,7 @@ def test_coder_roundtrip() -> None:
 def test_scaling_converts_to_float32(
     packed_dtype: str, unpacked_dtype: type[np.number]
 ) -> None:
-    # if scale_facor but no add_offset is given transform to float32 in any case
+    # if scale_factor but no add_offset is given transform to float32 in any case
     # this minimizes memory usage, see #1840, #1842
     original = xr.Variable(
         ("x",),
@@ -115,6 +115,26 @@ def test_scaling_converts_to_float32(
     assert roundtripped.dtype == np.float32
 
 
+@pytest.mark.parametrize("unpacked_dtype", [np.float32, np.float64, np.int32])
+@pytest.mark.parametrize("packed_dtype", "u1 u2 i1 i2 f2 f4".split())
+def test_scaling_converts_to_float64(
+    packed_dtype: str, unpacked_dtype: type[np.number]
+) -> None:
+    # if add_offset is given, but no scale_factor transform to float64 in any case
+    # to prevent precision issues
+    original = xr.Variable(
+        ("x",),
+        np.arange(10, dtype=packed_dtype),
+        encoding=dict(add_offset=unpacked_dtype(10)),
+    )
+    coder = variables.CFScaleOffsetCoder()
+    encoded = coder.encode(original)
+    assert encoded.dtype == np.float64
+    roundtripped = coder.decode(encoded)
+    assert_identical(original, roundtripped)
+    assert roundtripped.dtype == np.float64
+
+
 @pytest.mark.parametrize("scale_factor", (10, [10]))
 @pytest.mark.parametrize("add_offset", (0.1, [0.1]))
 def test_scaling_offset_as_list(scale_factor, add_offset) -> None:

From 2f5709a3eb5f6089b924e69b666b29ed55339165 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kmuehlbauer@wradlib.org>
Date: Sun, 2 Apr 2023 14:08:47 +0200
Subject: [PATCH 16/22] Apply suggestions from code review

typing by @Illviljan

Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com>
---
 xarray/coding/variables.py    | 7 ++-----
 xarray/tests/test_backends.py | 4 ++--
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
index a78c861ce38..17c5a212b86 100644
--- a/xarray/coding/variables.py
+++ b/xarray/coding/variables.py
@@ -319,7 +319,7 @@ def _choose_float_dtype(
             and offset_type == scale_type
             and np.issubdtype(scale_type, np.floating)
         ):
-            return np.dtype(scale_type)
+            return np.dtype(scale_type).type
         # Not CF conforming and add_offset given:
         # A scale factor is entirely safe (vanishing into the mantissa),
         # but a large integer offset could lead to loss of precision.
@@ -509,10 +509,7 @@ class NonStringCoder(VariableCoder):
     """Encode NonString variables if dtypes differ."""
 
     def encode(self, variable: Variable, name: T_Name = None) -> Variable:
-        if "dtype" in variable.encoding and variable.encoding["dtype"] not in (
-            "S1",
-            str,
-        ):
+        if "dtype" in variable.encoding and variable.encoding["dtype"] not in ("S1", str):
             dims, data, attrs, encoding = unpack_for_encoding(variable)
             dtype = np.dtype(encoding.pop("dtype"))
             if dtype != variable.dtype:
diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
index 2a3e6c1b121..214c1616b79 100644
--- a/xarray/tests/test_backends.py
+++ b/xarray/tests/test_backends.py
@@ -894,7 +894,7 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None:
         ],
     )
     def test_roundtrip_mask_and_scale(
-        self, decoded_fn: Callable, encoded_fn: Callable, dtype: type[np.number]
+        self, decoded_fn: Callable[[type[np.number]], Dataset], encoded_fn: Callable[[type[np.number]], Dataset], dtype: type[np.number]
     ) -> None:
         if dtype == np.float32 and isinstance(
             self, (TestZarrDirectoryStore, TestZarrDictStore)
@@ -1589,7 +1589,7 @@ def test_mask_and_scale(self, dtype: type[np.number]) -> None:
     @pytest.mark.parametrize("dtype", [np.float32, np.float64])
     @pytest.mark.parametrize("offset_cf_conforming", [True, False])
     def test_mask_and_scale_non_cf_conforming(
-        self, dtype, offset_cf_conforming
+        self, dtype: type[np.number], offset_cf_conforming: bool
     ) -> None:
         with create_tmp_file() as tmp_file:
             with nc4.Dataset(tmp_file, mode="w") as nc:

From f215d01de62a2e120ab520c9b9a2708768d28208 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 2 Apr 2023 12:09:23 +0000
Subject: [PATCH 17/22] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 xarray/coding/variables.py    | 5 ++++-
 xarray/tests/test_backends.py | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
index 17c5a212b86..02fdcea0189 100644
--- a/xarray/coding/variables.py
+++ b/xarray/coding/variables.py
@@ -509,7 +509,10 @@ class NonStringCoder(VariableCoder):
     """Encode NonString variables if dtypes differ."""
 
     def encode(self, variable: Variable, name: T_Name = None) -> Variable:
-        if "dtype" in variable.encoding and variable.encoding["dtype"] not in ("S1", str):
+        if "dtype" in variable.encoding and variable.encoding["dtype"] not in (
+            "S1",
+            str,
+        ):
             dims, data, attrs, encoding = unpack_for_encoding(variable)
             dtype = np.dtype(encoding.pop("dtype"))
             if dtype != variable.dtype:
diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
index 214c1616b79..82c2de60753 100644
--- a/xarray/tests/test_backends.py
+++ b/xarray/tests/test_backends.py
@@ -894,7 +894,10 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None:
         ],
     )
     def test_roundtrip_mask_and_scale(
-        self, decoded_fn: Callable[[type[np.number]], Dataset], encoded_fn: Callable[[type[np.number]], Dataset], dtype: type[np.number]
+        self,
+        decoded_fn: Callable[[type[np.number]], Dataset],
+        encoded_fn: Callable[[type[np.number]], Dataset],
+        dtype: type[np.number],
     ) -> None:
         if dtype == np.float32 and isinstance(
             self, (TestZarrDirectoryStore, TestZarrDictStore)

From c5cd53d7f1500bb9fcdb97e38ca3f9ad5988e7e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kmuehlbauer@wradlib.org>
Date: Sun, 2 Apr 2023 17:41:09 +0200
Subject: [PATCH 18/22] Apply suggestions from code review

Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com>
---
 xarray/coding/variables.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
index 02fdcea0189..f20a7f7f890 100644
--- a/xarray/coding/variables.py
+++ b/xarray/coding/variables.py
@@ -100,14 +100,14 @@ class NativeEndiannessArray(indexing.ExplicitlyIndexedNDArrayMixin):
 
     __slots__ = ("array",)
 
-    def __init__(self, array):
+    def __init__(self, array) -> None:
         self.array = indexing.as_indexable(array)
 
     @property
-    def dtype(self):
+    def dtype(self) -> np.dtype:
         return np.dtype(self.array.dtype.kind + str(self.array.dtype.itemsize))
 
-    def __getitem__(self, key):
+    def __getitem__(self, key) -> np.ndarray:
         return np.asarray(self.array[key], dtype=self.dtype)
 
 
@@ -132,14 +132,14 @@ class BoolTypeArray(indexing.ExplicitlyIndexedNDArrayMixin):
 
     __slots__ = ("array",)
 
-    def __init__(self, array):
+    def __init__(self, array) -> None:
         self.array = indexing.as_indexable(array)
 
     @property
-    def dtype(self):
+    def dtype(self) -> np.dtype:
         return np.dtype("bool")
 
-    def __getitem__(self, key):
+    def __getitem__(self, key) -> np.ndarray:
         return np.asarray(self.array[key], dtype=self.dtype)
 
 

From 986ffc6ef6fd4c12b5137fdc3d3a91a1fbca452a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kai.muehlbauer@uni-bonn.de>
Date: Mon, 3 Apr 2023 10:50:16 +0200
Subject: [PATCH 19/22] Test against None, as 0 (False) is a valid input for
 add_offset. Check for float32/64 only.

---
 xarray/coding/variables.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
index f20a7f7f890..2676b42e04b 100644
--- a/xarray/coding/variables.py
+++ b/xarray/coding/variables.py
@@ -304,20 +304,20 @@ def _choose_float_dtype(
     # see https://github.com/pydata/xarray/issues/5597#issuecomment-879561954
     scale_factor = mapping.get("scale_factor")
     add_offset = mapping.get("add_offset")
-    if scale_factor or add_offset:
+    if scale_factor is not None or add_offset is not None:
         # get the type from scale_factor/add_offset to determine
         # the needed floating point type
-        if scale_factor:
+        if scale_factor is not None:
             scale_type = type(scale_factor)
-        if add_offset:
+        if add_offset is not None:
             offset_type = type(add_offset)
         # CF conforming, both scale_factor and add-offset are given and
-        # of same floating point type
+        # of same floating point type (float32/64)
         if (
-            add_offset
-            and scale_factor
+            add_offset is not None
+            and scale_factor is not None
             and offset_type == scale_type
-            and np.issubdtype(scale_type, np.floating)
+            and scale_type in [np.float32, np.float64]
         ):
             return np.dtype(scale_type).type
         # Not CF conforming and add_offset given:
@@ -325,7 +325,7 @@ def _choose_float_dtype(
         # but a large integer offset could lead to loss of precision.
         # Sensitivity analysis can be tricky, so we just use a float64
         # if there's any offset at all - better unoptimised than wrong!
-        if add_offset:
+        if add_offset is not None:
             return np.float64
         # return float32 in other cases where only scale_factor is given
         return np.float32

From 51f751535cb467cae1147d2583020ba624af6fbe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kai.muehlbauer@uni-bonn.de>
Date: Tue, 4 Apr 2023 15:24:12 +0200
Subject: [PATCH 20/22] apply astype in the right order to prevent loss of
 precision, more fillvalue fixes

---
 xarray/coding/variables.py | 77 +++++++++++++++++++++++++-------------
 1 file changed, 51 insertions(+), 26 deletions(-)

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
index 2676b42e04b..7c9581fef74 100644
--- a/xarray/coding/variables.py
+++ b/xarray/coding/variables.py
@@ -220,34 +220,38 @@ class CFMaskCoder(VariableCoder):
     def encode(self, variable: Variable, name: T_Name = None):
         dims, data, attrs, encoding = unpack_for_encoding(variable)
 
+        # get dtype from encoding if available, otherwise use data.dtype
         dtype = np.dtype(encoding.get("dtype", data.dtype))
         fv = encoding.get("_FillValue")
         mv = encoding.get("missing_value")
 
-        if (
-            fv is not None
-            and mv is not None
-            and not duck_array_ops.allclose_or_equiv(fv, mv)
-        ):
-            raise ValueError(
-                f"Variable {name!r} has conflicting _FillValue ({fv}) and missing_value ({mv}). Cannot encode data."
-            )
-
-        if fv is not None:
-            # Ensure _FillValue is cast to same dtype as data's
-            encoding["_FillValue"] = dtype.type(fv)
-            fill_value = pop_to(encoding, attrs, "_FillValue", name=name)
-            if not pd.isnull(fill_value):
-                data = duck_array_ops.fillna(data, fill_value)
-
-        if mv is not None:
-            # Ensure missing_value is cast to same dtype as data's
-            encoding["missing_value"] = dtype.type(mv)
-            fill_value = pop_to(encoding, attrs, "missing_value", name=name)
-            if not pd.isnull(fill_value) and fv is None:
-                data = duck_array_ops.fillna(data, fill_value)
+        if fv is not None or mv is not None:
+            if (
+                fv is not None
+                and mv is not None
+                and not duck_array_ops.allclose_or_equiv(fv, mv)
+            ):
+                raise ValueError(
+                    f"Variable {name!r} has conflicting _FillValue ({fv}) and missing_value ({mv}). Cannot encode data."
+                )
 
-        return Variable(dims, data, attrs, encoding, fastpath=True)
+            if fv is not None:
+                # Ensure _FillValue is cast to same dtype as data's
+                encoding["_FillValue"] = dtype.type(fv)
+                fill_value = pop_to(encoding, attrs, "_FillValue", name=name)
+                if not pd.isnull(fill_value):
+                    data = duck_array_ops.fillna(data, fill_value)
+
+            if mv is not None:
+                # Only use mv if _FillValue isn't available
+                # Ensure missing_value is cast to same dtype as data's
+                encoding["missing_value"] = attrs.get("_FillValue", dtype.type(mv))
+                fill_value = pop_to(encoding, attrs, "missing_value", name=name)
+                if not pd.isnull(fill_value) and fv is None:
+                    data = duck_array_ops.fillna(data, fill_value)
+            return Variable(dims, data, attrs, encoding, fastpath=True)
+        else:
+            return variable
 
     def decode(self, variable: Variable, name: T_Name = None):
         dims, data, attrs, encoding = unpack_for_decoding(variable)
@@ -272,7 +276,13 @@ def decode(self, variable: Variable, name: T_Name = None):
                     stacklevel=3,
                 )
 
-            dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
+            if "scale_factor" not in attrs and "add_offset" not in attrs:
+                dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
+            else:
+                dtype, decoded_fill_value = (
+                    _choose_float_dtype(data.dtype, attrs),
+                    np.nan,
+                )
 
             if encoded_fill_values:
                 transform = partial(
@@ -319,6 +329,10 @@ def _choose_float_dtype(
             and offset_type == scale_type
             and scale_type in [np.float32, np.float64]
         ):
+            # in case of int32 -> we need upcast to float64
+            # due to precision issues
+            if dtype.itemsize == 4 and np.issubdtype(dtype, np.integer):
+                return np.float64
             return np.dtype(scale_type).type
         # Not CF conforming and add_offset given:
         # A scale factor is entirely safe (vanishing into the mantissa),
@@ -354,7 +368,12 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
         scale_factor = pop_to(encoding, attrs, "scale_factor", name=name)
         add_offset = pop_to(encoding, attrs, "add_offset", name=name)
         if scale_factor or add_offset:
-            dtype = _choose_float_dtype(data.dtype, attrs)
+            # if we have a _FillValue/masked_value we do not want to cast now
+            # but leave that to CFMaskCoder
+            dtype = data.dtype
+            if "_FillValue" not in encoding and "missing_value" not in encoding:
+                dtype = _choose_float_dtype(data.dtype, attrs)
+            # but still we need a copy prevent changing original data
             data = data.astype(dtype=dtype, copy=True)
             if add_offset:
                 data -= add_offset
@@ -373,7 +392,13 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
                 scale_factor = np.asarray(scale_factor).item()
             if np.ndim(add_offset) > 0:
                 add_offset = np.asarray(add_offset).item()
-            dtype = _choose_float_dtype(data.dtype, encoding)
+            # if we have a _FillValue/masked_value we already have the wanted
+            # floating point dtype here (via CFMaskCoder), so no check is necessary
+            # only check in other cases
+            dtype = data.dtype
+            if "_FillValue" not in encoding and "missing_value" not in encoding:
+                dtype = _choose_float_dtype(dtype, encoding)
+
             transform = partial(
                 _scale_offset_decoding,
                 scale_factor=scale_factor,

From 8c074f0bf4170a0fdf4b324a0c20e16fccbe391f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kai.muehlbauer@uni-bonn.de>
Date: Tue, 4 Apr 2023 15:24:30 +0200
Subject: [PATCH 21/22] add mask/scale roundtrip test for variable cf
 encoding/decoding

---
 xarray/tests/test_coding.py | 70 ++++++++++++++++++++++++++++++++-----
 1 file changed, 62 insertions(+), 8 deletions(-)

diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py
index 17320892108..95c3288d7b7 100644
--- a/xarray/tests/test_coding.py
+++ b/xarray/tests/test_coding.py
@@ -95,17 +95,71 @@ def test_coder_roundtrip() -> None:
     assert_identical(original, roundtripped)
 
 
-@pytest.mark.parametrize("unpacked_dtype", [np.float32, np.float64, np.int32])
+@pytest.mark.parametrize("ptype", "u1 u2 u4 i1 i2 i4".split())
+@pytest.mark.parametrize("utype", "f4 f8".split())
+def test_mask_scale_roundtrip(utype: str, ptype: str) -> None:
+    # this tests cf conforming packing/unpacking via
+    # encode_cf_variable/decode_cf_variable
+    # f4->i4 packing is skipped as non-conforming
+    if utype[1] == "4" and ptype[1] == "4":
+        pytest.skip("Can't pack float32 into int32/uint32")
+    # fillvalues according to netCDF4
+    filldict = {
+        "i1": -127,
+        "u1": 255,
+        "i2": -32767,
+        "u2": 65535,
+        "i4": -2147483647,
+        "u4": 4294967295,
+    }
+    fillvalue = filldict[ptype]
+    unpacked_dtype = np.dtype(utype).type
+    packed_dtype = np.dtype(ptype).type
+    info = np.iinfo(packed_dtype)
+
+    # create original "encoded" Variable
+    packed_data = np.array(
+        [info.min, fillvalue, info.max - 1, info.max], dtype=packed_dtype
+    )
+    attrs = dict(
+        scale_factor=unpacked_dtype(1),
+        add_offset=unpacked_dtype(0),
+        _FillValue=packed_dtype(fillvalue),
+    )
+    original = xr.Variable(("x",), packed_data, attrs=attrs)
+
+    # create wanted "decoded" Variable
+    unpacked_data = np.array(
+        [info.min, fillvalue, info.max - 1, info.max], dtype=unpacked_dtype
+    )
+    encoding = dict(
+        scale_factor=unpacked_dtype(1),
+        add_offset=unpacked_dtype(0),
+        _FillValue=packed_dtype(fillvalue),
+    )
+    wanted = xr.Variable(("x"), unpacked_data, encoding=encoding)
+    wanted = wanted.where(wanted != fillvalue)
+
+    # decode original and compare with wanted
+    decoded = decode_cf_variable("x", original)
+    assert wanted.dtype == decoded.dtype
+    xr.testing.assert_identical(wanted, decoded)
+
+    # encode again and compare with original
+    encoded = encode_cf_variable(decoded)
+    assert original.dtype == encoded.dtype
+    xr.testing.assert_identical(original, encoded)
+
+
+@pytest.mark.parametrize("unpacked_dtype", "f4 f8 i4".split())
 @pytest.mark.parametrize("packed_dtype", "u1 u2 i1 i2 f2 f4".split())
-def test_scaling_converts_to_float32(
-    packed_dtype: str, unpacked_dtype: type[np.number]
-) -> None:
+def test_scaling_converts_to_float32(packed_dtype: str, unpacked_dtype: str) -> None:
     # if scale_factor but no add_offset is given transform to float32 in any case
     # this minimizes memory usage, see #1840, #1842
     original = xr.Variable(
         ("x",),
         np.arange(10, dtype=packed_dtype),
-        encoding=dict(scale_factor=unpacked_dtype(10)),
+        encoding=dict(scale_factor=np.dtype(unpacked_dtype).type(10)),
     )
     coder = variables.CFScaleOffsetCoder()
     encoded = coder.encode(original)
@@ -115,7 +169,7 @@ def test_scaling_converts_to_float32(
     assert roundtripped.dtype == np.float32
 
 
-@pytest.mark.parametrize("unpacked_dtype", [np.float32, np.float64, np.int32])
+@pytest.mark.parametrize("unpacked_dtype", "f4 f8 i4".split())
 @pytest.mark.parametrize("packed_dtype", "u1 u2 i1 i2 f2 f4".split())
 def test_scaling_converts_to_float64(
     packed_dtype: str, unpacked_dtype: type[np.number]
@@ -125,7 +179,7 @@ def test_scaling_converts_to_float64(
     original = xr.Variable(
         ("x",),
         np.arange(10, dtype=packed_dtype),
-        encoding=dict(add_offset=unpacked_dtype(10)),
+        encoding=dict(add_offset=np.dtype(unpacked_dtype).type(10)),
     )
     coder = variables.CFScaleOffsetCoder()
     encoded = coder.encode(original)
@@ -139,7 +193,7 @@ def test_scaling_converts_to_float64(
 @pytest.mark.parametrize("add_offset", (0.1, [0.1]))
 def test_scaling_offset_as_list(scale_factor, add_offset) -> None:
     # test for #4631
-    # att: scale_factor and add_offset are not conforming to cf specs here
+    # attention: scale_factor and add_offset are not conforming to cf specs here
     encoding = dict(scale_factor=scale_factor, add_offset=add_offset)
     original = xr.Variable(("x",), np.arange(10.0), encoding=encoding)
     coder = variables.CFScaleOffsetCoder()

From 031cac5f63827f359824e87bf65a1009f22b7ac7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= <kai.muehlbauer@uni-bonn.de>
Date: Tue, 4 Apr 2023 15:43:25 +0200
Subject: [PATCH 22/22] separate CFMaskCoder test for multiple
 missing_values/__FillValues conflict, use _FillValue for missing_value if
 available

---
 xarray/coding/variables.py  |  3 ++-
 xarray/tests/test_coding.py | 10 ++++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
index 7c9581fef74..a5a41718648 100644
--- a/xarray/coding/variables.py
+++ b/xarray/coding/variables.py
@@ -243,7 +243,8 @@ def encode(self, variable: Variable, name: T_Name = None):
                     data = duck_array_ops.fillna(data, fill_value)
 
             if mv is not None:
-                # Only use mv if _FillValue isn't available
+                # Use _FillValue if available to align missing_value to prevent issues
+                # when decoding
                 # Ensure missing_value is cast to same dtype as data's
                 encoding["missing_value"] = attrs.get("_FillValue", dtype.type(mv))
                 fill_value = pop_to(encoding, attrs, "missing_value", name=name)
diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py
index 95c3288d7b7..406921a8556 100644
--- a/xarray/tests/test_coding.py
+++ b/xarray/tests/test_coding.py
@@ -51,9 +51,15 @@ def test_CFMaskCoder_encode_missing_fill_values_conflict(data, encoding) -> None
     assert encoded.dtype == encoded.attrs["missing_value"].dtype
     assert encoded.dtype == encoded.attrs["_FillValue"].dtype
 
+
+def test_CFMaskCoder_multiple_missing_values_conflict():
+    data = np.array([0.0, -1.0, 1.0])
+    attrs = dict(_FillValue=np.float64(1e20), missing_value=np.float64(1e21))
+    original = xr.Variable(("x",), data, attrs=attrs)
     with pytest.warns(variables.SerializationWarning):
-        roundtripped = decode_cf_variable("foo", encoded)
-        assert_identical(roundtripped, original)
+        decoded = decode_cf_variable("foo", original)
+    with pytest.raises(ValueError):
+        encode_cf_variable(decoded)
 
 
 def test_CFMaskCoder_missing_value() -> None: