Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automate interpretation of _Unsigned attribute #1453

Merged
merged 16 commits into from
Jul 28, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ Enhancements
- More attributes available in :py:attr:`~xarray.Dataset.attrs` dictionary when
raster files are opened with :py:func:`~xarray.open_rasterio`.
By `Greg Brener <https://github.com/gbrener>`_
- Support for NetCDF files using an ``_Unsigned`` attribute to indicate that a
a signed integer data type should be interpreted as unsigned bytes
(:issue:`1444`).
By `Eric Bruning <https://github.com/deeplycloudy>`_.

Bug fixes
~~~~~~~~~
Expand Down
3 changes: 3 additions & 0 deletions xarray/backends/pynio_.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ def __init__(self, filename, mode='r', autoclose=False):
import Nio
opener = functools.partial(Nio.open_file, filename, mode=mode)
self.ds = opener()
# xarray provides its own support for FillValue,
# so turn off PyNIO's support for the same.
self.ds.set_option('MaskedArrayMode', 'MaskedNever')
self._autoclose = autoclose
self._isopen = True
self._opener = opener
Expand Down
61 changes: 57 additions & 4 deletions xarray/conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,34 @@ def __getitem__(self, key):
return np.asarray(self.array[key], dtype=self.dtype)


class UnsignedIntTypeArray(utils.NDArrayMixin):
"""Decode arrays on the fly from signed integer to unsigned
integer. Typically used when _Unsigned is set at as a netCDF
attribute on a signed integer variable.

>>> sb = np.asarray([0, 1, 127, -128, -1], dtype='i1')

>>> sb.dtype
dtype('int8')

>>> UnsignedIntTypeArray(sb).dtype
dtype('uint8')

>>> UnsignedIntTypeArray(sb)[:]
array([ 0, 1, 127, 128, 255], dtype=uint8)
"""
def __init__(self, array):
self.array = array
self.unsigned_dtype = np.dtype('u%s' % array.dtype.itemsize)

@property
def dtype(self):
return self.unsigned_dtype

def __getitem__(self, key):
return np.asarray(self.array[key], dtype=self.dtype)


def string_to_char(arr):
"""Like netCDF4.stringtochar, but faster and more flexible.
"""
Expand Down Expand Up @@ -637,6 +665,14 @@ def maybe_encode_dtype(var, name=None):
'any _FillValue to use for NaNs' % name,
RuntimeWarning, stacklevel=3)
data = duck_array_ops.around(data)[...]
if encoding.get('_Unsigned', False):
signed_dtype = 'i%s' % dtype.itemsize
if '_FillValue' in var.attrs:
old_fill = np.asarray(attrs['_FillValue'])
new_fill = old_fill.astype(signed_dtype)
attrs['_FillValue'] = new_fill
data = data.astype(signed_dtype)
pop_to(encoding, attrs, '_Unsigned')
if dtype == 'S1' and data.dtype != 'S1':
data = string_to_char(np.asarray(data, 'S'))
dims = dims + ('string%s' % data.shape[-1],)
Expand Down Expand Up @@ -761,7 +797,8 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True,
example: ['h', 'e', 'l', 'l', 'o'] -> 'hello'
mask_and_scale: bool
Lazily scale (using scale_factor and add_offset) and mask
(using _FillValue).
(using _FillValue). If the _Unsigned attribute is present
treat integer arrays as unsigned.
decode_times : bool
Decode cf times ('hours since 2000-01-01') to np.datetime64.
decode_endianness : bool
Expand All @@ -786,6 +823,16 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True,
dimensions = dimensions[:-1]
data = CharToStringArray(data)

pop_to(attributes, encoding, '_Unsigned')
is_unsigned = encoding.get('_Unsigned', False)
if is_unsigned and mask_and_scale:
if data.dtype.kind == 'i':
data = UnsignedIntTypeArray(data)
else:
warnings.warn("variable has _Unsigned attribute but is not "
"of integer type. Ignoring attribute.",
RuntimeWarning, stacklevel=3)

if mask_and_scale:
if 'missing_value' in attributes:
# missing_value is deprecated, but we still want to support it as
Expand All @@ -800,20 +847,26 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True,
"and decoding explicitly using "
"xarray.conventions.decode_cf(ds)")
attributes['_FillValue'] = attributes.pop('missing_value')

fill_value = np.array(pop_to(attributes, encoding, '_FillValue'))
if fill_value.size > 1:
warnings.warn("variable has multiple fill values {0}, decoding "
"all values to NaN.".format(str(fill_value)),
RuntimeWarning, stacklevel=3)
scale_factor = pop_to(attributes, encoding, 'scale_factor')
add_offset = pop_to(attributes, encoding, 'add_offset')
if ((fill_value is not None and not np.any(pd.isnull(fill_value))) or
scale_factor is not None or add_offset is not None):
has_fill = (fill_value is not None and
not np.any(pd.isnull(fill_value)))
if (has_fill or scale_factor is not None or add_offset is not None):
if fill_value.dtype.kind in ['U', 'S']:
dtype = object
else:
dtype = float
# According to the CF spec, the fill value is of the same
# type as its variable, i.e. its storage format on disk.
# This handles the case where the fill_value also needs to be
# converted to its unsigned value.
if has_fill:
fill_value = np.asarray(fill_value, dtype=data.dtype)
data = MaskedAndScaledArray(data, fill_value, scale_factor,
add_offset, dtype)

Expand Down
62 changes: 59 additions & 3 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,23 @@ def create_encoded_masked_and_scaled_data():
return Dataset({'x': ('t', [-1, -1, 0, 1, 2], attributes)})


def create_unsigned_masked_scaled_data():
encoding = {'_FillValue': 255, '_Unsigned': 'true', 'dtype': 'i1',
'add_offset': 10, 'scale_factor': np.float32(0.1)}
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan])
return Dataset({'x': ('t', x, {}, encoding)})


def create_encoded_unsigned_masked_scaled_data():
# These are values as written to the file: the _FillValue will
# be represented in the signed form.
attributes = {'_FillValue': -1, '_Unsigned': 'true',
'add_offset': 10, 'scale_factor': np.float32(0.1)}
# Create signed data corresponding to [0, 1, 127, 128, 255] unsigned
sb = np.asarray([0, 1, 127, -128, -1], dtype='i1')
return Dataset({'x': ('t', sb, attributes)})


def create_boolean_data():
attributes = {'units': '-'}
return Dataset({'x': ('t', [True, False, False, True], attributes)})
Expand Down Expand Up @@ -360,24 +377,63 @@ def test_roundtrip_strings_with_fill_value(self):
with self.roundtrip(original) as actual:
self.assertDatasetIdentical(expected, actual)

def test_unsigned_roundtrip_mask_and_scale(self):
decoded = create_unsigned_masked_scaled_data()
encoded = create_encoded_unsigned_masked_scaled_data()
with self.roundtrip(decoded) as actual:
for k in decoded.variables:
self.assertEqual(decoded.variables[k].dtype,
actual.variables[k].dtype)
self.assertDatasetAllClose(decoded, actual)
with self.roundtrip(decoded,
open_kwargs=dict(decode_cf=False)) as actual:
for k in encoded.variables:
self.assertEqual(encoded.variables[k].dtype,
actual.variables[k].dtype)
self.assertDatasetAllClose(encoded, actual)
with self.roundtrip(encoded,
open_kwargs=dict(decode_cf=False)) as actual:
for k in encoded.variables:
self.assertEqual(encoded.variables[k].dtype,
actual.variables[k].dtype)
self.assertDatasetAllClose(encoded, actual)
# make sure roundtrip encoding didn't change the
# original dataset.
self.assertDatasetIdentical(
encoded, create_encoded_unsigned_masked_scaled_data())
with self.roundtrip(encoded) as actual:
for k in decoded.variables:
self.assertEqual(decoded.variables[k].dtype,
actual.variables[k].dtype)
self.assertDatasetAllClose(decoded, actual)
with self.roundtrip(encoded,
open_kwargs=dict(decode_cf=False)) as actual:
for k in encoded.variables:
self.assertEqual(encoded.variables[k].dtype,
actual.variables[k].dtype)
self.assertDatasetAllClose(encoded, actual)

def test_roundtrip_mask_and_scale(self):
decoded = create_masked_and_scaled_data()
encoded = create_encoded_masked_and_scaled_data()
with self.roundtrip(decoded) as actual:
self.assertDatasetAllClose(decoded, actual)
with self.roundtrip(decoded, open_kwargs=dict(decode_cf=False)) as actual:
with self.roundtrip(decoded,
open_kwargs=dict(decode_cf=False)) as actual:
# TODO: this assumes that all roundtrips will first
# encode. Is that something we want to test for?
self.assertDatasetAllClose(encoded, actual)
with self.roundtrip(encoded, open_kwargs=dict(decode_cf=False)) as actual:
with self.roundtrip(encoded,
open_kwargs=dict(decode_cf=False)) as actual:
self.assertDatasetAllClose(encoded, actual)
# make sure roundtrip encoding didn't change the
# original dataset.
self.assertDatasetIdentical(encoded,
create_encoded_masked_and_scaled_data())
with self.roundtrip(encoded) as actual:
self.assertDatasetAllClose(decoded, actual)
with self.roundtrip(encoded, open_kwargs=dict(decode_cf=False)) as actual:
with self.roundtrip(encoded,
open_kwargs=dict(decode_cf=False)) as actual:
self.assertDatasetAllClose(encoded, actual)

def test_coordinates_encoding(self):
Expand Down
9 changes: 9 additions & 0 deletions xarray/tests/test_conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,15 @@ def test_string_to_char(self):
self.assertArrayEqual(actual, expected)


class TestUnsignedIntTypeArray(TestCase):
def test_unsignedinttype_array(self):
sb = np.asarray([0, 1, 127, -128, -1], dtype='i1')
ub = conventions.UnsignedIntTypeArray(sb)
self.assertEqual(ub.dtype, np.dtype('u1'))
self.assertArrayEqual(ub, np.array([0, 1, 127, 128, 255],
dtype=np.dtype('u1')))


class TestBoolTypeArray(TestCase):
def test_booltype_array(self):
x = np.array([1, 0, 1, 1, 0], dtype='i1')
Expand Down