Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automate interpretation of _Unsigned attribute #1453

Merged
merged 16 commits into from
Jul 28, 2017
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ Enhancements
- More attributes available in :py:attr:`~xarray.Dataset.attrs` dictionary when
raster files are opened with :py:func:`~xarray.open_rasterio`.
By `Greg Brener <https://github.com/gbrener>`_
- Support for NetCDF files using an ``_Unsigned`` attribute to indicate that a
a signed integer data type should be interpreted as unsigned bytes
(:issue:`1444`).
By `Eric Bruning <https://github.com/deeplycloudy>`_.

Bug fixes
~~~~~~~~~
Expand Down
3 changes: 3 additions & 0 deletions xarray/backends/pynio_.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ def __init__(self, filename, mode='r', autoclose=False):
import Nio
opener = functools.partial(Nio.open_file, filename, mode=mode)
self.ds = opener()
# xarray provides its own support for FillValue,
# so turn off PyNIO's support for the same.
self.ds.set_option('MaskedArrayMode', 'MaskedNever')
self._autoclose = autoclose
self._isopen = True
self._opener = opener
Expand Down
59 changes: 55 additions & 4 deletions xarray/conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,34 @@ def __getitem__(self, key):
return np.asarray(self.array[key], dtype=self.dtype)


class UnsignedIntTypeArray(utils.NDArrayMixin):
"""Decode arrays on the fly from signed integer to unsigned
integer. Typically used when _Unsigned is set at as a netCDF
attribute on a signed integer variable.

>>> sb = np.asarray([0, 1, 127, -128, -1], dtype='i1')

>>> sb.dtype
dtype('int8')

>>> UnsignedIntTypeArray(sb).dtype
dtype('uint8')

>>> UnsignedIntTypeArray(sb)[:]
array([ 0, 1, 127, 128, 255], dtype=uint8)
"""
def __init__(self, array):
self.array = array
self.unsigned_dtype = np.dtype('u%s' % array.dtype.itemsize)

@property
def dtype(self):
return self.unsigned_dtype

def __getitem__(self, key):
return np.asarray(self.array[key], dtype=self.dtype)


def string_to_char(arr):
"""Like netCDF4.stringtochar, but faster and more flexible.
"""
Expand Down Expand Up @@ -637,6 +665,13 @@ def maybe_encode_dtype(var, name=None):
'any _FillValue to use for NaNs' % name,
RuntimeWarning, stacklevel=3)
data = duck_array_ops.around(data)[...]
if encoding.get('_Unsigned', False):
unsigned_dtype = 'i%s' % dtype.itemsize
old_fill = np.asarray(attrs['_FillValue'])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This block should be guarded in a check to verify that _FillValue is actually defined as an attribute.

new_fill = old_fill.astype(unsigned_dtype)
attrs['_FillValue'] = new_fill
data = data.astype(unsigned_dtype)
pop_to(encoding, attrs, '_Unsigned')
if dtype == 'S1' and data.dtype != 'S1':
data = string_to_char(np.asarray(data, 'S'))
dims = dims + ('string%s' % data.shape[-1],)
Expand Down Expand Up @@ -761,7 +796,8 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True,
example: ['h', 'e', 'l', 'l', 'o'] -> 'hello'
mask_and_scale: bool
Lazily scale (using scale_factor and add_offset) and mask
(using _FillValue).
(using _FillValue). If the _Unsigned attribute is present
treat integer arrays as unsigned.
decode_times : bool
Decode cf times ('hours since 2000-01-01') to np.datetime64.
decode_endianness : bool
Expand All @@ -786,6 +822,16 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True,
dimensions = dimensions[:-1]
data = CharToStringArray(data)

pop_to(attributes, encoding, '_Unsigned')
is_unsigned = encoding.get('_Unsigned', False)
if (is_unsigned) and (mask_and_scale is True):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You don't need the extra parentheses here, and use implicit boolean checks instead of is True. So this should be: if is_unsigned and mask_and_scale

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

if data.dtype.kind == 'i':
data = UnsignedIntTypeArray(data)
else:
warnings.warn("variable has _Unsigned attribute but is not "
"of integer type. Ignoring attribute.",
RuntimeWarning, stacklevel=3)

if mask_and_scale:
if 'missing_value' in attributes:
# missing_value is deprecated, but we still want to support it as
Expand All @@ -800,20 +846,25 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True,
"and decoding explicitly using "
"xarray.conventions.decode_cf(ds)")
attributes['_FillValue'] = attributes.pop('missing_value')

fill_value = np.array(pop_to(attributes, encoding, '_FillValue'))
if fill_value.size > 1:
warnings.warn("variable has multiple fill values {0}, decoding "
"all values to NaN.".format(str(fill_value)),
RuntimeWarning, stacklevel=3)
scale_factor = pop_to(attributes, encoding, 'scale_factor')
add_offset = pop_to(attributes, encoding, 'add_offset')
if ((fill_value is not None and not np.any(pd.isnull(fill_value))) or
scale_factor is not None or add_offset is not None):
has_fill = (fill_value is not None and
not np.any(pd.isnull(fill_value)))
if (has_fill or scale_factor is not None or add_offset is not None):
if fill_value.dtype.kind in ['U', 'S']:
dtype = object
else:
dtype = float
if is_unsigned and has_fill:
# Need to convert the fill_value to unsigned, too
# According to the CF spec, the fill value is of the same
# type as its variable, i.e. its storage format on disk
fill_value = np.asarray(fill_value, dtype=data.unsigned_dtype)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we simply cast fill_value to the dtype of data unilaterally here? e.g., fill_value = np.asarray(fill_value, dtype=data.dtype) without the if is_unsigned and has_fill check?

That seems a little more robust to me.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. I had to leave the has_fill check because fill_value will be None in cases where there is no fill_value attribute.

data = MaskedAndScaledArray(data, fill_value, scale_factor,
add_offset, dtype)

Expand Down
49 changes: 46 additions & 3 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,23 @@ def create_encoded_masked_and_scaled_data():
return Dataset({'x': ('t', [-1, -1, 0, 1, 2], attributes)})


def create_unsigned_masked_scaled_data():
encoding = {'_FillValue': 255, '_Unsigned': 'true', 'dtype': 'i1',
'add_offset': 10, 'scale_factor': np.float32(0.1)}
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan])
return Dataset({'x': ('t', x, {}, encoding)})


def create_encoded_unsigned_masked_scaled_data():
# These are values as written to the file: the _FillValue will
# be represented in the signed form.
attributes = {'_FillValue': -1, '_Unsigned': 'true',
'add_offset': 10, 'scale_factor': np.float32(0.1)}
# Create signed data corresponding to [0, 1, 127, 128, 255] unsigned
sb = np.asarray([0, 1, 127, -128, -1], dtype='i1')
return Dataset({'x': ('t', sb, attributes)})


def create_boolean_data():
attributes = {'units': '-'}
return Dataset({'x': ('t', [True, False, False, True], attributes)})
Expand Down Expand Up @@ -360,24 +377,50 @@ def test_roundtrip_strings_with_fill_value(self):
with self.roundtrip(original) as actual:
self.assertDatasetIdentical(expected, actual)

def test_unsigned_roundtrip_mask_and_scale(self):
decoded = create_unsigned_masked_scaled_data()
encoded = create_encoded_unsigned_masked_scaled_data()
with self.roundtrip(decoded) as actual:
self.assertDatasetAllClose(decoded, actual)
with self.roundtrip(decoded,
open_kwargs=dict(decode_cf=False)) as actual:
# TODO: this assumes that all roundtrips will first
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please remove this redundant TODO note

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

# encode. Is that something we want to test for?
self.assertDatasetAllClose(encoded, actual)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure assertDatasetAllClose checks dtypes. It would be good to check explicitly here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

with self.roundtrip(encoded,
open_kwargs=dict(decode_cf=False)) as actual:
self.assertDatasetAllClose(encoded, actual)
# make sure roundtrip encoding didn't change the
# original dataset.
self.assertDatasetIdentical(
encoded, create_encoded_unsigned_masked_scaled_data())
with self.roundtrip(encoded) as actual:
self.assertDatasetAllClose(decoded, actual)
with self.roundtrip(encoded,
open_kwargs=dict(decode_cf=False)) as actual:
self.assertDatasetAllClose(encoded, actual)

def test_roundtrip_mask_and_scale(self):
decoded = create_masked_and_scaled_data()
encoded = create_encoded_masked_and_scaled_data()
with self.roundtrip(decoded) as actual:
self.assertDatasetAllClose(decoded, actual)
with self.roundtrip(decoded, open_kwargs=dict(decode_cf=False)) as actual:
with self.roundtrip(decoded,
open_kwargs=dict(decode_cf=False)) as actual:
# TODO: this assumes that all roundtrips will first
# encode. Is that something we want to test for?
self.assertDatasetAllClose(encoded, actual)
with self.roundtrip(encoded, open_kwargs=dict(decode_cf=False)) as actual:
with self.roundtrip(encoded,
open_kwargs=dict(decode_cf=False)) as actual:
self.assertDatasetAllClose(encoded, actual)
# make sure roundtrip encoding didn't change the
# original dataset.
self.assertDatasetIdentical(encoded,
create_encoded_masked_and_scaled_data())
with self.roundtrip(encoded) as actual:
self.assertDatasetAllClose(decoded, actual)
with self.roundtrip(encoded, open_kwargs=dict(decode_cf=False)) as actual:
with self.roundtrip(encoded,
open_kwargs=dict(decode_cf=False)) as actual:
self.assertDatasetAllClose(encoded, actual)

def test_coordinates_encoding(self):
Expand Down
9 changes: 9 additions & 0 deletions xarray/tests/test_conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,15 @@ def test_string_to_char(self):
self.assertArrayEqual(actual, expected)


class TestUnsignedIntTypeArray(TestCase):
def test_unsignedinttype_array(self):
sb = np.asarray([0, 1, 127, -128, -1], dtype='i1')
ub = conventions.UnsignedIntTypeArray(sb)
self.assertEqual(ub.dtype, np.dtype('u1'))
self.assertArrayEqual(ub, np.array([0, 1, 127, 128, 255],
dtype=np.dtype('u1')))


class TestBoolTypeArray(TestCase):
def test_booltype_array(self):
x = np.array([1, 0, 1, 1, 0], dtype='i1')
Expand Down