-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Automate interpretation of _Unsigned attribute #1453
Changes from 15 commits
4bfeee7
32a67f3
ddbc99d
11a39d0
e248ae5
079a7e5
e06f657
b56cb75
8b7df53
466928c
31b9aea
4a7e481
d6a90c7
bef02b4
748a106
9bec545
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -534,6 +534,34 @@ def __getitem__(self, key): | |
return np.asarray(self.array[key], dtype=self.dtype) | ||
|
||
|
||
class UnsignedIntTypeArray(utils.NDArrayMixin): | ||
"""Decode arrays on the fly from signed integer to unsigned | ||
integer. Typically used when _Unsigned is set at as a netCDF | ||
attribute on a signed integer variable. | ||
|
||
>>> sb = np.asarray([0, 1, 127, -128, -1], dtype='i1') | ||
|
||
>>> sb.dtype | ||
dtype('int8') | ||
|
||
>>> UnsignedIntTypeArray(sb).dtype | ||
dtype('uint8') | ||
|
||
>>> UnsignedIntTypeArray(sb)[:] | ||
array([ 0, 1, 127, 128, 255], dtype=uint8) | ||
""" | ||
def __init__(self, array): | ||
self.array = array | ||
self.unsigned_dtype = np.dtype('u%s' % array.dtype.itemsize) | ||
|
||
@property | ||
def dtype(self): | ||
return self.unsigned_dtype | ||
|
||
def __getitem__(self, key): | ||
return np.asarray(self.array[key], dtype=self.dtype) | ||
|
||
|
||
def string_to_char(arr): | ||
"""Like netCDF4.stringtochar, but faster and more flexible. | ||
""" | ||
|
@@ -637,6 +665,13 @@ def maybe_encode_dtype(var, name=None): | |
'any _FillValue to use for NaNs' % name, | ||
RuntimeWarning, stacklevel=3) | ||
data = duck_array_ops.around(data)[...] | ||
if encoding.get('_Unsigned', False): | ||
unsigned_dtype = 'i%s' % dtype.itemsize | ||
old_fill = np.asarray(attrs['_FillValue']) | ||
new_fill = old_fill.astype(unsigned_dtype) | ||
attrs['_FillValue'] = new_fill | ||
data = data.astype(unsigned_dtype) | ||
pop_to(encoding, attrs, '_Unsigned') | ||
if dtype == 'S1' and data.dtype != 'S1': | ||
data = string_to_char(np.asarray(data, 'S')) | ||
dims = dims + ('string%s' % data.shape[-1],) | ||
|
@@ -761,7 +796,8 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, | |
example: ['h', 'e', 'l', 'l', 'o'] -> 'hello' | ||
mask_and_scale: bool | ||
Lazily scale (using scale_factor and add_offset) and mask | ||
(using _FillValue). | ||
(using _FillValue). If the _Unsigned attribute is present | ||
treat integer arrays as unsigned. | ||
decode_times : bool | ||
Decode cf times ('hours since 2000-01-01') to np.datetime64. | ||
decode_endianness : bool | ||
|
@@ -786,6 +822,16 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, | |
dimensions = dimensions[:-1] | ||
data = CharToStringArray(data) | ||
|
||
pop_to(attributes, encoding, '_Unsigned') | ||
is_unsigned = encoding.get('_Unsigned', False) | ||
if (is_unsigned) and (mask_and_scale is True): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You don't need the extra parentheses here, and use implicit boolean checks instead of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
if data.dtype.kind == 'i': | ||
data = UnsignedIntTypeArray(data) | ||
else: | ||
warnings.warn("variable has _Unsigned attribute but is not " | ||
"of integer type. Ignoring attribute.", | ||
RuntimeWarning, stacklevel=3) | ||
|
||
if mask_and_scale: | ||
if 'missing_value' in attributes: | ||
# missing_value is deprecated, but we still want to support it as | ||
|
@@ -800,20 +846,25 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, | |
"and decoding explicitly using " | ||
"xarray.conventions.decode_cf(ds)") | ||
attributes['_FillValue'] = attributes.pop('missing_value') | ||
|
||
fill_value = np.array(pop_to(attributes, encoding, '_FillValue')) | ||
if fill_value.size > 1: | ||
warnings.warn("variable has multiple fill values {0}, decoding " | ||
"all values to NaN.".format(str(fill_value)), | ||
RuntimeWarning, stacklevel=3) | ||
scale_factor = pop_to(attributes, encoding, 'scale_factor') | ||
add_offset = pop_to(attributes, encoding, 'add_offset') | ||
if ((fill_value is not None and not np.any(pd.isnull(fill_value))) or | ||
scale_factor is not None or add_offset is not None): | ||
has_fill = (fill_value is not None and | ||
not np.any(pd.isnull(fill_value))) | ||
if (has_fill or scale_factor is not None or add_offset is not None): | ||
if fill_value.dtype.kind in ['U', 'S']: | ||
dtype = object | ||
else: | ||
dtype = float | ||
if is_unsigned and has_fill: | ||
# Need to convert the fill_value to unsigned, too | ||
# According to the CF spec, the fill value is of the same | ||
# type as its variable, i.e. its storage format on disk | ||
fill_value = np.asarray(fill_value, dtype=data.unsigned_dtype) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we simply cast That seems a little more robust to me. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. I had to leave the |
||
data = MaskedAndScaledArray(data, fill_value, scale_factor, | ||
add_offset, dtype) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -62,6 +62,23 @@ def create_encoded_masked_and_scaled_data(): | |
return Dataset({'x': ('t', [-1, -1, 0, 1, 2], attributes)}) | ||
|
||
|
||
def create_unsigned_masked_scaled_data(): | ||
encoding = {'_FillValue': 255, '_Unsigned': 'true', 'dtype': 'i1', | ||
'add_offset': 10, 'scale_factor': np.float32(0.1)} | ||
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan]) | ||
return Dataset({'x': ('t', x, {}, encoding)}) | ||
|
||
|
||
def create_encoded_unsigned_masked_scaled_data(): | ||
# These are values as written to the file: the _FillValue will | ||
# be represented in the signed form. | ||
attributes = {'_FillValue': -1, '_Unsigned': 'true', | ||
'add_offset': 10, 'scale_factor': np.float32(0.1)} | ||
# Create signed data corresponding to [0, 1, 127, 128, 255] unsigned | ||
sb = np.asarray([0, 1, 127, -128, -1], dtype='i1') | ||
return Dataset({'x': ('t', sb, attributes)}) | ||
|
||
|
||
def create_boolean_data(): | ||
attributes = {'units': '-'} | ||
return Dataset({'x': ('t', [True, False, False, True], attributes)}) | ||
|
@@ -360,24 +377,50 @@ def test_roundtrip_strings_with_fill_value(self): | |
with self.roundtrip(original) as actual: | ||
self.assertDatasetIdentical(expected, actual) | ||
|
||
def test_unsigned_roundtrip_mask_and_scale(self): | ||
decoded = create_unsigned_masked_scaled_data() | ||
encoded = create_encoded_unsigned_masked_scaled_data() | ||
with self.roundtrip(decoded) as actual: | ||
self.assertDatasetAllClose(decoded, actual) | ||
with self.roundtrip(decoded, | ||
open_kwargs=dict(decode_cf=False)) as actual: | ||
# TODO: this assumes that all roundtrips will first | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please remove this redundant TODO note There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
# encode. Is that something we want to test for? | ||
self.assertDatasetAllClose(encoded, actual) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
with self.roundtrip(encoded, | ||
open_kwargs=dict(decode_cf=False)) as actual: | ||
self.assertDatasetAllClose(encoded, actual) | ||
# make sure roundtrip encoding didn't change the | ||
# original dataset. | ||
self.assertDatasetIdentical( | ||
encoded, create_encoded_unsigned_masked_scaled_data()) | ||
with self.roundtrip(encoded) as actual: | ||
self.assertDatasetAllClose(decoded, actual) | ||
with self.roundtrip(encoded, | ||
open_kwargs=dict(decode_cf=False)) as actual: | ||
self.assertDatasetAllClose(encoded, actual) | ||
|
||
def test_roundtrip_mask_and_scale(self): | ||
decoded = create_masked_and_scaled_data() | ||
encoded = create_encoded_masked_and_scaled_data() | ||
with self.roundtrip(decoded) as actual: | ||
self.assertDatasetAllClose(decoded, actual) | ||
with self.roundtrip(decoded, open_kwargs=dict(decode_cf=False)) as actual: | ||
with self.roundtrip(decoded, | ||
open_kwargs=dict(decode_cf=False)) as actual: | ||
# TODO: this assumes that all roundtrips will first | ||
# encode. Is that something we want to test for? | ||
self.assertDatasetAllClose(encoded, actual) | ||
with self.roundtrip(encoded, open_kwargs=dict(decode_cf=False)) as actual: | ||
with self.roundtrip(encoded, | ||
open_kwargs=dict(decode_cf=False)) as actual: | ||
self.assertDatasetAllClose(encoded, actual) | ||
# make sure roundtrip encoding didn't change the | ||
# original dataset. | ||
self.assertDatasetIdentical(encoded, | ||
create_encoded_masked_and_scaled_data()) | ||
with self.roundtrip(encoded) as actual: | ||
self.assertDatasetAllClose(decoded, actual) | ||
with self.roundtrip(encoded, open_kwargs=dict(decode_cf=False)) as actual: | ||
with self.roundtrip(encoded, | ||
open_kwargs=dict(decode_cf=False)) as actual: | ||
self.assertDatasetAllClose(encoded, actual) | ||
|
||
def test_coordinates_encoding(self): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This block should be guarded in a check to verify that
_FillValue
is actually defined as an attribute.