Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Py3k consistent unicode handling #1782

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion lib/iris/_cube_coord_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def rename(self, name):
self.long_name = None
except ValueError:
self.standard_name = None
self.long_name = unicode(name)
self.long_name = six.text_type(name)

# Always clear var_name when renaming.
self.var_name = None
Expand Down
2 changes: 1 addition & 1 deletion lib/iris/_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1361,7 +1361,7 @@ def name_in_independents():
# string like).
dim_by_name[name] = dim = len(self._shape)
self._nd_names.append(name)
if metadata[name].points_dtype.kind == 'S':
if metadata[name].points_dtype.kind in 'SU':
self._aux_templates.append(
_Template(dim, points, bounds, kwargs))
else:
Expand Down
2 changes: 1 addition & 1 deletion lib/iris/analysis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1930,7 +1930,7 @@ def _compute_shared_coords(self):

# Create new shared bounded coordinates.
for coord in self._shared_coords:
if coord.points.dtype.kind == 'S':
if coord.points.dtype.kind in 'SU':
if coord.bounds is None:
new_points = []
new_bounds = None
Expand Down
2 changes: 1 addition & 1 deletion lib/iris/coord_categorisation.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def add_categorised_coord(cube, name, from_coord, category_function,
result = category_function(from_coord, from_coord.points.ravel()[0])
if isinstance(result, six.string_types):
str_vectorised_fn = np.vectorize(category_function, otypes=[object])
vectorised_fn = lambda *args: str_vectorised_fn(*args).astype('|S64')
vectorised_fn = lambda *args: str_vectorised_fn(*args).astype('|U64')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This changes the Python 2 behaviour.

else:
vectorised_fn = np.vectorize(category_function)
new_coord = iris.coords.AuxCoord(vectorised_fn(from_coord,
Expand Down
4 changes: 2 additions & 2 deletions lib/iris/coords.py
Original file line number Diff line number Diff line change
Expand Up @@ -963,10 +963,10 @@ def collapsed(self, dims_to_collapse=None):
for index in np.ndindex(shape):
index_slice = (slice(None),) + tuple(index)
bounds.append(serialize(self.bounds[index_slice]))
dtype = np.dtype('S{}'.format(max(map(len, bounds))))
dtype = np.dtype('U{}'.format(max(map(len, bounds))))
bounds = np.array(bounds, dtype=dtype).reshape((1,) + shape)
points = serialize(self.points)
dtype = np.dtype('S{}'.format(len(points)))
dtype = np.dtype('U{}'.format(len(points)))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These change the Python 2 behaviour. i.e. If I collapse a cube and get the value of a textual, collapsed coordinate it will now be an instance of numpy.unicode_ instead of an instance of numpy.string_. User code doing isinstance(value, str) will fail.

# Create the new collapsed coordinate.
coord = self.copy(points=np.array(points, dtype=dtype),
bounds=bounds)
Expand Down
17 changes: 12 additions & 5 deletions lib/iris/cube.py
Original file line number Diff line number Diff line change
Expand Up @@ -1863,7 +1863,7 @@ def vector_summary(vector_coords, cube_header, max_line_offset):
if self.attributes:
attribute_lines = []
for name, value in sorted(six.iteritems(self.attributes)):
value = iris.util.clip_string(unicode(value))
value = iris.util.clip_string(six.text_type(value))
line = u'{pad:{width}}{name}: {value}'.format(pad=' ',
width=indent,
name=name,
Expand Down Expand Up @@ -1893,7 +1893,11 @@ def assert_valid(self):
warnings.warn('Cube.assert_valid() has been deprecated.')

def __str__(self):
return self.summary().encode(errors='replace')
# six has a decorator for this bit, but it doesn't do errors='replace'.
if six.PY3:
return self.summary()
else:
return self.summary().encode(errors='replace')

def __unicode__(self):
return self.summary()
Expand Down Expand Up @@ -2302,7 +2306,8 @@ def _as_list_of_coords(self, names_or_coords):
Convert a name, coord, or list of names/coords to a list of coords.
"""
# If not iterable, convert to list of a single item
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment is a little misleading with py3k strings having gained an __iter__ method...

if not hasattr(names_or_coords, '__iter__'):
if (not hasattr(names_or_coords, '__iter__') or
isinstance(names_or_coords, str)):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could these checks be simplified to just checking if names_or_coords is a str? This will be true regardless of Python 2 or 3...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would also need to check for a Coord as well, I guess?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't believe so - this is just checking whether the input is of type string or list. The check on whether names_or_coords contains one or more Coords happens on lines 2315-2316.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This method takes a str name, Coord or list/tuple/iterable of the same. This check makes it a list if it's not, so you can't just check if it's str.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with the new logic in these "iterable but not a string" cases -- i.e. it works and does what we want.
But I think the reason it works is a bit obscure, so I'd prefer :

    def is_iterable_but_not_string(x):
        if isinstance(names_or_coords, six.string_types):
            result = False
        else:
            result = isinstance(x, collections.Iterable)
        return result

My reasoning is that this test really does just the same thing for either Python2 or 3.
Whereas, the proposed code relies on the slightly odd fact that strings don't provide an 'iter' in Python2 (even though you can use "iter(string)", and even though they do have an iter in Python3) : Without that quirk, this would not work for a unicode object, because they aren't a 'str'.

So, I'm suggesting we should really check for an instance of "basestring" first (equivalently, "six.string_types" for portability).
However, it then also occurred to me that "hasattr(x, 'iter')" is not a terribly good test, as objects don't need to have an "iter" method to be iterable, as str+unicode themselves show : In fact, you can use "iter(obj)" if obj just has a getitem -- though frustratingly, that does not make it pass the isinstance(obj, collections.Iterable) test !!

Anyway, I believe the above form is more consistent with our usage elsewhere.
e.g. - https://github.com/QuLogic/iris/blob/py3k-unicode/lib/iris/fileformats/cf.py#L1077

names_or_coords = [names_or_coords]

coords = []
Expand Down Expand Up @@ -2348,7 +2353,8 @@ def slices_over(self, ref_to_slice):

"""
# Required to handle a mix between types.
if not hasattr(ref_to_slice, '__iter__'):
if (not hasattr(ref_to_slice, '__iter__') or
isinstance(ref_to_slice, str)):
ref_to_slice = [ref_to_slice]

slice_dims = set()
Expand Down Expand Up @@ -2408,7 +2414,8 @@ def slices(self, ref_to_slice, ordered=True):
raise TypeError("'ordered' argument to slices must be boolean.")

# Required to handle a mix between types
if not hasattr(ref_to_slice, '__iter__'):
if (not hasattr(ref_to_slice, '__iter__') or
isinstance(ref_to_slice, str)):
ref_to_slice = [ref_to_slice]

dim_to_slice = []
Expand Down
2 changes: 1 addition & 1 deletion lib/iris/fileformats/_pyke_rules/fc_rules_cf.krb
Original file line number Diff line number Diff line change
Expand Up @@ -913,7 +913,7 @@ fc_extras
# Set the cube global attributes.
for attr_name, attr_value in six.iteritems(cf_var.cf_group.global_attributes):
try:
if isinstance(attr_value, unicode):
if six.PY2 and isinstance(attr_value, six.text_type):
try:
cube.attributes[str(attr_name)] = str(attr_value)
except UnicodeEncodeError:
Expand Down
20 changes: 14 additions & 6 deletions lib/iris/fileformats/cf.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,12 @@
ocean_s_coordinate_g2=['eta', 'depth'])


# NetCDF returns a different type for strings depending on Python version.
def _is_str_dtype(var):
return ((six.PY2 and np.issubdtype(var.dtype, np.str)) or
(six.PY3 and np.issubdtype(var.dtype, np.bytes_)))


################################################################################
class CFVariable(six.with_metaclass(ABCMeta, object)):
"""Abstract base class wrapper for a CF-netCDF variable."""
Expand Down Expand Up @@ -313,7 +319,7 @@ def identify(cls, variables, ignore=None, target=None, warn=True):
warnings.warn(message % (name, nc_var_name))
else:
# Restrict to non-string type i.e. not a CFLabelVariable.
if not np.issubdtype(variables[name].dtype, np.str):
if not _is_str_dtype(variables[name]):
result[name] = CFAuxiliaryCoordinateVariable(name, variables[name])

return result
Expand Down Expand Up @@ -478,7 +484,7 @@ def identify(cls, variables, ignore=None, target=None, warn=True, monotonic=Fals
if nc_var_name in ignore:
continue
# String variables can't be coordinates
if np.issubdtype(nc_var.dtype, np.str):
if _is_str_dtype(nc_var):
continue
# Restrict to one-dimensional with name as dimension OR zero-dimensional scalar
if not ((nc_var.ndim == 1 and nc_var_name in nc_var.dimensions) or (nc_var.ndim == 0)):
Expand Down Expand Up @@ -638,8 +644,9 @@ def identify(cls, variables, ignore=None, target=None, warn=True):
warnings.warn(message % (name, nc_var_name))
else:
# Restrict to only string type.
if np.issubdtype(variables[name].dtype, np.str):
result[name] = CFLabelVariable(name, variables[name])
if _is_str_dtype(variables[name]):
var = variables[name]
result[name] = CFLabelVariable(name, var)

return result

Expand Down Expand Up @@ -683,7 +690,7 @@ def cf_label_data(self, cf_data_var):

# Calculate new label data shape (without string dimension) and create payload array.
new_shape = tuple(dim_len for i, dim_len in enumerate(self.shape) if i != str_dim)
data = np.empty(new_shape, dtype='|S%d' % self.shape[str_dim])
data = np.empty(new_shape, dtype='|U%d' % self.shape[str_dim])

for index in np.ndindex(new_shape):
# Create the slice for the label data.
Expand All @@ -692,7 +699,8 @@ def cf_label_data(self, cf_data_var):
else:
label_index = index + (slice(None, None),)

data[index] = ''.join(label_data[label_index]).strip()
data[index] = b''.join(label_data[label_index]).strip().decode(
'utf8')

return data

Expand Down
14 changes: 8 additions & 6 deletions lib/iris/fileformats/grib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,13 @@
_load_rules = None


CENTRE_TITLES = {'egrr': 'U.K. Met Office - Exeter',
'ecmf': 'European Centre for Medium Range Weather Forecasts',
'rjtd': 'Tokyo, Japan Meteorological Agency',
'55' : 'San Francisco',
'kwbc': 'US National Weather Service, National Centres for Environmental Prediction'}
CENTRE_TITLES = {
'egrr': u'U.K. Met Office - Exeter',
'ecmf': u'European Centre for Medium Range Weather Forecasts',
'rjtd': u'Tokyo, Japan Meteorological Agency',
'55': u'San Francisco',
'kwbc': u'US National Weather Service, National Centres for Environmental '
u'Prediction'}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a change of behaviour for Python 2.


TIME_RANGE_INDICATORS = {0:'none', 1:'none', 3:'time mean', 4:'time sum',
5:'time _difference', 10:'none',
Expand Down Expand Up @@ -449,7 +451,7 @@ def _compute_extra_keys(self):
#originating centre
#TODO #574 Expand to include sub-centre
self.extra_keys['_originatingCentre'] = CENTRE_TITLES.get(
centre, "unknown centre %s" % centre)
centre, u'unknown centre %s' % centre)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a change of behaviour for Python 2.


#forecast time unit as a cm string
#TODO #575 Do we want PP or GRIB style forecast delta?
Expand Down
2 changes: 1 addition & 1 deletion lib/iris/fileformats/name.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def _get_NAME_loader(filename):
import iris.fileformats.name_loaders as name_loaders

load = None
with open(filename, 'r') as file_handle:
with open(filename, 'rb') as file_handle:
header = name_loaders.read_header(file_handle)

# Infer file type based on contents of header.
Expand Down
49 changes: 29 additions & 20 deletions lib/iris/fileformats/name_loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,11 @@ def read_header(file_handle):
header = {}
header['NAME Version'] = next(file_handle).strip()
for line in file_handle:
words = line.split(':', 1)
words = line.split(b':', 1)
if len(words) != 2:
break
key, value = [word.strip() for word in words]
header[key] = value
header[key.decode()] = value
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It applies to most/all of the changes to the NAME loader, but this is a change to the Python 2 behaviour.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since the keys are always strings, I'm not sure it makes much difference in this case. Python 2 is very lax, e.g., 'foo' in {u'foo': 1} == True.


# Cast some values into floats or integers if they match a
# given name. Set any empty string values to None.
Expand All @@ -97,6 +97,8 @@ def read_header(file_handle):
'Number of fields',
'Number of series']:
header[key] = int(value)
else:
header[key] = value.decode()
else:
header[key] = None

Expand All @@ -118,7 +120,7 @@ def _read_data_arrays(file_handle, n_arrays, shape):
for line in file_handle:
# Split the line by comma, removing the last empty column
# caused by the trailing comma
vals = line.split(',')[:-1]
vals = line.split(b',')[:-1]

# Cast the x and y grid positions to integers and convert
# them to zero based indices
Expand Down Expand Up @@ -518,7 +520,7 @@ def load_NAMEIII_field(filename):
# Loading a file gives a generator of lines which can be progressed using
# the next() function. This will come in handy as we wish to progress
# through the file line by line.
with open(filename, 'r') as file_handle:
with open(filename, 'rb') as file_handle:
# Create a dictionary which can hold the header metadata about this
# file.
header = read_header(file_handle)
Expand All @@ -536,7 +538,8 @@ def load_NAMEIII_field(filename):
'Vertical Av or Int', 'Prob Perc',
'Prob Perc Ens', 'Prob Perc Time',
'Time', 'Z', 'D']:
cols = [col.strip() for col in next(file_handle).split(',')]
cols = [col.strip()
for col in next(file_handle).decode().split(',')]
column_headings[column_header_name] = cols[4:-1]

# Convert the time to python datetimes.
Expand Down Expand Up @@ -588,7 +591,7 @@ def load_NAMEII_field(filename):
A generator :class:`iris.cube.Cube` instances.

"""
with open(filename, 'r') as file_handle:
with open(filename, 'rb') as file_handle:
# Create a dictionary which can hold the header metadata about this
# file.
header = read_header(file_handle)
Expand All @@ -607,7 +610,8 @@ def load_NAMEII_field(filename):
for column_header_name in ['Species Category', 'Species',
'Time Av or Int', 'Quantity',
'Unit', 'Z', 'Time']:
cols = [col.strip() for col in next(file_handle).split(',')]
cols = [col.strip()
for col in next(file_handle).decode().split(',')]
column_headings[column_header_name] = cols[4:-1]

# Convert the time to python datetimes
Expand Down Expand Up @@ -667,7 +671,7 @@ def load_NAMEIII_timeseries(filename):
A generator :class:`iris.cube.Cube` instances.

"""
with open(filename, 'r') as file_handle:
with open(filename, 'rb') as file_handle:
# Create a dictionary which can hold the header metadata about this
# file.
header = read_header(file_handle)
Expand All @@ -683,7 +687,8 @@ def load_NAMEIII_timeseries(filename):
'Vertical Av or Int', 'Prob Perc',
'Prob Perc Ens', 'Prob Perc Time',
'Location', 'X', 'Y', 'Z', 'D']:
cols = [col.strip() for col in next(file_handle).split(',')]
cols = [col.strip()
for col in next(file_handle).decode().split(',')]
column_headings[column_header_name] = cols[1:-1]

# Determine the coordinates of the data and store in namedtuples.
Expand All @@ -707,10 +712,10 @@ def load_NAMEIII_timeseries(filename):
for line in file_handle:
# Split the line by comma, removing the last empty column caused
# by the trailing comma.
vals = line.split(',')[:-1]
vals = line.split(b',')[:-1]

# Time is stored in the first column.
t = vals[0].strip()
t = vals[0].decode().strip()
dt = datetime.datetime.strptime(t, NAMEIII_DATETIME_FORMAT)
time_list.append(dt)

Expand Down Expand Up @@ -741,7 +746,7 @@ def load_NAMEII_timeseries(filename):
A generator :class:`iris.cube.Cube` instances.

"""
with open(filename, 'r') as file_handle:
with open(filename, 'rb') as file_handle:
# Create a dictionary which can hold the header metadata about this
# file.
header = read_header(file_handle)
Expand All @@ -751,7 +756,8 @@ def load_NAMEII_timeseries(filename):
for column_header_name in ['Y', 'X', 'Location',
'Species Category', 'Species',
'Quantity', 'Z', 'Unit']:
cols = [col.strip() for col in next(file_handle).split(',')]
cols = [col.strip()
for col in next(file_handle).decode().split(',')]
column_headings[column_header_name] = cols[1:-1]

# Determine the coordinates of the data and store in namedtuples.
Expand All @@ -771,10 +777,10 @@ def load_NAMEII_timeseries(filename):
for line in file_handle:
# Split the line by comma, removing the last empty column caused
# by the trailing comma.
vals = line.split(',')[:-1]
vals = line.split(b',')[:-1]

# Time is stored in the first two columns.
t = (vals[0].strip() + ' ' + vals[1].strip())
t = (vals[0].strip() + b' ' + vals[1].strip()).decode()
dt = datetime.datetime.strptime(
t, NAMEII_TIMESERIES_DATETIME_FORMAT)
time_list.append(dt)
Expand Down Expand Up @@ -809,21 +815,22 @@ def load_NAMEIII_trajectory(filename):
time_unit = iris.unit.Unit('hours since epoch',
calendar=iris.unit.CALENDAR_GREGORIAN)

with open(filename, 'r') as infile:
with open(filename, 'rb') as infile:
header = read_header(infile)

# read the column headings
for line in infile:
if line.startswith(" "):
if line.startswith(b' '):
break
headings = [heading.strip() for heading in line.split(",")]
headings = [heading.strip() for heading in line.decode().split(',')]

# read the columns
columns = [[] for i in range(len(headings))]
for line in infile:
values = [v.strip() for v in line.split(",")]
values = [v.strip() for v in line.split(b',')]
for c, v in enumerate(values):
if "UTC" in v:
if b'UTC' in v:
v = v.decode()
v = v.replace(":00 ", " ") # Strip out milliseconds.
v = datetime.datetime.strptime(v, NAMEIII_DATETIME_FORMAT)
else:
Expand Down Expand Up @@ -872,6 +879,8 @@ def load_NAMEIII_trajectory(filename):
elif name == "Z (FL)":
name = "flight_level"
long_name = name
elif values[0].dtype.kind == 'S':
values = [v.decode() for v in values]

try:
coord = DimCoord(values, units=units)
Expand Down
4 changes: 3 additions & 1 deletion lib/iris/fileformats/netcdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ def _pyke_stats(engine, cf_name):
def _set_attributes(attributes, key, value):
"""Set attributes dictionary, converting unicode strings appropriately."""

if isinstance(value, unicode):
if isinstance(value, six.text_type):
try:
attributes[str(key)] = str(value)
except UnicodeEncodeError:
Expand Down Expand Up @@ -1236,6 +1236,8 @@ def _create_cf_variable(self, cube, dimension_names, coord):

if np.issubdtype(coord.points.dtype, np.str):
string_dimension_depth = coord.points.dtype.itemsize
if coord.points.dtype.kind == 'U':
string_dimension_depth //= 4
string_dimension_name = 'string%d' % string_dimension_depth

# Determine whether to create the string length dimension.
Expand Down
Loading