diff --git a/mica/archive/cda/services.py b/mica/archive/cda/services.py index b37bb3e1..497a17bb 100644 --- a/mica/archive/cda/services.py +++ b/mica/archive/cda/services.py @@ -528,7 +528,7 @@ def update_ocat_local(datafile, **params): dat[name] = np.char.encode(col, 'utf-8') dat.write(datafile, path='data', serialize_meta=True, overwrite=True, - format='hdf5', compression=True) + format='hdf5', compression=False) def get_ocat_local(obsid=None, *, @@ -596,7 +596,27 @@ def get_ocat_local(obsid=None, *, for name, col in dat.columns.items(): zero_length = len(dat) == 0 if col.info.dtype.kind == 'S': - dat[name] = col.astype('U') if zero_length else np.char.decode(col, 'utf-8') + if zero_length: + dat[name] = col.astype('U') + else: + # Try a faster way of converting to unicode for ASCII input. + # View the column as a numpy array of bytes and look for values + # above 128 that signify a non-ASCII character. + itemsize = col.dtype.itemsize + col_bytes = col.view((np.uint8, (itemsize,))) + if np.all(col_bytes < 128): + # This is ASCII so the numpy UTF-8 version is just the same + # but with the single leading byte set. + col_utf8 = np.zeros((col_bytes.shape[0], itemsize * 4), dtype=np.uint8) + for ii in range(itemsize): + col_utf8[:, ii * 4] = col_bytes[:, ii] + dat[name] = col_utf8.view(('U', itemsize)).flatten() + else: + # Use the standard slow way for non-ASCII input. This is + # commonly run for pi_name and observer columns that have + # names with unicode characters, but it might run for other + # columns since we have no spec on OCAT data content. + dat[name] = np.char.decode(col, 'utf-8') # Match target_name as a substring of the table target_name column. if len(dat) > 0 and target_name is not None and not resolve_name: diff --git a/mica/archive/tests/test_cda.py b/mica/archive/tests/test_cda.py index 0b15f30d..bcd7ceb8 100644 --- a/mica/archive/tests/test_cda.py +++ b/mica/archive/tests/test_cda.py @@ -94,6 +94,13 @@ def test_ocat_details_local_where(datafile): assert all('JET' in row['target_name'] for row in dat) +def test_ocat_local_unicode(): + """Unicode converstion from H5 encoding to UTF-8 is working""" + dat = get_ocat_local() + for key in ('observer', 'pi_name'): + assert np.any('ΓΌ' in val for val in dat[key]) + + def test_ocat_special_query(): dat = get_ocat_web(obsid=8008, rollReqs='true') assert len(dat) == 1