sot · taldcroft · May 17, 2022 · May 9, 2022 · May 9, 2022 · May 12, 2022
diff --git a/mica/archive/cda/services.py b/mica/archive/cda/services.py
@@ -528,7 +528,7 @@ def update_ocat_local(datafile, **params):
             dat[name] = np.char.encode(col, 'utf-8')
 
     dat.write(datafile, path='data', serialize_meta=True, overwrite=True,
-              format='hdf5', compression=True)
+              format='hdf5', compression=False)
 
 
 def get_ocat_local(obsid=None, *,
@@ -596,7 +596,27 @@ def get_ocat_local(obsid=None, *,
     for name, col in dat.columns.items():
         zero_length = len(dat) == 0
         if col.info.dtype.kind == 'S':
-            dat[name] = col.astype('U') if zero_length else np.char.decode(col, 'utf-8')
+            if zero_length:
+                dat[name] = col.astype('U')
+            else:
+                # Try a faster way of converting to unicode for ASCII input.
+                # View the column as a numpy array of bytes and look for values
+                # above 128 that signify a non-ASCII character.
+                itemsize = col.dtype.itemsize
+                col_bytes = col.view((np.uint8, (itemsize,)))
+                if np.all(col_bytes < 128):
+                    # This is ASCII so the numpy UTF-8 version is just the same
+                    # but with the single leading byte set.
+                    col_utf8 = np.zeros((col_bytes.shape[0], itemsize * 4), dtype=np.uint8)
+                    for ii in range(itemsize):
+                        col_utf8[:, ii * 4] = col_bytes[:, ii]
+                    dat[name] = col_utf8.view(('U', itemsize)).flatten()
+                else:
+                    # Use the standard slow way for non-ASCII input. This is
+                    # commonly run for pi_name and observer columns that have
+                    # names with unicode characters, but it might run for other
+                    # columns since we have no spec on OCAT data content.
+                    dat[name] = np.char.decode(col, 'utf-8')
 
     # Match target_name as a substring of the table target_name column.
     if len(dat) > 0 and target_name is not None and not resolve_name:

diff --git a/mica/archive/tests/test_cda.py b/mica/archive/tests/test_cda.py
@@ -94,6 +94,13 @@ def test_ocat_details_local_where(datafile):
     assert all('JET' in row['target_name'] for row in dat)
 
 
+def test_ocat_local_unicode():
+    """Unicode converstion from H5 encoding to UTF-8 is working"""
+    dat = get_ocat_local()
+    for key in ('observer', 'pi_name'):
+        assert np.any('ü' in val for val in dat[key])
+
+
 def test_ocat_special_query():
     dat = get_ocat_web(obsid=8008, rollReqs='true')
     assert len(dat) == 1