From f18dcf9f45f72123f415cc8be4c5c4d220cf246c Mon Sep 17 00:00:00 2001
From: Tom Aldcroft <taldcroft@gmail.com>
Date: Mon, 9 May 2022 16:51:59 -0400
Subject: [PATCH 1/3] Improve speed  of getting local OCAT data

---
 mica/archive/cda/services.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/mica/archive/cda/services.py b/mica/archive/cda/services.py
index b37bb3e1..9bc7652e 100644
--- a/mica/archive/cda/services.py
+++ b/mica/archive/cda/services.py
@@ -596,7 +596,25 @@ def get_ocat_local(obsid=None, *,
     for name, col in dat.columns.items():
         zero_length = len(dat) == 0
         if col.info.dtype.kind == 'S':
-            dat[name] = col.astype('U') if zero_length else np.char.decode(col, 'utf-8')
+            if zero_length:
+                dat[name] = col.astype('U')
+            else:
+                # Try a faster way of converting to unicode for ASCII input.
+                # View the column as a numpy array of bytes and look for values
+                # above 128 that signify a non-ASCII character.
+                itemsize = col.dtype.itemsize
+                col_bytes = col.view((np.uint8, (itemsize,)))
+                if np.all(col_bytes.flatten() < 128):
+                    # This is ASCII so the numpy UTF-8 version is just the same
+                    # but with the single leading byte set.
+                    col_utf8 = np.zeros((col_bytes.shape[0], itemsize * 4), dtype=np.uint8)
+                    for ii in range(itemsize):
+                        col_utf8[:, ii * 4] = col_bytes[:, ii]
+                    dat[name] = col_utf8.view(('U', itemsize)).flatten()
+                else:
+                    # Use the standard slow way for non-ASCII input. This can
+                    # run for pi_name and observer columns.
+                    dat[name] = np.char.decode(col, 'utf-8')
 
     # Match target_name as a substring of the table target_name column.
     if len(dat) > 0 and target_name is not None and not resolve_name:

From cd10ddff766480db9bb7ec7125aba941e92770cd Mon Sep 17 00:00:00 2001
From: Tom Aldcroft <taldcroft@gmail.com>
Date: Mon, 9 May 2022 17:08:26 -0400
Subject: [PATCH 2/3] Do not use compression saving OCAT local HDF5 file

---
 mica/archive/cda/services.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mica/archive/cda/services.py b/mica/archive/cda/services.py
index 9bc7652e..a6750dcf 100644
--- a/mica/archive/cda/services.py
+++ b/mica/archive/cda/services.py
@@ -528,7 +528,7 @@ def update_ocat_local(datafile, **params):
             dat[name] = np.char.encode(col, 'utf-8')
 
     dat.write(datafile, path='data', serialize_meta=True, overwrite=True,
-              format='hdf5', compression=True)
+              format='hdf5', compression=False)
 
 
 def get_ocat_local(obsid=None, *,

From db7b2e0bf578284920d7ae358d65cffea18ecec4 Mon Sep 17 00:00:00 2001
From: Tom Aldcroft <taldcroft@gmail.com>
Date: Thu, 12 May 2022 08:21:36 -0400
Subject: [PATCH 3/3] Address review comments and add a test

---
 mica/archive/cda/services.py   | 8 +++++---
 mica/archive/tests/test_cda.py | 7 +++++++
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/mica/archive/cda/services.py b/mica/archive/cda/services.py
index a6750dcf..497a17bb 100644
--- a/mica/archive/cda/services.py
+++ b/mica/archive/cda/services.py
@@ -604,7 +604,7 @@ def get_ocat_local(obsid=None, *,
                 # above 128 that signify a non-ASCII character.
                 itemsize = col.dtype.itemsize
                 col_bytes = col.view((np.uint8, (itemsize,)))
-                if np.all(col_bytes.flatten() < 128):
+                if np.all(col_bytes < 128):
                     # This is ASCII so the numpy UTF-8 version is just the same
                     # but with the single leading byte set.
                     col_utf8 = np.zeros((col_bytes.shape[0], itemsize * 4), dtype=np.uint8)
@@ -612,8 +612,10 @@ def get_ocat_local(obsid=None, *,
                         col_utf8[:, ii * 4] = col_bytes[:, ii]
                     dat[name] = col_utf8.view(('U', itemsize)).flatten()
                 else:
-                    # Use the standard slow way for non-ASCII input. This can
-                    # run for pi_name and observer columns.
+                    # Use the standard slow way for non-ASCII input. This is
+                    # commonly run for pi_name and observer columns that have
+                    # names with unicode characters, but it might run for other
+                    # columns since we have no spec on OCAT data content.
                     dat[name] = np.char.decode(col, 'utf-8')
 
     # Match target_name as a substring of the table target_name column.
diff --git a/mica/archive/tests/test_cda.py b/mica/archive/tests/test_cda.py
index 0b15f30d..bcd7ceb8 100644
--- a/mica/archive/tests/test_cda.py
+++ b/mica/archive/tests/test_cda.py
@@ -94,6 +94,13 @@ def test_ocat_details_local_where(datafile):
     assert all('JET' in row['target_name'] for row in dat)
 
 
+def test_ocat_local_unicode():
+    """Unicode converstion from H5 encoding to UTF-8 is working"""
+    dat = get_ocat_local()
+    for key in ('observer', 'pi_name'):
+        assert np.any('ü' in val for val in dat[key])
+
+
 def test_ocat_special_query():
     dat = get_ocat_web(obsid=8008, rollReqs='true')
     assert len(dat) == 1