From f18dcf9f45f72123f415cc8be4c5c4d220cf246c Mon Sep 17 00:00:00 2001 From: Tom Aldcroft Date: Mon, 9 May 2022 16:51:59 -0400 Subject: [PATCH 1/3] Improve speed of getting local OCAT data --- mica/archive/cda/services.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/mica/archive/cda/services.py b/mica/archive/cda/services.py index b37bb3e1..9bc7652e 100644 --- a/mica/archive/cda/services.py +++ b/mica/archive/cda/services.py @@ -596,7 +596,25 @@ def get_ocat_local(obsid=None, *, for name, col in dat.columns.items(): zero_length = len(dat) == 0 if col.info.dtype.kind == 'S': - dat[name] = col.astype('U') if zero_length else np.char.decode(col, 'utf-8') + if zero_length: + dat[name] = col.astype('U') + else: + # Try a faster way of converting to unicode for ASCII input. + # View the column as a numpy array of bytes and look for values + # above 128 that signify a non-ASCII character. + itemsize = col.dtype.itemsize + col_bytes = col.view((np.uint8, (itemsize,))) + if np.all(col_bytes.flatten() < 128): + # This is ASCII so the numpy UTF-8 version is just the same + # but with the single leading byte set. + col_utf8 = np.zeros((col_bytes.shape[0], itemsize * 4), dtype=np.uint8) + for ii in range(itemsize): + col_utf8[:, ii * 4] = col_bytes[:, ii] + dat[name] = col_utf8.view(('U', itemsize)).flatten() + else: + # Use the standard slow way for non-ASCII input. This can + # run for pi_name and observer columns. + dat[name] = np.char.decode(col, 'utf-8') # Match target_name as a substring of the table target_name column. if len(dat) > 0 and target_name is not None and not resolve_name: From cd10ddff766480db9bb7ec7125aba941e92770cd Mon Sep 17 00:00:00 2001 From: Tom Aldcroft Date: Mon, 9 May 2022 17:08:26 -0400 Subject: [PATCH 2/3] Do not use compression saving OCAT local HDF5 file --- mica/archive/cda/services.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mica/archive/cda/services.py b/mica/archive/cda/services.py index 9bc7652e..a6750dcf 100644 --- a/mica/archive/cda/services.py +++ b/mica/archive/cda/services.py @@ -528,7 +528,7 @@ def update_ocat_local(datafile, **params): dat[name] = np.char.encode(col, 'utf-8') dat.write(datafile, path='data', serialize_meta=True, overwrite=True, - format='hdf5', compression=True) + format='hdf5', compression=False) def get_ocat_local(obsid=None, *, From db7b2e0bf578284920d7ae358d65cffea18ecec4 Mon Sep 17 00:00:00 2001 From: Tom Aldcroft Date: Thu, 12 May 2022 08:21:36 -0400 Subject: [PATCH 3/3] Address review comments and add a test --- mica/archive/cda/services.py | 8 +++++--- mica/archive/tests/test_cda.py | 7 +++++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/mica/archive/cda/services.py b/mica/archive/cda/services.py index a6750dcf..497a17bb 100644 --- a/mica/archive/cda/services.py +++ b/mica/archive/cda/services.py @@ -604,7 +604,7 @@ def get_ocat_local(obsid=None, *, # above 128 that signify a non-ASCII character. itemsize = col.dtype.itemsize col_bytes = col.view((np.uint8, (itemsize,))) - if np.all(col_bytes.flatten() < 128): + if np.all(col_bytes < 128): # This is ASCII so the numpy UTF-8 version is just the same # but with the single leading byte set. col_utf8 = np.zeros((col_bytes.shape[0], itemsize * 4), dtype=np.uint8) @@ -612,8 +612,10 @@ def get_ocat_local(obsid=None, *, col_utf8[:, ii * 4] = col_bytes[:, ii] dat[name] = col_utf8.view(('U', itemsize)).flatten() else: - # Use the standard slow way for non-ASCII input. This can - # run for pi_name and observer columns. + # Use the standard slow way for non-ASCII input. This is + # commonly run for pi_name and observer columns that have + # names with unicode characters, but it might run for other + # columns since we have no spec on OCAT data content. dat[name] = np.char.decode(col, 'utf-8') # Match target_name as a substring of the table target_name column. diff --git a/mica/archive/tests/test_cda.py b/mica/archive/tests/test_cda.py index 0b15f30d..bcd7ceb8 100644 --- a/mica/archive/tests/test_cda.py +++ b/mica/archive/tests/test_cda.py @@ -94,6 +94,13 @@ def test_ocat_details_local_where(datafile): assert all('JET' in row['target_name'] for row in dat) +def test_ocat_local_unicode(): + """Unicode converstion from H5 encoding to UTF-8 is working""" + dat = get_ocat_local() + for key in ('observer', 'pi_name'): + assert np.any('ΓΌ' in val for val in dat[key]) + + def test_ocat_special_query(): dat = get_ocat_web(obsid=8008, rollReqs='true') assert len(dat) == 1