Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve speed of getting local OCAT data #272

Merged
merged 3 commits into from
May 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 22 additions & 2 deletions mica/archive/cda/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,7 +528,7 @@ def update_ocat_local(datafile, **params):
dat[name] = np.char.encode(col, 'utf-8')

dat.write(datafile, path='data', serialize_meta=True, overwrite=True,
format='hdf5', compression=True)
format='hdf5', compression=False)


def get_ocat_local(obsid=None, *,
Expand Down Expand Up @@ -596,7 +596,27 @@ def get_ocat_local(obsid=None, *,
for name, col in dat.columns.items():
zero_length = len(dat) == 0
if col.info.dtype.kind == 'S':
dat[name] = col.astype('U') if zero_length else np.char.decode(col, 'utf-8')
if zero_length:
dat[name] = col.astype('U')
else:
# Try a faster way of converting to unicode for ASCII input.
# View the column as a numpy array of bytes and look for values
# above 128 that signify a non-ASCII character.
itemsize = col.dtype.itemsize
col_bytes = col.view((np.uint8, (itemsize,)))
if np.all(col_bytes < 128):
# This is ASCII so the numpy UTF-8 version is just the same
# but with the single leading byte set.
col_utf8 = np.zeros((col_bytes.shape[0], itemsize * 4), dtype=np.uint8)
for ii in range(itemsize):
col_utf8[:, ii * 4] = col_bytes[:, ii]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this loop the same as this?

col_utf8_2[:,::4] = col_bytes

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably, but it requires a little thinking to be sure it will be right. Writing it out in a loop makes the intent blindingly obvious and is effectively just as fast (given all the other overhead).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I made this comment after testing that it was equivalent, so my question was a bit rhetorical, but ok.

dat[name] = col_utf8.view(('U', itemsize)).flatten()
else:
# Use the standard slow way for non-ASCII input. This is
# commonly run for pi_name and observer columns that have
# names with unicode characters, but it might run for other
# columns since we have no spec on OCAT data content.
dat[name] = np.char.decode(col, 'utf-8')

# Match target_name as a substring of the table target_name column.
if len(dat) > 0 and target_name is not None and not resolve_name:
Expand Down
7 changes: 7 additions & 0 deletions mica/archive/tests/test_cda.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,13 @@ def test_ocat_details_local_where(datafile):
assert all('JET' in row['target_name'] for row in dat)


def test_ocat_local_unicode():
"""Unicode converstion from H5 encoding to UTF-8 is working"""
dat = get_ocat_local()
for key in ('observer', 'pi_name'):
assert np.any('ü' in val for val in dat[key])


def test_ocat_special_query():
dat = get_ocat_web(obsid=8008, rollReqs='true')
assert len(dat) == 1
Expand Down