Skip to content

Commit

Permalink
MDP-1754 Improve the memory and cpu performance of reading a subset of
Browse files Browse the repository at this point in the history
columns.

By using the index bitfield masks we can return a sparse dataframe.

This is a behaviour change, as we don't return rows for timestamps where
the field wasn't updated.

Old code:
=========

# All columns
%timeit l.read('3284.JP', date_range=adu.DateRange(20170101, 20170206))
1 loops, best of 3: 1.99 s per loop

# Multiple columns
%timeit l.read('3284.JP', date_range=adu.DateRange(20170101, 20170206),
columns=['DISC_BID1', 'BID'])
10 loops, best of 3: 82.2 ms per loop

# Single very sparse column
%timeit l.read('3284.JP', date_range=adu.DateRange(20170101, 20170206),
columns=['DISC_BID1'])
10 loops, best of 3: 76.4 ms per loop


New code:
=========

# All columns
%timeit l.read('3284.JP', date_range=adu.DateRange(20170101, 20170206))
1 loop, best of 3: 2.29 s per loop

# Multiple columns
%timeit l.read('3284.JP', date_range=adu.DateRange(20170101, 20170206),
columns=['DISC_BID1', 'BID'])
10 loops, best of 3: 75.4 ms per loop

# Single very sparse column
%timeit l.read('3284.JP', date_range=adu.DateRange(20170101, 20170206),
columns=['DISC_BID1'])
10 loops, best of 3: 47.4 ms per loop

Fixes pandas-dev#290
  • Loading branch information
jamesblackburn committed Feb 16, 2017
1 parent 717fd6d commit 1d85fc9
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 11 deletions.
20 changes: 18 additions & 2 deletions arctic/tickstore/tickstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,10 +414,25 @@ def _read_bucket(self, doc, column_set, column_dtypes, include_symbol, include_i
raise ArcticException("Unhandled document version: %s" % doc[VERSION])
rtn[INDEX] = np.cumsum(np.fromstring(lz4.decompress(doc[INDEX]), dtype='uint64'))
doc_length = len(rtn[INDEX])
rtn_length = len(rtn[INDEX])
column_set.update(doc[COLUMNS].keys())

# get the mask for the columns we're about to load
union_mask = np.zeros((doc_length + 7) // 8, dtype='uint8')
for c in column_set:
try:
coldata = doc[COLUMNS][c]
mask = np.fromstring(lz4.decompress(coldata[ROWMASK]), dtype='uint8')
union_mask = union_mask | mask
except KeyError:
rtn[c] = None
union_mask = np.unpackbits(union_mask)[:doc_length].astype('bool')
rtn_length = np.sum(union_mask)

rtn[INDEX] = rtn[INDEX][union_mask]
if include_symbol:
rtn['SYMBOL'] = [doc[SYMBOL], ] * rtn_length
column_set.update(doc[COLUMNS].keys())

# Unpack each requested column in turn
for c in column_set:
try:
coldata = doc[COLUMNS][c]
Expand All @@ -427,6 +442,7 @@ def _read_bucket(self, doc, column_set, column_dtypes, include_symbol, include_i
rtn[c] = self._empty(rtn_length, dtype=column_dtypes[c])
rowmask = np.unpackbits(np.fromstring(lz4.decompress(coldata[ROWMASK]),
dtype='uint8'))[:doc_length].astype('bool')
rowmask = rowmask[union_mask]
rtn[c][rowmask] = values
except KeyError:
rtn[c] = None
Expand Down
15 changes: 6 additions & 9 deletions tests/integration/tickstore/test_ts_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,12 @@ def test_read_allow_secondary(tickstore_lib):

def test_read_symbol_as_column(tickstore_lib):
data = [{'ASK': 1545.25,
'index': 1185076787070},
{'CUMVOL': 354.0,
'index': 1185141600600}]
'index': 1185076787070},
{'CUMVOL': 354.0,
'index': 1185141600600}]
tickstore_lib.write('FEED::SYMBOL', data)

df = tickstore_lib.read('FEED::SYMBOL', columns=['SYMBOL'])
df = tickstore_lib.read('FEED::SYMBOL', columns=['SYMBOL', 'CUMVOL'])
assert all(df['SYMBOL'].values == ['FEED::SYMBOL'])


Expand Down Expand Up @@ -596,15 +596,12 @@ def test_read_with_image(tickstore_lib):
# Read one column from the updates
df = tickstore_lib.read('SYM', columns=('a',), date_range=dr, include_images=True)
assert set(df.columns) == set(('a',))
assert_array_equal(df['a'].values, np.array([37, 1, np.nan]))
assert_array_equal(df['a'].values, np.array([37, 1]))
assert df.index[0] == dt(2013, 1, 1, 10, tzinfo=mktz('Europe/London'))
assert df.index[1] == dt(2013, 1, 1, 11, tzinfo=mktz('Europe/London'))
assert df.index[2] == dt(2013, 1, 1, 12, tzinfo=mktz('Europe/London'))

# Read just the image column
df = tickstore_lib.read('SYM', columns=['c'], date_range=dr, include_images=True)
assert set(df.columns) == set(['c'])
assert_array_equal(df['c'].values, np.array([2, np.nan, np.nan]))
assert_array_equal(df['c'].values, np.array([2]))
assert df.index[0] == dt(2013, 1, 1, 10, tzinfo=mktz('Europe/London'))
assert df.index[1] == dt(2013, 1, 1, 11, tzinfo=mktz('Europe/London'))
assert df.index[2] == dt(2013, 1, 1, 12, tzinfo=mktz('Europe/London'))

0 comments on commit 1d85fc9

Please sign in to comment.