Skip to content

Commit

Permalink
BUG: Bug in selection from a HDFStore with a fixed format and start a…
Browse files Browse the repository at this point in the history
…nd/or stop will now return the selected range

closes #8287

Author: Jeff Reback <jeff@reback.net>

Closes #13267 from jreback/stop and squashes the following commits:

39faa23 [Jeff Reback] BUG: Bug in selection from a HDFStore with a fixed format and start and/or stop specified will now return the selected range
  • Loading branch information
jreback committed May 24, 2016
1 parent 75714de commit 69ad08b
Show file tree
Hide file tree
Showing 3 changed files with 132 additions and 44 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ Other enhancements
- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`)
- ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`)


.. _whatsnew_0182.api:

API changes
Expand Down Expand Up @@ -207,6 +208,7 @@ Bug Fixes
- Bug in ``SparseSeries`` and ``SparseDataFrame`` creation with ``object`` dtype may raise ``TypeError`` (:issue:`11633`)
- Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`)
- Bug in matplotlib ``AutoDataFormatter``; this restores the second scaled formatting and re-adds micro-second scaled formatting (:issue:`13131`)
- Bug in selection from a ``HDFStore`` with a fixed format and ``start`` and/or ``stop`` specified will now return the selected range (:issue:`8287`)


- Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`)
Expand Down
120 changes: 79 additions & 41 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -1314,12 +1314,20 @@ def __init__(self, store, s, func, where, nrows, start=None, stop=None,
self.s = s
self.func = func
self.where = where
self.nrows = nrows or 0
self.start = start or 0

if stop is None:
stop = self.nrows
self.stop = min(self.nrows, stop)
# set start/stop if they are not set if we are a table
if self.s.is_table:
if nrows is None:
nrows = 0
if start is None:
start = 0
if stop is None:
stop = nrows
stop = min(nrows, stop)

self.nrows = nrows
self.start = start
self.stop = stop

self.coordinates = None
if iterator or chunksize is not None:
Expand Down Expand Up @@ -2303,14 +2311,23 @@ def f(values, freq=None, tz=None):
return klass

def validate_read(self, kwargs):
if kwargs.get('columns') is not None:
"""
remove table keywords from kwargs and return
raise if any keywords are passed which are not-None
"""
kwargs = copy.copy(kwargs)

columns = kwargs.pop('columns', None)
if columns is not None:
raise TypeError("cannot pass a column specification when reading "
"a Fixed format store. this store must be "
"selected in its entirety")
if kwargs.get('where') is not None:
where = kwargs.pop('where', None)
if where is not None:
raise TypeError("cannot pass a where specification when reading "
"from a Fixed format store. this store must be "
"selected in its entirety")
return kwargs

@property
def is_exists(self):
Expand All @@ -2329,11 +2346,11 @@ def get_attrs(self):
def write(self, obj, **kwargs):
self.set_attrs()

def read_array(self, key):
def read_array(self, key, start=None, stop=None):
""" read an array for the specified node (off of group """
import tables
node = getattr(self.group, key)
data = node[:]
data = node[start:stop]
attrs = node._v_attrs

transposed = getattr(attrs, 'transposed', False)
Expand Down Expand Up @@ -2363,17 +2380,17 @@ def read_array(self, key):
else:
return ret

def read_index(self, key):
def read_index(self, key, **kwargs):
variety = _ensure_decoded(getattr(self.attrs, '%s_variety' % key))

if variety == u('multi'):
return self.read_multi_index(key)
return self.read_multi_index(key, **kwargs)
elif variety == u('block'):
return self.read_block_index(key)
return self.read_block_index(key, **kwargs)
elif variety == u('sparseint'):
return self.read_sparse_intindex(key)
return self.read_sparse_intindex(key, **kwargs)
elif variety == u('regular'):
_, index = self.read_index_node(getattr(self.group, key))
_, index = self.read_index_node(getattr(self.group, key), **kwargs)
return index
else: # pragma: no cover
raise TypeError('unrecognized index variety: %s' % variety)
Expand Down Expand Up @@ -2411,19 +2428,19 @@ def write_block_index(self, key, index):
self.write_array('%s_blengths' % key, index.blengths)
setattr(self.attrs, '%s_length' % key, index.length)

def read_block_index(self, key):
def read_block_index(self, key, **kwargs):
length = getattr(self.attrs, '%s_length' % key)
blocs = self.read_array('%s_blocs' % key)
blengths = self.read_array('%s_blengths' % key)
blocs = self.read_array('%s_blocs' % key, **kwargs)
blengths = self.read_array('%s_blengths' % key, **kwargs)
return BlockIndex(length, blocs, blengths)

def write_sparse_intindex(self, key, index):
self.write_array('%s_indices' % key, index.indices)
setattr(self.attrs, '%s_length' % key, index.length)

def read_sparse_intindex(self, key):
def read_sparse_intindex(self, key, **kwargs):
length = getattr(self.attrs, '%s_length' % key)
indices = self.read_array('%s_indices' % key)
indices = self.read_array('%s_indices' % key, **kwargs)
return IntIndex(length, indices)

def write_multi_index(self, key, index):
Expand All @@ -2448,27 +2465,28 @@ def write_multi_index(self, key, index):
label_key = '%s_label%d' % (key, i)
self.write_array(label_key, lab)

def read_multi_index(self, key):
def read_multi_index(self, key, **kwargs):
nlevels = getattr(self.attrs, '%s_nlevels' % key)

levels = []
labels = []
names = []
for i in range(nlevels):
level_key = '%s_level%d' % (key, i)
name, lev = self.read_index_node(getattr(self.group, level_key))
name, lev = self.read_index_node(getattr(self.group, level_key),
**kwargs)
levels.append(lev)
names.append(name)

label_key = '%s_label%d' % (key, i)
lab = self.read_array(label_key)
lab = self.read_array(label_key, **kwargs)
labels.append(lab)

return MultiIndex(levels=levels, labels=labels, names=names,
verify_integrity=True)

def read_index_node(self, node):
data = node[:]
def read_index_node(self, node, start=None, stop=None):
data = node[start:stop]
# If the index was an empty array write_array_empty() will
# have written a sentinel. Here we relace it with the original.
if ('shape' in node._v_attrs and
Expand Down Expand Up @@ -2607,17 +2625,17 @@ def write_array(self, key, value, items=None):

class LegacyFixed(GenericFixed):

def read_index_legacy(self, key):
def read_index_legacy(self, key, start=None, stop=None):
node = getattr(self.group, key)
data = node[:]
data = node[start:stop]
kind = node._v_attrs.kind
return _unconvert_index_legacy(data, kind, encoding=self.encoding)


class LegacySeriesFixed(LegacyFixed):

def read(self, **kwargs):
self.validate_read(kwargs)
kwargs = self.validate_read(kwargs)
index = self.read_index_legacy('index')
values = self.read_array('values')
return Series(values, index=index)
Expand All @@ -2626,7 +2644,7 @@ def read(self, **kwargs):
class LegacyFrameFixed(LegacyFixed):

def read(self, **kwargs):
self.validate_read(kwargs)
kwargs = self.validate_read(kwargs)
index = self.read_index_legacy('index')
columns = self.read_index_legacy('columns')
values = self.read_array('values')
Expand All @@ -2645,9 +2663,9 @@ def shape(self):
return None

def read(self, **kwargs):
self.validate_read(kwargs)
index = self.read_index('index')
values = self.read_array('values')
kwargs = self.validate_read(kwargs)
index = self.read_index('index', **kwargs)
values = self.read_array('values', **kwargs)
return Series(values, index=index, name=self.name)

def write(self, obj, **kwargs):
Expand All @@ -2657,12 +2675,25 @@ def write(self, obj, **kwargs):
self.attrs.name = obj.name


class SparseSeriesFixed(GenericFixed):
class SparseFixed(GenericFixed):

def validate_read(self, kwargs):
"""
we don't support start, stop kwds in Sparse
"""
kwargs = super(SparseFixed, self).validate_read(kwargs)
if 'start' in kwargs or 'stop' in kwargs:
raise NotImplementedError("start and/or stop are not supported "
"in fixed Sparse reading")
return kwargs


class SparseSeriesFixed(SparseFixed):
pandas_kind = u('sparse_series')
attributes = ['name', 'fill_value', 'kind']

def read(self, **kwargs):
self.validate_read(kwargs)
kwargs = self.validate_read(kwargs)
index = self.read_index('index')
sp_values = self.read_array('sp_values')
sp_index = self.read_index('sp_index')
Expand All @@ -2681,12 +2712,12 @@ def write(self, obj, **kwargs):
self.attrs.kind = obj.kind


class SparseFrameFixed(GenericFixed):
class SparseFrameFixed(SparseFixed):
pandas_kind = u('sparse_frame')
attributes = ['default_kind', 'default_fill_value']

def read(self, **kwargs):
self.validate_read(kwargs)
kwargs = self.validate_read(kwargs)
columns = self.read_index('columns')
sdict = {}
for c in columns:
Expand Down Expand Up @@ -2714,12 +2745,12 @@ def write(self, obj, **kwargs):
self.write_index('columns', obj.columns)


class SparsePanelFixed(GenericFixed):
class SparsePanelFixed(SparseFixed):
pandas_kind = u('sparse_panel')
attributes = ['default_kind', 'default_fill_value']

def read(self, **kwargs):
self.validate_read(kwargs)
kwargs = self.validate_read(kwargs)
items = self.read_index('items')

sdict = {}
Expand Down Expand Up @@ -2782,19 +2813,26 @@ def shape(self):
except:
return None

def read(self, **kwargs):
self.validate_read(kwargs)
def read(self, start=None, stop=None, **kwargs):
# start, stop applied to rows, so 0th axis only

kwargs = self.validate_read(kwargs)
select_axis = self.obj_type()._get_block_manager_axis(0)

axes = []
for i in range(self.ndim):
ax = self.read_index('axis%d' % i)

_start, _stop = (start, stop) if i == select_axis else (None, None)
ax = self.read_index('axis%d' % i, start=_start, stop=_stop)
axes.append(ax)

items = axes[0]
blocks = []
for i in range(self.nblocks):

blk_items = self.read_index('block%d_items' % i)
values = self.read_array('block%d_values' % i)
values = self.read_array('block%d_values' % i,
start=_start, stop=_stop)
blk = make_block(values,
placement=items.get_indexer(blk_items))
blocks.append(blk)
Expand Down
54 changes: 51 additions & 3 deletions pandas/io/tests/test_pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -4128,10 +4128,11 @@ def test_nan_selection_bug_4858(self):
result = store.select('df', where='values>2.0')
assert_frame_equal(result, expected)

def test_start_stop(self):
def test_start_stop_table(self):

with ensure_clean_store(self.path) as store:

# table
df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20)))
store.append('df', df)

Expand All @@ -4143,8 +4144,55 @@ def test_start_stop(self):
# out of range
result = store.select(
'df', [Term("columns=['A']")], start=30, stop=40)
assert(len(result) == 0)
assert(type(result) == DataFrame)
self.assertTrue(len(result) == 0)
expected = df.ix[30:40, ['A']]
tm.assert_frame_equal(result, expected)

def test_start_stop_fixed(self):

with ensure_clean_store(self.path) as store:

# fixed, GH 8287
df = DataFrame(dict(A=np.random.rand(20),
B=np.random.rand(20)),
index=pd.date_range('20130101', periods=20))
store.put('df', df)

result = store.select(
'df', start=0, stop=5)
expected = df.iloc[0:5, :]
tm.assert_frame_equal(result, expected)

result = store.select(
'df', start=5, stop=10)
expected = df.iloc[5:10, :]
tm.assert_frame_equal(result, expected)

# out of range
result = store.select(
'df', start=30, stop=40)
expected = df.iloc[30:40, :]
tm.assert_frame_equal(result, expected)

# series
s = df.A
store.put('s', s)
result = store.select('s', start=0, stop=5)
expected = s.iloc[0:5]
tm.assert_series_equal(result, expected)

result = store.select('s', start=5, stop=10)
expected = s.iloc[5:10]
tm.assert_series_equal(result, expected)

# sparse; not implemented
df = tm.makeDataFrame()
df.ix[3:5, 1:3] = np.nan
df.ix[8:10, -2] = np.nan
dfs = df.to_sparse()
store.put('dfs', dfs)
with self.assertRaises(NotImplementedError):
store.select('dfs', start=0, stop=5)

def test_select_filter_corner(self):

Expand Down

0 comments on commit 69ad08b

Please sign in to comment.