Skip to content

Commit

Permalink
ENH: allow multiple table selection. retrieve multiple tables based o…
Browse files Browse the repository at this point in the history
…n the results from a selector table.

     this allows one to potentially put the data you really want to index in a single table, and your actual (wide)
     data in another to speed queries
  • Loading branch information
jreback committed Dec 28, 2012
1 parent 04a1aa9 commit 1c32ebf
Show file tree
Hide file tree
Showing 5 changed files with 235 additions and 26 deletions.
1 change: 1 addition & 0 deletions RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ pandas 0.10.1
- support ``start`` and ``stop`` keywords in select to limit the row selection space
- added ``get_store`` context manager to automatically import with pandas
- added column filtering via ``columns`` keyword in select
- added methods select_multiple/select_as_coordinates to do multiple-table selection

**Bug fixes**

Expand Down
18 changes: 18 additions & 0 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1254,6 +1254,24 @@ If you want to inspect the table object, retrieve via ``get_table``. You could u
store.get_table('df_dc').nrows
Multiple Table Queries
~~~~~~~~~~~~~~~~~~~~~~

New in 0.10.1 is the method ``select_multiple``, that can perform selections from multiple tables and return a combined result, by using ``where`` on a selector table. The purpose is to allow fast selection from really wide tables. Construct 2 (or more) tables, where your indexing criteria is contained in a relatively small table. Then put your data in another table. Queries will be quite fast, yet you can allow your tables to grow (in column space). **THE USER IS RESPONSIBLE FOR SYNCHRONIZING THE TABLES**. This means, append to the tables in the same order. You can pass the ``axis`` parameter to control concatenation. Default is on the ``columns`` axis.

.. ipython:: python
index = date_range('1/1/2000', periods=8)
df1_mt = DataFrame(randn(8, 3), index=index, columns=['A', 'B', 'C'])
df2_mt = DataFrame(randn(8, 3), index=index, columns=['D', 'E', 'F'])
df2_mt['foo'] = 'bar'
# you can use data columns as well
store.append('df1_mt',df1_mt, columns = ['A','B'])
store.append('df2_mt',df2_mt)
store.select_multiple(['df1_mt','df2_mt'], where = [ 'A>0','B>0' ], axis = 1, selector = 'df1_mt')
Delete from a Table
~~~~~~~~~~~~~~~~~~~
Expand Down
39 changes: 27 additions & 12 deletions doc/source/v0.10.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -57,21 +57,36 @@ You can pass ``columns`` keyword to select to filter a list of the return column

.. ipython:: python

index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
['one', 'two', 'three']],
labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=['foo', 'bar'])
df = DataFrame(np.random.randn(10, 3), index=index,
columns=['A', 'B', 'C'])
df
index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
['one', 'two', 'three']],
labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=['foo', 'bar'])
df = DataFrame(np.random.randn(10, 3), index=index,
columns=['A', 'B', 'C'])
df

store.append('mi',df)
store.select('mi')

# the levels are automatically included as data columns
store.select('mi', Term('foo=bar'))

Multi-table Selection via ``select_multiple`` can perform selections from multiple tables and return a combined result, by using ``where`` on a selector table.

.. ipython:: python

store.append('mi',df)
store.select('mi')
index = date_range('1/1/2000', periods=8)
df1_mt = DataFrame(randn(8, 3), index=index, columns=['A', 'B', 'C'])
df2_mt = DataFrame(randn(8, 3), index=index, columns=['D', 'E', 'F'])
df2_mt['foo'] = 'bar'

# the levels are automatically included as data columns
store.select('mi', Term('foo=bar'))
# you can use data columns as well
store.append('df1_mt',df1_mt, columns = ['A','B'])
store.append('df2_mt',df2_mt)

store.select_multiple(['df1_mt','df2_mt'], where = [ 'A>0' ], axis = 1, selector = 'df1_mt')

.. ipython:: python
:suppress:

Expand Down
122 changes: 108 additions & 14 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,65 @@ def select(self, key, where=None, start=None, stop=None, columns=None, **kwargs)
raise KeyError('No object named %s in the file' % key)
return self._read_group(group, where=where, start=start, stop=stop, columns=columns, **kwargs)

def select_as_coordinates(self, key, where=None, **kwargs):
"""
return the selection as a Coordinates. Note that start/stop/columns parematers are inapplicable here.
Parameters
----------
key : object
Optional Parameters
-------------------
where : list of Term (or convertable) objects, optional
"""
return self.get_table(key).read_coordinates(where = where, **kwargs)

def select_multiple(self, keys, where=None, selector=None, columns=None, axis=1, **kwargs):
""" Retrieve pandas objects from multiple tables
Parameters
----------
keys : a list of the tables
selector : the table to apply the where criteria (defaults to keys[0] if not supplied)
columns : the columns I want back
axis : the concentation axis (defaults to 1)
Exceptions
----------
raise if any of the keys don't refer to tables or if they are not ALL THE SAME DIMENSIONS
"""

if not isinstance(keys, (list,tuple)):
raise Exception("keys must be a list/tuple")

if len(keys) == 0:
raise Exception("keys must have a non-zero length")

if len(keys) == 1:
return self.select(key = keys[0], where=where, columns = columns, **kwargs)

if selector is None:
selector = keys[0]

# collect the tables
tbls = [ self.get_table(k) for k in keys ]

# validate rows
nrows = tbls[0].nrows
for t in tbls:
if t.nrows != nrows:
raise Exception("all tables must have exactly the same nrows!")

# select coordinates from the selector table
c = self.select_as_coordinates(selector, where)

# collect the returns objs
objs = [ t.read(where = c, columns = columns) for t in tbls ]

# concat and return
return concat(objs, axis = axis, verify_integrity = True)

def put(self, key, value, table=False, append=False,
compression=None, **kwargs):
"""
Expand Down Expand Up @@ -1318,7 +1377,7 @@ def pandas_type(self):
def __repr__(self):
""" return a pretty representatgion of myself """
self.infer_axes()
dc = ",dc->%s" % ','.join(self.data_columns) if len(self.data_columns) else ''
dc = ",dc->[%s]" % ','.join(self.data_columns) if len(self.data_columns) else ''
return "%s (typ->%s,nrows->%s,indexers->[%s]%s)" % (self.pandas_type,
self.table_type_short,
self.nrows,
Expand Down Expand Up @@ -1730,6 +1789,18 @@ def create_description(self, compression = None, complevel = None, expectedrows
def read(self, **kwargs):
raise NotImplementedError("cannot read on an abstract table: subclasses should implement")

def read_coordinates(self, where=None, **kwargs):

# validate the version
self.validate_version(where)

# infer the data kind
if not self.infer_axes(): return False

# create the selection
self.selection = Selection(self, where = where, **kwargs)
return Coordinates(self.selection.select_coords(), group = self.group, where = where)

def write(self, **kwargs):
raise NotImplementedError("cannot write on an abstract table")

Expand Down Expand Up @@ -2475,6 +2546,19 @@ def convert_value(self, v):
# string quoting
return ["'" + v + "'", v]

class Coordinates(object):
""" holds a returned coordinates list, useful to select the same rows from different tables
coordinates : holds the array of coordinates
group : the source group
where : the source where
"""

def __init__(self, values, group, where, **kwargs):
self.values = values
self.group = group
self.where = where

class Selection(object):
"""
Carries out a selection operation on a tables.Table object.
Expand All @@ -2493,17 +2577,23 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs):
self.stop = stop
self.condition = None
self.filter = None
self.terms = self.generate(where)

# create the numexpr & the filter
if self.terms:
conds = [ t.condition for t in self.terms if t.condition is not None ]
if len(conds):
self.condition = "(%s)" % ' & '.join(conds)
self.filter = []
for t in self.terms:
if t.filter is not None:
self.filter.append(t.filter)
self.terms = None
self.coordinates = None

if isinstance(where, Coordinates):
self.coordinates = where.values
else:
self.terms = self.generate(where)

# create the numexpr & the filter
if self.terms:
conds = [ t.condition for t in self.terms if t.condition is not None ]
if len(conds):
self.condition = "(%s)" % ' & '.join(conds)
self.filter = []
for t in self.terms:
if t.filter is not None:
self.filter.append(t.filter)

def generate(self, where):
""" where can be a : dict,list,tuple,string """
Expand All @@ -2528,13 +2618,17 @@ def select(self):
"""
if self.condition is not None:
return self.table.table.readWhere(self.condition, start=self.start, stop=self.stop)
else:
return self.table.table.read(start=self.start,stop=self.stop)
elif self.coordinates is not None:
return self.table.table.readCoordinates(self.coordinates)
return self.table.table.read(start=self.start,stop=self.stop)

def select_coords(self):
"""
generate the selection
"""
if self.condition is None:
return np.arange(self.table.nrows)

return self.table.table.getWhereList(self.condition, start=self.start, stop=self.stop, sort = True)


Expand Down
81 changes: 81 additions & 0 deletions pandas/io/tests/test_pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -1365,6 +1365,82 @@ def test_frame_select(self):
#self.assertRaises(Exception, self.store.select,
# 'frame', [crit1, crit2])

def test_coordinates(self):
df = tm.makeTimeDataFrame()

self.store.remove('df')
self.store.append('df', df)

# all
c = self.store.select_as_coordinates('df')
assert((c.values == np.arange(len(df.index))).all() == True)

# get coordinates back & test vs frame
self.store.remove('df')

df = DataFrame(dict(A = range(5), B = range(5)))
self.store.append('df', df)
c = self.store.select_as_coordinates('df',[ 'index<3' ])
assert((c.values == np.arange(3)).all() == True)
result = self.store.select('df', where = c)
expected = df.ix[0:2,:]
tm.assert_frame_equal(result,expected)

c = self.store.select_as_coordinates('df', [ 'index>=3', 'index<=4' ])
assert((c.values == np.arange(2)+3).all() == True)
result = self.store.select('df', where = c)
expected = df.ix[3:4,:]
tm.assert_frame_equal(result,expected)

# multiple tables
self.store.remove('df1')
self.store.remove('df2')
df1 = tm.makeTimeDataFrame()
df2 = tm.makeTimeDataFrame().rename(columns = lambda x: "%s_2" % x)
self.store.append('df1',df1, columns = ['A','B'])
self.store.append('df2',df2)

c = self.store.select_as_coordinates('df1', [ 'A>0','B>0' ])
df1_result = self.store.select('df1',c)
df2_result = self.store.select('df2',c)
result = concat([ df1_result, df2_result ], axis=1)

expected = concat([ df1, df2 ], axis=1)
expected = expected[(expected.A > 0) & (expected.B > 0)]
tm.assert_frame_equal(result, expected)

def test_select_multiple(self):
df1 = tm.makeTimeDataFrame()
df2 = tm.makeTimeDataFrame().rename(columns = lambda x: "%s_2" % x)
df2['foo'] = 'bar'
self.store.append('df1',df1, columns = ['A','B'])
self.store.append('df2',df2)

# exceptions
self.assertRaises(Exception, self.store.select_multiple, None, where = [ 'A>0','B>0' ], selector = 'df1')
self.assertRaises(Exception, self.store.select_multiple, [ None ], where = [ 'A>0','B>0' ], selector = 'df1')

# default select
result = self.store.select('df1', ['A>0','B>0'])
expected = self.store.select_multiple([ 'df1' ], where = [ 'A>0','B>0' ], selector = 'df1')
tm.assert_frame_equal(result, expected)

# multiple
result = self.store.select_multiple(['df1','df2'], where = [ 'A>0','B>0' ], selector = 'df1')
expected = concat([ df1, df2 ], axis=1)
expected = expected[(expected.A > 0) & (expected.B > 0)]
tm.assert_frame_equal(result, expected)

# multiple (diff selector)
result = self.store.select_multiple(['df1','df2'], where = [ Term('index', '>', df2.index[4]) ], selector = 'df2')
expected = concat([ df1, df2 ], axis=1)
expected = expected[5:]
tm.assert_frame_equal(result, expected)

# test excpection for diff rows
self.store.append('df3',tm.makeTimeDataFrame(nper=50))
self.assertRaises(Exception, self.store.select_multiple, ['df1','df3'], where = [ 'A>0','B>0' ], selector = 'df1')

def test_start_stop(self):

df = DataFrame(dict(A = np.random.rand(20), B = np.random.rand(20)))
Expand All @@ -1374,6 +1450,11 @@ def test_start_stop(self):
expected = df.ix[0:4,['A']]
tm.assert_frame_equal(result, expected)

# out of range
result = self.store.select('df', [ Term("columns", "=", ["A"]) ], start=30, stop=40)
assert(len(result) == 0)
assert(type(result) == DataFrame)

def test_select_filter_corner(self):
df = DataFrame(np.random.randn(50, 100))
df.index = ['%.3d' % c for c in df.index]
Expand Down

0 comments on commit 1c32ebf

Please sign in to comment.