ENH: allow multiple table selection. retrieve multiple tables based o…

…n the results from a selector table. this allows one to potentially put the data you really want to index in a single table, and your actual (wide) data in another to speed queries
pandas-dev · Dec 28, 2012 · 1c32ebf · 1c32ebf
1 parent 04a1aa9
commit 1c32ebf
Show file tree

Hide file tree

Showing 5 changed files with 235 additions and 26 deletions.
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -40,6 +40,7 @@ pandas 0.10.1
     - support ``start`` and ``stop`` keywords in select to limit the row selection space
     - added ``get_store`` context manager to automatically import with pandas
     - added column filtering via ``columns`` keyword in select
+    - added methods select_multiple/select_as_coordinates to do multiple-table selection
 
 **Bug fixes**
 

diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -1254,6 +1254,24 @@ If you want to inspect the table object, retrieve via ``get_table``. You could u
 
    store.get_table('df_dc').nrows
 
+Multiple Table Queries
+~~~~~~~~~~~~~~~~~~~~~~
+
+New in 0.10.1 is the method ``select_multiple``, that can perform selections from multiple tables and return a combined result, by using ``where`` on a selector table. The purpose is to allow fast selection from really wide tables. Construct 2 (or more) tables, where your indexing criteria is contained in a relatively small table. Then put your data in another table. Queries will be quite fast, yet you can allow your tables to grow (in column space). **THE USER IS RESPONSIBLE FOR SYNCHRONIZING THE TABLES**. This means, append to the tables in the same order. You can pass the ``axis`` parameter to control concatenation. Default is on the ``columns`` axis.
+
+.. ipython:: python
+
+   index = date_range('1/1/2000', periods=8)
+   df1_mt = DataFrame(randn(8, 3), index=index, columns=['A', 'B', 'C'])
+   df2_mt = DataFrame(randn(8, 3), index=index, columns=['D', 'E', 'F'])
+   df2_mt['foo'] = 'bar'
+
+   # you can use data columns as well
+   store.append('df1_mt',df1_mt, columns = ['A','B'])
+   store.append('df2_mt',df2_mt)
+
+   store.select_multiple(['df1_mt','df2_mt'], where = [ 'A>0','B>0' ], axis = 1, selector = 'df1_mt')
+  
 
 Delete from a Table
 ~~~~~~~~~~~~~~~~~~~

diff --git a/doc/source/v0.10.1.txt b/doc/source/v0.10.1.txt
@@ -57,21 +57,36 @@ You can pass ``columns`` keyword to select to filter a list of the return column
 
 .. ipython:: python
 
-        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
-                                   ['one', 'two', 'three']],
-                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
-                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
-                           names=['foo', 'bar'])
-        df = DataFrame(np.random.randn(10, 3), index=index,
-                       columns=['A', 'B', 'C'])
-        df
+   index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
+                              ['one', 'two', 'three']],
+                      labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
+                              [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
+                      names=['foo', 'bar'])
+   df = DataFrame(np.random.randn(10, 3), index=index,
+                  columns=['A', 'B', 'C'])
+   df
+
+   store.append('mi',df)
+   store.select('mi')
+
+   # the levels are automatically included as data columns
+   store.select('mi', Term('foo=bar'))
+
+Multi-table Selection via ``select_multiple`` can perform selections from multiple tables and return a combined result, by using ``where`` on a selector table.
+
+.. ipython:: python
 
-        store.append('mi',df)
-        store.select('mi')
+   index = date_range('1/1/2000', periods=8)
+   df1_mt = DataFrame(randn(8, 3), index=index, columns=['A', 'B', 'C'])
+   df2_mt = DataFrame(randn(8, 3), index=index, columns=['D', 'E', 'F'])
+   df2_mt['foo'] = 'bar'
 
-	# the levels are automatically included as data columns
-        store.select('mi', Term('foo=bar'))
+   # you can use data columns as well
+   store.append('df1_mt',df1_mt, columns = ['A','B'])
+   store.append('df2_mt',df2_mt)
 
+   store.select_multiple(['df1_mt','df2_mt'], where = [ 'A>0' ], axis = 1, selector = 'df1_mt')
+
 .. ipython:: python
    :suppress:
 

diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -358,6 +358,65 @@ def select(self, key, where=None, start=None, stop=None, columns=None, **kwargs)
             raise KeyError('No object named %s in the file' % key)
         return self._read_group(group, where=where, start=start, stop=stop, columns=columns, **kwargs)
 
+    def select_as_coordinates(self, key, where=None, **kwargs):
+        """ 
+        return the selection as a Coordinates. Note that start/stop/columns parematers are inapplicable here.
+
+        Parameters
+        ----------
+        key : object
+
+        Optional Parameters
+        -------------------
+        where : list of Term (or convertable) objects, optional
+        """
+        return self.get_table(key).read_coordinates(where = where, **kwargs)
+
+    def select_multiple(self, keys, where=None, selector=None, columns=None, axis=1, **kwargs):
+        """ Retrieve pandas objects from multiple tables
+
+        Parameters
+        ----------
+        keys : a list of the tables
+        selector : the table to apply the where criteria (defaults to keys[0] if not supplied)
+        columns : the columns I want back
+        axis : the concentation axis (defaults to 1)
+
+        Exceptions
+        ----------
+        raise if any of the keys don't refer to tables or if they are not ALL THE SAME DIMENSIONS
+        """
+
+        if not isinstance(keys, (list,tuple)):
+            raise Exception("keys must be a list/tuple")
+
+        if len(keys) == 0:
+            raise Exception("keys must have a non-zero length")
+
+        if len(keys) == 1:
+            return self.select(key = keys[0], where=where, columns = columns, **kwargs)
+
+        if selector is None:
+            selector = keys[0]
+
+        # collect the tables
+        tbls = [ self.get_table(k) for k in keys ]
+
+        # validate rows
+        nrows = tbls[0].nrows
+        for t in tbls:
+            if t.nrows != nrows:
+                raise Exception("all tables must have exactly the same nrows!")
+
+        # select coordinates from the selector table
+        c = self.select_as_coordinates(selector, where)
+
+        # collect the returns objs
+        objs = [ t.read(where = c, columns = columns) for t in tbls ]
+
+        # concat and return
+        return concat(objs, axis = axis, verify_integrity = True)
+
     def put(self, key, value, table=False, append=False,
             compression=None, **kwargs):
         """
@@ -1318,7 +1377,7 @@ def pandas_type(self):
     def __repr__(self):
         """ return a pretty representatgion of myself """
         self.infer_axes()
-        dc = ",dc->%s" % ','.join(self.data_columns) if len(self.data_columns) else ''
+        dc = ",dc->[%s]" % ','.join(self.data_columns) if len(self.data_columns) else ''
         return "%s (typ->%s,nrows->%s,indexers->[%s]%s)" % (self.pandas_type,
                                                                      self.table_type_short,
                                                                      self.nrows,
@@ -1730,6 +1789,18 @@ def create_description(self, compression = None, complevel = None, expectedrows
     def read(self, **kwargs):
         raise NotImplementedError("cannot read on an abstract table: subclasses should implement")
 
+    def read_coordinates(self, where=None, **kwargs):
+
+        # validate the version
+        self.validate_version(where)
+
+        # infer the data kind
+        if not self.infer_axes(): return False
+
+        # create the selection
+        self.selection = Selection(self, where = where, **kwargs)
+        return Coordinates(self.selection.select_coords(), group = self.group, where = where)
+
     def write(self, **kwargs):
         raise NotImplementedError("cannot write on an abstract table")
 
@@ -2475,6 +2546,19 @@ def convert_value(self, v):
         # string quoting
         return ["'" + v + "'", v]
 
+class Coordinates(object):
+    """ holds a returned coordinates list, useful to select the same rows from different tables 
+
+    coordinates : holds the array of coordinates
+    group       : the source group
+    where       : the source where
+    """
+
+    def __init__(self, values, group, where, **kwargs):
+        self.values = values
+        self.group  = group
+        self.where  = where
+
 class Selection(object):
     """
     Carries out a selection operation on a tables.Table object.
@@ -2493,17 +2577,23 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs):
         self.stop       = stop
         self.condition  = None
         self.filter     = None
-        self.terms      = self.generate(where)
-
-        # create the numexpr & the filter
-        if self.terms:
-            conds = [ t.condition for t in self.terms if t.condition is not None ]
-            if len(conds):
-                self.condition = "(%s)" % ' & '.join(conds)
-            self.filter = []
-            for t in self.terms:
-                if t.filter is not None:
-                    self.filter.append(t.filter)
+        self.terms      = None
+        self.coordinates = None
+
+        if isinstance(where, Coordinates):
+            self.coordinates = where.values
+        else:
+            self.terms      = self.generate(where)
+
+            # create the numexpr & the filter
+            if self.terms:
+                conds = [ t.condition for t in self.terms if t.condition is not None ]
+                if len(conds):
+                    self.condition = "(%s)" % ' & '.join(conds)
+                self.filter = []
+                for t in self.terms:
+                    if t.filter is not None:
+                        self.filter.append(t.filter)
 
     def generate(self, where):
         """ where can be a : dict,list,tuple,string """
@@ -2528,13 +2618,17 @@ def select(self):
         """
         if self.condition is not None:
             return self.table.table.readWhere(self.condition, start=self.start, stop=self.stop)
-        else:
-            return self.table.table.read(start=self.start,stop=self.stop)
+        elif self.coordinates is not None:
+            return self.table.table.readCoordinates(self.coordinates)
+        return self.table.table.read(start=self.start,stop=self.stop)
 
     def select_coords(self):
         """
         generate the selection
         """
+        if self.condition is None:
+            return np.arange(self.table.nrows)
+
         return self.table.table.getWhereList(self.condition, start=self.start, stop=self.stop, sort = True)
 
 

diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py
@@ -1365,6 +1365,82 @@ def test_frame_select(self):
         #self.assertRaises(Exception, self.store.select,
         #                  'frame', [crit1, crit2])
 
+    def test_coordinates(self):
+        df = tm.makeTimeDataFrame()
+
+        self.store.remove('df')
+        self.store.append('df', df)
+
+        # all
+        c = self.store.select_as_coordinates('df')
+        assert((c.values == np.arange(len(df.index))).all() == True)
+
+        # get coordinates back & test vs frame
+        self.store.remove('df')
+
+        df = DataFrame(dict(A = range(5), B = range(5)))
+        self.store.append('df', df)
+        c = self.store.select_as_coordinates('df',[ 'index<3' ])
+        assert((c.values == np.arange(3)).all() == True)
+        result = self.store.select('df', where = c)
+        expected = df.ix[0:2,:]
+        tm.assert_frame_equal(result,expected)
+
+        c = self.store.select_as_coordinates('df', [ 'index>=3', 'index<=4' ])
+        assert((c.values == np.arange(2)+3).all() == True)
+        result = self.store.select('df', where = c)
+        expected = df.ix[3:4,:]
+        tm.assert_frame_equal(result,expected)
+
+        # multiple tables
+        self.store.remove('df1')
+        self.store.remove('df2')
+        df1 = tm.makeTimeDataFrame()
+        df2 = tm.makeTimeDataFrame().rename(columns = lambda x: "%s_2" % x)
+        self.store.append('df1',df1, columns = ['A','B'])
+        self.store.append('df2',df2)
+
+        c = self.store.select_as_coordinates('df1', [ 'A>0','B>0' ])
+        df1_result = self.store.select('df1',c)
+        df2_result = self.store.select('df2',c)
+        result = concat([ df1_result, df2_result ], axis=1)
+
+        expected = concat([ df1, df2 ], axis=1)
+        expected = expected[(expected.A > 0) & (expected.B > 0)]
+        tm.assert_frame_equal(result, expected)
+
+    def test_select_multiple(self):
+        df1 = tm.makeTimeDataFrame()
+        df2 = tm.makeTimeDataFrame().rename(columns = lambda x: "%s_2" % x)
+        df2['foo'] = 'bar'
+        self.store.append('df1',df1, columns = ['A','B'])
+        self.store.append('df2',df2)
+
+        # exceptions
+        self.assertRaises(Exception, self.store.select_multiple, None, where = [ 'A>0','B>0' ], selector = 'df1')
+        self.assertRaises(Exception, self.store.select_multiple, [ None ], where = [ 'A>0','B>0' ], selector = 'df1')
+
+        # default select
+        result = self.store.select('df1', ['A>0','B>0'])
+        expected = self.store.select_multiple([ 'df1' ], where = [ 'A>0','B>0' ], selector = 'df1')
+        tm.assert_frame_equal(result, expected)
+
+        # multiple
+        result = self.store.select_multiple(['df1','df2'], where = [ 'A>0','B>0' ], selector = 'df1')
+        expected = concat([ df1, df2 ], axis=1)
+        expected = expected[(expected.A > 0) & (expected.B > 0)]
+        tm.assert_frame_equal(result, expected)
+
+        # multiple (diff selector)
+        result = self.store.select_multiple(['df1','df2'], where = [ Term('index', '>', df2.index[4]) ], selector = 'df2')
+        expected = concat([ df1, df2 ], axis=1)
+        expected = expected[5:]
+        tm.assert_frame_equal(result, expected)
+
+        # test excpection for diff rows
+        self.store.append('df3',tm.makeTimeDataFrame(nper=50))
+        self.assertRaises(Exception, self.store.select_multiple, ['df1','df3'], where = [ 'A>0','B>0' ], selector = 'df1')
+
     def test_start_stop(self):
 
         df = DataFrame(dict(A = np.random.rand(20), B = np.random.rand(20)))
@@ -1374,6 +1450,11 @@ def test_start_stop(self):
         expected = df.ix[0:4,['A']]
         tm.assert_frame_equal(result, expected)
 
+        # out of range
+        result = self.store.select('df', [ Term("columns", "=", ["A"]) ], start=30, stop=40)
+        assert(len(result) == 0)
+        assert(type(result) == DataFrame)
+
     def test_select_filter_corner(self):
         df = DataFrame(np.random.randn(50, 100))
         df.index = ['%.3d' % c for c in df.index]