No longer confuse batching/caching; add Table.scan(scan_batching=...)

For details, see the comments added in this commit, and issues #54 and issue #56.
python-happybase · Feb 25, 2014 · 106dcf0 · 106dcf0
1 parent da109ab
commit 106dcf0
Show file tree

Hide file tree

Showing 3 changed files with 49 additions and 11 deletions.
diff --git a/NEWS.rst b/NEWS.rst
@@ -18,6 +18,14 @@ Release date: *not yet released*
   Python 2.6 a separate ```ordereddict``` pacakge has to be installed from PyPI.
   (`issue #39 <https://github.com/wbolster/happybase/issues/39>`_)
 
+* The `batch_size` argument to :py:meth:`Table.scan` is no longer propagated to
+  `Scan.setBatching()` at the Java side (inside the Thrift server). To influence
+  the `Scan.setBatching()` (which may split rows into partial rows) a new
+  `scan_batching` argument to :py:meth:`Table.scan` has been added. See `issue
+  #54 <https://github.com/wbolster/happybase/issues/54>`_, `issue #56
+  <https://github.com/wbolster/happybase/issues/56>`_, and the HBase docs for
+  `Scan.setBatching()` for more details.
+
 
 HappyBase 0.7
 -------------

diff --git a/happybase/table.py b/happybase/table.py
@@ -214,8 +214,8 @@ def cells(self, row, column, versions=None, timestamp=None,
 
     def scan(self, row_start=None, row_stop=None, row_prefix=None,
              columns=None, filter=None, timestamp=None,
-             include_timestamp=False, batch_size=1000, limit=None,
-             sorted_columns=False):
+             include_timestamp=False, batch_size=1000, scan_batching=None,
+             limit=None, sorted_columns=False):
         """Create a scanner for data in the table.
 
         This method returns an iterable that can be used for looping over the
@@ -245,15 +245,22 @@ def scan(self, row_start=None, row_stop=None, row_prefix=None,
 
         If `limit` is given, at most `limit` results will be returned.
 
-        If `sorted_columns` is `True`, the columns in the rows returned
-        by this scanner will be retrieved in sorted order, and the data
-        will be stored in `OrderedDict` instances.
-
         The `batch_size` argument specifies how many results should be
         retrieved per batch when retrieving results from the scanner. Only set
         this to a low value (or even 1) if your data is large, since a low
         batch size results in added round-trips to the server.
 
+        The optional `scan_batching` is for advanced usage only; it
+        translates to `Scan.setBatching()` at the Java side (inside the
+        Thrift server). By setting this value rows may be split into
+        partial rows, so result rows may be incomplete, and the number
+        of results returned by te scanner may no longer correspond to
+        the number of rows matched by the scan.
+
+        If `sorted_columns` is `True`, the columns in the rows returned
+        by this scanner will be retrieved in sorted order, and the data
+        will be stored in `OrderedDict` instances.
+
         **Compatibility notes:**
 
         * The `filter` argument is only available when using HBase 0.92
@@ -274,6 +281,7 @@ def scan(self, row_start=None, row_stop=None, row_prefix=None,
         :param int timestamp: timestamp (optional)
         :param bool include_timestamp: whether timestamps are returned
         :param int batch_size: batch size for retrieving resuls
+        :param bool scan_batching: server-side scan batching (optional)
         :param int limit: max number of rows to return
         :param bool sorted_columns: whether to return sorted columns
 
@@ -327,18 +335,33 @@ def scan(self, row_start=None, row_stop=None, row_prefix=None,
                         self.name, row_start, row_stop, columns, timestamp, {})
 
         else:
-            # The scan's caching size is set to the batch_size, so that
-            # the HTable on the Java side retrieves rows from the region
-            # servers in the same chunk sizes that it sends out over
-            # Thrift.
+            # XXX: The "batch_size" can be slightly confusing to those
+            # familiar with the HBase Java API:
+            #
+            # * TScan.caching (Thrift API) translates to
+            #   Scan.setCaching() (Java API)
+            #
+            # * TScan.batchSize (Thrift API) translates to
+            #   Scan.setBatching (Java API) .
+            #
+            # However, we set Scan.setCaching() to what is called
+            # batch_size in the HappyBase API, so that the HTable on the
+            # Java side (inside the Thrift server) retrieves rows from
+            # the region servers in the same chunk sizes that it sends
+            # out again to Python (over Thrift). This cannot be tweaked
+            # (by design).
+            #
+            # The Scan.setBatching() value (Java API), which possibly
+            # cuts rows into multiple partial rows, can be set using the
+            # slightly strange name scan_batching.
             scan = TScan(
                 startRow=row_start,
                 stopRow=row_stop,
                 timestamp=timestamp,
                 columns=columns,
                 caching=batch_size,
                 filterString=filter,
-                batchSize=batch_size,
+                batchSize=scan_batching,
                 sortColumns=sorted_columns,
             )
             scan_id = self.connection.client.scannerOpenWithScan(

diff --git a/tests/test_api.py b/tests/test_api.py
@@ -446,6 +446,13 @@ def test_scan_sorting():
         row.items())
 
 
+def test_scan_filter_and_batch_size():
+    # See issue #54 and #56
+    filter = "SingleColumnValueFilter ('cf1', 'qual1', =, 'binary:val1')"
+    for k, v in table.scan(filter=filter):
+        print v
+
+
 def test_delete():
     row_key = 'row-test-delete'
     data = {'cf1:col1': 'v1',