apache · BryanCutler · Nov 23, 2016 · Nov 28, 2016 · Nov 28, 2016 · Nov 30, 2016
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
@@ -41,5 +41,7 @@
                             list_, struct, field,
                             DataType, Field, Schema, schema)
 
-from pyarrow.table import Column, RecordBatch, Table, from_pandas_dataframe
+from pyarrow.table import (Column, RecordBatch, dataframe_from_batches, Table,
+                           from_pandas_dataframe)
+
 from pyarrow.version import version as __version__
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
@@ -158,6 +158,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         CColumn(const shared_ptr[CField]& field,
                 const shared_ptr[CArray]& data)
 
+        CColumn(const shared_ptr[CField]& field,
+                const vector[shared_ptr[CArray]]& chunks)
+
         int64_t length()
         int64_t null_count()
         const c_string& name()

diff --git a/python/pyarrow/table.pyx b/python/pyarrow/table.pyx
@@ -28,6 +28,7 @@ cimport pyarrow.includes.pyarrow as pyarrow
 import pyarrow.config
 
 from pyarrow.array cimport Array, box_arrow_array
+from pyarrow.error import ArrowException
 from pyarrow.error cimport check_status
 from pyarrow.schema cimport box_data_type, box_schema
 
@@ -414,6 +415,52 @@ cdef class RecordBatch:
         return result
 
 
+def dataframe_from_batches(batches):
+    """
+    Convert a list of Arrow RecordBatches to a pandas.DataFrame
+
+    Parameters
+    ----------
+
+    batches: list of RecordBatch
+        RecordBatch list to be converted, schemas must be equal
+    """
+
+    cdef:
+        vector[shared_ptr[CArray]] c_array_chunks
+        vector[shared_ptr[CColumn]] c_columns
+        shared_ptr[CTable] c_table
+        Array arr
+        Schema schema
+
+    import pandas as pd
+
+    schema = batches[0].schema
+
+    # check schemas are equal
+    if any((not schema.equals(other.schema) for other in batches[1:])):
+        raise ArrowException("Error converting list of RecordBatches to "
+                "DataFrame, not all schemas are equal")
+
+    cdef int K = batches[0].num_columns
+
+    # create chunked columns from the batches
+    c_columns.resize(K)
+    for i in range(K):
+        for batch in batches:
+            arr = batch[i]
+            c_array_chunks.push_back(arr.sp_array)
+        c_columns[i].reset(new CColumn(schema.sp_schema.get().field(i),
+                           c_array_chunks))
+        c_array_chunks.clear()
+
+    # create a Table from columns and convert to DataFrame
+    c_table.reset(new CTable('', schema.sp_schema, c_columns))
+    table = Table()
+    table.init(c_table)
+    return table.to_pandas()
+
+
 cdef class Table:
     """
     A collection of top-level named, equal length Arrow arrays.

diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py
@@ -19,6 +19,7 @@
 
 from pandas.util.testing import assert_frame_equal
 import pandas as pd
+import pytest
 
 import pyarrow as pa
 
@@ -50,6 +51,40 @@ def test_recordbatch_from_to_pandas():
     assert_frame_equal(data, result)
 
 
+def test_recordbatchlist_to_pandas():
+    data1 = pd.DataFrame({
+        'c1': np.array([1, 1, 2], dtype='uint32'),
+        'c2': np.array([1.0, 2.0, 3.0], dtype='float64'),
+        'c3': [True, None, False],
+        'c4': ['foo', 'bar', None]
+    })
+
+    data2 = pd.DataFrame({
+        'c1': np.array([3, 5], dtype='uint32'),
+        'c2': np.array([4.0, 5.0], dtype='float64'),
+        'c3': [True, True],
+        'c4': ['baz', 'qux']
+    })
+
+    batch1 = pa.RecordBatch.from_pandas(data1)
+    batch2 = pa.RecordBatch.from_pandas(data2)
+
+    result = pa.dataframe_from_batches([batch1, batch2])
+    data = pd.concat([data1, data2], ignore_index=True)
+    assert_frame_equal(data, result)
+
+
+def test_recordbatchlist_schema_equals():
+    data1 = pd.DataFrame({'c1': np.array([1], dtype='uint32')})
+    data2 = pd.DataFrame({'c1': np.array([4.0, 5.0], dtype='float64')})
+
+    batch1 = pa.RecordBatch.from_pandas(data1)
+    batch2 = pa.RecordBatch.from_pandas(data2)
+
+    with pytest.raises(pa.ArrowException):
+        pa.dataframe_from_batches([batch1, batch2])
+
+
 def test_table_basics():
     data = [
         pa.from_pylist(range(5)),