ENH: add to/from_parquet with pyarrow & fastparquet (#15838)

pandas-dev · Aug 2, 2017 · f433061 · f433061
1 parent 3ed51c2
commit f433061
Show file tree

Hide file tree

Showing 20 changed files with 703 additions and 12 deletions.
diff --git a/ci/install_travis.sh b/ci/install_travis.sh
@@ -153,6 +153,7 @@ fi
 echo
 echo "[removing installed pandas]"
 conda remove pandas -y --force
+pip uninstall -y pandas
 
 if [ "$BUILD_TEST" ]; then
 

diff --git a/ci/requirements-2.7.sh b/ci/requirements-2.7.sh
@@ -4,4 +4,4 @@ source activate pandas
 
 echo "install 27"
 
-conda install -n pandas -c conda-forge feather-format pyarrow=0.4.1
+conda install -n pandas -c conda-forge feather-format pyarrow=0.4.1 fastparquet
diff --git a/ci/requirements-3.5.sh b/ci/requirements-3.5.sh
@@ -4,8 +4,8 @@ source activate pandas
 
 echo "install 35"
 
-conda install -n pandas -c conda-forge feather-format pyarrow=0.4.1
-
 # pip install python-dateutil to get latest
 conda remove -n pandas python-dateutil --force
 pip install python-dateutil
+
+conda install -n pandas -c conda-forge feather-format pyarrow=0.4.1
diff --git a/ci/requirements-3.5_OSX.sh b/ci/requirements-3.5_OSX.sh
@@ -4,4 +4,4 @@ source activate pandas
 
 echo "install 35_OSX"
 
-conda install -n pandas -c conda-forge feather-format==0.3.1
+conda install -n pandas -c conda-forge feather-format==0.3.1 fastparquet
diff --git a/ci/requirements-3.6.pip b/ci/requirements-3.6.pip
@@ -0,0 +1 @@
+brotlipy
diff --git a/ci/requirements-3.6.run b/ci/requirements-3.6.run
@@ -17,6 +17,8 @@ pymysql
 feather-format
 pyarrow
 psycopg2
+python-snappy
+fastparquet
 beautifulsoup4
 s3fs
 xarray

diff --git a/ci/requirements-3.6_DOC.sh b/ci/requirements-3.6_DOC.sh
@@ -6,6 +6,6 @@ echo "[install DOC_BUILD deps]"
 
 pip install pandas-gbq
 
-conda install -n pandas -c conda-forge feather-format pyarrow nbsphinx pandoc
+conda install -n pandas -c conda-forge feather-format pyarrow nbsphinx pandoc fastparquet
 
 conda install -n pandas -c r r rpy2 --yes
diff --git a/ci/requirements-3.6_WIN.run b/ci/requirements-3.6_WIN.run
@@ -13,3 +13,5 @@ numexpr
 pytables
 matplotlib
 blosc
+fastparquet
+pyarrow
diff --git a/doc/source/install.rst b/doc/source/install.rst
@@ -237,6 +237,7 @@ Optional Dependencies
 * `xarray <http://xarray.pydata.org>`__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended.
 * `PyTables <http://www.pytables.org>`__: necessary for HDF5-based storage. Version 3.0.0 or higher required, Version 3.2.1 or higher highly recommended.
 * `Feather Format <https://github.com/wesm/feather>`__: necessary for feather-based storage, version 0.3.1 or higher.
+* ``Apache Parquet Format``, either `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.4.1) or `fastparquet <https://fastparquet.readthedocs.io/en/latest/necessary>`__ (>= 0.0.6) for parquet-based storage. The `snappy <https://pypi.python.org/pypi/python-snappy>`__ and `brotli <https://pypi.python.org/pypi/brotlipy>`__ are available for compression support.
 * `SQLAlchemy <http://www.sqlalchemy.org>`__: for SQL database support. Version 0.8.1 or higher recommended. Besides SQLAlchemy, you also need a database specific driver. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs <http://docs.sqlalchemy.org/en/latest/dialects/index.html>`__. Some common drivers are:
 
   * `psycopg2 <http://initd.org/psycopg/>`__: for PostgreSQL

diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -43,6 +43,7 @@ object. The corresponding ``writer`` functions are object methods that are acces
     binary;`MS Excel <https://en.wikipedia.org/wiki/Microsoft_Excel>`__;:ref:`read_excel<io.excel_reader>`;:ref:`to_excel<io.excel_writer>`
     binary;`HDF5 Format <https://support.hdfgroup.org/HDF5/whatishdf5.html>`__;:ref:`read_hdf<io.hdf5>`;:ref:`to_hdf<io.hdf5>`
     binary;`Feather Format <https://github.com/wesm/feather>`__;:ref:`read_feather<io.feather>`;:ref:`to_feather<io.feather>`
+    binary;`Parquet Format <https://parquet.apache.org/>`__;:ref:`read_parquet<io.parquet>`;:ref:`to_parquet<io.parquet>`
     binary;`Msgpack <http://msgpack.org/index.html>`__;:ref:`read_msgpack<io.msgpack>`;:ref:`to_msgpack<io.msgpack>`
     binary;`Stata <https://en.wikipedia.org/wiki/Stata>`__;:ref:`read_stata<io.stata_reader>`;:ref:`to_stata<io.stata_writer>`
     binary;`SAS <https://en.wikipedia.org/wiki/SAS_(software)>`__;:ref:`read_sas<io.sas_reader>`;
@@ -209,7 +210,7 @@ buffer_lines : int, default None
   .. deprecated:: 0.19.0
 
      Argument removed because its value is not respected by the parser
-     
+
 compact_ints : boolean, default False
   .. deprecated:: 0.19.0
 
@@ -4087,7 +4088,7 @@ control compression: ``complevel`` and ``complib``.
 ``complevel`` specifies if and how hard data is to be compressed.
               ``complevel=0`` and ``complevel=None`` disables
               compression and ``0<complevel<10`` enables compression.
-              
+
 ``complib`` specifies which compression library to use. If nothing is
             specified the default library ``zlib`` is used. A
             compression library usually optimizes for either good
@@ -4102,9 +4103,9 @@ control compression: ``complevel`` and ``complib``.
              - `blosc <http://www.blosc.org/>`_: Fast compression and decompression.
 
              .. versionadded:: 0.20.2
-                               
+
                 Support for alternative blosc compressors:
-                  
+
                 - `blosc:blosclz <http://www.blosc.org/>`_ This is the
                   default compressor for ``blosc``
                 - `blosc:lz4
@@ -4545,6 +4546,79 @@ Read from a feather file.
    import os
    os.remove('example.feather')
 
+
+.. _io.parquet:
+
+Parquet
+-------
+
+.. versionadded:: 0.21.0
+
+`Parquet <https://parquet.apache.org/`__ provides a partitioned binary columnar serialization for data frames. It is designed to
+make reading and writing data frames efficient, and to make sharing data across data analysis
+languages easy. Parquet can use a variety of compression techniques to shrink the file size as much as possible
+while still maintaining good read performance.
+
+Parquet is designed to faithfully serialize and de-serialize ``DataFrame`` s, supporting all of the pandas
+dtypes, including extension dtypes such as datetime with tz.
+
+Several caveats.
+
+- The format will NOT write an ``Index``, or ``MultiIndex`` for the ``DataFrame`` and will raise an
+  error if a non-default one is provided. You can simply ``.reset_index(drop=True)`` in order to store the index.
+- Duplicate column names and non-string columns names are not supported
+- Categorical dtypes are currently not-supported (for ``pyarrow``).
+- Non supported types include ``Period`` and actual python object types. These will raise a helpful error message
+  on an attempt at serialization.
+
+You can specifiy an ``engine`` to direct the serialization. This can be one of ``pyarrow``, or ``fastparquet``, or ``auto``.
+If the engine is NOT specified, then the ``pd.options.io.parquet.engine`` option is checked; if this is also ``auto``, then
+then ``pyarrow`` is tried, and falling back to ``fastparquet``.
+
+See the documentation for `pyarrow <http://arrow.apache.org/docs/python/`__ and `fastparquet <https://fastparquet.readthedocs.io/en/latest/>`__
+
+.. note::
+
+   These engines are very similar and should read/write nearly identical parquet format files.
+   These libraries differ by having different underlying dependencies (``fastparquet`` by using ``numba``, while ``pyarrow`` uses a c-library).
+
+.. ipython:: python
+
+   df = pd.DataFrame({'a': list('abc'),
+                      'b': list(range(1, 4)),
+                      'c': np.arange(3, 6).astype('u1'),
+                      'd': np.arange(4.0, 7.0, dtype='float64'),
+                      'e': [True, False, True],
+                      'f': pd.date_range('20130101', periods=3),
+                      'g': pd.date_range('20130101', periods=3, tz='US/Eastern'),
+                      'h': pd.date_range('20130101', periods=3, freq='ns')})
+
+   df
+   df.dtypes
+
+Write to a parquet file.
+
+.. ipython:: python
+
+   df.to_parquet('example_pa.parquet', engine='pyarrow')
+   df.to_parquet('example_fp.parquet', engine='fastparquet')
+
+Read from a parquet file.
+
+.. ipython:: python
+
+   result = pd.read_parquet('example_pa.parquet', engine='pyarrow')
+   result = pd.read_parquet('example_fp.parquet', engine='fastparquet')
+
+   result.dtypes
+
+.. ipython:: python
+   :suppress:
+
+   import os
+   os.remove('example_pa.parquet')
+   os.remove('example_fp.parquet')
+
 .. _io.sql:
 
 SQL Queries

diff --git a/doc/source/options.rst b/doc/source/options.rst
@@ -414,6 +414,9 @@ io.hdf.default_format               None         default format writing format,
                                                  'table'
 io.hdf.dropna_table                 True         drop ALL nan rows when appending
                                                  to a table
+io.parquet.engine                   None         The engine to use as a default for
+                                                 parquet reading and writing. If None
+                                                 then try 'pyarrow' and 'fastparquet'
 mode.chained_assignment             warn         Raise an exception, warn, or no
                                                  action if trying to use chained
                                                  assignment, The default is warn

diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -78,6 +78,7 @@ Other Enhancements
 - :func:`DataFrame.select_dtypes` now accepts scalar values for include/exclude as well as list-like. (:issue:`16855`)
 - :func:`date_range` now accepts 'YS' in addition to 'AS' as an alias for start of year (:issue:`9313`)
 - :func:`date_range` now accepts 'Y' in addition to 'A' as an alias for end of year (:issue:`9313`)
+- Integration with Apache Parquet, including a new top-level ``pd.read_parquet()`` and ``DataFrame.to_parquet()`` method, see :ref:`here <io.parquet>`.
 
 .. _whatsnew_0210.api_breaking:
 

diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
@@ -465,3 +465,15 @@ def _register_xlsx(engine, other):
     except ImportError:
         # fallback
         _register_xlsx('openpyxl', 'xlsxwriter')
+
+# Set up the io.parquet specific configuration.
+parquet_engine_doc = """
+: string
+    The default parquet reader/writer engine. Available options:
+    'auto', 'pyarrow', 'fastparquet', the default is 'auto'
+"""
+
+with cf.config_prefix('io.parquet'):
+    cf.register_option(
+        'engine', 'auto', parquet_engine_doc,
+        validator=is_one_of_factory(['auto', 'pyarrow', 'fastparquet']))
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1598,6 +1598,30 @@ def to_feather(self, fname):
         from pandas.io.feather_format import to_feather
         to_feather(self, fname)
 
+    def to_parquet(self, fname, engine='auto', compression='snappy',
+                   **kwargs):
+        """
+        Write a DataFrame to the binary parquet format.
+
+        .. versionadded:: 0.21.0
+
+        Parameters
+        ----------
+        fname : str
+            string file path
+        engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
+            Parquet reader library to use. If 'auto', then the option
+            'io.parquet.engine' is used. If 'auto', then the first
+            library to be installed is used.
+        compression : str, optional, default 'snappy'
+            compression method, includes {'gzip', 'snappy', 'brotli'}
+        kwargs
+            Additional keyword arguments passed to the engine
+        """
+        from pandas.io.parquet import to_parquet
+        to_parquet(self, fname, engine,
+                   compression=compression, **kwargs)
+
     @Substitution(header='Write out column names. If a list of string is given, \
 it is assumed to be aliases for the column names')
     @Appender(fmt.docstring_to_string, indents=1)

diff --git a/pandas/io/api.py b/pandas/io/api.py
@@ -13,6 +13,7 @@
 from pandas.io.sql import read_sql, read_sql_table, read_sql_query
 from pandas.io.sas import read_sas
 from pandas.io.feather_format import read_feather
+from pandas.io.parquet import read_parquet
 from pandas.io.stata import read_stata
 from pandas.io.pickle import read_pickle, to_pickle
 from pandas.io.packers import read_msgpack, to_msgpack

diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py
@@ -19,7 +19,7 @@ def _try_import():
                           "you can install via conda\n"
                           "conda install feather-format -c conda-forge\n"
                           "or via pip\n"
-                          "pip install feather-format\n")
+                          "pip install -U feather-format\n")
 
     try:
         feather.__version__ >= LooseVersion('0.3.1')
@@ -29,7 +29,7 @@ def _try_import():
                           "you can install via conda\n"
                           "conda install feather-format -c conda-forge"
                           "or via pip\n"
-                          "pip install feather-format\n")
+                          "pip install -U feather-format\n")
 
     return feather