Updated pyarrow dep to 0.7.0

Addressed review comments
pandas-dev · Dec 6, 2017 · 1345847 · 1345847
1 parent 9f16982
commit 1345847
Show file tree

Hide file tree

Showing 6 changed files with 75 additions and 196 deletions.
diff --git a/ci/requirements-2.7.sh b/ci/requirements-2.7.sh
@@ -4,4 +4,4 @@ source activate pandas
 
 echo "install 27"
 
-conda install -n pandas -c conda-forge feather-format pyarrow=0.4.1 fastparquet
+conda install -n pandas -c conda-forge feather-format pyarrow=0.7.0 fastparquet
diff --git a/ci/requirements-3.5.sh b/ci/requirements-3.5.sh
@@ -8,4 +8,4 @@ echo "install 35"
 conda remove -n pandas python-dateutil --force
 pip install python-dateutil
 
-conda install -n pandas -c conda-forge feather-format pyarrow=0.5.0
+conda install -n pandas -c conda-forge feather-format pyarrow=0.7.0
diff --git a/doc/source/install.rst b/doc/source/install.rst
@@ -233,7 +233,7 @@ Optional Dependencies
 * `xarray <http://xarray.pydata.org>`__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended.
 * `PyTables <http://www.pytables.org>`__: necessary for HDF5-based storage. Version 3.0.0 or higher required, Version 3.2.1 or higher highly recommended.
 * `Feather Format <https://github.com/wesm/feather>`__: necessary for feather-based storage, version 0.3.1 or higher.
-* `Apache Parquet <https://parquet.apache.org/>`__, either `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.4.1) or `fastparquet <https://fastparquet.readthedocs.io/en/latest/necessary>`__ (>= 0.0.6) for parquet-based storage. The `snappy <https://pypi.python.org/pypi/python-snappy>`__ and `brotli <https://pypi.python.org/pypi/brotlipy>`__ are available for compression support.
+* `Apache Parquet <https://parquet.apache.org/>`__, either `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.7.0) or `fastparquet <https://fastparquet.readthedocs.io/en/latest/necessary>`__ (>= 0.1.0) for parquet-based storage. The `snappy <https://pypi.python.org/pypi/python-snappy>`__ and `brotli <https://pypi.python.org/pypi/brotlipy>`__ are available for compression support.
 * `SQLAlchemy <http://www.sqlalchemy.org>`__: for SQL database support. Version 0.8.1 or higher recommended. Besides SQLAlchemy, you also need a database specific driver. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs <http://docs.sqlalchemy.org/en/latest/dialects/index.html>`__. Some common drivers are:
 
   * `psycopg2 <http://initd.org/psycopg/>`__: for PostgreSQL

diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt
@@ -77,7 +77,7 @@ Other Enhancements
 - :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`)
 - :func:`pandas.read_clipboard` updated to use qtpy, falling back to PyQt5 and then PyQt4, adding compatibility with Python3 and multiple python-qt bindings (:issue:`17722`)
 - Improved wording of ``ValueError`` raised in :func:`read_csv` when the ``usecols`` argument cannot match all columns. (:issue:`17301`)
-- Enabled the use of non-default indexes in ``to_parquet`` with pyarrow>=0.7.0 (:issue:`18581`)
+- Enabled the use of non-default indexes in :func:`DataFrame.to_parquet` where the underlying engine supports it (:issue:`18581`)
 
 .. _whatsnew_0220.api_breaking:
 

diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -4,6 +4,7 @@
 from distutils.version import LooseVersion
 from pandas import DataFrame, RangeIndex, Int64Index, get_option
 from pandas.compat import range
+from pandas.core.common import AbstractMethodError
 from pandas.io.common import get_filepath_or_buffer
 
 
@@ -39,97 +40,59 @@ class BaseImpl(object):
     api = None  # module
 
     @staticmethod
-    def _validate_index(df):
-        if not isinstance(df.index, Int64Index):
-            msg = (
-                "parquet does not support serializing {} for the index;"
-                "you can .reset_index() to make the index into column(s)"
-            )
-            raise ValueError(msg.format(type(df.index)))
-        if not df.index.equals(RangeIndex(len(df))):
-            raise ValueError(
-                "parquet does not support serializing a non-default index "
-                "for the index; you can .reset_index() to make the index "
-                "into column(s)"
-            )
-        if df.index.name is not None:
-            raise ValueError(
-                "parquet does not serialize index meta-data "
-                "on a default index"
-            )
-
-    @staticmethod
-    def _validate_columns(df):
+    def validate_dataframe(df):
+        if not isinstance(df, DataFrame):
+            raise ValueError("to_parquet only support IO with DataFrames")
         # must have value column names (strings only)
         if df.columns.inferred_type not in {'string', 'unicode'}:
             raise ValueError("parquet must have string column names")
 
-    def validate_dataframe(self, df):
-        if not isinstance(df, DataFrame):
-            raise ValueError("to_parquet only support IO with DataFrames")
-        self._validate_columns(df)
-        self._validate_index(df)
-
     def write(self, df, path, compression, **kwargs):
-        raise NotImplementedError()
+        raise AbstractMethodError()
 
     def read(self, path, columns=None, **kwargs):
-        raise NotImplementedError()
+        raise AbstractMethodError()
 
 
 class PyArrowImpl(BaseImpl):
 
     def __init__(self):
         # since pandas is a dependency of pyarrow
         # we need to import on first use
-
         try:
             import pyarrow
             import pyarrow.parquet
         except ImportError:
-            raise ImportError("pyarrow is required for parquet support\n\n"
-                              "you can install via conda\n"
-                              "conda install pyarrow -c conda-forge\n"
-                              "\nor via pip\n"
-                              "pip install -U pyarrow\n")
-
-        if LooseVersion(pyarrow.__version__) < '0.4.1':
-            raise ImportError("pyarrow >= 0.4.1 is required for parquet"
-                              "support\n\n"
-                              "you can install via conda\n"
-                              "conda install pyarrow -c conda-forge\n"
-                              "\nor via pip\n"
-                              "pip install -U pyarrow\n")
-
-        self._pyarrow_lt_050 = LooseVersion(pyarrow.__version__) < '0.5.0'
-        self._pyarrow_lt_060 = LooseVersion(pyarrow.__version__) < '0.6.0'
-        self._pyarrow_lt_070 = LooseVersion(pyarrow.__version__) < '0.7.0'
+            raise ImportError(
+                "pyarrow is required for parquet support\n\n"
+                "you can install via conda\n"
+                "conda install pyarrow -c conda-forge\n"
+                "\nor via pip\n"
+                "pip install -U pyarrow\n"
+            )
+        if LooseVersion(pyarrow.__version__) < '0.7.0':
+            raise ImportError(
+                "pyarrow >= 0.4.1 is required for parquet support\n\n"
+                "you can install via conda\n"
+                "conda install pyarrow -c conda-forge\n"
+                "\nor via pip\n"
+                "pip install -U pyarrow\n"
+            )
         self.api = pyarrow
 
-    def _validate_index(self, df):
-        # pyarrow >= 0.7.0 supports multi-indexes so no need to validate
-        if self._pyarrow_lt_070:
-            super(PyArrowImpl, self)._validate_index(df)
-
     def write(self, df, path, compression='snappy',
               coerce_timestamps='ms', **kwargs):
         self.validate_dataframe(df)
         path, _, _ = get_filepath_or_buffer(path)
-        if self._pyarrow_lt_060:
-            table = self.api.Table.from_pandas(df, timestamps_to_ms=True)
-            self.api.parquet.write_table(
-                table, path, compression=compression, **kwargs)
-
-        else:
-            table = self.api.Table.from_pandas(df)
-            self.api.parquet.write_table(
-                table, path, compression=compression,
-                coerce_timestamps=coerce_timestamps, **kwargs)
+        table = self.api.Table.from_pandas(df)
+        self.api.parquet.write_table(
+            table, path, compression=compression,
+            coerce_timestamps=coerce_timestamps, **kwargs)
 
     def read(self, path, columns=None, **kwargs):
         path, _, _ = get_filepath_or_buffer(path)
-        return self.api.parquet.read_table(path, columns=columns,
-                                           **kwargs).to_pandas()
+        return self.api.parquet.read_table(
+            path, columns=columns, **kwargs).to_pandas()
 
 
 class FastParquetImpl(BaseImpl):
@@ -140,20 +103,21 @@ def __init__(self):
         try:
             import fastparquet
         except ImportError:
-            raise ImportError("fastparquet is required for parquet support\n\n"
-                              "you can install via conda\n"
-                              "conda install fastparquet -c conda-forge\n"
-                              "\nor via pip\n"
-                              "pip install -U fastparquet")
-
+            raise ImportError(
+                "fastparquet is required for parquet support\n\n"
+                "you can install via conda\n"
+                "conda install fastparquet -c conda-forge\n"
+                "\nor via pip\n"
+                "pip install -U fastparquet"
+            )
         if LooseVersion(fastparquet.__version__) < '0.1.0':
-            raise ImportError("fastparquet >= 0.1.0 is required for parquet "
-                              "support\n\n"
-                              "you can install via conda\n"
-                              "conda install fastparquet -c conda-forge\n"
-                              "\nor via pip\n"
-                              "pip install -U fastparquet")
-
+            raise ImportError(
+                "fastparquet >= 0.1.0 is required for parquet "
+                "support\n\n"
+                "you can install via conda\n"
+                "conda install fastparquet -c conda-forge\n"
+                "\nor via pip\n"
+                "pip install -U fastparquet")
         self.api = fastparquet
 
     def write(self, df, path, compression='snappy', **kwargs):