Skip to content

Commit

Permalink
Updated pyarrow dep to 0.7.0
Browse files Browse the repository at this point in the history
Addressed review comments
  • Loading branch information
dhirschfeld committed Dec 6, 2017
1 parent 9f16982 commit 1345847
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 196 deletions.
2 changes: 1 addition & 1 deletion ci/requirements-2.7.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ source activate pandas

echo "install 27"

conda install -n pandas -c conda-forge feather-format pyarrow=0.4.1 fastparquet
conda install -n pandas -c conda-forge feather-format pyarrow=0.7.0 fastparquet
2 changes: 1 addition & 1 deletion ci/requirements-3.5.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ echo "install 35"
conda remove -n pandas python-dateutil --force
pip install python-dateutil

conda install -n pandas -c conda-forge feather-format pyarrow=0.5.0
conda install -n pandas -c conda-forge feather-format pyarrow=0.7.0
2 changes: 1 addition & 1 deletion doc/source/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ Optional Dependencies
* `xarray <http://xarray.pydata.org>`__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended.
* `PyTables <http://www.pytables.org>`__: necessary for HDF5-based storage. Version 3.0.0 or higher required, Version 3.2.1 or higher highly recommended.
* `Feather Format <https://github.com/wesm/feather>`__: necessary for feather-based storage, version 0.3.1 or higher.
* `Apache Parquet <https://parquet.apache.org/>`__, either `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.4.1) or `fastparquet <https://fastparquet.readthedocs.io/en/latest/necessary>`__ (>= 0.0.6) for parquet-based storage. The `snappy <https://pypi.python.org/pypi/python-snappy>`__ and `brotli <https://pypi.python.org/pypi/brotlipy>`__ are available for compression support.
* `Apache Parquet <https://parquet.apache.org/>`__, either `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.7.0) or `fastparquet <https://fastparquet.readthedocs.io/en/latest/necessary>`__ (>= 0.1.0) for parquet-based storage. The `snappy <https://pypi.python.org/pypi/python-snappy>`__ and `brotli <https://pypi.python.org/pypi/brotlipy>`__ are available for compression support.
* `SQLAlchemy <http://www.sqlalchemy.org>`__: for SQL database support. Version 0.8.1 or higher recommended. Besides SQLAlchemy, you also need a database specific driver. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs <http://docs.sqlalchemy.org/en/latest/dialects/index.html>`__. Some common drivers are:

* `psycopg2 <http://initd.org/psycopg/>`__: for PostgreSQL
Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.22.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ Other Enhancements
- :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`)
- :func:`pandas.read_clipboard` updated to use qtpy, falling back to PyQt5 and then PyQt4, adding compatibility with Python3 and multiple python-qt bindings (:issue:`17722`)
- Improved wording of ``ValueError`` raised in :func:`read_csv` when the ``usecols`` argument cannot match all columns. (:issue:`17301`)
- Enabled the use of non-default indexes in ``to_parquet`` with pyarrow>=0.7.0 (:issue:`18581`)
- Enabled the use of non-default indexes in :func:`DataFrame.to_parquet` where the underlying engine supports it (:issue:`18581`)

.. _whatsnew_0220.api_breaking:

Expand Down
118 changes: 41 additions & 77 deletions pandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from distutils.version import LooseVersion
from pandas import DataFrame, RangeIndex, Int64Index, get_option
from pandas.compat import range
from pandas.core.common import AbstractMethodError
from pandas.io.common import get_filepath_or_buffer


Expand Down Expand Up @@ -39,97 +40,59 @@ class BaseImpl(object):
api = None # module

@staticmethod
def _validate_index(df):
if not isinstance(df.index, Int64Index):
msg = (
"parquet does not support serializing {} for the index;"
"you can .reset_index() to make the index into column(s)"
)
raise ValueError(msg.format(type(df.index)))
if not df.index.equals(RangeIndex(len(df))):
raise ValueError(
"parquet does not support serializing a non-default index "
"for the index; you can .reset_index() to make the index "
"into column(s)"
)
if df.index.name is not None:
raise ValueError(
"parquet does not serialize index meta-data "
"on a default index"
)

@staticmethod
def _validate_columns(df):
def validate_dataframe(df):
if not isinstance(df, DataFrame):
raise ValueError("to_parquet only support IO with DataFrames")
# must have value column names (strings only)
if df.columns.inferred_type not in {'string', 'unicode'}:
raise ValueError("parquet must have string column names")

def validate_dataframe(self, df):
if not isinstance(df, DataFrame):
raise ValueError("to_parquet only support IO with DataFrames")
self._validate_columns(df)
self._validate_index(df)

def write(self, df, path, compression, **kwargs):
raise NotImplementedError()
raise AbstractMethodError()

def read(self, path, columns=None, **kwargs):
raise NotImplementedError()
raise AbstractMethodError()


class PyArrowImpl(BaseImpl):

def __init__(self):
# since pandas is a dependency of pyarrow
# we need to import on first use

try:
import pyarrow
import pyarrow.parquet
except ImportError:
raise ImportError("pyarrow is required for parquet support\n\n"
"you can install via conda\n"
"conda install pyarrow -c conda-forge\n"
"\nor via pip\n"
"pip install -U pyarrow\n")

if LooseVersion(pyarrow.__version__) < '0.4.1':
raise ImportError("pyarrow >= 0.4.1 is required for parquet"
"support\n\n"
"you can install via conda\n"
"conda install pyarrow -c conda-forge\n"
"\nor via pip\n"
"pip install -U pyarrow\n")

self._pyarrow_lt_050 = LooseVersion(pyarrow.__version__) < '0.5.0'
self._pyarrow_lt_060 = LooseVersion(pyarrow.__version__) < '0.6.0'
self._pyarrow_lt_070 = LooseVersion(pyarrow.__version__) < '0.7.0'
raise ImportError(
"pyarrow is required for parquet support\n\n"
"you can install via conda\n"
"conda install pyarrow -c conda-forge\n"
"\nor via pip\n"
"pip install -U pyarrow\n"
)
if LooseVersion(pyarrow.__version__) < '0.7.0':
raise ImportError(
"pyarrow >= 0.4.1 is required for parquet support\n\n"
"you can install via conda\n"
"conda install pyarrow -c conda-forge\n"
"\nor via pip\n"
"pip install -U pyarrow\n"
)
self.api = pyarrow

def _validate_index(self, df):
# pyarrow >= 0.7.0 supports multi-indexes so no need to validate
if self._pyarrow_lt_070:
super(PyArrowImpl, self)._validate_index(df)

def write(self, df, path, compression='snappy',
coerce_timestamps='ms', **kwargs):
self.validate_dataframe(df)
path, _, _ = get_filepath_or_buffer(path)
if self._pyarrow_lt_060:
table = self.api.Table.from_pandas(df, timestamps_to_ms=True)
self.api.parquet.write_table(
table, path, compression=compression, **kwargs)

else:
table = self.api.Table.from_pandas(df)
self.api.parquet.write_table(
table, path, compression=compression,
coerce_timestamps=coerce_timestamps, **kwargs)
table = self.api.Table.from_pandas(df)
self.api.parquet.write_table(
table, path, compression=compression,
coerce_timestamps=coerce_timestamps, **kwargs)

def read(self, path, columns=None, **kwargs):
path, _, _ = get_filepath_or_buffer(path)
return self.api.parquet.read_table(path, columns=columns,
**kwargs).to_pandas()
return self.api.parquet.read_table(
path, columns=columns, **kwargs).to_pandas()


class FastParquetImpl(BaseImpl):
Expand All @@ -140,20 +103,21 @@ def __init__(self):
try:
import fastparquet
except ImportError:
raise ImportError("fastparquet is required for parquet support\n\n"
"you can install via conda\n"
"conda install fastparquet -c conda-forge\n"
"\nor via pip\n"
"pip install -U fastparquet")

raise ImportError(
"fastparquet is required for parquet support\n\n"
"you can install via conda\n"
"conda install fastparquet -c conda-forge\n"
"\nor via pip\n"
"pip install -U fastparquet"
)
if LooseVersion(fastparquet.__version__) < '0.1.0':
raise ImportError("fastparquet >= 0.1.0 is required for parquet "
"support\n\n"
"you can install via conda\n"
"conda install fastparquet -c conda-forge\n"
"\nor via pip\n"
"pip install -U fastparquet")

raise ImportError(
"fastparquet >= 0.1.0 is required for parquet "
"support\n\n"
"you can install via conda\n"
"conda install fastparquet -c conda-forge\n"
"\nor via pip\n"
"pip install -U fastparquet")
self.api = fastparquet

def write(self, df, path, compression='snappy', **kwargs):
Expand Down
Loading

0 comments on commit 1345847

Please sign in to comment.