Skip to content

Commit

Permalink
Updated pyarrow dep to 0.7.0
Browse files Browse the repository at this point in the history
Addressed review comments
  • Loading branch information
dhirschfeld committed Dec 6, 2017
1 parent b4d0bfe commit 44b0fe8
Show file tree
Hide file tree
Showing 6 changed files with 86 additions and 150 deletions.
2 changes: 1 addition & 1 deletion ci/requirements-2.7.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ source activate pandas

echo "install 27"

conda install -n pandas -c conda-forge feather-format pyarrow=0.4.1 fastparquet
conda install -n pandas -c conda-forge feather-format pyarrow=0.7.0 fastparquet
2 changes: 1 addition & 1 deletion ci/requirements-3.5.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ echo "install 35"
conda remove -n pandas python-dateutil --force
pip install python-dateutil

conda install -n pandas -c conda-forge feather-format pyarrow=0.5.0
conda install -n pandas -c conda-forge feather-format pyarrow=0.7.0
2 changes: 1 addition & 1 deletion doc/source/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ Optional Dependencies
* `xarray <http://xarray.pydata.org>`__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended.
* `PyTables <http://www.pytables.org>`__: necessary for HDF5-based storage. Version 3.0.0 or higher required, Version 3.2.1 or higher highly recommended.
* `Feather Format <https://github.com/wesm/feather>`__: necessary for feather-based storage, version 0.3.1 or higher.
* `Apache Parquet <https://parquet.apache.org/>`__, either `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.4.1) or `fastparquet <https://fastparquet.readthedocs.io/en/latest/necessary>`__ (>= 0.0.6) for parquet-based storage. The `snappy <https://pypi.python.org/pypi/python-snappy>`__ and `brotli <https://pypi.python.org/pypi/brotlipy>`__ are available for compression support.
* `Apache Parquet <https://parquet.apache.org/>`__, either `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.7.0) or `fastparquet <https://fastparquet.readthedocs.io/en/latest/necessary>`__ (>= 0.1.0) for parquet-based storage. The `snappy <https://pypi.python.org/pypi/python-snappy>`__ and `brotli <https://pypi.python.org/pypi/brotlipy>`__ are available for compression support.
* `SQLAlchemy <http://www.sqlalchemy.org>`__: for SQL database support. Version 0.8.1 or higher recommended. Besides SQLAlchemy, you also need a database specific driver. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs <http://docs.sqlalchemy.org/en/latest/dialects/index.html>`__. Some common drivers are:

* `psycopg2 <http://initd.org/psycopg/>`__: for PostgreSQL
Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.22.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ Other Enhancements
- :func:`pandas.read_clipboard` updated to use qtpy, falling back to PyQt5 and then PyQt4, adding compatibility with Python3 and multiple python-qt bindings (:issue:`17722`)
- Improved wording of ``ValueError`` raised in :func:`read_csv` when the ``usecols`` argument cannot match all columns. (:issue:`17301`)
- :func:`DataFrame.corrwith` now silently drops non-numeric columns when passed a Series. Before, an exception was raised (:issue:`18570`).

- Enabled the use of non-default indexes in :func:`DataFrame.to_parquet` where the underlying engine supports it (:issue:`18581`)

.. _whatsnew_0220.api_breaking:

Expand Down
95 changes: 48 additions & 47 deletions pandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from distutils.version import LooseVersion
from pandas import DataFrame, RangeIndex, Int64Index, get_option
from pandas.compat import range
from pandas.core.common import AbstractMethodError
from pandas.io.common import get_filepath_or_buffer


Expand Down Expand Up @@ -39,54 +40,30 @@ class BaseImpl(object):
api = None # module

@staticmethod
def _validate_index(df):
if not isinstance(df.index, Int64Index):
msg = (
"parquet does not support serializing {} for the index;"
"you can .reset_index() to make the index into column(s)"
)
raise ValueError(msg.format(type(df.index)))
if not df.index.equals(RangeIndex(len(df))):
raise ValueError(
"parquet does not support serializing a non-default index "
"for the index; you can .reset_index() to make the index "
"into column(s)"
)
if df.index.name is not None:
raise ValueError(
"parquet does not serialize index meta-data "
"on a default index"
)

@staticmethod
def _validate_columns(df):
def validate_dataframe(df):
if not isinstance(df, DataFrame):
raise ValueError("to_parquet only support IO with DataFrames")
# must have value column names (strings only)
if df.columns.inferred_type not in {'string', 'unicode'}:
raise ValueError("parquet must have string column names")

def validate_dataframe(self, df):
if not isinstance(df, DataFrame):
raise ValueError("to_parquet only support IO with DataFrames")
self._validate_columns(df)
self._validate_index(df)

def write(self, df, path, compression, **kwargs):
raise NotImplementedError()
raise AbstractMethodError()

def read(self, path, columns=None, **kwargs):
raise NotImplementedError()
raise AbstractMethodError()


class PyArrowImpl(BaseImpl):

def __init__(self):
# since pandas is a dependency of pyarrow
# we need to import on first use

try:
import pyarrow
import pyarrow.parquet
except ImportError:
<<<<<<< HEAD
raise ImportError("pyarrow is required for parquet support\n\n"
"you can install via conda\n"
"conda install pyarrow -c conda-forge\n"
Expand All @@ -104,32 +81,38 @@ def __init__(self):
self._pyarrow_lt_050 = LooseVersion(pyarrow.__version__) < '0.5.0'
self._pyarrow_lt_060 = LooseVersion(pyarrow.__version__) < '0.6.0'
self._pyarrow_lt_070 = LooseVersion(pyarrow.__version__) < '0.7.0'
=======
raise ImportError(
"pyarrow is required for parquet support\n\n"
"you can install via conda\n"
"conda install pyarrow -c conda-forge\n"
"\nor via pip\n"
"pip install -U pyarrow\n"
)
if LooseVersion(pyarrow.__version__) < '0.7.0':
raise ImportError(
"pyarrow >= 0.4.1 is required for parquet support\n\n"
"you can install via conda\n"
"conda install pyarrow -c conda-forge\n"
"\nor via pip\n"
"pip install -U pyarrow\n"
)
>>>>>>> Updated pyarrow dep to 0.7.0
self.api = pyarrow

def _validate_index(self, df):
# pyarrow >= 0.7.0 supports multi-indexes so no need to validate
if self._pyarrow_lt_070:
super(PyArrowImpl, self)._validate_index(df)

def write(self, df, path, compression='snappy',
coerce_timestamps='ms', **kwargs):
self.validate_dataframe(df)
path, _, _ = get_filepath_or_buffer(path)
if self._pyarrow_lt_060:
table = self.api.Table.from_pandas(df, timestamps_to_ms=True)
self.api.parquet.write_table(
table, path, compression=compression, **kwargs)

else:
table = self.api.Table.from_pandas(df)
self.api.parquet.write_table(
table, path, compression=compression,
coerce_timestamps=coerce_timestamps, **kwargs)
table = self.api.Table.from_pandas(df)
self.api.parquet.write_table(
table, path, compression=compression,
coerce_timestamps=coerce_timestamps, **kwargs)

def read(self, path, columns=None, **kwargs):
path, _, _ = get_filepath_or_buffer(path)
return self.api.parquet.read_table(path, columns=columns,
**kwargs).to_pandas()
return self.api.parquet.read_table(
path, columns=columns, **kwargs).to_pandas()


class FastParquetImpl(BaseImpl):
Expand All @@ -140,6 +123,7 @@ def __init__(self):
try:
import fastparquet
except ImportError:
<<<<<<< HEAD
raise ImportError("fastparquet is required for parquet support\n\n"
"you can install via conda\n"
"conda install fastparquet -c conda-forge\n"
Expand All @@ -154,6 +138,23 @@ def __init__(self):
"\nor via pip\n"
"pip install -U fastparquet")

=======
raise ImportError(
"fastparquet is required for parquet support\n\n"
"you can install via conda\n"
"conda install fastparquet -c conda-forge\n"
"\nor via pip\n"
"pip install -U fastparquet"
)
if LooseVersion(fastparquet.__version__) < '0.1.0':
raise ImportError(
"fastparquet >= 0.1.0 is required for parquet "
"support\n\n"
"you can install via conda\n"
"conda install fastparquet -c conda-forge\n"
"\nor via pip\n"
"pip install -U fastparquet")
>>>>>>> Updated pyarrow dep to 0.7.0
self.api = fastparquet

def write(self, df, path, compression='snappy', **kwargs):
Expand Down
133 changes: 34 additions & 99 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def engine(request):
def pa():
if not _HAVE_PYARROW:
pytest.skip("pyarrow is not installed")
<<<<<<< HEAD
return 'pyarrow'


Expand All @@ -60,6 +61,9 @@ def pa_ge_070():
if not _HAVE_PYARROW:
pytest.skip("pyarrow is not installed")
if LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0'):
=======
if LooseVersion(pyarrow.__version__) < '0.7.0':
>>>>>>> Updated pyarrow dep to 0.7.0
pytest.skip("pyarrow is < 0.7.0")
return 'pyarrow'

Expand All @@ -68,6 +72,8 @@ def pa_ge_070():
def fp():
if not _HAVE_FASTPARQUET:
pytest.skip("fastparquet is not installed")
if LooseVersion(fastparquet.__version__) < '0.1.0':
pytest.skip("fastparquet is < 0.1.0")
return 'fastparquet'


Expand Down Expand Up @@ -181,9 +187,7 @@ def test_cross_engine_fp_pa(df_cross_compat, pa, fp):
class Base(object):

def check_error_on_write(self, df, engine, exc):
# check that we are raising the exception
# on writing

# check that we are raising the exception on writing
with pytest.raises(exc):
with tm.ensure_clean() as path:
to_parquet(df, path, engine, compression=None)
Expand Down Expand Up @@ -270,6 +274,32 @@ def test_read_columns(self, engine):
write_kwargs={'compression': None},
read_kwargs={'columns': ['string']})

def test_write_with_index(self, engine):
df = pd.DataFrame({'A': [1, 2, 3]})
self.check_round_trip(df, engine, write_kwargs={'compression': None})

# non-default index
for index in [[2, 3, 4],
pd.date_range('20130101', periods=3),
list('abc'),
[1, 3, 4],
pd.MultiIndex.from_tuples([('a', 1), ('a', 2),
('b', 1)]),
]:

df.index = index
self.check_round_trip(df, engine)

# index with meta-data
df.index = [0, 1, 2]
df.index.name = 'foo'
self.check_round_trip(df, engine)

# column multi-index
df.index = [0, 1, 2]
df.columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]),
self.check_error_on_write(df, engine, ValueError)


class TestParquetPyArrow(Base):

Expand All @@ -295,14 +325,12 @@ def test_basic(self, pa):
self.check_round_trip(df, pa)

def test_duplicate_columns(self, pa):

# not currently able to handle duplicate columns
df = pd.DataFrame(np.arange(12).reshape(4, 3),
columns=list('aaa')).copy()
self.check_error_on_write(df, pa, ValueError)

def test_unsupported(self, pa):

# period
df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)})
self.check_error_on_write(df, pa, ValueError)
Expand All @@ -316,78 +344,13 @@ def test_unsupported(self, pa):
df = pd.DataFrame({'a': ['a', 1, 2.0]})
self.check_error_on_write(df, pa, ValueError)

def test_categorical(self, pa_ge_070):
pa = pa_ge_070

# supported in >= 0.7.0
def test_categorical(self, pa):
df = pd.DataFrame({'a': pd.Categorical(list('abc'))})

# de-serialized as object
expected = df.assign(a=df.a.astype(object))
self.check_round_trip(df, pa, expected)

def test_categorical_unsupported(self, pa_lt_070):
pa = pa_lt_070

# supported in >= 0.7.0
df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
self.check_error_on_write(df, pa, NotImplementedError)

def test_write_with_index(self, pa_lt_070):
engine = pa_lt_070
df = pd.DataFrame({'A': [1, 2, 3]})
self.check_round_trip(df, engine, write_kwargs={'compression': None})

# non-default index
for index in [[2, 3, 4],
pd.date_range('20130101', periods=3),
list('abc'),
[1, 3, 4],
pd.MultiIndex.from_tuples([('a', 1), ('a', 2),
('b', 1)]),
]:

df.index = index
self.check_error_on_write(df, engine, ValueError)

# index with meta-data
df.index = [0, 1, 2]
df.index.name = 'foo'
self.check_error_on_write(df, engine, ValueError)

# column multi-index
df.index = [0, 1, 2]
df.columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]),
self.check_error_on_write(df, engine, ValueError)

def test_write_with_non_default_index(self, pa_ge_070):
engine = pa_ge_070

df = pd.DataFrame({'A': [1, 2, 3]})
self.check_round_trip(df, engine, write_kwargs={'compression': None})

# non-default index
for index in [[2, 3, 4],
pd.date_range('20130101', periods=3),
list('abc'),
[1, 3, 4],
pd.MultiIndex.from_tuples([('a', 1), ('a', 2),
('b', 1)]),
]:

df.index = index
self.check_round_trip(df, engine)

# index with meta-data
df.index = [0, 1, 2]
df.index.name = 'foo'
self.check_round_trip(df, engine)

# column multi-index
df.index = [0, 1, 2]
df.columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]),
self.check_error_on_write(df, engine, ValueError)


class TestParquetFastParquet(Base):

Expand All @@ -413,34 +376,6 @@ def test_basic(self, fp):

self.check_round_trip(df, fp, write_kwargs={'compression': None})

def test_write_with_index(self, fp):
engine = fp

df = pd.DataFrame({'A': [1, 2, 3]})
self.check_round_trip(df, engine, write_kwargs={'compression': None})

# non-default index
for index in [[2, 3, 4],
pd.date_range('20130101', periods=3),
list('abc'),
[1, 3, 4],
pd.MultiIndex.from_tuples([('a', 1), ('a', 2),
('b', 1)]),
]:

df.index = index
self.check_error_on_write(df, engine, ValueError)

# index with meta-data
df.index = [0, 1, 2]
df.index.name = 'foo'
self.check_error_on_write(df, engine, ValueError)

# column multi-index
df.index = [0, 1, 2]
df.columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]),
self.check_error_on_write(df, engine, ValueError)

@pytest.mark.skip(reason="not supported")
def test_duplicate_columns(self, fp):

Expand Down

0 comments on commit 44b0fe8

Please sign in to comment.