From 4bf7f564b19a5287848c44ba882be17b821f0ccd Mon Sep 17 00:00:00 2001 From: dhirschfeld Date: Sat, 9 Dec 2017 17:33:12 +1000 Subject: [PATCH 01/12] Allow non-default indexes in to_parquet. ...when supported by the underlying engine. Fixes #18581 --- ci/requirements-2.7.sh | 2 +- ci/requirements-3.5.sh | 2 +- doc/source/install.rst | 2 +- doc/source/io.rst | 5 +- doc/source/whatsnew/v0.22.0.txt | 2 +- pandas/io/parquet.py | 212 +++++++++++++++++++------------- pandas/tests/io/test_parquet.py | 110 ++++++++--------- 7 files changed, 181 insertions(+), 154 deletions(-) diff --git a/ci/requirements-2.7.sh b/ci/requirements-2.7.sh index e3bd5e46026c5..9e59f3e5eeb55 100644 --- a/ci/requirements-2.7.sh +++ b/ci/requirements-2.7.sh @@ -4,4 +4,4 @@ source activate pandas echo "install 27" -conda install -n pandas -c conda-forge feather-format pyarrow=0.4.1 fastparquet +conda install -n pandas -c conda-forge feather-format pyarrow=0.7.0 fastparquet diff --git a/ci/requirements-3.5.sh b/ci/requirements-3.5.sh index d694ad3679ac1..a1ed467b36ff9 100644 --- a/ci/requirements-3.5.sh +++ b/ci/requirements-3.5.sh @@ -8,4 +8,4 @@ echo "install 35" conda remove -n pandas python-dateutil --force pip install python-dateutil -conda install -n pandas -c conda-forge feather-format pyarrow=0.5.0 +conda install -n pandas -c conda-forge feather-format pyarrow=0.7.0 diff --git a/doc/source/install.rst b/doc/source/install.rst index 979d5afd0a04f..641a07ed22810 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -229,7 +229,7 @@ Optional Dependencies * `xarray `__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended. * `PyTables `__: necessary for HDF5-based storage. Version 3.0.0 or higher required, Version 3.2.1 or higher highly recommended. * `Feather Format `__: necessary for feather-based storage, version 0.3.1 or higher. -* `Apache Parquet `__, either `pyarrow `__ (>= 0.4.1) or `fastparquet `__ (>= 0.0.6) for parquet-based storage. The `snappy `__ and `brotli `__ are available for compression support. +* `Apache Parquet `__, either `pyarrow `__ (>= 0.7.0) or `fastparquet `__ (>= 0.1.0) for parquet-based storage. The `snappy `__ and `brotli `__ are available for compression support. * `SQLAlchemy `__: for SQL database support. Version 0.8.1 or higher recommended. Besides SQLAlchemy, you also need a database specific driver. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs `__. Some common drivers are: * `psycopg2 `__: for PostgreSQL diff --git a/doc/source/io.rst b/doc/source/io.rst index 65205c57a1ab6..5568f4b6235f9 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4504,11 +4504,8 @@ dtypes, including extension dtypes such as datetime with tz. Several caveats. -- The format will NOT write an ``Index``, or ``MultiIndex`` for the - ``DataFrame`` and will raise an error if a non-default one is provided. You - can ``.reset_index()`` to store the index or ``.reset_index(drop=True)`` to - ignore it. - Duplicate column names and non-string columns names are not supported +- Index level names, if specified, must be strings - Categorical dtypes can be serialized to parquet, but will de-serialize as ``object`` dtype. - Non supported types include ``Period`` and actual python object types. These will raise a helpful error message on an attempt at serialization. diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index c3a0e3599a0f9..baa5bd3be862a 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -326,4 +326,4 @@ Other ^^^^^ - Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`) -- +- \ No newline at end of file diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 7827c3ae04d4d..a2733c53eb9cc 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -3,7 +3,8 @@ from warnings import catch_warnings from distutils.version import LooseVersion from pandas import DataFrame, RangeIndex, Int64Index, get_option -from pandas.compat import range +from pandas.compat import range, string_types +from pandas.core.common import AbstractMethodError from pandas.io.common import get_filepath_or_buffer @@ -34,82 +35,152 @@ def get_engine(engine): return FastParquetImpl() -class PyArrowImpl(object): +class BaseImpl(object): + + api = None # module + + @staticmethod + def validate_dataframe(df): + if not isinstance(df, DataFrame): + raise ValueError("to_parquet only supports IO with DataFrames") + # must have value column names (strings only) + if df.columns.inferred_type not in {'string', 'unicode'}: + raise ValueError("parquet must have string column names") + # index level names must be strings + valid_names = all( + isinstance(name, string_types) + for name in df.index.names + if name is not None + ) + if not valid_names: + raise ValueError("Index level names must be strings") + + def write(self, df, path, compression, **kwargs): + raise AbstractMethodError(self) + + def read(self, path, columns=None, **kwargs): + raise AbstractMethodError(self) + + +class PyArrowImpl(BaseImpl): def __init__(self): # since pandas is a dependency of pyarrow # we need to import on first use - try: import pyarrow import pyarrow.parquet except ImportError: - raise ImportError("pyarrow is required for parquet support\n\n" - "you can install via conda\n" - "conda install pyarrow -c conda-forge\n" - "\nor via pip\n" - "pip install -U pyarrow\n") - - if LooseVersion(pyarrow.__version__) < LooseVersion('0.4.1'): - raise ImportError("pyarrow >= 0.4.1 is required for parquet" - "support\n\n" - "you can install via conda\n" - "conda install pyarrow -c conda-forge\n" - "\nor via pip\n" - "pip install -U pyarrow\n") - - self._pyarrow_lt_050 = (LooseVersion(pyarrow.__version__) < - LooseVersion('0.5.0')) - self._pyarrow_lt_060 = (LooseVersion(pyarrow.__version__) < - LooseVersion('0.6.0')) + raise ImportError( + "pyarrow is required for parquet support\n\n" + "you can install via conda\n" + "conda install pyarrow -c conda-forge\n" + "\nor via pip\n" + "pip install -U pyarrow\n" + ) + if LooseVersion(pyarrow.__version__) < '0.4.1': + raise ImportError( + "pyarrow >= 0.4.1 is required for parquet support\n\n" + "you can install via conda\n" + "conda install pyarrow -c conda-forge\n" + "\nor via pip\n" + "pip install -U pyarrow\n" + ) + self._pyarrow_lt_070 = ( + LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0') + ) self.api = pyarrow def write(self, df, path, compression='snappy', coerce_timestamps='ms', **kwargs): + self.validate_dataframe(df) + if self._pyarrow_lt_070: + self._validate_write_lt_070( + df, path, compression, coerce_timestamps, **kwargs + ) path, _, _ = get_filepath_or_buffer(path) - if self._pyarrow_lt_060: - table = self.api.Table.from_pandas(df, timestamps_to_ms=True) - self.api.parquet.write_table( - table, path, compression=compression, **kwargs) - - else: - table = self.api.Table.from_pandas(df) - self.api.parquet.write_table( - table, path, compression=compression, - coerce_timestamps=coerce_timestamps, **kwargs) + table = self.api.Table.from_pandas(df) + self.api.parquet.write_table( + table, path, compression=compression, + coerce_timestamps=coerce_timestamps, **kwargs) def read(self, path, columns=None, **kwargs): path, _, _ = get_filepath_or_buffer(path) - return self.api.parquet.read_table(path, columns=columns, - **kwargs).to_pandas() - - -class FastParquetImpl(object): + parquet_file = self.api.parquet.ParquetFile(path) + if self._pyarrow_lt_070: + parquet_file.path = path + return self._read_lt_070(parquet_file, columns, **kwargs) + kwargs['use_pandas_metadata'] = True + return parquet_file.read(columns=columns, **kwargs).to_pandas() + + + def _validate_write_lt_070(self, df, path, compression='snappy', + coerce_timestamps='ms', **kwargs): + # Compatibility shim for pyarrow < 0.7.0 + # TODO: Remove in pandas 0.22.0 + from pandas.core.indexes.multi import MultiIndex + if isinstance(df.index, MultiIndex): + msg = "Mulit-index DataFrames are only supported with pyarrow >= 0.7.0" + raise ValueError(msg) + # Validate index + if not isinstance(df.index, Int64Index): + msg = ( + "parquet does not support serializing {} for the index;" + "you can .reset_index() to make the index into column(s)" + ) + raise ValueError(msg.format(type(df.index))) + if not df.index.equals(RangeIndex(len(df))): + raise ValueError( + "parquet does not support serializing a non-default index " + "for the index; you can .reset_index() to make the index " + "into column(s)" + ) + if df.index.name is not None: + raise ValueError( + "parquet does not serialize index meta-data " + "on a default index" + ) + + def _read_lt_070(self, parquet_file, columns, **kwargs): + # Compatibility shim for pyarrow < 0.7.0 + # TODO: Remove in pandas 0.22.0 + from itertools import chain + import json + if columns is not None: + metadata = json.loads(parquet_file.metadata.metadata[b'pandas']) + columns = set(chain(columns, metadata['index_columns'])) + kwargs['columns'] = columns + return self.api.parquet.read_table(parquet_file.path, **kwargs).to_pandas() + + +class FastParquetImpl(BaseImpl): def __init__(self): # since pandas is a dependency of fastparquet # we need to import on first use - try: import fastparquet except ImportError: - raise ImportError("fastparquet is required for parquet support\n\n" - "you can install via conda\n" - "conda install fastparquet -c conda-forge\n" - "\nor via pip\n" - "pip install -U fastparquet") - - if LooseVersion(fastparquet.__version__) < LooseVersion('0.1.0'): - raise ImportError("fastparquet >= 0.1.0 is required for parquet " - "support\n\n" - "you can install via conda\n" - "conda install fastparquet -c conda-forge\n" - "\nor via pip\n" - "pip install -U fastparquet") - + raise ImportError( + "fastparquet is required for parquet support\n\n" + "you can install via conda\n" + "conda install fastparquet -c conda-forge\n" + "\nor via pip\n" + "pip install -U fastparquet" + ) + if LooseVersion(fastparquet.__version__) < '0.1.0': + raise ImportError( + "fastparquet >= 0.1.0 is required for parquet " + "support\n\n" + "you can install via conda\n" + "conda install fastparquet -c conda-forge\n" + "\nor via pip\n" + "pip install -U fastparquet" + ) self.api = fastparquet def write(self, df, path, compression='snappy', **kwargs): + self.validate_dataframe(df) # thriftpy/protocol/compact.py:339: # DeprecationWarning: tostring() is deprecated. # Use tobytes() instead. @@ -120,7 +191,8 @@ def write(self, df, path, compression='snappy', **kwargs): def read(self, path, columns=None, **kwargs): path, _, _ = get_filepath_or_buffer(path) - return self.api.ParquetFile(path).to_pandas(columns=columns, **kwargs) + parquet_file = self.api.ParquetFile(path) + return parquet_file.to_pandas(columns=columns, **kwargs) def to_parquet(df, path, engine='auto', compression='snappy', **kwargs): @@ -141,43 +213,7 @@ def to_parquet(df, path, engine='auto', compression='snappy', **kwargs): kwargs Additional keyword arguments passed to the engine """ - impl = get_engine(engine) - - if not isinstance(df, DataFrame): - raise ValueError("to_parquet only support IO with DataFrames") - - valid_types = {'string', 'unicode'} - - # validate index - # -------------- - - # validate that we have only a default index - # raise on anything else as we don't serialize the index - - if not isinstance(df.index, Int64Index): - raise ValueError("parquet does not support serializing {} " - "for the index; you can .reset_index()" - "to make the index into column(s)".format( - type(df.index))) - - if not df.index.equals(RangeIndex.from_range(range(len(df)))): - raise ValueError("parquet does not support serializing a " - "non-default index for the index; you " - "can .reset_index() to make the index " - "into column(s)") - - if df.index.name is not None: - raise ValueError("parquet does not serialize index meta-data on a " - "default index") - - # validate columns - # ---------------- - - # must have value column names (strings only) - if df.columns.inferred_type not in valid_types: - raise ValueError("parquet must have string column names") - return impl.write(df, path, compression=compression, **kwargs) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 3243e7a6017c2..ff16b7f1d074d 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -6,6 +6,7 @@ from warnings import catch_warnings import numpy as np +from numpy.random import randn import pandas as pd from pandas.compat import PY3 from pandas.io.parquet import (to_parquet, read_parquet, get_engine, @@ -43,23 +44,7 @@ def engine(request): def pa(): if not _HAVE_PYARROW: pytest.skip("pyarrow is not installed") - return 'pyarrow' - - -@pytest.fixture -def pa_lt_070(): - if not _HAVE_PYARROW: - pytest.skip("pyarrow is not installed") - if LooseVersion(pyarrow.__version__) >= LooseVersion('0.7.0'): - pytest.skip("pyarrow is >= 0.7.0") - return 'pyarrow' - - -@pytest.fixture -def pa_ge_070(): - if not _HAVE_PYARROW: - pytest.skip("pyarrow is not installed") - if LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0'): + if LooseVersion(pyarrow.__version__) < '0.7.0': pytest.skip("pyarrow is < 0.7.0") return 'pyarrow' @@ -68,6 +53,8 @@ def pa_ge_070(): def fp(): if not _HAVE_FASTPARQUET: pytest.skip("fastparquet is not installed") + if LooseVersion(fastparquet.__version__) < '0.1.0': + pytest.skip("fastparquet is < 0.1.0") return 'fastparquet' @@ -181,9 +168,7 @@ def test_cross_engine_fp_pa(df_cross_compat, pa, fp): class Base(object): def check_error_on_write(self, df, engine, exc): - # check that we are raising the exception - # on writing - + # check that we are raising the exception on writing with pytest.raises(exc): with tm.ensure_clean() as path: to_parquet(df, path, engine, compression=None) @@ -247,8 +232,30 @@ def test_columns_dtypes_invalid(self, engine): datetime.datetime(2011, 1, 1, 1, 1)] self.check_error_on_write(df, engine, ValueError) - def test_write_with_index(self, engine): + @pytest.mark.parametrize('compression', [None, 'gzip', 'snappy', 'brotli']) + def test_compression(self, engine, compression): + + if compression == 'snappy': + pytest.importorskip('snappy') + + elif compression == 'brotli': + pytest.importorskip('brotli') + df = pd.DataFrame({'A': [1, 2, 3]}) + self.check_round_trip(df, engine, + write_kwargs={'compression': compression}) + + def test_read_columns(self, engine): + # GH18154 + df = pd.DataFrame({'string': list('abc'), + 'int': list(range(1, 4))}) + + expected = pd.DataFrame({'string': list('abc')}) + self.check_round_trip(df, engine, expected=expected, + write_kwargs={'compression': None}, + read_kwargs={'columns': ['string']}) + + def test_write_with_index(self, engine): df = pd.DataFrame({'A': [1, 2, 3]}) self.check_round_trip(df, engine, write_kwargs={'compression': None}) @@ -262,40 +269,39 @@ def test_write_with_index(self, engine): ]: df.index = index - self.check_error_on_write(df, engine, ValueError) + self.check_round_trip(df, engine) # index with meta-data df.index = [0, 1, 2] df.index.name = 'foo' - self.check_error_on_write(df, engine, ValueError) + self.check_round_trip(df, engine) # column multi-index df.index = [0, 1, 2] df.columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]), self.check_error_on_write(df, engine, ValueError) - @pytest.mark.parametrize('compression', [None, 'gzip', 'snappy', 'brotli']) - def test_compression(self, engine, compression): - - if compression == 'snappy': - pytest.importorskip('snappy') - - elif compression == 'brotli': - pytest.importorskip('brotli') - - df = pd.DataFrame({'A': [1, 2, 3]}) - self.check_round_trip(df, engine, - write_kwargs={'compression': compression}) - - def test_read_columns(self, engine): - # GH18154 - df = pd.DataFrame({'string': list('abc'), - 'int': list(range(1, 4))}) - - expected = pd.DataFrame({'string': list('abc')}) - self.check_round_trip(df, engine, expected=expected, - write_kwargs={'compression': None}, - read_kwargs={'columns': ['string']}) + def test_multiindex_with_columns(self, engine): + if engine == 'fastparquet': + pytest.xfail("fastparquet doesn't support mulit-indexes as of 0.1.3") + + dates = pd.date_range('01-Jan-2018', '01-Dec-2018', freq='MS') + df = pd.DataFrame(randn(2*len(dates), 3), columns=list('ABC')) + index1 = pd.MultiIndex.from_product( + [['Level1', 'Level2'], dates], + names=['level', 'date'] + ) + index2 = index1.copy(names=None) + for index in [index1, index2]: + df.index = index + with tm.ensure_clean() as path: + df.to_parquet(path, engine) + result = read_parquet(path, engine) + expected = df + tm.assert_frame_equal(result, expected) + result = read_parquet(path, engine, columns=['A', 'B']) + expected = df[['A', 'B']] + tm.assert_frame_equal(result, expected) class TestParquetPyArrow(Base): @@ -322,14 +328,12 @@ def test_basic(self, pa): self.check_round_trip(df, pa) def test_duplicate_columns(self, pa): - # not currently able to handle duplicate columns df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list('aaa')).copy() self.check_error_on_write(df, pa, ValueError) def test_unsupported(self, pa): - # period df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)}) self.check_error_on_write(df, pa, ValueError) @@ -343,23 +347,13 @@ def test_unsupported(self, pa): df = pd.DataFrame({'a': ['a', 1, 2.0]}) self.check_error_on_write(df, pa, ValueError) - def test_categorical(self, pa_ge_070): - pa = pa_ge_070 - - # supported in >= 0.7.0 + def test_categorical(self, pa): df = pd.DataFrame({'a': pd.Categorical(list('abc'))}) # de-serialized as object expected = df.assign(a=df.a.astype(object)) self.check_round_trip(df, pa, expected) - def test_categorical_unsupported(self, pa_lt_070): - pa = pa_lt_070 - - # supported in >= 0.7.0 - df = pd.DataFrame({'a': pd.Categorical(list('abc'))}) - self.check_error_on_write(df, pa, NotImplementedError) - class TestParquetFastParquet(Base): From 7a6d9a99e41f06587cb2626916dd464c650c98f2 Mon Sep 17 00:00:00 2001 From: dhirschfeld Date: Sat, 9 Dec 2017 18:02:33 +1000 Subject: [PATCH 02/12] Fixed test_write_with_index on Windows snappy compression isn't available on Windows --- pandas/tests/io/test_parquet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index ff16b7f1d074d..115cde9211c30 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -269,12 +269,12 @@ def test_write_with_index(self, engine): ]: df.index = index - self.check_round_trip(df, engine) + self.check_round_trip(df, engine, write_kwargs={'compression': None}) # index with meta-data df.index = [0, 1, 2] df.index.name = 'foo' - self.check_round_trip(df, engine) + self.check_round_trip(df, engine, write_kwargs={'compression': None}) # column multi-index df.index = [0, 1, 2] From 1496ef6da086ac7d1fe59d795b3b4c28c9887b08 Mon Sep 17 00:00:00 2001 From: dhirschfeld Date: Sat, 9 Dec 2017 19:33:51 +1000 Subject: [PATCH 03/12] Fix test_write_with_index to work with fastparquet fastparquet automatically names an index 'index' if it doesn't already have a name --- pandas/tests/io/test_parquet.py | 44 ++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 115cde9211c30..c98ac124b7f4f 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -174,7 +174,8 @@ def check_error_on_write(self, df, engine, exc): to_parquet(df, path, engine, compression=None) def check_round_trip(self, df, engine, expected=None, - write_kwargs=None, read_kwargs=None): + write_kwargs=None, read_kwargs=None, + check_names=True): if write_kwargs is None: write_kwargs = {} if read_kwargs is None: @@ -185,7 +186,7 @@ def check_round_trip(self, df, engine, expected=None, if expected is None: expected = df - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_names=check_names) # repeat to_parquet(df, path, engine, **write_kwargs) @@ -193,7 +194,7 @@ def check_round_trip(self, df, engine, expected=None, if expected is None: expected = df - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_names=check_names) class TestBasic(Base): @@ -256,25 +257,40 @@ def test_read_columns(self, engine): read_kwargs={'columns': ['string']}) def test_write_with_index(self, engine): + check_names = engine != 'fastparquet' + df = pd.DataFrame({'A': [1, 2, 3]}) self.check_round_trip(df, engine, write_kwargs={'compression': None}) + indexes = [ + [2, 3, 4], + pd.date_range('20130101', periods=3), + list('abc'), + [1, 3, 4], + ] # non-default index - for index in [[2, 3, 4], - pd.date_range('20130101', periods=3), - list('abc'), - [1, 3, 4], - pd.MultiIndex.from_tuples([('a', 1), ('a', 2), - ('b', 1)]), - ]: - + for index in indexes: df.index = index - self.check_round_trip(df, engine, write_kwargs={'compression': None}) - + self.check_round_trip( + df, engine, + write_kwargs={'compression': None}, + check_names=check_names, + ) + if engine != 'fastparquet': + # Not suppoprted in fastparquet as of 0.1.3 + index = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]) + df.index = index + self.check_round_trip( + df, engine, + write_kwargs={'compression': None}, + ) # index with meta-data df.index = [0, 1, 2] df.index.name = 'foo' - self.check_round_trip(df, engine, write_kwargs={'compression': None}) + self.check_round_trip( + df, engine, + write_kwargs={'compression': None} + ) # column multi-index df.index = [0, 1, 2] From 9c8cfb7e73f8f25b677622ce9387a399dabf4ed5 Mon Sep 17 00:00:00 2001 From: dhirschfeld Date: Sun, 10 Dec 2017 12:08:50 +1000 Subject: [PATCH 04/12] Fix linting issues --- pandas/io/parquet.py | 15 +++++++++------ pandas/tests/io/test_parquet.py | 9 +++++---- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index a2733c53eb9cc..8f07b14fc4a3d 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -3,7 +3,7 @@ from warnings import catch_warnings from distutils.version import LooseVersion from pandas import DataFrame, RangeIndex, Int64Index, get_option -from pandas.compat import range, string_types +from pandas.compat import string_types from pandas.core.common import AbstractMethodError from pandas.io.common import get_filepath_or_buffer @@ -87,7 +87,7 @@ def __init__(self): "pip install -U pyarrow\n" ) self._pyarrow_lt_070 = ( - LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0') + LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0') ) self.api = pyarrow @@ -113,14 +113,16 @@ def read(self, path, columns=None, **kwargs): kwargs['use_pandas_metadata'] = True return parquet_file.read(columns=columns, **kwargs).to_pandas() - def _validate_write_lt_070(self, df, path, compression='snappy', - coerce_timestamps='ms', **kwargs): + coerce_timestamps='ms', **kwargs): # Compatibility shim for pyarrow < 0.7.0 # TODO: Remove in pandas 0.22.0 from pandas.core.indexes.multi import MultiIndex if isinstance(df.index, MultiIndex): - msg = "Mulit-index DataFrames are only supported with pyarrow >= 0.7.0" + msg = ( + "Mulit-index DataFrames are only supported " + "with pyarrow >= 0.7.0" + ) raise ValueError(msg) # Validate index if not isinstance(df.index, Int64Index): @@ -150,7 +152,8 @@ def _read_lt_070(self, parquet_file, columns, **kwargs): metadata = json.loads(parquet_file.metadata.metadata[b'pandas']) columns = set(chain(columns, metadata['index_columns'])) kwargs['columns'] = columns - return self.api.parquet.read_table(parquet_file.path, **kwargs).to_pandas() + kwargs['path'] = parquet_file.path + return self.api.parquet.read_table(**kwargs).to_pandas() class FastParquetImpl(BaseImpl): diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index c98ac124b7f4f..057f5e45b930c 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -258,7 +258,7 @@ def test_read_columns(self, engine): def test_write_with_index(self, engine): check_names = engine != 'fastparquet' - + df = pd.DataFrame({'A': [1, 2, 3]}) self.check_round_trip(df, engine, write_kwargs={'compression': None}) @@ -278,7 +278,7 @@ def test_write_with_index(self, engine): ) if engine != 'fastparquet': # Not suppoprted in fastparquet as of 0.1.3 - index = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]) + index = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]) df.index = index self.check_round_trip( df, engine, @@ -299,10 +299,11 @@ def test_write_with_index(self, engine): def test_multiindex_with_columns(self, engine): if engine == 'fastparquet': - pytest.xfail("fastparquet doesn't support mulit-indexes as of 0.1.3") + msg = "fastparquet doesn't support mulit-indexes as of 0.1.3" + pytest.xfail(msg) dates = pd.date_range('01-Jan-2018', '01-Dec-2018', freq='MS') - df = pd.DataFrame(randn(2*len(dates), 3), columns=list('ABC')) + df = pd.DataFrame(randn(2 * len(dates), 3), columns=list('ABC')) index1 = pd.MultiIndex.from_product( [['Level1', 'Level2'], dates], names=['level', 'date'] From 60124ac32974865e21bb536e249df623129ca1c3 Mon Sep 17 00:00:00 2001 From: dhirschfeld Date: Sun, 10 Dec 2017 12:11:16 +1000 Subject: [PATCH 05/12] Added wahtsnew entry for 0.21.1 --- doc/source/whatsnew/v0.21.1.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index daf060f50b060..e4dacb628353c 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -53,6 +53,7 @@ Other Enhancements - :meth:`Timestamp.timestamp` is now available in Python 2.7. (:issue:`17329`) - :class:`Grouper` and :class:`TimeGrouper` now have a friendly repr output (:issue:`18203`). +- Enabled the use of non-default indexes in :func:`DataFrame.to_parquet` where the underlying engine supports it (:issue:`18581`) - .. _whatsnew_0211.deprecations: From d8a27e16d8bc6f87047e9de00501ceadc4143b0d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 10 Dec 2017 16:58:46 +0100 Subject: [PATCH 06/12] move whatsnew note to parquet section --- doc/source/whatsnew/v0.21.1.txt | 4 +++- doc/source/whatsnew/v0.22.0.txt | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 46ee02e4482bd..1b709cfd4aa90 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -61,6 +61,9 @@ New features Improvements to the Parquet IO functionality ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +- :func:`DataFrame.to_parquet` will now write non-default indexes when the + underlying engine supports it. The indexes will be preserved when reading + back in with :func:`read_parquet` (:issue:`18581`). - :func:`read_parquet` now allows to specify the columns to read from a parquet file (:issue:`18154`) - :func:`read_parquet` now allows to specify kwargs which are passed to the respective engine (:issue:`18216`) @@ -71,7 +74,6 @@ Other Enhancements - :meth:`Timestamp.timestamp` is now available in Python 2.7. (:issue:`17329`) - :class:`Grouper` and :class:`TimeGrouper` now have a friendly repr output (:issue:`18203`). -- Enabled the use of non-default indexes in :func:`DataFrame.to_parquet` where the underlying engine supports it (:issue:`18581`) .. _whatsnew_0211.deprecations: diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index cca4bd77075a2..650832beadae3 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -329,4 +329,4 @@ Other ^^^^^ - Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`) -- \ No newline at end of file +- \ No newline at end of file From cd2611025ab0955273a066d48eda7abee18ed9cd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 10 Dec 2017 17:11:49 +0100 Subject: [PATCH 07/12] undo version changes in ci and install docs --- ci/requirements-2.7.sh | 2 +- ci/requirements-3.5.sh | 2 +- doc/source/install.rst | 2 +- doc/source/whatsnew/v0.22.0.txt | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ci/requirements-2.7.sh b/ci/requirements-2.7.sh index 9e59f3e5eeb55..e3bd5e46026c5 100644 --- a/ci/requirements-2.7.sh +++ b/ci/requirements-2.7.sh @@ -4,4 +4,4 @@ source activate pandas echo "install 27" -conda install -n pandas -c conda-forge feather-format pyarrow=0.7.0 fastparquet +conda install -n pandas -c conda-forge feather-format pyarrow=0.4.1 fastparquet diff --git a/ci/requirements-3.5.sh b/ci/requirements-3.5.sh index a1ed467b36ff9..d694ad3679ac1 100644 --- a/ci/requirements-3.5.sh +++ b/ci/requirements-3.5.sh @@ -8,4 +8,4 @@ echo "install 35" conda remove -n pandas python-dateutil --force pip install python-dateutil -conda install -n pandas -c conda-forge feather-format pyarrow=0.7.0 +conda install -n pandas -c conda-forge feather-format pyarrow=0.5.0 diff --git a/doc/source/install.rst b/doc/source/install.rst index 641a07ed22810..979d5afd0a04f 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -229,7 +229,7 @@ Optional Dependencies * `xarray `__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended. * `PyTables `__: necessary for HDF5-based storage. Version 3.0.0 or higher required, Version 3.2.1 or higher highly recommended. * `Feather Format `__: necessary for feather-based storage, version 0.3.1 or higher. -* `Apache Parquet `__, either `pyarrow `__ (>= 0.7.0) or `fastparquet `__ (>= 0.1.0) for parquet-based storage. The `snappy `__ and `brotli `__ are available for compression support. +* `Apache Parquet `__, either `pyarrow `__ (>= 0.4.1) or `fastparquet `__ (>= 0.0.6) for parquet-based storage. The `snappy `__ and `brotli `__ are available for compression support. * `SQLAlchemy `__: for SQL database support. Version 0.8.1 or higher recommended. Besides SQLAlchemy, you also need a database specific driver. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs `__. Some common drivers are: * `psycopg2 `__: for PostgreSQL diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 650832beadae3..02cb5aa870c9b 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -329,4 +329,4 @@ Other ^^^^^ - Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`) -- \ No newline at end of file +- From 1acb00c021b070a602244d137973131bef697f20 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 10 Dec 2017 18:41:30 +0100 Subject: [PATCH 08/12] undo testing changes for old pyarrow versions --- pandas/tests/io/test_parquet.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index b06398412900b..198143b40cb0f 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -44,7 +44,23 @@ def engine(request): def pa(): if not _HAVE_PYARROW: pytest.skip("pyarrow is not installed") - if LooseVersion(pyarrow.__version__) < '0.7.0': + return 'pyarrow' + + +@pytest.fixture +def pa_lt_070(): + if not _HAVE_PYARROW: + pytest.skip("pyarrow is not installed") + if LooseVersion(pyarrow.__version__) >= LooseVersion('0.7.0'): + pytest.skip("pyarrow is >= 0.7.0") + return 'pyarrow' + + +@pytest.fixture +def pa_ge_070(): + if not _HAVE_PYARROW: + pytest.skip("pyarrow is not installed") + if LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0'): pytest.skip("pyarrow is < 0.7.0") return 'pyarrow' @@ -53,8 +69,6 @@ def pa(): def fp(): if not _HAVE_FASTPARQUET: pytest.skip("fastparquet is not installed") - if LooseVersion(fastparquet.__version__) < '0.1.0': - pytest.skip("fastparquet is < 0.1.0") return 'fastparquet' @@ -394,13 +408,23 @@ def test_unsupported(self, pa): df = pd.DataFrame({'a': ['a', 1, 2.0]}) self.check_error_on_write(df, pa, ValueError) - def test_categorical(self, pa): + def test_categorical(self, pa_ge_070): + pa = pa_ge_070 + + # supported in >= 0.7.0 df = pd.DataFrame({'a': pd.Categorical(list('abc'))}) # de-serialized as object expected = df.assign(a=df.a.astype(object)) self.check_round_trip(df, pa, expected) + def test_categorical_unsupported(self, pa_lt_070): + pa = pa_lt_070 + + # supported in >= 0.7.0 + df = pd.DataFrame({'a': pd.Categorical(list('abc'))}) + self.check_error_on_write(df, pa, NotImplementedError) + class TestParquetFastParquet(Base): From 0f7b5269a0276094dd645e601a69cd3d90846daa Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 10 Dec 2017 19:26:07 +0100 Subject: [PATCH 09/12] fix test for older versions of pyarrow --- pandas/io/parquet.py | 33 ++++++++++++++++--------- pandas/tests/io/test_parquet.py | 44 +++++++++++++++++---------------- 2 files changed, 44 insertions(+), 33 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 8f07b14fc4a3d..0a344487780c6 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -86,9 +86,12 @@ def __init__(self): "\nor via pip\n" "pip install -U pyarrow\n" ) + + self._pyarrow_lt_060 = ( + LooseVersion(pyarrow.__version__) < LooseVersion('0.6.0')) self._pyarrow_lt_070 = ( - LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0') - ) + LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0')) + self.api = pyarrow def write(self, df, path, compression='snappy', @@ -99,17 +102,23 @@ def write(self, df, path, compression='snappy', df, path, compression, coerce_timestamps, **kwargs ) path, _, _ = get_filepath_or_buffer(path) - table = self.api.Table.from_pandas(df) - self.api.parquet.write_table( - table, path, compression=compression, - coerce_timestamps=coerce_timestamps, **kwargs) + + if self._pyarrow_lt_060: + table = self.api.Table.from_pandas(df, timestamps_to_ms=True) + self.api.parquet.write_table( + table, path, compression=compression, **kwargs) + + else: + table = self.api.Table.from_pandas(df) + self.api.parquet.write_table( + table, path, compression=compression, + coerce_timestamps=coerce_timestamps, **kwargs) def read(self, path, columns=None, **kwargs): path, _, _ = get_filepath_or_buffer(path) parquet_file = self.api.parquet.ParquetFile(path) if self._pyarrow_lt_070: - parquet_file.path = path - return self._read_lt_070(parquet_file, columns, **kwargs) + return self._read_lt_070(path, parquet_file, columns, **kwargs) kwargs['use_pandas_metadata'] = True return parquet_file.read(columns=columns, **kwargs).to_pandas() @@ -143,17 +152,17 @@ def _validate_write_lt_070(self, df, path, compression='snappy', "on a default index" ) - def _read_lt_070(self, parquet_file, columns, **kwargs): + def _read_lt_070(self, path, parquet_file, columns, **kwargs): # Compatibility shim for pyarrow < 0.7.0 # TODO: Remove in pandas 0.22.0 from itertools import chain import json if columns is not None: - metadata = json.loads(parquet_file.metadata.metadata[b'pandas']) + metadata = json.loads( + parquet_file.metadata.metadata[b'pandas'].decode('utf-8')) columns = set(chain(columns, metadata['index_columns'])) kwargs['columns'] = columns - kwargs['path'] = parquet_file.path - return self.api.parquet.read_table(**kwargs).to_pandas() + return self.api.parquet.read_table(path, **kwargs).to_pandas() class FastParquetImpl(BaseImpl): diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 198143b40cb0f..33c46da637734 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -296,9 +296,14 @@ def test_read_columns(self, engine): write_kwargs={'compression': None}, read_kwargs={'columns': ['string']}) - def test_write_with_index(self, engine): + def test_write_index(self, engine): check_names = engine != 'fastparquet' + if engine == 'pyarrow': + import pyarrow + if LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0'): + pytest.skip("pyarrow is < 0.7.0") + df = pd.DataFrame({'A': [1, 2, 3]}) self.check_round_trip(df, engine, write_kwargs={'compression': None}) @@ -314,34 +319,31 @@ def test_write_with_index(self, engine): self.check_round_trip( df, engine, write_kwargs={'compression': None}, - check_names=check_names, - ) - if engine != 'fastparquet': - # Not suppoprted in fastparquet as of 0.1.3 - index = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]) - df.index = index - self.check_round_trip( - df, engine, - write_kwargs={'compression': None}, - ) + check_names=check_names) + # index with meta-data df.index = [0, 1, 2] df.index.name = 'foo' - self.check_round_trip( - df, engine, - write_kwargs={'compression': None} - ) + self.check_round_trip( df, engine, write_kwargs={'compression': None}) + + def test_write_multiindex(self, pa_ge_070): + # Not suppoprted in fastparquet as of 0.1.3 or older pyarrow version + engine = pa_ge_070 + + df = pd.DataFrame({'A': [1, 2, 3]}) + index = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]) + df.index = index + self.check_round_trip(df, engine, write_kwargs={'compression': None}) + def test_write_column_multiindex(self, engine): # column multi-index - df.index = [0, 1, 2] - df.columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]), + mi_columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]) + df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns) self.check_error_on_write(df, engine, ValueError) - def test_multiindex_with_columns(self, engine): - if engine == 'fastparquet': - msg = "fastparquet doesn't support mulit-indexes as of 0.1.3" - pytest.xfail(msg) + def test_multiindex_with_columns(self, pa_ge_070): + engine = pa_ge_070 dates = pd.date_range('01-Jan-2018', '01-Dec-2018', freq='MS') df = pd.DataFrame(randn(2 * len(dates), 3), columns=list('ABC')) index1 = pd.MultiIndex.from_product( From 67dffdec2c23928717ac237e4f84a15b482c51a8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 10 Dec 2017 23:32:31 +0100 Subject: [PATCH 10/12] use read_pandas for pyarrow < 0.7 --- pandas/io/parquet.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 0a344487780c6..32b7f528d85c0 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -155,14 +155,8 @@ def _validate_write_lt_070(self, df, path, compression='snappy', def _read_lt_070(self, path, parquet_file, columns, **kwargs): # Compatibility shim for pyarrow < 0.7.0 # TODO: Remove in pandas 0.22.0 - from itertools import chain - import json - if columns is not None: - metadata = json.loads( - parquet_file.metadata.metadata[b'pandas'].decode('utf-8')) - columns = set(chain(columns, metadata['index_columns'])) kwargs['columns'] = columns - return self.api.parquet.read_table(path, **kwargs).to_pandas() + return self.api.parquet.read_pandas(path, **kwargs).to_pandas() class FastParquetImpl(BaseImpl): From 79de86b989dfcacf9f1f0ffd4b30833843bc2bc7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 11 Dec 2017 00:35:21 +0100 Subject: [PATCH 11/12] further clean-up --- pandas/io/parquet.py | 40 +++++++++++++++------------------ pandas/tests/io/test_parquet.py | 9 ++++---- 2 files changed, 22 insertions(+), 27 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 32b7f528d85c0..96735d431d369 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -98,9 +98,7 @@ def write(self, df, path, compression='snappy', coerce_timestamps='ms', **kwargs): self.validate_dataframe(df) if self._pyarrow_lt_070: - self._validate_write_lt_070( - df, path, compression, coerce_timestamps, **kwargs - ) + self._validate_write_lt_070(df) path, _, _ = get_filepath_or_buffer(path) if self._pyarrow_lt_060: @@ -116,48 +114,46 @@ def write(self, df, path, compression='snappy', def read(self, path, columns=None, **kwargs): path, _, _ = get_filepath_or_buffer(path) - parquet_file = self.api.parquet.ParquetFile(path) if self._pyarrow_lt_070: - return self._read_lt_070(path, parquet_file, columns, **kwargs) + return self.api.parquet.read_pandas(path, columns=columns, + **kwargs).to_pandas() kwargs['use_pandas_metadata'] = True - return parquet_file.read(columns=columns, **kwargs).to_pandas() + return self.api.parquet.read_table(path, columns=columns, + **kwargs).to_pandas() - def _validate_write_lt_070(self, df, path, compression='snappy', - coerce_timestamps='ms', **kwargs): + def _validate_write_lt_070(self, df): # Compatibility shim for pyarrow < 0.7.0 # TODO: Remove in pandas 0.22.0 from pandas.core.indexes.multi import MultiIndex if isinstance(df.index, MultiIndex): msg = ( - "Mulit-index DataFrames are only supported " + "Multi-index DataFrames are only supported " "with pyarrow >= 0.7.0" ) raise ValueError(msg) # Validate index if not isinstance(df.index, Int64Index): msg = ( - "parquet does not support serializing {} for the index;" - "you can .reset_index() to make the index into column(s)" + "pyarrow < 0.7.0 does not support serializing {} for the " + "index; you can .reset_index() to make the index into " + "column(s), or install the latest version of pyarrow or " + "fastparquet." ) raise ValueError(msg.format(type(df.index))) if not df.index.equals(RangeIndex(len(df))): raise ValueError( - "parquet does not support serializing a non-default index " - "for the index; you can .reset_index() to make the index " - "into column(s)" + "pyarrow < 0.7.0 does not support serializing a non-default " + "index; you can .reset_index() to make the index into " + "column(s), or install the latest version of pyarrow or " + "fastparquet." ) if df.index.name is not None: raise ValueError( - "parquet does not serialize index meta-data " - "on a default index" + "pyarrow < 0.7.0 does not serialize indexes with a name; you " + "can set the index.name to None or install the latest version " + "of pyarrow or fastparquet." ) - def _read_lt_070(self, path, parquet_file, columns, **kwargs): - # Compatibility shim for pyarrow < 0.7.0 - # TODO: Remove in pandas 0.22.0 - kwargs['columns'] = columns - return self.api.parquet.read_pandas(path, **kwargs).to_pandas() - class FastParquetImpl(BaseImpl): diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 33c46da637734..c59acbd946f91 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -6,7 +6,6 @@ from warnings import catch_warnings import numpy as np -from numpy.random import randn import pandas as pd from pandas.compat import PY3, is_platform_windows from pandas.io.parquet import (to_parquet, read_parquet, get_engine, @@ -324,7 +323,7 @@ def test_write_index(self, engine): # index with meta-data df.index = [0, 1, 2] df.index.name = 'foo' - self.check_round_trip( df, engine, write_kwargs={'compression': None}) + self.check_round_trip(df, engine, write_kwargs={'compression': None}) def test_write_multiindex(self, pa_ge_070): # Not suppoprted in fastparquet as of 0.1.3 or older pyarrow version @@ -345,11 +344,11 @@ def test_multiindex_with_columns(self, pa_ge_070): engine = pa_ge_070 dates = pd.date_range('01-Jan-2018', '01-Dec-2018', freq='MS') - df = pd.DataFrame(randn(2 * len(dates), 3), columns=list('ABC')) + df = pd.DataFrame(np.random.randn(2 * len(dates), 3), + columns=list('ABC')) index1 = pd.MultiIndex.from_product( [['Level1', 'Level2'], dates], - names=['level', 'date'] - ) + names=['level', 'date']) index2 = index1.copy(names=None) for index in [index1, index2]: df.index = index From 0fbdc14ac336f3a29fe531fffd7b4b5b7c766691 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 11 Dec 2017 18:00:03 +0100 Subject: [PATCH 12/12] style update --- pandas/io/parquet.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 96735d431d369..aa5dd821f5980 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -41,11 +41,14 @@ class BaseImpl(object): @staticmethod def validate_dataframe(df): + if not isinstance(df, DataFrame): raise ValueError("to_parquet only supports IO with DataFrames") + # must have value column names (strings only) if df.columns.inferred_type not in {'string', 'unicode'}: raise ValueError("parquet must have string column names") + # index level names must be strings valid_names = all( isinstance(name, string_types)