From 7703867f4bdc77aa564da21e1ac2646f040058bb Mon Sep 17 00:00:00 2001 From: Dobatymo Date: Sun, 8 Jul 2018 21:05:01 +0800 Subject: [PATCH] ENH: support 'infer' compression in _get_handle() (#17900) xref gh-15008 xref gh-17262 --- doc/source/whatsnew/v0.24.0.txt | 1 + pandas/core/frame.py | 8 ++--- pandas/core/generic.py | 2 +- pandas/io/common.py | 17 +++++++---- pandas/io/pickle.py | 8 ++--- pandas/tests/io/formats/test_to_csv.py | 37 +++++++++++++++++++++++- pandas/tests/io/json/test_compression.py | 32 ++++++++++++++++++++ 7 files changed, 89 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 812ea366b704a6..29c1a68053aef8 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -83,6 +83,7 @@ Other Enhancements - :func:`read_html` copies cell data across ``colspan``s and ``rowspan``s, and it treats all-``th`` table rows as headers if ``header`` kwarg is not given and there is no ``thead`` (:issue:`17054`) - :meth:`Series.nlargest`, :meth:`Series.nsmallest`, :meth:`DataFrame.nlargest`, and :meth:`DataFrame.nsmallest` now accept the value ``"all"`` for the ``keep`` argument. This keeps all ties for the nth largest/smallest value (:issue:`16818`) - :class:`IntervalIndex` has gained the :meth:`~IntervalIndex.set_closed` method to change the existing ``closed`` value (:issue:`21670`) +- :func:`~DataFrame.to_csv` and :func:`~DataFrame.to_json` now support ``compression='infer'`` to infer compression based on filename (:issue:`15008`) - .. _whatsnew_0240.api_breaking: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7e8fef9644d8ce..66f51cd0dae456 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1695,10 +1695,10 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, encoding : string, optional A string representing the encoding to use in the output file, defaults to 'ascii' on Python 2 and 'utf-8' on Python 3. - compression : string, optional - A string representing the compression to use in the output file. - Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only - used when the first argument is a filename. + compression : {'infer', 'gzip', 'bz2', 'xz', None}, default None + If 'infer' and `path_or_buf` is path-like, then detect compression + from the following extensions: '.gz', '.bz2' or '.xz' + (otherwise no compression). line_terminator : string, default ``'\n'`` The newline character or character sequence to use in the output file diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a75e3960cda165..bda96133c6b97d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1906,7 +1906,7 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None, .. versionadded:: 0.19.0 - compression : {None, 'gzip', 'bz2', 'zip', 'xz'} + compression : {'infer', 'gzip', 'bz2', 'xz', None}, default None A string representing the compression to use in the output file, only used when the first argument is a filename. diff --git a/pandas/io/common.py b/pandas/io/common.py index 6d579fc8a8a099..17dda903cdadb3 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -267,10 +267,12 @@ def _infer_compression(filepath_or_buffer, compression): Parameters ---------- - filepath_or_buf : + filepath_or_buffer : a path (str) or buffer - compression : str or None - the compression method including None for no compression and 'infer' + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None} + If 'infer' and `filepath_or_buffer` is path-like, then detect + compression from the following extensions: '.gz', '.bz2', '.zip', + or '.xz' (otherwise no compression). Returns ------- @@ -322,8 +324,10 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, mode : str mode to open path_or_buf with encoding : str or None - compression : str or None - Supported compression protocols are gzip, bz2, zip, and xz + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None + If 'infer' and `filepath_or_buffer` is path-like, then detect + compression from the following extensions: '.gz', '.bz2', '.zip', + or '.xz' (otherwise no compression). memory_map : boolean, default False See parsers._parser_params for more information. is_text : boolean, default True @@ -350,6 +354,9 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, path_or_buf = _stringify_path(path_or_buf) is_path = isinstance(path_or_buf, compat.string_types) + if is_path: + compression = _infer_compression(path_or_buf, compression) + if compression: if compat.PY2 and not is_path and encoding: diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index d347d76c33e0f7..6738daec9397cc 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -5,7 +5,7 @@ from numpy.lib.format import read_array, write_array from pandas.compat import BytesIO, cPickle as pkl, pickle_compat as pc, PY3 from pandas.core.dtypes.common import is_datetime64_dtype, _NS_DTYPE -from pandas.io.common import _get_handle, _infer_compression, _stringify_path +from pandas.io.common import _get_handle, _stringify_path def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL): @@ -67,9 +67,8 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL): >>> os.remove("./dummy.pkl") """ path = _stringify_path(path) - inferred_compression = _infer_compression(path, compression) f, fh = _get_handle(path, 'wb', - compression=inferred_compression, + compression=compression, is_text=False) if protocol < 0: protocol = pkl.HIGHEST_PROTOCOL @@ -138,12 +137,11 @@ def read_pickle(path, compression='infer'): >>> os.remove("./dummy.pkl") """ path = _stringify_path(path) - inferred_compression = _infer_compression(path, compression) def read_wrapper(func): # wrapper file handle open/close operation f, fh = _get_handle(path, 'rb', - compression=inferred_compression, + compression=compression, is_text=False) try: return func(f) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 20dfeb10daf46d..5fb356e48289ff 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -1,9 +1,12 @@ # -*- coding: utf-8 -*- import sys + +import pytest + import numpy as np import pandas as pd -import pytest + from pandas import DataFrame from pandas.util import testing as tm @@ -316,3 +319,35 @@ def test_to_csv_write_to_open_file(self): df.to_csv(f, header=None, index=None) with open(path, 'r') as f: assert f.read() == expected + + @pytest.mark.parametrize("to_infer", [True, False]) + @pytest.mark.parametrize("read_infer", [True, False]) + def test_to_csv_compression(self, compression_only, + read_infer, to_infer): + # see gh-15008 + compression = compression_only + + if compression == "zip": + pytest.skip("{compression} is not supported " + "for to_csv".format(compression=compression)) + + # We'll complete file extension subsequently. + filename = "test." + + if compression == "gzip": + filename += "gz" + else: + # xz --> .xz + # bz2 --> .bz2 + filename += compression + + df = DataFrame({"A": [1]}) + + to_compression = "infer" if to_infer else compression + read_compression = "infer" if read_infer else compression + + with tm.ensure_clean(filename) as path: + df.to_csv(path, compression=to_compression) + result = pd.read_csv(path, index_col=0, + compression=read_compression) + tm.assert_frame_equal(result, df) diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 05ceace20f5a40..f2e72e5fe00e1b 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -88,3 +88,35 @@ def test_read_unsupported_compression_type(): msg = "Unrecognized compression type: unsupported" assert_raises_regex(ValueError, msg, pd.read_json, path, compression="unsupported") + + +@pytest.mark.parametrize("to_infer", [True, False]) +@pytest.mark.parametrize("read_infer", [True, False]) +def test_to_json_compression(compression_only, + read_infer, to_infer): + # see gh-15008 + compression = compression_only + + if compression == "zip": + pytest.skip("{compression} is not supported " + "for to_csv".format(compression=compression)) + + # We'll complete file extension subsequently. + filename = "test." + + if compression == "gzip": + filename += "gz" + else: + # xz --> .xz + # bz2 --> .bz2 + filename += compression + + df = pd.DataFrame({"A": [1]}) + + to_compression = "infer" if to_infer else compression + read_compression = "infer" if read_infer else compression + + with tm.ensure_clean(filename) as path: + df.to_json(path, compression=to_compression) + result = pd.read_json(path, compression=read_compression) + tm.assert_frame_equal(result, df)