From 93f154cd5cca4cbd0ad0725c83700ffa61c6527c Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Wed, 1 Aug 2018 17:23:34 -0400 Subject: [PATCH] API: Default to_* methods to compression='infer' (#22011) Closes gh-22004. --- doc/source/io.rst | 2 +- doc/source/whatsnew/v0.24.0.txt | 3 +- pandas/core/frame.py | 8 ++- pandas/core/generic.py | 9 +-- pandas/core/series.py | 11 ++-- pandas/io/formats/csvs.py | 41 ++++++------ pandas/io/json/json.py | 2 +- pandas/tests/io/test_common.py | 61 +++++++++--------- pandas/tests/io/test_compression.py | 99 +++++++++++++++++++++++++++++ pandas/tests/test_common.py | 69 ++------------------ 10 files changed, 180 insertions(+), 125 deletions(-) create mode 100644 pandas/tests/io/test_compression.py diff --git a/doc/source/io.rst b/doc/source/io.rst index 9fe578524c8e0..c2c8c1c17700f 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -298,7 +298,7 @@ compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None`` Set to ``None`` for no decompression. .. versionadded:: 0.18.1 support for 'zip' and 'xz' compression. - + .. versionchanged:: 0.24.0 'infer' option added and set to default. thousands : str, default ``None`` Thousands separator. decimal : str, default ``'.'`` diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 8b89618cd0d88..2e0d9ed2bf3f0 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -177,7 +177,8 @@ Other Enhancements - :func:`read_html` copies cell data across ``colspan`` and ``rowspan``, and it treats all-``th`` table rows as headers if ``header`` kwarg is not given and there is no ``thead`` (:issue:`17054`) - :meth:`Series.nlargest`, :meth:`Series.nsmallest`, :meth:`DataFrame.nlargest`, and :meth:`DataFrame.nsmallest` now accept the value ``"all"`` for the ``keep`` argument. This keeps all ties for the nth largest/smallest value (:issue:`16818`) - :class:`IntervalIndex` has gained the :meth:`~IntervalIndex.set_closed` method to change the existing ``closed`` value (:issue:`21670`) -- :func:`~DataFrame.to_csv` and :func:`~DataFrame.to_json` now support ``compression='infer'`` to infer compression based on filename (:issue:`15008`) +- :func:`~DataFrame.to_csv`, :func:`~Series.to_csv`, :func:`~DataFrame.to_json`, and :func:`~Series.to_json` now support ``compression='infer'`` to infer compression based on filename extension (:issue:`15008`). + The default compression for ``to_csv``, ``to_json``, and ``to_pickle`` methods has been updated to ``'infer'`` (:issue:`22004`). - :func:`to_timedelta` now supports iso-formated timedelta strings (:issue:`21877`) - :class:`Series` and :class:`DataFrame` now support :class:`Iterable` in constructor (:issue:`2193`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 834cc3d188b39..ebd35cb1a6a1a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1715,7 +1715,7 @@ def to_panel(self): def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, - mode='w', encoding=None, compression=None, quoting=None, + mode='w', encoding=None, compression='infer', quoting=None, quotechar='"', line_terminator='\n', chunksize=None, tupleize_cols=None, date_format=None, doublequote=True, escapechar=None, decimal='.'): @@ -1750,10 +1750,14 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, encoding : string, optional A string representing the encoding to use in the output file, defaults to 'ascii' on Python 2 and 'utf-8' on Python 3. - compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, + default 'infer' If 'infer' and `path_or_buf` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip' or '.xz' (otherwise no compression). + + .. versionchanged:: 0.24.0 + 'infer' option added and set to default line_terminator : string, default ``'\n'`` The newline character or character sequence to use in the output file diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7a12ce0e1385e..f62605c342702 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1933,7 +1933,7 @@ def _repr_latex_(self): def to_json(self, path_or_buf=None, orient=None, date_format=None, double_precision=10, force_ascii=True, date_unit='ms', - default_handler=None, lines=False, compression=None, + default_handler=None, lines=False, compression='infer', index=True): """ Convert the object to a JSON string. @@ -1999,13 +1999,14 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None, like. .. versionadded:: 0.19.0 - - compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, + default 'infer' A string representing the compression to use in the output file, only used when the first argument is a filename. .. versionadded:: 0.21.0 - + .. versionchanged:: 0.24.0 + 'infer' option added and set to default index : boolean, default True Whether to include the index values in the JSON string. Not including the index (``index=False``) is only supported when diff --git a/pandas/core/series.py b/pandas/core/series.py index 8f9fe5ee516e6..21dea15772cc0 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3767,7 +3767,7 @@ def from_csv(cls, path, sep=',', parse_dates=True, header=None, def to_csv(self, path=None, index=True, sep=",", na_rep='', float_format=None, header=False, index_label=None, - mode='w', encoding=None, compression=None, date_format=None, + mode='w', encoding=None, compression='infer', date_format=None, decimal='.'): """ Write Series to a comma-separated values (csv) file @@ -3795,10 +3795,13 @@ def to_csv(self, path=None, index=True, sep=",", na_rep='', encoding : string, optional a string representing the encoding to use if the contents are non-ascii, for python versions prior to 3 - compression : string, optional + compression : None or string, default 'infer' A string representing the compression to use in the output file. - Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only - used when the first argument is a filename. + Allowed values are None, 'gzip', 'bz2', 'zip', 'xz', and 'infer'. + This input is only used when the first argument is a filename. + + .. versionchanged:: 0.24.0 + 'infer' option added and set to default date_format: string, default None Format string for datetime objects. decimal: string, default '.' diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 0796888554a46..6fabd2573a7b4 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -21,8 +21,13 @@ from pandas.core.dtypes.generic import ( ABCMultiIndex, ABCPeriodIndex, ABCDatetimeIndex, ABCIndexClass) -from pandas.io.common import (_get_handle, UnicodeWriter, _expand_user, - _stringify_path) +from pandas.io.common import ( + _expand_user, + _get_handle, + _infer_compression, + _stringify_path, + UnicodeWriter, +) class CSVFormatter(object): @@ -30,7 +35,7 @@ class CSVFormatter(object): def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, - compression=None, quoting=None, line_terminator='\n', + compression='infer', quoting=None, line_terminator='\n', chunksize=None, tupleize_cols=False, quotechar='"', date_format=None, doublequote=True, escapechar=None, decimal='.'): @@ -50,8 +55,10 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', self.index = index self.index_label = index_label self.mode = mode + if encoding is None: + encoding = 'ascii' if compat.PY2 else 'utf-8' self.encoding = encoding - self.compression = compression + self.compression = _infer_compression(self.path_or_buf, compression) if quoting is None: quoting = csvlib.QUOTE_MINIMAL @@ -124,16 +131,10 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', self.nlevels = 0 def save(self): - # create the writer & save - if self.encoding is None: - if compat.PY2: - encoding = 'ascii' - else: - encoding = 'utf-8' - else: - encoding = self.encoding - - # GH 21227 internal compression is not used when file-like passed. + """ + Create the writer & save + """ + # GH21227 internal compression is not used when file-like passed. if self.compression and hasattr(self.path_or_buf, 'write'): msg = ("compression has no effect when passing file-like " "object as input.") @@ -147,7 +148,7 @@ def save(self): if is_zip: # zipfile doesn't support writing string to archive. uses string # buffer to receive csv writing and dump into zip compression - # file handle. GH 21241, 21118 + # file handle. GH21241, GH21118 f = StringIO() close = False elif hasattr(self.path_or_buf, 'write'): @@ -155,7 +156,7 @@ def save(self): close = False else: f, handles = _get_handle(self.path_or_buf, self.mode, - encoding=encoding, + encoding=self.encoding, compression=self.compression) close = True @@ -165,23 +166,23 @@ def save(self): doublequote=self.doublequote, escapechar=self.escapechar, quotechar=self.quotechar) - if encoding == 'ascii': + if self.encoding == 'ascii': self.writer = csvlib.writer(f, **writer_kwargs) else: - writer_kwargs['encoding'] = encoding + writer_kwargs['encoding'] = self.encoding self.writer = UnicodeWriter(f, **writer_kwargs) self._save() finally: if is_zip: - # GH 17778 handles zip compression separately. + # GH17778 handles zip compression separately. buf = f.getvalue() if hasattr(self.path_or_buf, 'write'): self.path_or_buf.write(buf) else: f, handles = _get_handle(self.path_or_buf, self.mode, - encoding=encoding, + encoding=self.encoding, compression=self.compression) f.write(buf) close = True diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 629e00ebfa7d0..c5f8872f93d94 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -28,7 +28,7 @@ # interface to/from def to_json(path_or_buf, obj, orient=None, date_format='epoch', double_precision=10, force_ascii=True, date_unit='ms', - default_handler=None, lines=False, compression=None, + default_handler=None, lines=False, compression='infer', index=True): if not index and orient not in ['split', 'table']: diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 5c9739be73393..ceaac9818354a 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -1,19 +1,20 @@ """ - Tests for the pandas.io.common functionalities +Tests for the pandas.io.common functionalities """ import mmap -import pytest import os -from os.path import isabs + +import pytest import pandas as pd -import pandas.util.testing as tm +import pandas.io.common as icom import pandas.util._test_decorators as td - -from pandas.io import common -from pandas.compat import is_platform_windows, StringIO, FileNotFoundError - -from pandas import read_csv, concat +import pandas.util.testing as tm +from pandas.compat import ( + is_platform_windows, + StringIO, + FileNotFoundError, +) class CustomFSPath(object): @@ -55,24 +56,24 @@ class TestCommonIOCapabilities(object): def test_expand_user(self): filename = '~/sometest' - expanded_name = common._expand_user(filename) + expanded_name = icom._expand_user(filename) assert expanded_name != filename - assert isabs(expanded_name) + assert os.path.isabs(expanded_name) assert os.path.expanduser(filename) == expanded_name def test_expand_user_normal_path(self): filename = '/somefolder/sometest' - expanded_name = common._expand_user(filename) + expanded_name = icom._expand_user(filename) assert expanded_name == filename assert os.path.expanduser(filename) == expanded_name @td.skip_if_no('pathlib') def test_stringify_path_pathlib(self): - rel_path = common._stringify_path(Path('.')) + rel_path = icom._stringify_path(Path('.')) assert rel_path == '.' - redundant_path = common._stringify_path(Path('foo//bar')) + redundant_path = icom._stringify_path(Path('foo//bar')) assert redundant_path == os.path.join('foo', 'bar') @td.skip_if_no('py.path') @@ -80,11 +81,11 @@ def test_stringify_path_localpath(self): path = os.path.join('foo', 'bar') abs_path = os.path.abspath(path) lpath = LocalPath(path) - assert common._stringify_path(lpath) == abs_path + assert icom._stringify_path(lpath) == abs_path def test_stringify_path_fspath(self): p = CustomFSPath('foo/bar.csv') - result = common._stringify_path(p) + result = icom._stringify_path(p) assert result == 'foo/bar.csv' @pytest.mark.parametrize('extension,expected', [ @@ -97,36 +98,36 @@ def test_stringify_path_fspath(self): @pytest.mark.parametrize('path_type', path_types) def test_infer_compression_from_path(self, extension, expected, path_type): path = path_type('foo/bar.csv' + extension) - compression = common._infer_compression(path, compression='infer') + compression = icom._infer_compression(path, compression='infer') assert compression == expected def test_get_filepath_or_buffer_with_path(self): filename = '~/sometest' - filepath_or_buffer, _, _, should_close = common.get_filepath_or_buffer( + filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer( filename) assert filepath_or_buffer != filename - assert isabs(filepath_or_buffer) + assert os.path.isabs(filepath_or_buffer) assert os.path.expanduser(filename) == filepath_or_buffer assert not should_close def test_get_filepath_or_buffer_with_buffer(self): input_buffer = StringIO() - filepath_or_buffer, _, _, should_close = common.get_filepath_or_buffer( + filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer( input_buffer) assert filepath_or_buffer == input_buffer assert not should_close def test_iterator(self): - reader = read_csv(StringIO(self.data1), chunksize=1) - result = concat(reader, ignore_index=True) - expected = read_csv(StringIO(self.data1)) + reader = pd.read_csv(StringIO(self.data1), chunksize=1) + result = pd.concat(reader, ignore_index=True) + expected = pd.read_csv(StringIO(self.data1)) tm.assert_frame_equal(result, expected) # GH12153 - it = read_csv(StringIO(self.data1), chunksize=1) + it = pd.read_csv(StringIO(self.data1), chunksize=1) first = next(it) tm.assert_frame_equal(first, expected.iloc[[0]]) - tm.assert_frame_equal(concat(it), expected.iloc[1:]) + tm.assert_frame_equal(pd.concat(it), expected.iloc[1:]) @pytest.mark.parametrize('reader, module, error_class, fn_ext', [ (pd.read_csv, 'os', FileNotFoundError, 'csv'), @@ -246,18 +247,18 @@ def test_constructor_bad_file(self, mmap_file): msg = "[Errno 22]" err = mmap.error - tm.assert_raises_regex(err, msg, common.MMapWrapper, non_file) + tm.assert_raises_regex(err, msg, icom.MMapWrapper, non_file) target = open(mmap_file, 'r') target.close() msg = "I/O operation on closed file" tm.assert_raises_regex( - ValueError, msg, common.MMapWrapper, target) + ValueError, msg, icom.MMapWrapper, target) def test_get_attr(self, mmap_file): with open(mmap_file, 'r') as target: - wrapper = common.MMapWrapper(target) + wrapper = icom.MMapWrapper(target) attrs = dir(wrapper.mmap) attrs = [attr for attr in attrs @@ -271,7 +272,7 @@ def test_get_attr(self, mmap_file): def test_next(self, mmap_file): with open(mmap_file, 'r') as target: - wrapper = common.MMapWrapper(target) + wrapper = icom.MMapWrapper(target) lines = target.readlines() for line in lines: @@ -285,4 +286,4 @@ def test_unknown_engine(self): df = tm.makeDataFrame() df.to_csv(path) with tm.assert_raises_regex(ValueError, 'Unknown engine'): - read_csv(path, engine='pyt') + pd.read_csv(path, engine='pyt') diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py new file mode 100644 index 0000000000000..76788ced44e84 --- /dev/null +++ b/pandas/tests/io/test_compression.py @@ -0,0 +1,99 @@ +import os + +import pytest + +import pandas as pd +import pandas.io.common as icom +import pandas.util.testing as tm + + +@pytest.mark.parametrize('obj', [ + pd.DataFrame(100 * [[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + columns=['X', 'Y', 'Z']), + pd.Series(100 * [0.123456, 0.234567, 0.567567], name='X')]) +@pytest.mark.parametrize('method', ['to_pickle', 'to_json', 'to_csv']) +def test_compression_size(obj, method, compression_only): + with tm.ensure_clean() as path: + getattr(obj, method)(path, compression=compression_only) + compressed_size = os.path.getsize(path) + getattr(obj, method)(path, compression=None) + uncompressed_size = os.path.getsize(path) + assert uncompressed_size > compressed_size + + +@pytest.mark.parametrize('obj', [ + pd.DataFrame(100 * [[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + columns=['X', 'Y', 'Z']), + pd.Series(100 * [0.123456, 0.234567, 0.567567], name='X')]) +@pytest.mark.parametrize('method', ['to_csv', 'to_json']) +def test_compression_size_fh(obj, method, compression_only): + with tm.ensure_clean() as path: + f, handles = icom._get_handle(path, 'w', compression=compression_only) + with f: + getattr(obj, method)(f) + assert not f.closed + assert f.closed + compressed_size = os.path.getsize(path) + with tm.ensure_clean() as path: + f, handles = icom._get_handle(path, 'w', compression=None) + with f: + getattr(obj, method)(f) + assert not f.closed + assert f.closed + uncompressed_size = os.path.getsize(path) + assert uncompressed_size > compressed_size + + +@pytest.mark.parametrize('write_method, write_kwargs, read_method', [ + ('to_csv', {'index': False}, pd.read_csv), + ('to_json', {}, pd.read_json), + ('to_pickle', {}, pd.read_pickle), +]) +def test_dataframe_compression_defaults_to_infer( + write_method, write_kwargs, read_method, compression_only): + # GH22004 + input = pd.DataFrame([[1.0, 0, -4], [3.4, 5, 2]], columns=['X', 'Y', 'Z']) + extension = icom._compression_to_extension[compression_only] + with tm.ensure_clean('compressed' + extension) as path: + getattr(input, write_method)(path, **write_kwargs) + output = read_method(path, compression=compression_only) + tm.assert_frame_equal(output, input) + + +@pytest.mark.parametrize('write_method,write_kwargs,read_method,read_kwargs', [ + ('to_csv', {'index': False, 'header': True}, + pd.read_csv, {'squeeze': True}), + ('to_json', {}, pd.read_json, {'typ': 'series'}), + ('to_pickle', {}, pd.read_pickle, {}), +]) +def test_series_compression_defaults_to_infer( + write_method, write_kwargs, read_method, read_kwargs, + compression_only): + # GH22004 + input = pd.Series([0, 5, -2, 10], name='X') + extension = icom._compression_to_extension[compression_only] + with tm.ensure_clean('compressed' + extension) as path: + getattr(input, write_method)(path, **write_kwargs) + output = read_method(path, compression=compression_only, **read_kwargs) + tm.assert_series_equal(output, input, check_names=False) + + +def test_compression_warning(compression_only): + # Assert that passing a file object to to_csv while explicitly specifying a + # compression protocol triggers a RuntimeWarning, as per GH21227. + # Note that pytest has an issue that causes assert_produces_warning to fail + # in Python 2 if the warning has occurred in previous tests + # (see https://git.io/fNEBm & https://git.io/fNEBC). Hence, should this + # test fail in just Python 2 builds, it likely indicates that other tests + # are producing RuntimeWarnings, thereby triggering the pytest bug. + df = pd.DataFrame(100 * [[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + columns=['X', 'Y', 'Z']) + with tm.ensure_clean() as path: + f, handles = icom._get_handle(path, 'w', compression=compression_only) + with tm.assert_produces_warning(RuntimeWarning, + check_stacklevel=False): + with f: + df.to_csv(f, compression=compression_only) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index e1c9202189972..868525e818b62 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -1,17 +1,16 @@ # -*- coding: utf-8 -*- -import pytest -import os import collections from functools import partial import numpy as np +import pytest -from pandas import Series, DataFrame, Timestamp -import pandas.core.common as com -from pandas.core import ops -from pandas.io.common import _get_handle -import pandas.util.testing as tm +from pandas import Series, Timestamp +from pandas.core import ( + common as com, + ops, +) def test_get_callable_name(): @@ -20,7 +19,7 @@ def test_get_callable_name(): def fn(x): return x - lambda_ = lambda x: x + lambda_ = lambda x: x # noqa: E731 part1 = partial(fn) part2 = partial(part1) @@ -111,57 +110,3 @@ def test_standardize_mapping(): dd = collections.defaultdict(list) assert isinstance(com.standardize_mapping(dd), partial) - - -@pytest.mark.parametrize('obj', [ - DataFrame(100 * [[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - columns=['X', 'Y', 'Z']), - Series(100 * [0.123456, 0.234567, 0.567567], name='X')]) -@pytest.mark.parametrize('method', ['to_pickle', 'to_json', 'to_csv']) -def test_compression_size(obj, method, compression_only): - - with tm.ensure_clean() as filename: - getattr(obj, method)(filename, compression=compression_only) - compressed = os.path.getsize(filename) - getattr(obj, method)(filename, compression=None) - uncompressed = os.path.getsize(filename) - assert uncompressed > compressed - - -@pytest.mark.parametrize('obj', [ - DataFrame(100 * [[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - columns=['X', 'Y', 'Z']), - Series(100 * [0.123456, 0.234567, 0.567567], name='X')]) -@pytest.mark.parametrize('method', ['to_csv', 'to_json']) -def test_compression_size_fh(obj, method, compression_only): - - with tm.ensure_clean() as filename: - f, _handles = _get_handle(filename, 'w', compression=compression_only) - with f: - getattr(obj, method)(f) - assert not f.closed - assert f.closed - compressed = os.path.getsize(filename) - with tm.ensure_clean() as filename: - f, _handles = _get_handle(filename, 'w', compression=None) - with f: - getattr(obj, method)(f) - assert not f.closed - assert f.closed - uncompressed = os.path.getsize(filename) - assert uncompressed > compressed - - -# GH 21227 -def test_compression_warning(compression_only): - df = DataFrame(100 * [[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - columns=['X', 'Y', 'Z']) - with tm.ensure_clean() as filename: - f, _handles = _get_handle(filename, 'w', compression=compression_only) - with tm.assert_produces_warning(RuntimeWarning, - check_stacklevel=False): - with f: - df.to_csv(f, compression=compression_only)