API: Default to_* methods to compression='infer' (#22011)

Closes gh-22004.
pandas-dev · Aug 1, 2018 · 93f154c · 93f154c
1 parent 9c11866
commit 93f154c
Show file tree

Hide file tree

Showing 10 changed files with 180 additions and 125 deletions.
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -298,7 +298,7 @@ compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``
   Set to ``None`` for no decompression.
 
   .. versionadded:: 0.18.1 support for 'zip' and 'xz' compression.
-
+  .. versionchanged:: 0.24.0 'infer' option added and set to default.
 thousands : str, default ``None``
   Thousands separator.
 decimal : str, default ``'.'``

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -177,7 +177,8 @@ Other Enhancements
 - :func:`read_html` copies cell data across ``colspan`` and ``rowspan``, and it treats all-``th`` table rows as headers if ``header`` kwarg is not given and there is no ``thead`` (:issue:`17054`)
 - :meth:`Series.nlargest`, :meth:`Series.nsmallest`, :meth:`DataFrame.nlargest`, and :meth:`DataFrame.nsmallest` now accept the value ``"all"`` for the ``keep`` argument. This keeps all ties for the nth largest/smallest value (:issue:`16818`)
 - :class:`IntervalIndex` has gained the :meth:`~IntervalIndex.set_closed` method to change the existing ``closed`` value (:issue:`21670`)
-- :func:`~DataFrame.to_csv` and :func:`~DataFrame.to_json` now support ``compression='infer'`` to infer compression based on filename (:issue:`15008`)
+- :func:`~DataFrame.to_csv`, :func:`~Series.to_csv`, :func:`~DataFrame.to_json`, and :func:`~Series.to_json` now support ``compression='infer'`` to infer compression based on filename extension (:issue:`15008`).
+  The default compression for ``to_csv``, ``to_json``, and ``to_pickle`` methods has been updated to ``'infer'`` (:issue:`22004`).
 - :func:`to_timedelta` now supports iso-formated timedelta strings (:issue:`21877`)
 - :class:`Series` and :class:`DataFrame` now support :class:`Iterable` in constructor (:issue:`2193`)
 

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1715,7 +1715,7 @@ def to_panel(self):
 
     def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
                columns=None, header=True, index=True, index_label=None,
-               mode='w', encoding=None, compression=None, quoting=None,
+               mode='w', encoding=None, compression='infer', quoting=None,
                quotechar='"', line_terminator='\n', chunksize=None,
                tupleize_cols=None, date_format=None, doublequote=True,
                escapechar=None, decimal='.'):
@@ -1750,10 +1750,14 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
         encoding : string, optional
             A string representing the encoding to use in the output file,
             defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
-        compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None
+        compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None},
+                      default 'infer'
             If 'infer' and `path_or_buf` is path-like, then detect compression
             from the following extensions: '.gz', '.bz2', '.zip' or '.xz'
             (otherwise no compression).
+
+            .. versionchanged:: 0.24.0
+               'infer' option added and set to default
         line_terminator : string, default ``'\n'``
             The newline character or character sequence to use in the output
             file

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -1933,7 +1933,7 @@ def _repr_latex_(self):
 
     def to_json(self, path_or_buf=None, orient=None, date_format=None,
                 double_precision=10, force_ascii=True, date_unit='ms',
-                default_handler=None, lines=False, compression=None,
+                default_handler=None, lines=False, compression='infer',
                 index=True):
         """
         Convert the object to a JSON string.
@@ -1999,13 +1999,14 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None,
             like.
 
             .. versionadded:: 0.19.0
-
-        compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None
+        compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None},
+                       default 'infer'
             A string representing the compression to use in the output file,
             only used when the first argument is a filename.
 
             .. versionadded:: 0.21.0
-
+            .. versionchanged:: 0.24.0
+               'infer' option added and set to default
         index : boolean, default True
             Whether to include the index values in the JSON string. Not
             including the index (``index=False``) is only supported when

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -3767,7 +3767,7 @@ def from_csv(cls, path, sep=',', parse_dates=True, header=None,
 
     def to_csv(self, path=None, index=True, sep=",", na_rep='',
                float_format=None, header=False, index_label=None,
-               mode='w', encoding=None, compression=None, date_format=None,
+               mode='w', encoding=None, compression='infer', date_format=None,
                decimal='.'):
         """
         Write Series to a comma-separated values (csv) file
@@ -3795,10 +3795,13 @@ def to_csv(self, path=None, index=True, sep=",", na_rep='',
         encoding : string, optional
             a string representing the encoding to use if the contents are
             non-ascii, for python versions prior to 3
-        compression : string, optional
+        compression : None or string, default 'infer'
             A string representing the compression to use in the output file.
-            Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only
-            used when the first argument is a filename.
+            Allowed values are None, 'gzip', 'bz2', 'zip', 'xz', and 'infer'.
+            This input is only used when the first argument is a filename.
+
+            .. versionchanged:: 0.24.0
+               'infer' option added and set to default
         date_format: string, default None
             Format string for datetime objects.
         decimal: string, default '.'

diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py
@@ -21,16 +21,21 @@
 from pandas.core.dtypes.generic import (
     ABCMultiIndex, ABCPeriodIndex, ABCDatetimeIndex, ABCIndexClass)
 
-from pandas.io.common import (_get_handle, UnicodeWriter, _expand_user,
-                              _stringify_path)
+from pandas.io.common import (
+    _expand_user,
+    _get_handle,
+    _infer_compression,
+    _stringify_path,
+    UnicodeWriter,
+)
 
 
 class CSVFormatter(object):
 
     def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
                  float_format=None, cols=None, header=True, index=True,
                  index_label=None, mode='w', nanRep=None, encoding=None,
-                 compression=None, quoting=None, line_terminator='\n',
+                 compression='infer', quoting=None, line_terminator='\n',
                  chunksize=None, tupleize_cols=False, quotechar='"',
                  date_format=None, doublequote=True, escapechar=None,
                  decimal='.'):
@@ -50,8 +55,10 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
         self.index = index
         self.index_label = index_label
         self.mode = mode
+        if encoding is None:
+            encoding = 'ascii' if compat.PY2 else 'utf-8'
         self.encoding = encoding
-        self.compression = compression
+        self.compression = _infer_compression(self.path_or_buf, compression)
 
         if quoting is None:
             quoting = csvlib.QUOTE_MINIMAL
@@ -124,16 +131,10 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
             self.nlevels = 0
 
     def save(self):
-        # create the writer & save
-        if self.encoding is None:
-            if compat.PY2:
-                encoding = 'ascii'
-            else:
-                encoding = 'utf-8'
-        else:
-            encoding = self.encoding
-
-        # GH 21227 internal compression is not used when file-like passed.
+        """
+        Create the writer & save
+        """
+        # GH21227 internal compression is not used when file-like passed.
         if self.compression and hasattr(self.path_or_buf, 'write'):
             msg = ("compression has no effect when passing file-like "
                    "object as input.")
@@ -147,15 +148,15 @@ def save(self):
         if is_zip:
             # zipfile doesn't support writing string to archive. uses string
             # buffer to receive csv writing and dump into zip compression
-            # file handle. GH 21241, 21118
+            # file handle. GH21241, GH21118
             f = StringIO()
             close = False
         elif hasattr(self.path_or_buf, 'write'):
             f = self.path_or_buf
             close = False
         else:
             f, handles = _get_handle(self.path_or_buf, self.mode,
-                                     encoding=encoding,
+                                     encoding=self.encoding,
                                      compression=self.compression)
             close = True
 
@@ -165,23 +166,23 @@ def save(self):
                                  doublequote=self.doublequote,
                                  escapechar=self.escapechar,
                                  quotechar=self.quotechar)
-            if encoding == 'ascii':
+            if self.encoding == 'ascii':
                 self.writer = csvlib.writer(f, **writer_kwargs)
             else:
-                writer_kwargs['encoding'] = encoding
+                writer_kwargs['encoding'] = self.encoding
                 self.writer = UnicodeWriter(f, **writer_kwargs)
 
             self._save()
 
         finally:
             if is_zip:
-                # GH 17778 handles zip compression separately.
+                # GH17778 handles zip compression separately.
                 buf = f.getvalue()
                 if hasattr(self.path_or_buf, 'write'):
                     self.path_or_buf.write(buf)
                 else:
                     f, handles = _get_handle(self.path_or_buf, self.mode,
-                                             encoding=encoding,
+                                             encoding=self.encoding,
                                              compression=self.compression)
                     f.write(buf)
                     close = True

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
@@ -28,7 +28,7 @@
 # interface to/from
 def to_json(path_or_buf, obj, orient=None, date_format='epoch',
             double_precision=10, force_ascii=True, date_unit='ms',
-            default_handler=None, lines=False, compression=None,
+            default_handler=None, lines=False, compression='infer',
             index=True):
 
     if not index and orient not in ['split', 'table']:

diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py
@@ -1,19 +1,20 @@
 """
-    Tests for the pandas.io.common functionalities
+Tests for the pandas.io.common functionalities
 """
 import mmap
-import pytest
 import os
-from os.path import isabs
+
+import pytest
 
 import pandas as pd
-import pandas.util.testing as tm
+import pandas.io.common as icom
 import pandas.util._test_decorators as td
-
-from pandas.io import common
-from pandas.compat import is_platform_windows, StringIO, FileNotFoundError
-
-from pandas import read_csv, concat
+import pandas.util.testing as tm
+from pandas.compat import (
+    is_platform_windows,
+    StringIO,
+    FileNotFoundError,
+)
 
 
 class CustomFSPath(object):
@@ -55,36 +56,36 @@ class TestCommonIOCapabilities(object):
 
     def test_expand_user(self):
         filename = '~/sometest'
-        expanded_name = common._expand_user(filename)
+        expanded_name = icom._expand_user(filename)
 
         assert expanded_name != filename
-        assert isabs(expanded_name)
+        assert os.path.isabs(expanded_name)
         assert os.path.expanduser(filename) == expanded_name
 
     def test_expand_user_normal_path(self):
         filename = '/somefolder/sometest'
-        expanded_name = common._expand_user(filename)
+        expanded_name = icom._expand_user(filename)
 
         assert expanded_name == filename
         assert os.path.expanduser(filename) == expanded_name
 
     @td.skip_if_no('pathlib')
     def test_stringify_path_pathlib(self):
-        rel_path = common._stringify_path(Path('.'))
+        rel_path = icom._stringify_path(Path('.'))
         assert rel_path == '.'
-        redundant_path = common._stringify_path(Path('foo//bar'))
+        redundant_path = icom._stringify_path(Path('foo//bar'))
         assert redundant_path == os.path.join('foo', 'bar')
 
     @td.skip_if_no('py.path')
     def test_stringify_path_localpath(self):
         path = os.path.join('foo', 'bar')
         abs_path = os.path.abspath(path)
         lpath = LocalPath(path)
-        assert common._stringify_path(lpath) == abs_path
+        assert icom._stringify_path(lpath) == abs_path
 
     def test_stringify_path_fspath(self):
         p = CustomFSPath('foo/bar.csv')
-        result = common._stringify_path(p)
+        result = icom._stringify_path(p)
         assert result == 'foo/bar.csv'
 
     @pytest.mark.parametrize('extension,expected', [
@@ -97,36 +98,36 @@ def test_stringify_path_fspath(self):
     @pytest.mark.parametrize('path_type', path_types)
     def test_infer_compression_from_path(self, extension, expected, path_type):
         path = path_type('foo/bar.csv' + extension)
-        compression = common._infer_compression(path, compression='infer')
+        compression = icom._infer_compression(path, compression='infer')
         assert compression == expected
 
     def test_get_filepath_or_buffer_with_path(self):
         filename = '~/sometest'
-        filepath_or_buffer, _, _, should_close = common.get_filepath_or_buffer(
+        filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer(
             filename)
         assert filepath_or_buffer != filename
-        assert isabs(filepath_or_buffer)
+        assert os.path.isabs(filepath_or_buffer)
         assert os.path.expanduser(filename) == filepath_or_buffer
         assert not should_close
 
     def test_get_filepath_or_buffer_with_buffer(self):
         input_buffer = StringIO()
-        filepath_or_buffer, _, _, should_close = common.get_filepath_or_buffer(
+        filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer(
             input_buffer)
         assert filepath_or_buffer == input_buffer
         assert not should_close
 
     def test_iterator(self):
-        reader = read_csv(StringIO(self.data1), chunksize=1)
-        result = concat(reader, ignore_index=True)
-        expected = read_csv(StringIO(self.data1))
+        reader = pd.read_csv(StringIO(self.data1), chunksize=1)
+        result = pd.concat(reader, ignore_index=True)
+        expected = pd.read_csv(StringIO(self.data1))
         tm.assert_frame_equal(result, expected)
 
         # GH12153
-        it = read_csv(StringIO(self.data1), chunksize=1)
+        it = pd.read_csv(StringIO(self.data1), chunksize=1)
         first = next(it)
         tm.assert_frame_equal(first, expected.iloc[[0]])
-        tm.assert_frame_equal(concat(it), expected.iloc[1:])
+        tm.assert_frame_equal(pd.concat(it), expected.iloc[1:])
 
     @pytest.mark.parametrize('reader, module, error_class, fn_ext', [
         (pd.read_csv, 'os', FileNotFoundError, 'csv'),
@@ -246,18 +247,18 @@ def test_constructor_bad_file(self, mmap_file):
             msg = "[Errno 22]"
             err = mmap.error
 
-        tm.assert_raises_regex(err, msg, common.MMapWrapper, non_file)
+        tm.assert_raises_regex(err, msg, icom.MMapWrapper, non_file)
 
         target = open(mmap_file, 'r')
         target.close()
 
         msg = "I/O operation on closed file"
         tm.assert_raises_regex(
-            ValueError, msg, common.MMapWrapper, target)
+            ValueError, msg, icom.MMapWrapper, target)
 
     def test_get_attr(self, mmap_file):
         with open(mmap_file, 'r') as target:
-            wrapper = common.MMapWrapper(target)
+            wrapper = icom.MMapWrapper(target)
 
         attrs = dir(wrapper.mmap)
         attrs = [attr for attr in attrs
@@ -271,7 +272,7 @@ def test_get_attr(self, mmap_file):
 
     def test_next(self, mmap_file):
         with open(mmap_file, 'r') as target:
-            wrapper = common.MMapWrapper(target)
+            wrapper = icom.MMapWrapper(target)
             lines = target.readlines()
 
         for line in lines:
@@ -285,4 +286,4 @@ def test_unknown_engine(self):
             df = tm.makeDataFrame()
             df.to_csv(path)
             with tm.assert_raises_regex(ValueError, 'Unknown engine'):
-                read_csv(path, engine='pyt')
+                pd.read_csv(path, engine='pyt')