From 91451cb7dbaaf6fb3f9bdfca73fe6adc2ee68cce Mon Sep 17 00:00:00 2001 From: Ming Li <14131823+minggli@users.noreply.github.com> Date: Mon, 18 Jun 2018 23:45:25 +0100 Subject: [PATCH] BUG/REG: file-handle object handled incorrectly in to_csv (#21478) --- doc/source/whatsnew/v0.23.2.txt | 2 +- pandas/core/frame.py | 3 +- pandas/core/series.py | 3 +- pandas/io/common.py | 4 +++ pandas/io/formats/csvs.py | 59 ++++++++++++++++++++----------- pandas/tests/frame/test_to_csv.py | 16 +++++---- pandas/tests/series/test_io.py | 18 +++++----- pandas/tests/test_common.py | 34 +++++++++++++----- 8 files changed, 91 insertions(+), 48 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 94669c5b02410..67c7ce150132a 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -16,7 +16,7 @@ and bug fixes. We recommend that all users upgrade to this version. Fixed Regressions ~~~~~~~~~~~~~~~~~ -- +- Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`) - .. _whatsnew_0232.performance: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 02c86d2f4dcc8..a5dfbcc2a3142 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1690,7 +1690,8 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, defaults to 'ascii' on Python 2 and 'utf-8' on Python 3. compression : string, optional A string representing the compression to use in the output file. - Allowed values are 'gzip', 'bz2', 'zip', 'xz'. + Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only + used when the first argument is a filename. line_terminator : string, default ``'\n'`` The newline character or character sequence to use in the output file diff --git a/pandas/core/series.py b/pandas/core/series.py index 0450f28087f66..23c4bbe082f28 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3790,7 +3790,8 @@ def to_csv(self, path=None, index=True, sep=",", na_rep='', non-ascii, for python versions prior to 3 compression : string, optional A string representing the compression to use in the output file. - Allowed values are 'gzip', 'bz2', 'zip', 'xz'. + Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only + used when the first argument is a filename. date_format: string, default None Format string for datetime objects. decimal: string, default '.' diff --git a/pandas/io/common.py b/pandas/io/common.py index a492b7c0b8e8e..ac9077f2db50e 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -445,6 +445,10 @@ def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, **kwargs): def write(self, data): super(BytesZipFile, self).writestr(self.filename, data) + @property + def closed(self): + return self.fp is None + class MMapWrapper(BaseIterator): """ diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 7f660e2644fa4..60518f596e9af 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -5,11 +5,13 @@ from __future__ import print_function +import warnings + import csv as csvlib +from zipfile import ZipFile import numpy as np from pandas.core.dtypes.missing import notna -from pandas.core.dtypes.inference import is_file_like from pandas.core.index import Index, MultiIndex from pandas import compat from pandas.compat import (StringIO, range, zip) @@ -128,19 +130,31 @@ def save(self): else: encoding = self.encoding - # PR 21300 uses string buffer to receive csv writing and dump into - # file-like output with compression as option. GH 21241, 21118 - f = StringIO() - if not is_file_like(self.path_or_buf): - # path_or_buf is path - path_or_buf = self.path_or_buf - elif hasattr(self.path_or_buf, 'name'): - # path_or_buf is file handle - path_or_buf = self.path_or_buf.name - else: - # path_or_buf is file-like IO objects. + # GH 21227 internal compression is not used when file-like passed. + if self.compression and hasattr(self.path_or_buf, 'write'): + msg = ("compression has no effect when passing file-like " + "object as input.") + warnings.warn(msg, RuntimeWarning, stacklevel=2) + + # when zip compression is called. + is_zip = isinstance(self.path_or_buf, ZipFile) or ( + not hasattr(self.path_or_buf, 'write') + and self.compression == 'zip') + + if is_zip: + # zipfile doesn't support writing string to archive. uses string + # buffer to receive csv writing and dump into zip compression + # file handle. GH 21241, 21118 + f = StringIO() + close = False + elif hasattr(self.path_or_buf, 'write'): f = self.path_or_buf - path_or_buf = None + close = False + else: + f, handles = _get_handle(self.path_or_buf, self.mode, + encoding=encoding, + compression=self.compression) + close = True try: writer_kwargs = dict(lineterminator=self.line_terminator, @@ -157,13 +171,18 @@ def save(self): self._save() finally: - # GH 17778 handles zip compression for byte strings separately. - buf = f.getvalue() - if path_or_buf: - f, handles = _get_handle(path_or_buf, self.mode, - encoding=encoding, - compression=self.compression) - f.write(buf) + if is_zip: + # GH 17778 handles zip compression separately. + buf = f.getvalue() + if hasattr(self.path_or_buf, 'write'): + self.path_or_buf.write(buf) + else: + f, handles = _get_handle(self.path_or_buf, self.mode, + encoding=encoding, + compression=self.compression) + f.write(buf) + close = True + if close: f.close() for _fh in handles: _fh.close() diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 60dc336a85388..3ad25ae73109e 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -9,6 +9,7 @@ import numpy as np from pandas.compat import (lmap, range, lrange, StringIO, u) +from pandas.io.common import _get_handle import pandas.core.common as com from pandas.errors import ParserError from pandas import (DataFrame, Index, Series, MultiIndex, Timestamp, @@ -935,18 +936,19 @@ def test_to_csv_compression(self, df, encoding, compression): with ensure_clean() as filename: df.to_csv(filename, compression=compression, encoding=encoding) - # test the round trip - to_csv -> read_csv result = read_csv(filename, compression=compression, index_col=0, encoding=encoding) + assert_frame_equal(df, result) - with open(filename, 'w') as fh: - df.to_csv(fh, compression=compression, encoding=encoding) - - result_fh = read_csv(filename, compression=compression, - index_col=0, encoding=encoding) + # test the round trip using file handle - to_csv -> read_csv + f, _handles = _get_handle(filename, 'w', compression=compression, + encoding=encoding) + with f: + df.to_csv(f, encoding=encoding) + result = pd.read_csv(filename, compression=compression, + encoding=encoding, index_col=0, squeeze=True) assert_frame_equal(df, result) - assert_frame_equal(df, result_fh) # explicitly make sure file is compressed with tm.decompress_file(filename, compression) as fh: diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index f98962685ad9a..814d794d45c18 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -11,6 +11,7 @@ from pandas import Series, DataFrame from pandas.compat import StringIO, u +from pandas.io.common import _get_handle from pandas.util.testing import (assert_series_equal, assert_almost_equal, assert_frame_equal, ensure_clean) import pandas.util.testing as tm @@ -151,20 +152,19 @@ def test_to_csv_compression(self, s, encoding, compression): s.to_csv(filename, compression=compression, encoding=encoding, header=True) - # test the round trip - to_csv -> read_csv result = pd.read_csv(filename, compression=compression, encoding=encoding, index_col=0, squeeze=True) + assert_series_equal(s, result) - with open(filename, 'w') as fh: - s.to_csv(fh, compression=compression, encoding=encoding, - header=True) - - result_fh = pd.read_csv(filename, compression=compression, - encoding=encoding, index_col=0, - squeeze=True) + # test the round trip using file handle - to_csv -> read_csv + f, _handles = _get_handle(filename, 'w', compression=compression, + encoding=encoding) + with f: + s.to_csv(f, encoding=encoding, header=True) + result = pd.read_csv(filename, compression=compression, + encoding=encoding, index_col=0, squeeze=True) assert_series_equal(s, result) - assert_series_equal(s, result_fh) # explicitly ensure file was compressed with tm.decompress_file(filename, compression) as fh: diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 7034e9ac2e0c8..ef5f13bfa504a 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -11,6 +11,7 @@ from pandas.compat import range, lmap import pandas.core.common as com from pandas.core import ops +from pandas.io.common import _get_handle import pandas.util.testing as tm @@ -246,19 +247,34 @@ def test_compression_size(obj, method, compression_only): [12.32112, 123123.2, 321321.2]], columns=['X', 'Y', 'Z']), Series(100 * [0.123456, 0.234567, 0.567567], name='X')]) -@pytest.mark.parametrize('method', ['to_csv']) +@pytest.mark.parametrize('method', ['to_csv', 'to_json']) def test_compression_size_fh(obj, method, compression_only): with tm.ensure_clean() as filename: - with open(filename, 'w') as fh: - getattr(obj, method)(fh, compression=compression_only) - assert not fh.closed - assert fh.closed + f, _handles = _get_handle(filename, 'w', compression=compression_only) + with f: + getattr(obj, method)(f) + assert not f.closed + assert f.closed compressed = os.path.getsize(filename) with tm.ensure_clean() as filename: - with open(filename, 'w') as fh: - getattr(obj, method)(fh, compression=None) - assert not fh.closed - assert fh.closed + f, _handles = _get_handle(filename, 'w', compression=None) + with f: + getattr(obj, method)(f) + assert not f.closed + assert f.closed uncompressed = os.path.getsize(filename) assert uncompressed > compressed + + +# GH 21227 +def test_compression_warning(compression_only): + df = DataFrame(100 * [[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + columns=['X', 'Y', 'Z']) + with tm.ensure_clean() as filename: + f, _handles = _get_handle(filename, 'w', compression=compression_only) + with tm.assert_produces_warning(RuntimeWarning, + check_stacklevel=False): + with f: + df.to_csv(f, compression=compression_only)