Skip to content

Commit

Permalink
BUG/REG: file-handle object handled incorrectly in to_csv (#21478)
Browse files Browse the repository at this point in the history
  • Loading branch information
minggli authored and jreback committed Jun 18, 2018
1 parent 076635a commit 91451cb
Show file tree
Hide file tree
Showing 8 changed files with 91 additions and 48 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.23.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ and bug fixes. We recommend that all users upgrade to this version.
Fixed Regressions
~~~~~~~~~~~~~~~~~

-
- Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`)
-

.. _whatsnew_0232.performance:
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1690,7 +1690,8 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
compression : string, optional
A string representing the compression to use in the output file.
Allowed values are 'gzip', 'bz2', 'zip', 'xz'.
Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only
used when the first argument is a filename.
line_terminator : string, default ``'\n'``
The newline character or character sequence to use in the output
file
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3790,7 +3790,8 @@ def to_csv(self, path=None, index=True, sep=",", na_rep='',
non-ascii, for python versions prior to 3
compression : string, optional
A string representing the compression to use in the output file.
Allowed values are 'gzip', 'bz2', 'zip', 'xz'.
Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only
used when the first argument is a filename.
date_format: string, default None
Format string for datetime objects.
decimal: string, default '.'
Expand Down
4 changes: 4 additions & 0 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,10 @@ def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, **kwargs):
def write(self, data):
super(BytesZipFile, self).writestr(self.filename, data)

@property
def closed(self):
return self.fp is None


class MMapWrapper(BaseIterator):
"""
Expand Down
59 changes: 39 additions & 20 deletions pandas/io/formats/csvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@

from __future__ import print_function

import warnings

import csv as csvlib
from zipfile import ZipFile
import numpy as np

from pandas.core.dtypes.missing import notna
from pandas.core.dtypes.inference import is_file_like
from pandas.core.index import Index, MultiIndex
from pandas import compat
from pandas.compat import (StringIO, range, zip)
Expand Down Expand Up @@ -128,19 +130,31 @@ def save(self):
else:
encoding = self.encoding

# PR 21300 uses string buffer to receive csv writing and dump into
# file-like output with compression as option. GH 21241, 21118
f = StringIO()
if not is_file_like(self.path_or_buf):
# path_or_buf is path
path_or_buf = self.path_or_buf
elif hasattr(self.path_or_buf, 'name'):
# path_or_buf is file handle
path_or_buf = self.path_or_buf.name
else:
# path_or_buf is file-like IO objects.
# GH 21227 internal compression is not used when file-like passed.
if self.compression and hasattr(self.path_or_buf, 'write'):
msg = ("compression has no effect when passing file-like "
"object as input.")
warnings.warn(msg, RuntimeWarning, stacklevel=2)

# when zip compression is called.
is_zip = isinstance(self.path_or_buf, ZipFile) or (
not hasattr(self.path_or_buf, 'write')
and self.compression == 'zip')

if is_zip:
# zipfile doesn't support writing string to archive. uses string
# buffer to receive csv writing and dump into zip compression
# file handle. GH 21241, 21118
f = StringIO()
close = False
elif hasattr(self.path_or_buf, 'write'):
f = self.path_or_buf
path_or_buf = None
close = False
else:
f, handles = _get_handle(self.path_or_buf, self.mode,
encoding=encoding,
compression=self.compression)
close = True

try:
writer_kwargs = dict(lineterminator=self.line_terminator,
Expand All @@ -157,13 +171,18 @@ def save(self):
self._save()

finally:
# GH 17778 handles zip compression for byte strings separately.
buf = f.getvalue()
if path_or_buf:
f, handles = _get_handle(path_or_buf, self.mode,
encoding=encoding,
compression=self.compression)
f.write(buf)
if is_zip:
# GH 17778 handles zip compression separately.
buf = f.getvalue()
if hasattr(self.path_or_buf, 'write'):
self.path_or_buf.write(buf)
else:
f, handles = _get_handle(self.path_or_buf, self.mode,
encoding=encoding,
compression=self.compression)
f.write(buf)
close = True
if close:
f.close()
for _fh in handles:
_fh.close()
Expand Down
16 changes: 9 additions & 7 deletions pandas/tests/frame/test_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import numpy as np

from pandas.compat import (lmap, range, lrange, StringIO, u)
from pandas.io.common import _get_handle
import pandas.core.common as com
from pandas.errors import ParserError
from pandas import (DataFrame, Index, Series, MultiIndex, Timestamp,
Expand Down Expand Up @@ -935,18 +936,19 @@ def test_to_csv_compression(self, df, encoding, compression):
with ensure_clean() as filename:

df.to_csv(filename, compression=compression, encoding=encoding)

# test the round trip - to_csv -> read_csv
result = read_csv(filename, compression=compression,
index_col=0, encoding=encoding)
assert_frame_equal(df, result)

with open(filename, 'w') as fh:
df.to_csv(fh, compression=compression, encoding=encoding)

result_fh = read_csv(filename, compression=compression,
index_col=0, encoding=encoding)
# test the round trip using file handle - to_csv -> read_csv
f, _handles = _get_handle(filename, 'w', compression=compression,
encoding=encoding)
with f:
df.to_csv(f, encoding=encoding)
result = pd.read_csv(filename, compression=compression,
encoding=encoding, index_col=0, squeeze=True)
assert_frame_equal(df, result)
assert_frame_equal(df, result_fh)

# explicitly make sure file is compressed
with tm.decompress_file(filename, compression) as fh:
Expand Down
18 changes: 9 additions & 9 deletions pandas/tests/series/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from pandas import Series, DataFrame

from pandas.compat import StringIO, u
from pandas.io.common import _get_handle
from pandas.util.testing import (assert_series_equal, assert_almost_equal,
assert_frame_equal, ensure_clean)
import pandas.util.testing as tm
Expand Down Expand Up @@ -151,20 +152,19 @@ def test_to_csv_compression(self, s, encoding, compression):

s.to_csv(filename, compression=compression, encoding=encoding,
header=True)

# test the round trip - to_csv -> read_csv
result = pd.read_csv(filename, compression=compression,
encoding=encoding, index_col=0, squeeze=True)
assert_series_equal(s, result)

with open(filename, 'w') as fh:
s.to_csv(fh, compression=compression, encoding=encoding,
header=True)

result_fh = pd.read_csv(filename, compression=compression,
encoding=encoding, index_col=0,
squeeze=True)
# test the round trip using file handle - to_csv -> read_csv
f, _handles = _get_handle(filename, 'w', compression=compression,
encoding=encoding)
with f:
s.to_csv(f, encoding=encoding, header=True)
result = pd.read_csv(filename, compression=compression,
encoding=encoding, index_col=0, squeeze=True)
assert_series_equal(s, result)
assert_series_equal(s, result_fh)

# explicitly ensure file was compressed
with tm.decompress_file(filename, compression) as fh:
Expand Down
34 changes: 25 additions & 9 deletions pandas/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from pandas.compat import range, lmap
import pandas.core.common as com
from pandas.core import ops
from pandas.io.common import _get_handle
import pandas.util.testing as tm


Expand Down Expand Up @@ -246,19 +247,34 @@ def test_compression_size(obj, method, compression_only):
[12.32112, 123123.2, 321321.2]],
columns=['X', 'Y', 'Z']),
Series(100 * [0.123456, 0.234567, 0.567567], name='X')])
@pytest.mark.parametrize('method', ['to_csv'])
@pytest.mark.parametrize('method', ['to_csv', 'to_json'])
def test_compression_size_fh(obj, method, compression_only):

with tm.ensure_clean() as filename:
with open(filename, 'w') as fh:
getattr(obj, method)(fh, compression=compression_only)
assert not fh.closed
assert fh.closed
f, _handles = _get_handle(filename, 'w', compression=compression_only)
with f:
getattr(obj, method)(f)
assert not f.closed
assert f.closed
compressed = os.path.getsize(filename)
with tm.ensure_clean() as filename:
with open(filename, 'w') as fh:
getattr(obj, method)(fh, compression=None)
assert not fh.closed
assert fh.closed
f, _handles = _get_handle(filename, 'w', compression=None)
with f:
getattr(obj, method)(f)
assert not f.closed
assert f.closed
uncompressed = os.path.getsize(filename)
assert uncompressed > compressed


# GH 21227
def test_compression_warning(compression_only):
df = DataFrame(100 * [[0.123456, 0.234567, 0.567567],
[12.32112, 123123.2, 321321.2]],
columns=['X', 'Y', 'Z'])
with tm.ensure_clean() as filename:
f, _handles = _get_handle(filename, 'w', compression=compression_only)
with tm.assert_produces_warning(RuntimeWarning,
check_stacklevel=False):
with f:
df.to_csv(f, compression=compression_only)

0 comments on commit 91451cb

Please sign in to comment.