Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REGR: to_csv created corrupt ZIP files when chunksize<rows #38728

Merged
merged 1 commit into from
Dec 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ including other versions of pandas.
Fixed regressions
~~~~~~~~~~~~~~~~~
- The deprecated attributes ``_AXIS_NAMES`` and ``_AXIS_NUMBERS`` of :class:`DataFrame` and :class:`Series` will no longer show up in ``dir`` or ``inspect.getmembers`` calls (:issue:`38740`)
- :meth:`to_csv` created corrupted zip files when there were more rows than ``chunksize`` (issue:`38714`)
-

.. ---------------------------------------------------------------------------
Expand Down
29 changes: 24 additions & 5 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
from collections import abc
import dataclasses
import gzip
from io import BufferedIOBase, BytesIO, RawIOBase, TextIOWrapper
from io import BufferedIOBase, BytesIO, RawIOBase, StringIO, TextIOWrapper
import mmap
import os
from typing import IO, Any, AnyStr, Dict, List, Mapping, Optional, Tuple, cast
from typing import IO, Any, AnyStr, Dict, List, Mapping, Optional, Tuple, Union, cast
from urllib.parse import (
urljoin,
urlparse as parse_url,
Expand Down Expand Up @@ -713,17 +713,36 @@ def __init__(
archive_name: Optional[str] = None,
**kwargs,
):
if mode in ["wb", "rb"]:
mode = mode.replace("b", "")
mode = mode.replace("b", "")
self.archive_name = archive_name
self.multiple_write_buffer: Optional[Union[StringIO, BytesIO]] = None

kwargs_zip: Dict[str, Any] = {"compression": zipfile.ZIP_DEFLATED}
kwargs_zip.update(kwargs)

super().__init__(file, mode, **kwargs_zip) # type: ignore[arg-type]

def write(self, data):
# buffer multiple write calls, write on flush
if self.multiple_write_buffer is None:
self.multiple_write_buffer = (
BytesIO() if isinstance(data, bytes) else StringIO()
)
self.multiple_write_buffer.write(data)

def flush(self) -> None:
# write to actual handle and close write buffer
if self.multiple_write_buffer is None or self.multiple_write_buffer.closed:
return

# ZipFile needs a non-empty string
archive_name = self.archive_name or self.filename or "zip"
super().writestr(archive_name, data)
with self.multiple_write_buffer:
super().writestr(archive_name, self.multiple_write_buffer.getvalue())

def close(self):
self.flush()
super().close()

@property
def closed(self):
Expand Down
22 changes: 22 additions & 0 deletions pandas/tests/io/formats/test_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -640,3 +640,25 @@ def test_to_csv_encoding_binary_handle(self, mode):

handle.seek(0)
assert handle.read().startswith(b'\xef\xbb\xbf""')


def test_to_csv_iterative_compression_name(compression):
# GH 38714
df = tm.makeDataFrame()
with tm.ensure_clean() as path:
df.to_csv(path, compression=compression, chunksize=1)
tm.assert_frame_equal(
pd.read_csv(path, compression=compression, index_col=0), df
)


def test_to_csv_iterative_compression_buffer(compression):
# GH 38714
df = tm.makeDataFrame()
with io.BytesIO() as buffer:
df.to_csv(buffer, compression=compression, chunksize=1)
buffer.seek(0)
tm.assert_frame_equal(
pd.read_csv(buffer, compression=compression, index_col=0), df
)
assert not buffer.closed