Skip to content

Commit

Permalink
support binary file handles in to_csv (#35129)
Browse files Browse the repository at this point in the history
  • Loading branch information
twoertwein authored Aug 7, 2020
1 parent 1104f0d commit 3b88446
Show file tree
Hide file tree
Showing 8 changed files with 158 additions and 60 deletions.
17 changes: 17 additions & 0 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1064,6 +1064,23 @@ DD/MM/YYYY instead. For convenience, a ``dayfirst`` keyword is provided:
pd.read_csv('tmp.csv', parse_dates=[0])
pd.read_csv('tmp.csv', dayfirst=True, parse_dates=[0])
Writing CSVs to binary file objects
+++++++++++++++++++++++++++++++++++

.. versionadded:: 1.2.0

``df.to_csv(..., mode="w+b")`` allows writing a CSV to a file object
opened binary mode. For this to work, it is necessary that ``mode``
contains a "b":

.. ipython:: python
import io
data = pd.DataFrame([0, 1, 2])
buffer = io.BytesIO()
data.to_csv(buffer, mode="w+b", encoding="utf-8", compression="gzip")
.. _io.float_precision:

Specifying method for floating-point conversion
Expand Down
23 changes: 21 additions & 2 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,25 @@ including other versions of pandas.
Enhancements
~~~~~~~~~~~~

.. _whatsnew_120.binary_handle_to_csv:

Support for binary file handles in ``to_csv``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

:meth:`to_csv` supports file handles in binary mode (:issue:`19827` and :issue:`35058`)
with ``encoding`` (:issue:`13068` and :issue:`23854`) and ``compression`` (:issue:`22555`).
``mode`` has to contain a ``b`` for binary handles to be supported.

For example:

.. ipython:: python
import io
data = pd.DataFrame([0, 1, 2])
buffer = io.BytesIO()
data.to_csv(buffer, mode="w+b", encoding="utf-8", compression="gzip")
.. _whatsnew_120.enhancements.other:

Other enhancements
Expand Down Expand Up @@ -121,7 +140,7 @@ MultiIndex
I/O
^^^

-
- Bug in :meth:`to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`)
-

Plotting
Expand Down Expand Up @@ -167,4 +186,4 @@ Other
.. _whatsnew_120.contributors:

Contributors
~~~~~~~~~~~~
~~~~~~~~~~~~
16 changes: 13 additions & 3 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3021,13 +3021,18 @@ def to_csv(
----------
path_or_buf : str or file handle, default None
File path or object, if None is provided the result is returned as
a string. If a file object is passed it should be opened with
`newline=''`, disabling universal newlines.
a string. If a non-binary file object is passed, it should be opened
with `newline=''`, disabling universal newlines. If a binary
file object is passed, `mode` needs to contain a `'b'`.
.. versionchanged:: 0.24.0
Was previously named "path" for Series.
.. versionchanged:: 1.2.0
Support for binary file objects was introduced.
sep : str, default ','
String of length 1. Field delimiter for the output file.
na_rep : str, default ''
Expand Down Expand Up @@ -3056,7 +3061,8 @@ def to_csv(
Python write mode, default 'w'.
encoding : str, optional
A string representing the encoding to use in the output file,
defaults to 'utf-8'.
defaults to 'utf-8'. `encoding` is not supported if `path_or_buf`
is a non-binary file object.
compression : str or dict, default 'infer'
If str, represents compression mode. If dict, value at 'method' is
the compression mode. Compression mode may be any of the following
Expand All @@ -3080,6 +3086,10 @@ def to_csv(
supported for compression modes 'gzip' and 'bz2'
as well as 'zip'.
.. versionchanged:: 1.2.0
Compression is supported for non-binary file objects.
quoting : optional constant from csv module
Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
Expand Down
17 changes: 11 additions & 6 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,8 +407,9 @@ def get_handle(
memory_map : boolean, default False
See parsers._parser_params for more information.
is_text : boolean, default True
whether file/buffer is in text format (csv, json, etc.), or in binary
mode (pickle, etc.).
Whether the type of the content passed to the file/buffer is string or
bytes. This is not the same as `"b" not in mode`. If a string content is
passed to a binary file/buffer, a wrapper is inserted.
errors : str, default 'strict'
Specifies how encoding and decoding errors are to be handled.
See the errors argument for :func:`open` for a full list
Expand Down Expand Up @@ -449,14 +450,14 @@ def get_handle(
if is_path:
f = gzip.open(path_or_buf, mode, **compression_args)
else:
f = gzip.GzipFile(fileobj=path_or_buf, **compression_args)
f = gzip.GzipFile(fileobj=path_or_buf, mode=mode, **compression_args)

# BZ Compression
elif compression == "bz2":
if is_path:
f = bz2.BZ2File(path_or_buf, mode, **compression_args)
else:
f = bz2.BZ2File(path_or_buf, **compression_args)
f = bz2.BZ2File(path_or_buf, mode=mode, **compression_args)

# ZIP Compression
elif compression == "zip":
Expand Down Expand Up @@ -489,10 +490,14 @@ def get_handle(
handles.append(f)

elif is_path:
if encoding:
# Check whether the filename is to be opened in binary mode.
# Binary mode does not support 'encoding' and 'newline'.
is_binary_mode = "b" in mode

if encoding and not is_binary_mode:
# Encoding
f = open(path_or_buf, mode, encoding=encoding, errors=errors, newline="")
elif is_text:
elif is_text and not is_binary_mode:
# No explicit encoding
f = open(path_or_buf, mode, errors="replace", newline="")
else:
Expand Down
82 changes: 33 additions & 49 deletions pandas/io/formats/csvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,10 @@
"""

import csv as csvlib
from io import StringIO
from io import StringIO, TextIOWrapper
import os
from typing import Hashable, List, Mapping, Optional, Sequence, Union
import warnings
from zipfile import ZipFile

import numpy as np

Expand Down Expand Up @@ -159,38 +158,29 @@ def save(self) -> None:
"""
Create the writer & save.
"""
# GH21227 internal compression is not used when file-like passed.
if self.compression and hasattr(self.path_or_buf, "write"):
# GH21227 internal compression is not used for non-binary handles.
if (
self.compression
and hasattr(self.path_or_buf, "write")
and "b" not in self.mode
):
warnings.warn(
"compression has no effect when passing file-like object as input.",
"compression has no effect when passing a non-binary object as input.",
RuntimeWarning,
stacklevel=2,
)

# when zip compression is called.
is_zip = isinstance(self.path_or_buf, ZipFile) or (
not hasattr(self.path_or_buf, "write") and self.compression == "zip"
self.compression = None

# get a handle or wrap an existing handle to take care of 1) compression and
# 2) text -> byte conversion
f, handles = get_handle(
self.path_or_buf,
self.mode,
encoding=self.encoding,
errors=self.errors,
compression=dict(self.compression_args, method=self.compression),
)

if is_zip:
# zipfile doesn't support writing string to archive. uses string
# buffer to receive csv writing and dump into zip compression
# file handle. GH21241, GH21118
f = StringIO()
close = False
elif hasattr(self.path_or_buf, "write"):
f = self.path_or_buf
close = False
else:
f, handles = get_handle(
self.path_or_buf,
self.mode,
encoding=self.encoding,
errors=self.errors,
compression=dict(self.compression_args, method=self.compression),
)
close = True

try:
# Note: self.encoding is irrelevant here
self.writer = csvlib.writer(
Expand All @@ -206,29 +196,23 @@ def save(self) -> None:
self._save()

finally:
if is_zip:
# GH17778 handles zip compression separately.
buf = f.getvalue()
if hasattr(self.path_or_buf, "write"):
self.path_or_buf.write(buf)
else:
compression = dict(self.compression_args, method=self.compression)

f, handles = get_handle(
self.path_or_buf,
self.mode,
encoding=self.encoding,
errors=self.errors,
compression=compression,
)
f.write(buf)
close = True
if close:
if self.should_close:
f.close()
for _fh in handles:
_fh.close()
elif self.should_close:
elif (
isinstance(f, TextIOWrapper)
and not f.closed
and f != self.path_or_buf
and hasattr(self.path_or_buf, "write")
):
# get_handle uses TextIOWrapper for non-binary handles. TextIOWrapper
# closes the wrapped handle if it is not detached.
f.flush() # make sure everything is written
f.detach() # makes f unusable
del f
elif f != self.path_or_buf:
f.close()
for _fh in handles:
_fh.close()

def _save_header(self):
writer = self.writer
Expand Down
36 changes: 36 additions & 0 deletions pandas/tests/io/formats/test_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -607,3 +607,39 @@ def test_to_csv_errors(self, errors):
ser.to_csv(path, errors=errors)
# No use in reading back the data as it is not the same anymore
# due to the error handling

def test_to_csv_binary_handle(self):
"""
Binary file objects should work if 'mode' contains a 'b'.
GH 35058 and GH 19827
"""
df = tm.makeDataFrame()
with tm.ensure_clean() as path:
with open(path, mode="w+b") as handle:
df.to_csv(handle, mode="w+b")
tm.assert_frame_equal(df, pd.read_csv(path, index_col=0))

def test_to_csv_encoding_binary_handle(self):
"""
Binary file objects should honor a specified encoding.
GH 23854 and GH 13068 with binary handles
"""
# example from GH 23854
content = "a, b, 🐟".encode("utf-8-sig")
buffer = io.BytesIO(content)
df = pd.read_csv(buffer, encoding="utf-8-sig")

buffer = io.BytesIO()
df.to_csv(buffer, mode="w+b", encoding="utf-8-sig", index=False)
buffer.seek(0) # tests whether file handle wasn't closed
assert buffer.getvalue().startswith(content)

# example from GH 13068
with tm.ensure_clean() as path:
with open(path, "w+b") as handle:
pd.DataFrame().to_csv(handle, mode="w+b", encoding="utf-8-sig")

handle.seek(0)
assert handle.read().startswith(b'\xef\xbb\xbf""')
11 changes: 11 additions & 0 deletions pandas/tests/io/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,17 @@ def test_unknown_engine(self):
with pytest.raises(ValueError, match="Unknown engine"):
pd.read_csv(path, engine="pyt")

def test_binary_mode(self):
"""
'encoding' shouldn't be passed to 'open' in binary mode.
GH 35058
"""
with tm.ensure_clean() as path:
df = tm.makeDataFrame()
df.to_csv(path, mode="w+b")
tm.assert_frame_equal(df, pd.read_csv(path, index_col=0))


def test_is_fsspec_url():
assert icom.is_fsspec_url("gcs://pandas/somethingelse.com")
Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/io/test_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,22 @@ def test_compression_warning(compression_only):
df.to_csv(f, compression=compression_only)


def test_compression_binary(compression_only):
"""
Binary file handles support compression.
GH22555
"""
df = tm.makeDataFrame()
with tm.ensure_clean() as path:
with open(path, mode="wb") as file:
df.to_csv(file, mode="wb", compression=compression_only)
file.seek(0) # file shouldn't be closed
tm.assert_frame_equal(
df, pd.read_csv(path, index_col=0, compression=compression_only)
)


def test_with_missing_lzma():
"""Tests if import pandas works when lzma is not present."""
# https://github.com/pandas-dev/pandas/issues/27575
Expand Down

0 comments on commit 3b88446

Please sign in to comment.