support binary file handles in to_csv (#35129)

pandas-dev · Aug 7, 2020 · 3b88446 · 3b88446
1 parent 1104f0d
commit 3b88446
Show file tree

Hide file tree

Showing 8 changed files with 158 additions and 60 deletions.
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -1064,6 +1064,23 @@ DD/MM/YYYY instead. For convenience, a ``dayfirst`` keyword is provided:
    pd.read_csv('tmp.csv', parse_dates=[0])
    pd.read_csv('tmp.csv', dayfirst=True, parse_dates=[0])
 
+Writing CSVs to binary file objects
++++++++++++++++++++++++++++++++++++
+
+.. versionadded:: 1.2.0
+
+``df.to_csv(..., mode="w+b")`` allows writing a CSV to a file object
+opened binary mode. For this to work, it is necessary that ``mode``
+contains a "b":
+
+.. ipython:: python
+
+   import io
+
+   data = pd.DataFrame([0, 1, 2])
+   buffer = io.BytesIO()
+   data.to_csv(buffer, mode="w+b", encoding="utf-8", compression="gzip")
+
 .. _io.float_precision:
 
 Specifying method for floating-point conversion

diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -13,6 +13,25 @@ including other versions of pandas.
 Enhancements
 ~~~~~~~~~~~~
 
+.. _whatsnew_120.binary_handle_to_csv:
+
+Support for binary file handles in ``to_csv``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:meth:`to_csv` supports file handles in binary mode (:issue:`19827` and :issue:`35058`)
+with ``encoding`` (:issue:`13068` and :issue:`23854`) and ``compression`` (:issue:`22555`).
+``mode`` has to contain a ``b`` for binary handles to be supported.
+
+For example:
+
+.. ipython:: python
+
+   import io
+
+   data = pd.DataFrame([0, 1, 2])
+   buffer = io.BytesIO()
+   data.to_csv(buffer, mode="w+b", encoding="utf-8", compression="gzip")
+
 .. _whatsnew_120.enhancements.other:
 
 Other enhancements
@@ -121,7 +140,7 @@ MultiIndex
 I/O
 ^^^
 
--
+- Bug in :meth:`to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`)
 -
 
 Plotting
@@ -167,4 +186,4 @@ Other
 .. _whatsnew_120.contributors:
 
 Contributors
-~~~~~~~~~~~~
+~~~~~~~~~~~~
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -3021,13 +3021,18 @@ def to_csv(
         ----------
         path_or_buf : str or file handle, default None
             File path or object, if None is provided the result is returned as
-            a string.  If a file object is passed it should be opened with
-            `newline=''`, disabling universal newlines.
+            a string.  If a non-binary file object is passed, it should be opened
+            with `newline=''`, disabling universal newlines. If a binary
+            file object is passed, `mode` needs to contain a `'b'`.
 
             .. versionchanged:: 0.24.0
 
                Was previously named "path" for Series.
 
+            .. versionchanged:: 1.2.0
+
+               Support for binary file objects was introduced.
+
         sep : str, default ','
             String of length 1. Field delimiter for the output file.
         na_rep : str, default ''
@@ -3056,7 +3061,8 @@ def to_csv(
             Python write mode, default 'w'.
         encoding : str, optional
             A string representing the encoding to use in the output file,
-            defaults to 'utf-8'.
+            defaults to 'utf-8'. `encoding` is not supported if `path_or_buf`
+            is a non-binary file object.
         compression : str or dict, default 'infer'
             If str, represents compression mode. If dict, value at 'method' is
             the compression mode. Compression mode may be any of the following
@@ -3080,6 +3086,10 @@ def to_csv(
                supported for compression modes 'gzip' and 'bz2'
                as well as 'zip'.
 
+            .. versionchanged:: 1.2.0
+
+                Compression is supported for non-binary file objects.
+
         quoting : optional constant from csv module
             Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
             then floats are converted to strings and thus csv.QUOTE_NONNUMERIC

diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -407,8 +407,9 @@ def get_handle(
     memory_map : boolean, default False
         See parsers._parser_params for more information.
     is_text : boolean, default True
-        whether file/buffer is in text format (csv, json, etc.), or in binary
-        mode (pickle, etc.).
+        Whether the type of the content passed to the file/buffer is string or
+        bytes. This is not the same as `"b" not in mode`. If a string content is
+        passed to a binary file/buffer, a wrapper is inserted.
     errors : str, default 'strict'
         Specifies how encoding and decoding errors are to be handled.
         See the errors argument for :func:`open` for a full list
@@ -449,14 +450,14 @@ def get_handle(
             if is_path:
                 f = gzip.open(path_or_buf, mode, **compression_args)
             else:
-                f = gzip.GzipFile(fileobj=path_or_buf, **compression_args)
+                f = gzip.GzipFile(fileobj=path_or_buf, mode=mode, **compression_args)
 
         # BZ Compression
         elif compression == "bz2":
             if is_path:
                 f = bz2.BZ2File(path_or_buf, mode, **compression_args)
             else:
-                f = bz2.BZ2File(path_or_buf, **compression_args)
+                f = bz2.BZ2File(path_or_buf, mode=mode, **compression_args)
 
         # ZIP Compression
         elif compression == "zip":
@@ -489,10 +490,14 @@ def get_handle(
         handles.append(f)
 
     elif is_path:
-        if encoding:
+        # Check whether the filename is to be opened in binary mode.
+        # Binary mode does not support 'encoding' and 'newline'.
+        is_binary_mode = "b" in mode
+
+        if encoding and not is_binary_mode:
             # Encoding
             f = open(path_or_buf, mode, encoding=encoding, errors=errors, newline="")
-        elif is_text:
+        elif is_text and not is_binary_mode:
             # No explicit encoding
             f = open(path_or_buf, mode, errors="replace", newline="")
         else:

diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py
@@ -3,11 +3,10 @@
 """
 
 import csv as csvlib
-from io import StringIO
+from io import StringIO, TextIOWrapper
 import os
 from typing import Hashable, List, Mapping, Optional, Sequence, Union
 import warnings
-from zipfile import ZipFile
 
 import numpy as np
 
@@ -159,38 +158,29 @@ def save(self) -> None:
         """
         Create the writer & save.
         """
-        # GH21227 internal compression is not used when file-like passed.
-        if self.compression and hasattr(self.path_or_buf, "write"):
+        # GH21227 internal compression is not used for non-binary handles.
+        if (
+            self.compression
+            and hasattr(self.path_or_buf, "write")
+            and "b" not in self.mode
+        ):
             warnings.warn(
-                "compression has no effect when passing file-like object as input.",
+                "compression has no effect when passing a non-binary object as input.",
                 RuntimeWarning,
                 stacklevel=2,
             )
-
-        # when zip compression is called.
-        is_zip = isinstance(self.path_or_buf, ZipFile) or (
-            not hasattr(self.path_or_buf, "write") and self.compression == "zip"
+            self.compression = None
+
+        # get a handle or wrap an existing handle to take care of 1) compression and
+        # 2) text -> byte conversion
+        f, handles = get_handle(
+            self.path_or_buf,
+            self.mode,
+            encoding=self.encoding,
+            errors=self.errors,
+            compression=dict(self.compression_args, method=self.compression),
         )
 
-        if is_zip:
-            # zipfile doesn't support writing string to archive. uses string
-            # buffer to receive csv writing and dump into zip compression
-            # file handle. GH21241, GH21118
-            f = StringIO()
-            close = False
-        elif hasattr(self.path_or_buf, "write"):
-            f = self.path_or_buf
-            close = False
-        else:
-            f, handles = get_handle(
-                self.path_or_buf,
-                self.mode,
-                encoding=self.encoding,
-                errors=self.errors,
-                compression=dict(self.compression_args, method=self.compression),
-            )
-            close = True
-
         try:
             # Note: self.encoding is irrelevant here
             self.writer = csvlib.writer(
@@ -206,29 +196,23 @@ def save(self) -> None:
             self._save()
 
         finally:
-            if is_zip:
-                # GH17778 handles zip compression separately.
-                buf = f.getvalue()
-                if hasattr(self.path_or_buf, "write"):
-                    self.path_or_buf.write(buf)
-                else:
-                    compression = dict(self.compression_args, method=self.compression)
-
-                    f, handles = get_handle(
-                        self.path_or_buf,
-                        self.mode,
-                        encoding=self.encoding,
-                        errors=self.errors,
-                        compression=compression,
-                    )
-                    f.write(buf)
-                    close = True
-            if close:
+            if self.should_close:
                 f.close()
-                for _fh in handles:
-                    _fh.close()
-            elif self.should_close:
+            elif (
+                isinstance(f, TextIOWrapper)
+                and not f.closed
+                and f != self.path_or_buf
+                and hasattr(self.path_or_buf, "write")
+            ):
+                # get_handle uses TextIOWrapper for non-binary handles. TextIOWrapper
+                # closes the wrapped handle if it is not detached.
+                f.flush()  # make sure everything is written
+                f.detach()  # makes f unusable
+                del f
+            elif f != self.path_or_buf:
                 f.close()
+            for _fh in handles:
+                _fh.close()
 
     def _save_header(self):
         writer = self.writer

diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py
@@ -607,3 +607,39 @@ def test_to_csv_errors(self, errors):
             ser.to_csv(path, errors=errors)
         # No use in reading back the data as it is not the same anymore
         # due to the error handling
+
+    def test_to_csv_binary_handle(self):
+        """
+        Binary file objects should work if 'mode' contains a 'b'.
+
+        GH 35058 and GH 19827
+        """
+        df = tm.makeDataFrame()
+        with tm.ensure_clean() as path:
+            with open(path, mode="w+b") as handle:
+                df.to_csv(handle, mode="w+b")
+            tm.assert_frame_equal(df, pd.read_csv(path, index_col=0))
+
+    def test_to_csv_encoding_binary_handle(self):
+        """
+        Binary file objects should honor a specified encoding.
+
+        GH 23854 and GH 13068 with binary handles
+        """
+        # example from GH 23854
+        content = "a, b, 🐟".encode("utf-8-sig")
+        buffer = io.BytesIO(content)
+        df = pd.read_csv(buffer, encoding="utf-8-sig")
+
+        buffer = io.BytesIO()
+        df.to_csv(buffer, mode="w+b", encoding="utf-8-sig", index=False)
+        buffer.seek(0)  # tests whether file handle wasn't closed
+        assert buffer.getvalue().startswith(content)
+
+        # example from GH 13068
+        with tm.ensure_clean() as path:
+            with open(path, "w+b") as handle:
+                pd.DataFrame().to_csv(handle, mode="w+b", encoding="utf-8-sig")
+
+                handle.seek(0)
+                assert handle.read().startswith(b'\xef\xbb\xbf""')
diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py
@@ -378,6 +378,17 @@ def test_unknown_engine(self):
             with pytest.raises(ValueError, match="Unknown engine"):
                 pd.read_csv(path, engine="pyt")
 
+    def test_binary_mode(self):
+        """
+        'encoding' shouldn't be passed to 'open' in binary mode.
+
+        GH 35058
+        """
+        with tm.ensure_clean() as path:
+            df = tm.makeDataFrame()
+            df.to_csv(path, mode="w+b")
+            tm.assert_frame_equal(df, pd.read_csv(path, index_col=0))
+
 
 def test_is_fsspec_url():
     assert icom.is_fsspec_url("gcs://pandas/somethingelse.com")

diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py
@@ -114,6 +114,22 @@ def test_compression_warning(compression_only):
                 df.to_csv(f, compression=compression_only)
 
 
+def test_compression_binary(compression_only):
+    """
+    Binary file handles support compression.
+
+    GH22555
+    """
+    df = tm.makeDataFrame()
+    with tm.ensure_clean() as path:
+        with open(path, mode="wb") as file:
+            df.to_csv(file, mode="wb", compression=compression_only)
+            file.seek(0)  # file shouldn't be closed
+        tm.assert_frame_equal(
+            df, pd.read_csv(path, index_col=0, compression=compression_only)
+        )
+
+
 def test_with_missing_lzma():
     """Tests if import pandas works when lzma is not present."""
     # https://github.com/pandas-dev/pandas/issues/27575