Skip to content

Commit

Permalink
Backport PR #39376: REGR: write compressed pickle files with protocol…
Browse files Browse the repository at this point in the history
…=5 (#39428)

Co-authored-by: Torsten Wörtwein <twoertwein@users.noreply.github.com>
  • Loading branch information
meeseeksmachine and twoertwein authored Jan 27, 2021
1 parent 7cdff4e commit 06b4887
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 2 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.2.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ including other versions of pandas.

Fixed regressions
~~~~~~~~~~~~~~~~~
-
- Fixed regression in :meth:`~DataFrame.to_pickle` failing to create bz2/xz compressed pickle files with ``protocol=5`` (:issue:`39002`)
-

.. ---------------------------------------------------------------------------
Expand Down
14 changes: 13 additions & 1 deletion pandas/io/pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,19 @@ def to_pickle(
is_text=False,
storage_options=storage_options,
) as handles:
pickle.dump(obj, handles.handle, protocol=protocol) # type: ignore[arg-type]
if handles.compression["method"] in ("bz2", "xz") and protocol >= 5:
# some weird TypeError GH#39002 with pickle 5: fallback to letting
# pickle create the entire object and then write it to the buffer.
# "zip" would also be here if pandas.io.common._BytesZipFile
# wouldn't buffer write calls
handles.handle.write(
pickle.dumps(obj, protocol=protocol) # type: ignore[arg-type]
)
else:
# letting pickle write directly to the buffer is more memory-efficient
pickle.dump(
obj, handles.handle, protocol=protocol # type: ignore[arg-type]
)


@doc(storage_options=generic._shared_docs["storage_options"])
Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/io/test_pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import bz2
import datetime
import functools
from functools import partial
import glob
import gzip
import io
Expand Down Expand Up @@ -588,3 +589,14 @@ def test_pickle_preserves_block_ndim():

# GH#37631 OP issue was about indexing, underlying problem was pickle
tm.assert_series_equal(res[[True]], ser)


@pytest.mark.parametrize("protocol", [pickle.DEFAULT_PROTOCOL, pickle.HIGHEST_PROTOCOL])
def test_pickle_big_dataframe_compression(protocol, compression):
# GH#39002
df = pd.DataFrame(range(100000))
result = tm.round_trip_pathlib(
partial(df.to_pickle, protocol=protocol, compression=compression),
partial(pd.read_pickle, compression=compression),
)
tm.assert_frame_equal(df, result)

0 comments on commit 06b4887

Please sign in to comment.