diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst index 5e96587a326d9..95fbbaf5d566e 100644 --- a/doc/source/whatsnew/v1.2.2.rst +++ b/doc/source/whatsnew/v1.2.2.rst @@ -14,7 +14,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- +- Fixed regression in :meth:`~DataFrame.to_pickle` failing to create bz2/xz compressed pickle files with ``protocol=5`` (:issue:`39002`) - .. --------------------------------------------------------------------------- diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index a5507259b7b6a..2dcbaf38fa51a 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -94,7 +94,19 @@ def to_pickle( is_text=False, storage_options=storage_options, ) as handles: - pickle.dump(obj, handles.handle, protocol=protocol) # type: ignore[arg-type] + if handles.compression["method"] in ("bz2", "xz") and protocol >= 5: + # some weird TypeError GH#39002 with pickle 5: fallback to letting + # pickle create the entire object and then write it to the buffer. + # "zip" would also be here if pandas.io.common._BytesZipFile + # wouldn't buffer write calls + handles.handle.write( + pickle.dumps(obj, protocol=protocol) # type: ignore[arg-type] + ) + else: + # letting pickle write directly to the buffer is more memory-efficient + pickle.dump( + obj, handles.handle, protocol=protocol # type: ignore[arg-type] + ) @doc(storage_options=generic._shared_docs["storage_options"]) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 34b36e2549b62..24844c4f2eb85 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -13,6 +13,7 @@ import bz2 import datetime import functools +from functools import partial import glob import gzip import io @@ -588,3 +589,14 @@ def test_pickle_preserves_block_ndim(): # GH#37631 OP issue was about indexing, underlying problem was pickle tm.assert_series_equal(res[[True]], ser) + + +@pytest.mark.parametrize("protocol", [pickle.DEFAULT_PROTOCOL, pickle.HIGHEST_PROTOCOL]) +def test_pickle_big_dataframe_compression(protocol, compression): + # GH#39002 + df = pd.DataFrame(range(100000)) + result = tm.round_trip_pathlib( + partial(df.to_pickle, protocol=protocol, compression=compression), + partial(pd.read_pickle, compression=compression), + ) + tm.assert_frame_equal(df, result)