Skip to content

Commit

Permalink
DEPR: squeeze argument in read_csv/read_table/read_excel (#43427)
Browse files Browse the repository at this point in the history
  • Loading branch information
lithomas1 committed Sep 10, 2021
1 parent 6e19bdc commit cd61b59
Show file tree
Hide file tree
Showing 13 changed files with 106 additions and 47 deletions.
5 changes: 5 additions & 0 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1208,6 +1208,10 @@ Returning Series
Using the ``squeeze`` keyword, the parser will return output with a single column
as a ``Series``:

.. deprecated:: 1.4.0
Users should append ``.squeeze("columns")`` to the DataFrame returned by
``read_csv`` instead.

.. ipython:: python
:suppress:
Expand All @@ -1217,6 +1221,7 @@ as a ``Series``:
fh.write(data)
.. ipython:: python
:okwarning:
print(open("tmp.csv").read())
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ Other Deprecations
- Deprecated :meth:`Index.reindex` with a non-unique index (:issue:`42568`)
- Deprecated :meth:`.Styler.render` in favour of :meth:`.Styler.to_html` (:issue:`42140`)
- Deprecated passing in a string column label into ``times`` in :meth:`DataFrame.ewm` (:issue:`43265`)
- Deprecated the ``squeeze`` argument to :meth:`read_csv`, :meth:`read_table`, and :meth:`read_excel`. Users should squeeze the DataFrame afterwards with ``.squeeze("columns")`` instead. (:issue:`43242`)

.. ---------------------------------------------------------------------------
Expand Down
10 changes: 7 additions & 3 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,10 @@
Returns a subset of the columns according to behavior above.
squeeze : bool, default False
If the parsed data only contains one column then return a Series.
.. deprecated:: 1.4.0
Append ``.squeeze("columns")`` to the call to ``read_excel`` to squeeze
the data.
dtype : Type name or dict of column -> type, default None
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
Use `object` to preserve data as stored in Excel and not interpret dtype.
Expand Down Expand Up @@ -337,7 +341,7 @@ def read_excel(
names=None,
index_col=None,
usecols=None,
squeeze=False,
squeeze=None,
dtype: DtypeArg | None = None,
engine=None,
converters=None,
Expand Down Expand Up @@ -481,7 +485,7 @@ def parse(
names=None,
index_col=None,
usecols=None,
squeeze=False,
squeeze=None,
dtype: DtypeArg | None = None,
true_values=None,
false_values=None,
Expand Down Expand Up @@ -1243,7 +1247,7 @@ def parse(
names=None,
index_col=None,
usecols=None,
squeeze=False,
squeeze=None,
converters=None,
true_values=None,
false_values=None,
Expand Down
2 changes: 1 addition & 1 deletion pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@
"chunksize": None,
"verbose": False,
"encoding": None,
"squeeze": False,
"squeeze": None,
"compression": None,
"mangle_dupe_cols": True,
"infer_datetime_format": False,
Expand Down
1 change: 0 additions & 1 deletion pandas/io/parsers/c_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ class CParserWrapper(ParserBase):
def __init__(self, src: FilePathOrBuffer, **kwds):
self.kwds = kwds
kwds = kwds.copy()

ParserBase.__init__(self, kwds)

self.low_memory = kwds.pop("low_memory", False)
Expand Down
26 changes: 20 additions & 6 deletions pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
Appender,
deprecate_nonkeyword_arguments,
)
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import validate_bool_kwarg

from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -131,6 +132,10 @@
parsing time and lower memory usage.
squeeze : bool, default False
If the parsed data only contains one column then return a Series.
.. deprecated:: 1.4.0
Append ``.squeeze("columns")`` to the call to ``{func_name}`` to squeeze
the data.
prefix : str, optional
Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
mangle_dupe_cols : bool, default True
Expand Down Expand Up @@ -439,7 +444,11 @@
"low_memory",
}

_deprecated_defaults: dict[str, Any] = {"error_bad_lines": None, "warn_bad_lines": None}
_deprecated_defaults: dict[str, Any] = {
"error_bad_lines": None,
"warn_bad_lines": None,
"squeeze": None,
}


def validate_integer(name, val, min_val=0):
Expand Down Expand Up @@ -552,7 +561,7 @@ def read_csv(
names=lib.no_default,
index_col=None,
usecols=None,
squeeze=False,
squeeze=None,
prefix=lib.no_default,
mangle_dupe_cols=True,
# General Parsing Configuration
Expand Down Expand Up @@ -650,7 +659,7 @@ def read_table(
names=lib.no_default,
index_col=None,
usecols=None,
squeeze=False,
squeeze=None,
prefix=lib.no_default,
mangle_dupe_cols=True,
# General Parsing Configuration
Expand Down Expand Up @@ -867,11 +876,12 @@ def __init__(self, f, engine=None, **kwds):

self.chunksize = options.pop("chunksize", None)
self.nrows = options.pop("nrows", None)
self.squeeze = options.pop("squeeze", False)

self._check_file_or_buffer(f, engine)
self.options, self.engine = self._clean_options(options, engine)

self.squeeze = self.options.pop("squeeze", False)

if "has_index_names" in kwds:
self.options["has_index_names"] = kwds["has_index_names"]

Expand Down Expand Up @@ -1050,7 +1060,7 @@ def _clean_options(self, options, engine):
f"The {arg} argument has been deprecated and will be "
"removed in a future version.\n\n"
)
warnings.warn(msg, FutureWarning, stacklevel=7)
warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
else:
result[arg] = parser_default

Expand Down Expand Up @@ -1100,6 +1110,10 @@ def _clean_options(self, options, engine):
result["na_values"] = na_values
result["na_fvalues"] = na_fvalues
result["skiprows"] = skiprows
# Default for squeeze is none since we need to check
# if user sets it. We then set to False to preserve
# previous behavior.
result["squeeze"] = False if options["squeeze"] is None else options["squeeze"]

return result, engine

Expand Down Expand Up @@ -1149,7 +1163,7 @@ def read(self, nrows=None):
self._currow += new_rows

if self.squeeze and len(df.columns) == 1:
return df[df.columns[0]].copy()
return df.squeeze("columns").copy()
return df

def get_chunk(self, size=None):
Expand Down
3 changes: 1 addition & 2 deletions pandas/tests/frame/methods/test_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -1039,8 +1039,7 @@ def test_to_csv_compression(self, df, encoding, compression):
compression=compression,
encoding=encoding,
index_col=0,
squeeze=True,
)
).squeeze("columns")
tm.assert_frame_equal(df, result)

# explicitly make sure file is compressed
Expand Down
27 changes: 17 additions & 10 deletions pandas/tests/io/excel/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1194,18 +1194,25 @@ def test_read_excel_squeeze(self, read_ext):
# GH 12157
f = "test_squeeze" + read_ext

actual = pd.read_excel(f, sheet_name="two_columns", index_col=0, squeeze=True)
expected = Series([2, 3, 4], [4, 5, 6], name="b")
expected.index.name = "a"
tm.assert_series_equal(actual, expected)
with tm.assert_produces_warning(
FutureWarning,
match="The squeeze argument has been deprecated "
"and will be removed in a future version.\n\n",
):
actual = pd.read_excel(
f, sheet_name="two_columns", index_col=0, squeeze=True
)
expected = Series([2, 3, 4], [4, 5, 6], name="b")
expected.index.name = "a"
tm.assert_series_equal(actual, expected)

actual = pd.read_excel(f, sheet_name="two_columns", squeeze=True)
expected = DataFrame({"a": [4, 5, 6], "b": [2, 3, 4]})
tm.assert_frame_equal(actual, expected)
actual = pd.read_excel(f, sheet_name="two_columns", squeeze=True)
expected = DataFrame({"a": [4, 5, 6], "b": [2, 3, 4]})
tm.assert_frame_equal(actual, expected)

actual = pd.read_excel(f, sheet_name="one_column", squeeze=True)
expected = Series([1, 2, 3], name="a")
tm.assert_series_equal(actual, expected)
actual = pd.read_excel(f, sheet_name="one_column", squeeze=True)
expected = Series([1, 2, 3], name="a")
tm.assert_series_equal(actual, expected)

def test_deprecated_kwargs(self, read_ext):
with tm.assert_produces_warning(FutureWarning, raise_on_extra_warnings=False):
Expand Down
36 changes: 25 additions & 11 deletions pandas/tests/io/parser/common/test_common_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,8 @@ def test_1000_sep(all_parsers):
tm.assert_frame_equal(result, expected)


def test_squeeze(all_parsers):
@pytest.mark.parametrize("squeeze", [True, False])
def test_squeeze(all_parsers, squeeze):
data = """\
a,1
b,2
Expand All @@ -138,13 +139,25 @@ def test_squeeze(all_parsers):
index = Index(["a", "b", "c"], name=0)
expected = Series([1, 2, 3], name=1, index=index)

result = parser.read_csv(StringIO(data), index_col=0, header=None, squeeze=True)
tm.assert_series_equal(result, expected)
result = parser.read_csv_check_warnings(
FutureWarning,
"The squeeze argument has been deprecated "
"and will be removed in a future version.\n\n",
StringIO(data),
index_col=0,
header=None,
squeeze=squeeze,
)
if not squeeze:
expected = DataFrame(expected)
tm.assert_frame_equal(result, expected)
else:
tm.assert_series_equal(result, expected)

# see gh-8217
#
# Series should not be a view.
assert not result._is_view
# see gh-8217
#
# Series should not be a view.
assert not result._is_view


@xfail_pyarrow
Expand Down Expand Up @@ -847,12 +860,13 @@ def test_deprecated_bad_lines_warns(all_parsers, csv1, on_bad_lines):
# GH 15122
parser = all_parsers
kwds = {f"{on_bad_lines}_bad_lines": False}
with tm.assert_produces_warning(
parser.read_csv_check_warnings(
FutureWarning,
match=f"The {on_bad_lines}_bad_lines argument has been deprecated "
f"The {on_bad_lines}_bad_lines argument has been deprecated "
"and will be removed in a future version.\n\n",
):
parser.read_csv(csv1, **kwds)
csv1,
**kwds,
)


def test_malformed_second_line(all_parsers):
Expand Down
7 changes: 3 additions & 4 deletions pandas/tests/io/parser/common/test_iterator.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

from pandas import (
DataFrame,
Series,
concat,
)
import pandas._testing as tm
Expand Down Expand Up @@ -94,7 +93,7 @@ def test_iterator_skipfooter_errors(all_parsers, kwargs):

def test_iteration_open_handle(all_parsers):
parser = all_parsers
kwargs = {"squeeze": True, "header": None}
kwargs = {"header": None}

with tm.ensure_clean() as path:
with open(path, "w") as f:
Expand All @@ -106,5 +105,5 @@ def test_iteration_open_handle(all_parsers):
break

result = parser.read_csv(f, **kwargs)
expected = Series(["DDD", "EEE", "FFF", "GGG"], name=0)
tm.assert_series_equal(result, expected)
expected = DataFrame({0: ["DDD", "EEE", "FFF", "GGG"]})
tm.assert_frame_equal(result, expected)
11 changes: 11 additions & 0 deletions pandas/tests/io/parser/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
read_csv,
read_table,
)
import pandas._testing as tm


class BaseParser:
Expand All @@ -27,6 +28,16 @@ def read_csv(self, *args, **kwargs):
kwargs = self.update_kwargs(kwargs)
return read_csv(*args, **kwargs)

def read_csv_check_warnings(
self, warn_type: type[Warning], warn_msg: str, *args, **kwargs
):
# We need to check the stacklevel here instead of in the tests
# since this is where read_csv is called and where the warning
# should point to.
kwargs = self.update_kwargs(kwargs)
with tm.assert_produces_warning(warn_type, match=warn_msg):
return read_csv(*args, **kwargs)

def read_table(self, *args, **kwargs):
kwargs = self.update_kwargs(kwargs)
return read_table(*args, **kwargs)
Expand Down
9 changes: 8 additions & 1 deletion pandas/tests/io/test_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,14 @@ def test_series_compression_defaults_to_infer(
extension = icom._compression_to_extension[compression_only]
with tm.ensure_clean("compressed" + extension) as path:
getattr(input, write_method)(path, **write_kwargs)
output = read_method(path, compression=compression_only, **read_kwargs)
if "squeeze" in read_kwargs:
kwargs = read_kwargs.copy()
del kwargs["squeeze"]
output = read_method(path, compression=compression_only, **kwargs).squeeze(
"columns"
)
else:
output = read_method(path, compression=compression_only, **read_kwargs)
tm.assert_series_equal(output, input, check_names=False)


Expand Down
15 changes: 7 additions & 8 deletions pandas/tests/series/methods/test_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@

class TestSeriesToCSV:
def read_csv(self, path, **kwargs):
params = {"squeeze": True, "index_col": 0, "header": None, "parse_dates": True}
params = {"index_col": 0, "header": None, "parse_dates": True}
params.update(**kwargs)

header = params.get("header")
out = pd.read_csv(path, **params)
out = pd.read_csv(path, **params).squeeze("columns")

if header is None:
out.name = out.index.name = None
Expand Down Expand Up @@ -138,8 +138,7 @@ def test_to_csv_compression(self, s, encoding, compression):
compression=compression,
encoding=encoding,
index_col=0,
squeeze=True,
)
).squeeze("columns")
tm.assert_series_equal(s, result)

# test the round trip using file handle - to_csv -> read_csv
Expand All @@ -153,8 +152,7 @@ def test_to_csv_compression(self, s, encoding, compression):
compression=compression,
encoding=encoding,
index_col=0,
squeeze=True,
)
).squeeze("columns")
tm.assert_series_equal(s, result)

# explicitly ensure file was compressed
Expand All @@ -164,7 +162,8 @@ def test_to_csv_compression(self, s, encoding, compression):

with tm.decompress_file(filename, compression) as fh:
tm.assert_series_equal(
s, pd.read_csv(fh, index_col=0, squeeze=True, encoding=encoding)
s,
pd.read_csv(fh, index_col=0, encoding=encoding).squeeze("columns"),
)

def test_to_csv_interval_index(self):
Expand All @@ -173,7 +172,7 @@ def test_to_csv_interval_index(self):

with tm.ensure_clean("__tmp_to_csv_interval_index__.csv") as path:
s.to_csv(path, header=False)
result = self.read_csv(path, index_col=0, squeeze=True)
result = self.read_csv(path, index_col=0)

# can't roundtrip intervalindex via read_csv so check string repr (GH 23595)
expected = s.copy()
Expand Down

0 comments on commit cd61b59

Please sign in to comment.