From 67b087f5c07cf624595caca6c3d290503dbf92ff Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 3 Nov 2020 20:18:25 +0100 Subject: [PATCH 01/21] ERR: fix error message in Period for invalid frequency (#37602) --- pandas/_libs/tslibs/period.pyx | 2 +- pandas/tests/scalar/period/test_period.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index b1f9ff71f5faa..b817d80c64ccd 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2438,7 +2438,7 @@ cpdef int freq_to_dtype_code(BaseOffset freq) except? -1: try: return freq._period_dtype_code except AttributeError as err: - raise ValueError(INVALID_FREQ_ERR_MSG) from err + raise ValueError(INVALID_FREQ_ERR_MSG.format(freq)) from err cdef int64_t _ordinal_from_fields(int year, int month, quarter, int day, diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index f150e5e5b18b2..46bc6421c2070 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -1554,3 +1554,9 @@ def test_negone_ordinals(): repr(period) period = Period(ordinal=-1, freq="W") repr(period) + + +def test_invalid_frequency_error_message(): + msg = "Invalid frequency: " + with pytest.raises(ValueError, match=msg): + Period("2012-01-02", freq="WOM-1MON") From 7d40d3ea53da635f4074ef98f84a3a8c6aa24166 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 3 Nov 2020 15:09:05 -0800 Subject: [PATCH 02/21] CLN: remove rebox_native (#37608) --- pandas/core/arrays/datetimelike.py | 5 +++-- pandas/core/arrays/datetimes.py | 9 +++------ pandas/core/arrays/period.py | 8 ++------ pandas/core/arrays/timedeltas.py | 8 ++------ pandas/tests/arrays/test_datetimelike.py | 5 +++-- 5 files changed, 13 insertions(+), 22 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 579719d8bac3b..1955a96160a4a 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -151,7 +151,9 @@ def _rebox_native(cls, value: int) -> Union[int, np.datetime64, np.timedelta64]: """ raise AbstractMethodError(cls) - def _unbox_scalar(self, value: DTScalarOrNaT, setitem: bool = False) -> int: + def _unbox_scalar( + self, value: DTScalarOrNaT, setitem: bool = False + ) -> Union[np.int64, np.datetime64, np.timedelta64]: """ Unbox the integer value of a scalar `value`. @@ -636,7 +638,6 @@ def _unbox( """ if lib.is_scalar(other): other = self._unbox_scalar(other, setitem=setitem) - other = self._rebox_native(other) else: # same type as self self._check_compatible_with(other, setitem=setitem) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index b05271552f117..f655d10881011 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -454,16 +454,13 @@ def _generate_range( # ----------------------------------------------------------------- # DatetimeLike Interface - @classmethod - def _rebox_native(cls, value: int) -> np.datetime64: - return np.int64(value).view("M8[ns]") - - def _unbox_scalar(self, value, setitem: bool = False): + def _unbox_scalar(self, value, setitem: bool = False) -> np.datetime64: if not isinstance(value, self._scalar_type) and value is not NaT: raise ValueError("'value' should be a Timestamp.") if not isna(value): self._check_compatible_with(value, setitem=setitem) - return value.value + return value.asm8 + return np.datetime64(value.value, "ns") def _scalar_from_string(self, value): return Timestamp(value, tz=self.tz) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index b95a7acc19b1f..d808ade53ad33 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -260,18 +260,14 @@ def _generate_range(cls, start, end, periods, freq, fields): # ----------------------------------------------------------------- # DatetimeLike Interface - @classmethod - def _rebox_native(cls, value: int) -> np.int64: - return np.int64(value) - def _unbox_scalar( self, value: Union[Period, NaTType], setitem: bool = False ) -> int: if value is NaT: - return value.value + return np.int64(value.value) elif isinstance(value, self._scalar_type): self._check_compatible_with(value, setitem=setitem) - return value.ordinal + return np.int64(value.ordinal) else: raise ValueError(f"'value' should be a Period. Got '{value}' instead.") diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index e5b56ae80b578..e4a844fd4c6ef 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -301,15 +301,11 @@ def _generate_range(cls, start, end, periods, freq, closed=None): # ---------------------------------------------------------------- # DatetimeLike Interface - @classmethod - def _rebox_native(cls, value: int) -> np.timedelta64: - return np.int64(value).view("m8[ns]") - - def _unbox_scalar(self, value, setitem: bool = False): + def _unbox_scalar(self, value, setitem: bool = False) -> np.timedelta64: if not isinstance(value, self._scalar_type) and value is not NaT: raise ValueError("'value' should be a Timedelta.") self._check_compatible_with(value, setitem=setitem) - return value.value + return np.timedelta64(value.value, "ns") def _scalar_from_string(self, value): return Timedelta(value) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index b9298e9dec5b5..ec20c829f1544 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -191,10 +191,11 @@ def test_unbox_scalar(self): data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 arr = self.array_cls(data, freq="D") result = arr._unbox_scalar(arr[0]) - assert isinstance(result, int) + expected = arr._data.dtype.type + assert isinstance(result, expected) result = arr._unbox_scalar(pd.NaT) - assert isinstance(result, int) + assert isinstance(result, expected) msg = f"'value' should be a {self.dtype.__name__}." with pytest.raises(ValueError, match=msg): From 93e3477617531e1006eb98e87ddb7cbf1fb21797 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 3 Nov 2020 17:50:29 -0800 Subject: [PATCH 03/21] TST/REF: tests.generic (#37618) --- pandas/tests/frame/methods/test_equals.py | 57 ++++++++- pandas/tests/frame/methods/test_head_tail.py | 24 ++++ .../generic/methods/test_first_valid_index.py | 5 +- pandas/tests/generic/methods/test_pipe.py | 15 +-- .../generic/methods/test_reorder_levels.py | 11 +- pandas/tests/generic/methods/test_sample.py | 10 +- pandas/tests/generic/test_generic.py | 111 +++--------------- 7 files changed, 112 insertions(+), 121 deletions(-) diff --git a/pandas/tests/frame/methods/test_equals.py b/pandas/tests/frame/methods/test_equals.py index c024390297fec..de2509ed91be2 100644 --- a/pandas/tests/frame/methods/test_equals.py +++ b/pandas/tests/frame/methods/test_equals.py @@ -1,4 +1,6 @@ -from pandas import DataFrame +import numpy as np + +from pandas import DataFrame, date_range import pandas._testing as tm @@ -21,3 +23,56 @@ def test_equals_different_blocks(self): tm.assert_frame_equal(df0, df1) assert df0.equals(df1) assert df1.equals(df0) + + def test_equals(self): + # Add object dtype column with nans + index = np.random.random(10) + df1 = DataFrame(np.random.random(10), index=index, columns=["floats"]) + df1["text"] = "the sky is so blue. we could use more chocolate.".split() + df1["start"] = date_range("2000-1-1", periods=10, freq="T") + df1["end"] = date_range("2000-1-1", periods=10, freq="D") + df1["diff"] = df1["end"] - df1["start"] + df1["bool"] = np.arange(10) % 3 == 0 + df1.loc[::2] = np.nan + df2 = df1.copy() + assert df1["text"].equals(df2["text"]) + assert df1["start"].equals(df2["start"]) + assert df1["end"].equals(df2["end"]) + assert df1["diff"].equals(df2["diff"]) + assert df1["bool"].equals(df2["bool"]) + assert df1.equals(df2) + assert not df1.equals(object) + + # different dtype + different = df1.copy() + different["floats"] = different["floats"].astype("float32") + assert not df1.equals(different) + + # different index + different_index = -index + different = df2.set_index(different_index) + assert not df1.equals(different) + + # different columns + different = df2.copy() + different.columns = df2.columns[::-1] + assert not df1.equals(different) + + # DatetimeIndex + index = date_range("2000-1-1", periods=10, freq="T") + df1 = df1.set_index(index) + df2 = df1.copy() + assert df1.equals(df2) + + # MultiIndex + df3 = df1.set_index(["text"], append=True) + df2 = df1.set_index(["text"], append=True) + assert df3.equals(df2) + + df2 = df1.set_index(["floats"], append=True) + assert not df3.equals(df2) + + # NaN in index + df3 = df1.set_index(["floats"], append=True) + df2 = df1.set_index(["floats"], append=True) + assert df3.equals(df2) diff --git a/pandas/tests/frame/methods/test_head_tail.py b/pandas/tests/frame/methods/test_head_tail.py index 93763bc12ce0d..fa28f7d3e16a2 100644 --- a/pandas/tests/frame/methods/test_head_tail.py +++ b/pandas/tests/frame/methods/test_head_tail.py @@ -4,6 +4,30 @@ import pandas._testing as tm +def test_head_tail_generic(index, frame_or_series): + # GH#5370 + + ndim = 2 if frame_or_series is DataFrame else 1 + shape = (len(index),) * ndim + vals = np.random.randn(*shape) + obj = frame_or_series(vals, index=index) + + tm.assert_equal(obj.head(), obj.iloc[:5]) + tm.assert_equal(obj.tail(), obj.iloc[-5:]) + + # 0-len + tm.assert_equal(obj.head(0), obj.iloc[0:0]) + tm.assert_equal(obj.tail(0), obj.iloc[0:0]) + + # bounded + tm.assert_equal(obj.head(len(obj) + 1), obj) + tm.assert_equal(obj.tail(len(obj) + 1), obj) + + # neg index + tm.assert_equal(obj.head(-3), obj.head(len(index) - 3)) + tm.assert_equal(obj.tail(-3), obj.tail(len(index) - 3)) + + def test_head_tail(float_frame): tm.assert_frame_equal(float_frame.head(), float_frame[:5]) tm.assert_frame_equal(float_frame.tail(), float_frame[-5:]) diff --git a/pandas/tests/generic/methods/test_first_valid_index.py b/pandas/tests/generic/methods/test_first_valid_index.py index bca3452c3c458..8d021f0e3954e 100644 --- a/pandas/tests/generic/methods/test_first_valid_index.py +++ b/pandas/tests/generic/methods/test_first_valid_index.py @@ -9,10 +9,9 @@ class TestFirstValidIndex: - @pytest.mark.parametrize("klass", [Series, DataFrame]) - def test_first_valid_index_single_nan(self, klass): + def test_first_valid_index_single_nan(self, frame_or_series): # GH#9752 Series/DataFrame should both return None, not raise - obj = klass([np.nan]) + obj = frame_or_series([np.nan]) assert obj.first_valid_index() is None assert obj.iloc[:0].first_valid_index() is None diff --git a/pandas/tests/generic/methods/test_pipe.py b/pandas/tests/generic/methods/test_pipe.py index 59e5edc4b8bb5..b378600634bf0 100644 --- a/pandas/tests/generic/methods/test_pipe.py +++ b/pandas/tests/generic/methods/test_pipe.py @@ -5,11 +5,10 @@ class TestPipe: - @pytest.mark.parametrize("klass", [Series, DataFrame]) - def test_pipe(self, klass): + def test_pipe(self, frame_or_series): obj = DataFrame({"A": [1, 2, 3]}) expected = DataFrame({"A": [1, 4, 9]}) - if klass is Series: + if frame_or_series is Series: obj = obj["A"] expected = expected["A"] @@ -17,20 +16,18 @@ def test_pipe(self, klass): result = obj.pipe(f, 2) tm.assert_equal(result, expected) - @pytest.mark.parametrize("klass", [Series, DataFrame]) - def test_pipe_tuple(self, klass): + def test_pipe_tuple(self, frame_or_series): obj = DataFrame({"A": [1, 2, 3]}) - if klass is Series: + if frame_or_series is Series: obj = obj["A"] f = lambda x, y: y result = obj.pipe((f, "y"), 0) tm.assert_equal(result, obj) - @pytest.mark.parametrize("klass", [Series, DataFrame]) - def test_pipe_tuple_error(self, klass): + def test_pipe_tuple_error(self, frame_or_series): obj = DataFrame({"A": [1, 2, 3]}) - if klass is Series: + if frame_or_series is Series: obj = obj["A"] f = lambda x, y: y diff --git a/pandas/tests/generic/methods/test_reorder_levels.py b/pandas/tests/generic/methods/test_reorder_levels.py index 8bb6417e56659..6bfbf089a6108 100644 --- a/pandas/tests/generic/methods/test_reorder_levels.py +++ b/pandas/tests/generic/methods/test_reorder_levels.py @@ -1,20 +1,19 @@ import numpy as np import pytest -from pandas import DataFrame, MultiIndex, Series +from pandas import DataFrame, MultiIndex import pandas._testing as tm class TestReorderLevels: - @pytest.mark.parametrize("klass", [Series, DataFrame]) - def test_reorder_levels(self, klass): + def test_reorder_levels(self, frame_or_series): index = MultiIndex( levels=[["bar"], ["one", "two", "three"], [0, 1]], codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], names=["L0", "L1", "L2"], ) df = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=index) - obj = df if klass is DataFrame else df["A"] + obj = df if frame_or_series is DataFrame else df["A"] # no change, position result = obj.reorder_levels([0, 1, 2]) @@ -32,7 +31,7 @@ def test_reorder_levels(self, klass): names=["L1", "L2", "L0"], ) expected = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=e_idx) - expected = expected if klass is DataFrame else expected["A"] + expected = expected if frame_or_series is DataFrame else expected["A"] tm.assert_equal(result, expected) result = obj.reorder_levels([0, 0, 0]) @@ -42,7 +41,7 @@ def test_reorder_levels(self, klass): names=["L0", "L0", "L0"], ) expected = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=e_idx) - expected = expected if klass is DataFrame else expected["A"] + expected = expected if frame_or_series is DataFrame else expected["A"] tm.assert_equal(result, expected) result = obj.reorder_levels(["L0", "L0", "L0"]) diff --git a/pandas/tests/generic/methods/test_sample.py b/pandas/tests/generic/methods/test_sample.py index 7303dad9170ed..b26a3785f918d 100644 --- a/pandas/tests/generic/methods/test_sample.py +++ b/pandas/tests/generic/methods/test_sample.py @@ -155,22 +155,20 @@ def test_sample_none_weights(self, obj): ), ], ) - @pytest.mark.parametrize("klass", [Series, DataFrame]) - def test_sample_random_state(self, func_str, arg, klass): + def test_sample_random_state(self, func_str, arg, frame_or_series): # GH#32503 obj = DataFrame({"col1": range(10, 20), "col2": range(20, 30)}) - if klass is Series: + if frame_or_series is Series: obj = obj["col1"] result = obj.sample(n=3, random_state=eval(func_str)(arg)) expected = obj.sample(n=3, random_state=com.random_state(eval(func_str)(arg))) tm.assert_equal(result, expected) - @pytest.mark.parametrize("klass", [Series, DataFrame]) - def test_sample_upsampling_without_replacement(self, klass): + def test_sample_upsampling_without_replacement(self, frame_or_series): # GH#27451 obj = DataFrame({"A": list("abc")}) - if klass is Series: + if frame_or_series is Series: obj = obj["A"] msg = ( diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 45601abc95fe6..930c48cbdc214 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -5,8 +5,7 @@ from pandas.core.dtypes.common import is_scalar -import pandas as pd -from pandas import DataFrame, Series, date_range +from pandas import DataFrame, Series import pandas._testing as tm # ---------------------------------------------------------------------- @@ -248,31 +247,6 @@ def test_metadata_propagation(self): self.check_metadata(v1 & v2) self.check_metadata(v1 | v2) - def test_head_tail(self, index): - # GH5370 - - o = self._construct(shape=len(index)) - - axis = o._get_axis_name(0) - setattr(o, axis, index) - - o.head() - - self._compare(o.head(), o.iloc[:5]) - self._compare(o.tail(), o.iloc[-5:]) - - # 0-len - self._compare(o.head(0), o.iloc[0:0]) - self._compare(o.tail(0), o.iloc[0:0]) - - # bounded - self._compare(o.head(len(o) + 1), o) - self._compare(o.tail(len(o) + 1), o) - - # neg index - self._compare(o.head(-3), o.head(len(index) - 3)) - self._compare(o.tail(-3), o.tail(len(index) - 3)) - def test_size_compat(self): # GH8846 # size property should be defined @@ -460,77 +434,23 @@ def test_take_invalid_kwargs(self): obj.take(indices, mode="clip") @pytest.mark.parametrize("is_copy", [True, False]) - def test_depr_take_kwarg_is_copy(self, is_copy): + def test_depr_take_kwarg_is_copy(self, is_copy, frame_or_series): # GH 27357 - df = DataFrame({"A": [1, 2, 3]}) + obj = DataFrame({"A": [1, 2, 3]}) + if frame_or_series is Series: + obj = obj["A"] + msg = ( "is_copy is deprecated and will be removed in a future version. " "'take' always returns a copy, so there is no need to specify this." ) with tm.assert_produces_warning(FutureWarning) as w: - df.take([0, 1], is_copy=is_copy) + obj.take([0, 1], is_copy=is_copy) assert w[0].message.args[0] == msg - s = Series([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning): - s.take([0, 1], is_copy=is_copy) - - def test_equals(self): - # Add object dtype column with nans - index = np.random.random(10) - df1 = DataFrame(np.random.random(10), index=index, columns=["floats"]) - df1["text"] = "the sky is so blue. we could use more chocolate.".split() - df1["start"] = date_range("2000-1-1", periods=10, freq="T") - df1["end"] = date_range("2000-1-1", periods=10, freq="D") - df1["diff"] = df1["end"] - df1["start"] - df1["bool"] = np.arange(10) % 3 == 0 - df1.loc[::2] = np.nan - df2 = df1.copy() - assert df1["text"].equals(df2["text"]) - assert df1["start"].equals(df2["start"]) - assert df1["end"].equals(df2["end"]) - assert df1["diff"].equals(df2["diff"]) - assert df1["bool"].equals(df2["bool"]) - assert df1.equals(df2) - assert not df1.equals(object) - - # different dtype - different = df1.copy() - different["floats"] = different["floats"].astype("float32") - assert not df1.equals(different) - - # different index - different_index = -index - different = df2.set_index(different_index) - assert not df1.equals(different) - - # different columns - different = df2.copy() - different.columns = df2.columns[::-1] - assert not df1.equals(different) - - # DatetimeIndex - index = pd.date_range("2000-1-1", periods=10, freq="T") - df1 = df1.set_index(index) - df2 = df1.copy() - assert df1.equals(df2) - - # MultiIndex - df3 = df1.set_index(["text"], append=True) - df2 = df1.set_index(["text"], append=True) - assert df3.equals(df2) - - df2 = df1.set_index(["floats"], append=True) - assert not df3.equals(df2) - - # NaN in index - df3 = df1.set_index(["floats"], append=True) - df2 = df1.set_index(["floats"], append=True) - assert df3.equals(df2) - - @pytest.mark.parametrize("box", [pd.Series, pd.DataFrame]) - def test_axis_classmethods(self, box): + def test_axis_classmethods(self, frame_or_series): + box = frame_or_series obj = box(dtype=object) values = box._AXIS_TO_AXIS_NUMBER.keys() for v in values: @@ -538,24 +458,23 @@ def test_axis_classmethods(self, box): assert obj._get_axis_name(v) == box._get_axis_name(v) assert obj._get_block_manager_axis(v) == box._get_block_manager_axis(v) - @pytest.mark.parametrize("box", [pd.Series, pd.DataFrame]) - def test_axis_names_deprecated(self, box): + def test_axis_names_deprecated(self, frame_or_series): # GH33637 + box = frame_or_series obj = box(dtype=object) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): obj._AXIS_NAMES - @pytest.mark.parametrize("box", [pd.Series, pd.DataFrame]) - def test_axis_numbers_deprecated(self, box): + def test_axis_numbers_deprecated(self, frame_or_series): # GH33637 + box = frame_or_series obj = box(dtype=object) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): obj._AXIS_NUMBERS - @pytest.mark.parametrize("as_frame", [True, False]) - def test_flags_identity(self, as_frame): + def test_flags_identity(self, frame_or_series): s = Series([1, 2]) - if as_frame: + if frame_or_series is DataFrame: s = s.to_frame() assert s.flags is s.flags From e0d1c7e1bd4beca7a0389115f1b6d681bb2fad48 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 3 Nov 2020 17:51:33 -0800 Subject: [PATCH 04/21] TST: collect tests by method (#37617) * TST/REF: collect test_timeseries tests by method * misplaced DataFrame.values tst * misplaced dataframe.values test * collect test by method --- pandas/tests/frame/methods/test_asfreq.py | 11 ++++ pandas/tests/frame/methods/test_values.py | 19 ++++++- .../tests/indexes/datetimes/test_indexing.py | 7 +++ .../tests/series/apply/test_series_apply.py | 13 ++++- pandas/tests/series/methods/test_values.py | 20 +++++++ pandas/tests/series/test_arithmetic.py | 15 ++++++ pandas/tests/series/test_dtypes.py | 52 ++++--------------- pandas/tests/series/test_period.py | 24 --------- pandas/tests/series/test_timeseries.py | 41 --------------- 9 files changed, 93 insertions(+), 109 deletions(-) create mode 100644 pandas/tests/series/methods/test_values.py delete mode 100644 pandas/tests/series/test_period.py delete mode 100644 pandas/tests/series/test_timeseries.py diff --git a/pandas/tests/frame/methods/test_asfreq.py b/pandas/tests/frame/methods/test_asfreq.py index cdcd922949bcf..368ce88abe165 100644 --- a/pandas/tests/frame/methods/test_asfreq.py +++ b/pandas/tests/frame/methods/test_asfreq.py @@ -74,3 +74,14 @@ def test_asfreq_fillvalue(self): expected_series = ts.asfreq(freq="1S").fillna(9.0) actual_series = ts.asfreq(freq="1S", fill_value=9.0) tm.assert_series_equal(expected_series, actual_series) + + def test_asfreq_with_date_object_index(self, frame_or_series): + rng = date_range("1/1/2000", periods=20) + ts = frame_or_series(np.random.randn(20), index=rng) + + ts2 = ts.copy() + ts2.index = [x.date() for x in ts2.index] + + result = ts2.asfreq("4H", method="ffill") + expected = ts.asfreq("4H", method="ffill") + tm.assert_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_values.py b/pandas/tests/frame/methods/test_values.py index 564a659724768..fb0c5d31f692b 100644 --- a/pandas/tests/frame/methods/test_values.py +++ b/pandas/tests/frame/methods/test_values.py @@ -1,6 +1,7 @@ import numpy as np +import pytest -from pandas import DataFrame, NaT, Timestamp, date_range +from pandas import DataFrame, NaT, Series, Timestamp, date_range, period_range import pandas._testing as tm @@ -44,6 +45,22 @@ def test_values_duplicates(self): tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize("constructor", [date_range, period_range]) + def test_values_casts_datetimelike_to_object(self, constructor): + series = Series(constructor("2000-01-01", periods=10, freq="D")) + + expected = series.astype("object") + + df = DataFrame({"a": series, "b": np.random.randn(len(series))}) + + result = df.values.squeeze() + assert (result[:, 0] == expected.values).all() + + df = DataFrame({"a": series, "b": ["foo"] * len(series)}) + + result = df.values.squeeze() + assert (result[:, 0] == expected.values).all() + def test_frame_values_with_tz(self): tz = "US/Central" df = DataFrame({"A": date_range("2000", periods=4, tz=tz)}) diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index d4ebb557fd6cd..59269b9b54ddc 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -544,6 +544,13 @@ def test_contains_nonunique(self, vals): class TestGetIndexer: + def test_get_indexer_date_objs(self): + rng = date_range("1/1/2000", periods=20) + + result = rng.get_indexer(rng.map(lambda x: x.date())) + expected = rng.get_indexer(rng) + tm.assert_numpy_array_equal(result, expected) + def test_get_indexer(self): idx = pd.date_range("2000-01-01", periods=3) exp = np.array([0, 1, 2], dtype=np.intp) diff --git a/pandas/tests/series/apply/test_series_apply.py b/pandas/tests/series/apply/test_series_apply.py index 9096d2a1033e5..93431a5c75091 100644 --- a/pandas/tests/series/apply/test_series_apply.py +++ b/pandas/tests/series/apply/test_series_apply.py @@ -5,12 +5,23 @@ import pytest import pandas as pd -from pandas import DataFrame, Index, MultiIndex, Series, isna +from pandas import DataFrame, Index, MultiIndex, Series, isna, timedelta_range import pandas._testing as tm from pandas.core.base import SpecificationError class TestSeriesApply: + def test_series_map_box_timedelta(self): + # GH#11349 + ser = Series(timedelta_range("1 day 1 s", periods=5, freq="h")) + + def f(x): + return x.total_seconds() + + ser.map(f) + ser.apply(f) + DataFrame(ser).applymap(f) + def test_apply(self, datetime_series): with np.errstate(all="ignore"): tm.assert_series_equal( diff --git a/pandas/tests/series/methods/test_values.py b/pandas/tests/series/methods/test_values.py new file mode 100644 index 0000000000000..e28a714ea656d --- /dev/null +++ b/pandas/tests/series/methods/test_values.py @@ -0,0 +1,20 @@ +import numpy as np +import pytest + +from pandas import IntervalIndex, Series, period_range +import pandas._testing as tm + + +class TestValues: + @pytest.mark.parametrize( + "data", + [ + period_range("2000", periods=4), + IntervalIndex.from_breaks([1, 2, 3, 4]), + ], + ) + def test_values_object_extension_dtypes(self, data): + # https://github.com/pandas-dev/pandas/issues/23995 + result = Series(data).values + expected = np.array(data.astype(object)) + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 9154c566a3dae..fa8f85178ba9f 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -730,6 +730,21 @@ def test_datetime_understood(self): expected = Series(pd.to_datetime(["2011-12-26", "2011-12-27", "2011-12-28"])) tm.assert_series_equal(result, expected) + def test_align_date_objects_with_datetimeindex(self): + rng = date_range("1/1/2000", periods=20) + ts = Series(np.random.randn(20), index=rng) + + ts_slice = ts[5:] + ts2 = ts_slice.copy() + ts2.index = [x.date() for x in ts2.index] + + result = ts + ts2 + result2 = ts2 + ts + expected = ts + ts[5:] + expected.index = expected.index._with_freq(None) + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result2, expected) + @pytest.mark.parametrize( "names", diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index b85a53960b0f6..2fbed92567f71 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -6,7 +6,7 @@ from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd -from pandas import Categorical, DataFrame, Series, date_range +from pandas import Categorical, DataFrame, Series import pandas._testing as tm @@ -120,18 +120,20 @@ def cmp(a, b): s.astype("object").astype(CategoricalDtype()), roundtrip_expected ) + def test_invalid_conversions(self): # invalid conversion (these are NOT a dtype) + cat = Categorical([f"{i} - {i + 499}" for i in range(0, 10000, 500)]) + ser = Series(np.random.randint(0, 10000, 100)).sort_values() + ser = pd.cut(ser, range(0, 10500, 500), right=False, labels=cat) + msg = ( "dtype '' " "not understood" ) - - for invalid in [ - lambda x: x.astype(Categorical), - lambda x: x.astype("object").astype(Categorical), - ]: - with pytest.raises(TypeError, match=msg): - invalid(s) + with pytest.raises(TypeError, match=msg): + ser.astype(Categorical) + with pytest.raises(TypeError, match=msg): + ser.astype("object").astype(Categorical) @pytest.mark.parametrize("dtype", np.typecodes["All"]) def test_astype_empty_constructor_equality(self, dtype): @@ -148,27 +150,6 @@ def test_astype_empty_constructor_equality(self, dtype): as_type_empty = Series([]).astype(dtype) tm.assert_series_equal(init_empty, as_type_empty) - def test_intercept_astype_object(self): - series = Series(date_range("1/1/2000", periods=10)) - - # This test no longer makes sense, as - # Series is by default already M8[ns]. - expected = series.astype("object") - - df = DataFrame({"a": series, "b": np.random.randn(len(series))}) - exp_dtypes = Series( - [np.dtype("datetime64[ns]"), np.dtype("float64")], index=["a", "b"] - ) - tm.assert_series_equal(df.dtypes, exp_dtypes) - - result = df.values.squeeze() - assert (result[:, 0] == expected.values).all() - - df = DataFrame({"a": series, "b": ["foo"] * len(series)}) - - result = df.values.squeeze() - assert (result[:, 0] == expected.values).all() - def test_series_to_categorical(self): # see gh-16524: test conversion of Series to Categorical series = Series(["a", "b", "c"]) @@ -178,19 +159,6 @@ def test_series_to_categorical(self): tm.assert_series_equal(result, expected) - @pytest.mark.parametrize( - "data", - [ - pd.period_range("2000", periods=4), - pd.IntervalIndex.from_breaks([1, 2, 3, 4]), - ], - ) - def test_values_compatibility(self, data): - # https://github.com/pandas-dev/pandas/issues/23995 - result = Series(data).values - expected = np.array(data.astype(object)) - tm.assert_numpy_array_equal(result, expected) - def test_reindex_astype_order_consistency(self): # GH 17444 s = Series([1, 2, 3], index=[2, 0, 1]) diff --git a/pandas/tests/series/test_period.py b/pandas/tests/series/test_period.py deleted file mode 100644 index 17dbfa9cf379a..0000000000000 --- a/pandas/tests/series/test_period.py +++ /dev/null @@ -1,24 +0,0 @@ -import numpy as np - -from pandas import DataFrame, Series, period_range - - -class TestSeriesPeriod: - - # --------------------------------------------------------------------- - # NaT support - - def test_intercept_astype_object(self): - series = Series(period_range("2000-01-01", periods=10, freq="D")) - - expected = series.astype("object") - - df = DataFrame({"a": series, "b": np.random.randn(len(series))}) - - result = df.values.squeeze() - assert (result[:, 0] == expected.values).all() - - df = DataFrame({"a": series, "b": ["foo"] * len(series)}) - - result = df.values.squeeze() - assert (result[:, 0] == expected.values).all() diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py deleted file mode 100644 index 0769606d18d57..0000000000000 --- a/pandas/tests/series/test_timeseries.py +++ /dev/null @@ -1,41 +0,0 @@ -import numpy as np - -from pandas import DataFrame, Series, date_range, timedelta_range -import pandas._testing as tm - - -class TestTimeSeries: - def test_promote_datetime_date(self): - rng = date_range("1/1/2000", periods=20) - ts = Series(np.random.randn(20), index=rng) - - ts_slice = ts[5:] - ts2 = ts_slice.copy() - ts2.index = [x.date() for x in ts2.index] - - result = ts + ts2 - result2 = ts2 + ts - expected = ts + ts[5:] - expected.index = expected.index._with_freq(None) - tm.assert_series_equal(result, expected) - tm.assert_series_equal(result2, expected) - - # test asfreq - result = ts2.asfreq("4H", method="ffill") - expected = ts[5:].asfreq("4H", method="ffill") - tm.assert_series_equal(result, expected) - - result = rng.get_indexer(ts2.index) - expected = rng.get_indexer(ts_slice.index) - tm.assert_numpy_array_equal(result, expected) - - def test_series_map_box_timedelta(self): - # GH 11349 - s = Series(timedelta_range("1 day 1 s", periods=5, freq="h")) - - def f(x): - return x.total_seconds() - - s.map(f) - s.apply(f) - DataFrame(s).applymap(f) From d75eb5ba1be16b6cd74fd44a68ce124be6575e4f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 3 Nov 2020 17:53:16 -0800 Subject: [PATCH 05/21] TST/REF: share tests across Series/DataFrame (#37616) --- pandas/tests/frame/methods/test_asof.py | 10 +- pandas/tests/frame/methods/test_droplevel.py | 29 +++-- .../frame/methods/test_first_and_last.py | 44 +++++--- pandas/tests/frame/methods/test_head_tail.py | 3 + pandas/tests/frame/methods/test_truncate.py | 69 ++++++++---- pandas/tests/frame/methods/test_tz_convert.py | 9 +- .../tests/frame/methods/test_tz_localize.py | 9 +- pandas/tests/series/methods/test_asof.py | 3 - pandas/tests/series/methods/test_droplevel.py | 19 ---- .../series/methods/test_first_and_last.py | 69 ------------ .../series/{indexing => methods}/test_pop.py | 0 pandas/tests/series/methods/test_truncate.py | 106 ------------------ 12 files changed, 111 insertions(+), 259 deletions(-) delete mode 100644 pandas/tests/series/methods/test_droplevel.py delete mode 100644 pandas/tests/series/methods/test_first_and_last.py rename pandas/tests/series/{indexing => methods}/test_pop.py (100%) diff --git a/pandas/tests/frame/methods/test_asof.py b/pandas/tests/frame/methods/test_asof.py index 70b42976c95a7..6931dd0ea2d4c 100644 --- a/pandas/tests/frame/methods/test_asof.py +++ b/pandas/tests/frame/methods/test_asof.py @@ -96,12 +96,16 @@ def test_missing(self, date_range_frame): result = df.asof("1989-12-31") assert isinstance(result.name, Period) + def test_asof_all_nans(self, frame_or_series): + # GH 15713 + # DataFrame/Series is all nans + result = frame_or_series([np.nan]).asof([0]) + expected = frame_or_series([np.nan]) + tm.assert_equal(result, expected) + def test_all_nans(self, date_range_frame): # GH 15713 # DataFrame is all nans - result = DataFrame([np.nan]).asof([0]) - expected = DataFrame([np.nan]) - tm.assert_frame_equal(result, expected) # testing non-default indexes, multiple inputs N = 150 diff --git a/pandas/tests/frame/methods/test_droplevel.py b/pandas/tests/frame/methods/test_droplevel.py index 517905cf23259..ce98704b03106 100644 --- a/pandas/tests/frame/methods/test_droplevel.py +++ b/pandas/tests/frame/methods/test_droplevel.py @@ -1,23 +1,32 @@ +import pytest + from pandas import DataFrame, Index, MultiIndex import pandas._testing as tm class TestDropLevel: - def test_droplevel(self): + def test_droplevel(self, frame_or_series): # GH#20342 - df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]) - df = df.set_index([0, 1]).rename_axis(["a", "b"]) - df.columns = MultiIndex.from_tuples( + cols = MultiIndex.from_tuples( [("c", "e"), ("d", "f")], names=["level_1", "level_2"] ) + mi = MultiIndex.from_tuples([(1, 2), (5, 6), (9, 10)], names=["a", "b"]) + df = DataFrame([[3, 4], [7, 8], [11, 12]], index=mi, columns=cols) + if frame_or_series is not DataFrame: + df = df.iloc[:, 0] # test that dropping of a level in index works expected = df.reset_index("a", drop=True) result = df.droplevel("a", axis="index") - tm.assert_frame_equal(result, expected) + tm.assert_equal(result, expected) - # test that dropping of a level in columns works - expected = df.copy() - expected.columns = Index(["c", "d"], name="level_1") - result = df.droplevel("level_2", axis="columns") - tm.assert_frame_equal(result, expected) + if frame_or_series is DataFrame: + # test that dropping of a level in columns works + expected = df.copy() + expected.columns = Index(["c", "d"], name="level_1") + result = df.droplevel("level_2", axis="columns") + tm.assert_equal(result, expected) + else: + # test that droplevel raises ValueError on axis != 0 + with pytest.raises(ValueError, match="No axis named columns"): + df.droplevel(1, axis="columns") diff --git a/pandas/tests/frame/methods/test_first_and_last.py b/pandas/tests/frame/methods/test_first_and_last.py index 2b3756969acca..d21e1eee54e16 100644 --- a/pandas/tests/frame/methods/test_first_and_last.py +++ b/pandas/tests/frame/methods/test_first_and_last.py @@ -8,56 +8,64 @@ class TestFirst: - def test_first_subset(self): + def test_first_subset(self, frame_or_series): ts = tm.makeTimeDataFrame(freq="12h") + if frame_or_series is not DataFrame: + ts = ts["A"] result = ts.first("10d") assert len(result) == 20 ts = tm.makeTimeDataFrame(freq="D") + if frame_or_series is not DataFrame: + ts = ts["A"] result = ts.first("10d") assert len(result) == 10 result = ts.first("3M") expected = ts[:"3/31/2000"] - tm.assert_frame_equal(result, expected) + tm.assert_equal(result, expected) result = ts.first("21D") expected = ts[:21] - tm.assert_frame_equal(result, expected) + tm.assert_equal(result, expected) result = ts[:0].first("3M") - tm.assert_frame_equal(result, ts[:0]) + tm.assert_equal(result, ts[:0]) - def test_first_raises(self): + def test_first_last_raises(self, frame_or_series): # GH#20725 - df = DataFrame([[1, 2, 3], [4, 5, 6]]) + obj = DataFrame([[1, 2, 3], [4, 5, 6]]) + if frame_or_series is not DataFrame: + obj = obj[0] + msg = "'first' only supports a DatetimeIndex index" with pytest.raises(TypeError, match=msg): # index is not a DatetimeIndex - df.first("1D") + obj.first("1D") + + msg = "'last' only supports a DatetimeIndex index" + with pytest.raises(TypeError, match=msg): # index is not a DatetimeIndex + obj.last("1D") - def test_last_subset(self): + def test_last_subset(self, frame_or_series): ts = tm.makeTimeDataFrame(freq="12h") + if frame_or_series is not DataFrame: + ts = ts["A"] result = ts.last("10d") assert len(result) == 20 ts = tm.makeTimeDataFrame(nper=30, freq="D") + if frame_or_series is not DataFrame: + ts = ts["A"] result = ts.last("10d") assert len(result) == 10 result = ts.last("21D") expected = ts["2000-01-10":] - tm.assert_frame_equal(result, expected) + tm.assert_equal(result, expected) result = ts.last("21D") expected = ts[-21:] - tm.assert_frame_equal(result, expected) + tm.assert_equal(result, expected) result = ts[:0].last("3M") - tm.assert_frame_equal(result, ts[:0]) - - def test_last_raises(self): - # GH20725 - df = DataFrame([[1, 2, 3], [4, 5, 6]]) - msg = "'last' only supports a DatetimeIndex index" - with pytest.raises(TypeError, match=msg): # index is not a DatetimeIndex - df.last("1D") + tm.assert_equal(result, ts[:0]) diff --git a/pandas/tests/frame/methods/test_head_tail.py b/pandas/tests/frame/methods/test_head_tail.py index fa28f7d3e16a2..99cb7840c3eb6 100644 --- a/pandas/tests/frame/methods/test_head_tail.py +++ b/pandas/tests/frame/methods/test_head_tail.py @@ -48,6 +48,9 @@ def test_head_tail(float_frame): tm.assert_frame_equal(df.tail(0), df[0:0]) tm.assert_frame_equal(df.head(-1), df.iloc[:-1]) tm.assert_frame_equal(df.tail(-1), df.iloc[1:]) + + +def test_head_tail_empty(): # test empty dataframe empty_df = DataFrame() tm.assert_frame_equal(empty_df.tail(), empty_df) diff --git a/pandas/tests/frame/methods/test_truncate.py b/pandas/tests/frame/methods/test_truncate.py index 674f482c478a0..c6d6637edc88c 100644 --- a/pandas/tests/frame/methods/test_truncate.py +++ b/pandas/tests/frame/methods/test_truncate.py @@ -2,12 +2,15 @@ import pytest import pandas as pd +from pandas import DataFrame, Series, date_range import pandas._testing as tm class TestDataFrameTruncate: - def test_truncate(self, datetime_frame): + def test_truncate(self, datetime_frame, frame_or_series): ts = datetime_frame[::3] + if frame_or_series is Series: + ts = ts.iloc[:, 0] start, end = datetime_frame.index[3], datetime_frame.index[6] @@ -16,34 +19,41 @@ def test_truncate(self, datetime_frame): # neither specified truncated = ts.truncate() - tm.assert_frame_equal(truncated, ts) + tm.assert_equal(truncated, ts) # both specified expected = ts[1:3] truncated = ts.truncate(start, end) - tm.assert_frame_equal(truncated, expected) + tm.assert_equal(truncated, expected) truncated = ts.truncate(start_missing, end_missing) - tm.assert_frame_equal(truncated, expected) + tm.assert_equal(truncated, expected) # start specified expected = ts[1:] truncated = ts.truncate(before=start) - tm.assert_frame_equal(truncated, expected) + tm.assert_equal(truncated, expected) truncated = ts.truncate(before=start_missing) - tm.assert_frame_equal(truncated, expected) + tm.assert_equal(truncated, expected) # end specified expected = ts[:3] truncated = ts.truncate(after=end) - tm.assert_frame_equal(truncated, expected) + tm.assert_equal(truncated, expected) truncated = ts.truncate(after=end_missing) - tm.assert_frame_equal(truncated, expected) + tm.assert_equal(truncated, expected) + + # corner case, empty series/frame returned + truncated = ts.truncate(after=ts.index[0] - ts.index.freq) + assert len(truncated) == 0 + + truncated = ts.truncate(before=ts.index[-1] + ts.index.freq) + assert len(truncated) == 0 msg = "Truncate: 2000-01-06 00:00:00 must be after 2000-02-04 00:00:00" with pytest.raises(ValueError, match=msg): @@ -57,25 +67,35 @@ def test_truncate_copy(self, datetime_frame): truncated.values[:] = 5.0 assert not (datetime_frame.values[5:11] == 5).any() - def test_truncate_nonsortedindex(self): + def test_truncate_nonsortedindex(self, frame_or_series): # GH#17935 - df = pd.DataFrame({"A": ["a", "b", "c", "d", "e"]}, index=[5, 3, 2, 9, 0]) + obj = DataFrame({"A": ["a", "b", "c", "d", "e"]}, index=[5, 3, 2, 9, 0]) + if frame_or_series is Series: + obj = obj["A"] + msg = "truncate requires a sorted index" with pytest.raises(ValueError, match=msg): - df.truncate(before=3, after=9) + obj.truncate(before=3, after=9) + + def test_sort_values_nonsortedindex(self): + # TODO: belongs elsewhere? - rng = pd.date_range("2011-01-01", "2012-01-01", freq="W") - ts = pd.DataFrame( + rng = date_range("2011-01-01", "2012-01-01", freq="W") + ts = DataFrame( {"A": np.random.randn(len(rng)), "B": np.random.randn(len(rng))}, index=rng ) + msg = "truncate requires a sorted index" with pytest.raises(ValueError, match=msg): ts.sort_values("A", ascending=False).truncate( before="2011-11", after="2011-12" ) - df = pd.DataFrame( + def test_truncate_nonsortedindex_axis1(self): + # GH#17935 + + df = DataFrame( { 3: np.random.randn(5), 20: np.random.randn(5), @@ -93,27 +113,34 @@ def test_truncate_nonsortedindex(self): [(1, 2, [2, 1]), (None, 2, [2, 1, 0]), (1, None, [3, 2, 1])], ) @pytest.mark.parametrize("klass", [pd.Int64Index, pd.DatetimeIndex]) - def test_truncate_decreasing_index(self, before, after, indices, klass): + def test_truncate_decreasing_index( + self, before, after, indices, klass, frame_or_series + ): # https://github.com/pandas-dev/pandas/issues/33756 idx = klass([3, 2, 1, 0]) if klass is pd.DatetimeIndex: before = pd.Timestamp(before) if before is not None else None after = pd.Timestamp(after) if after is not None else None indices = [pd.Timestamp(i) for i in indices] - values = pd.DataFrame(range(len(idx)), index=idx) + values = frame_or_series(range(len(idx)), index=idx) result = values.truncate(before=before, after=after) expected = values.loc[indices] - tm.assert_frame_equal(result, expected) + tm.assert_equal(result, expected) - def test_truncate_multiindex(self): + def test_truncate_multiindex(self, frame_or_series): # GH 34564 mi = pd.MultiIndex.from_product([[1, 2, 3, 4], ["A", "B"]], names=["L1", "L2"]) - s1 = pd.DataFrame(range(mi.shape[0]), index=mi, columns=["col"]) + s1 = DataFrame(range(mi.shape[0]), index=mi, columns=["col"]) + if frame_or_series is Series: + s1 = s1["col"] + result = s1.truncate(before=2, after=3) - df = pd.DataFrame.from_dict( + df = DataFrame.from_dict( {"L1": [2, 2, 3, 3], "L2": ["A", "B", "A", "B"], "col": [2, 3, 4, 5]} ) expected = df.set_index(["L1", "L2"]) + if frame_or_series is Series: + expected = expected["col"] - tm.assert_frame_equal(result, expected) + tm.assert_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_tz_convert.py b/pandas/tests/frame/methods/test_tz_convert.py index c70e479723644..ecb30cf11319b 100644 --- a/pandas/tests/frame/methods/test_tz_convert.py +++ b/pandas/tests/frame/methods/test_tz_convert.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas import DataFrame, Index, MultiIndex, Series, date_range +from pandas import DataFrame, Index, MultiIndex, date_range import pandas._testing as tm @@ -89,17 +89,16 @@ def test_tz_convert_and_localize(self, fn): df = DataFrame(index=l0) df = getattr(df, fn)("US/Pacific", level=1) - @pytest.mark.parametrize("klass", [Series, DataFrame]) @pytest.mark.parametrize("copy", [True, False]) - def test_tz_convert_copy_inplace_mutate(self, copy, klass): + def test_tz_convert_copy_inplace_mutate(self, copy, frame_or_series): # GH#6326 - obj = klass( + obj = frame_or_series( np.arange(0, 5), index=date_range("20131027", periods=5, freq="1H", tz="Europe/Berlin"), ) orig = obj.copy() result = obj.tz_convert("UTC", copy=copy) - expected = klass(np.arange(0, 5), index=obj.index.tz_convert("UTC")) + expected = frame_or_series(np.arange(0, 5), index=obj.index.tz_convert("UTC")) tm.assert_equal(result, expected) tm.assert_equal(obj, orig) assert result.index is not obj.index diff --git a/pandas/tests/frame/methods/test_tz_localize.py b/pandas/tests/frame/methods/test_tz_localize.py index 183b81ca5298e..aa5ab51fe3d8b 100644 --- a/pandas/tests/frame/methods/test_tz_localize.py +++ b/pandas/tests/frame/methods/test_tz_localize.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas import DataFrame, Series, date_range +from pandas import DataFrame, date_range import pandas._testing as tm @@ -23,16 +23,15 @@ def test_frame_tz_localize(self): assert result.columns.tz.zone == "UTC" tm.assert_frame_equal(result, expected.T) - @pytest.mark.parametrize("klass", [Series, DataFrame]) @pytest.mark.parametrize("copy", [True, False]) - def test_tz_localize_copy_inplace_mutate(self, copy, klass): + def test_tz_localize_copy_inplace_mutate(self, copy, frame_or_series): # GH#6326 - obj = klass( + obj = frame_or_series( np.arange(0, 5), index=date_range("20131027", periods=5, freq="1H", tz=None) ) orig = obj.copy() result = obj.tz_localize("UTC", copy=copy) - expected = klass( + expected = frame_or_series( np.arange(0, 5), index=date_range("20131027", periods=5, freq="1H", tz="UTC"), ) diff --git a/pandas/tests/series/methods/test_asof.py b/pandas/tests/series/methods/test_asof.py index 4b4ef5ea046be..43d40d53dcd21 100644 --- a/pandas/tests/series/methods/test_asof.py +++ b/pandas/tests/series/methods/test_asof.py @@ -161,9 +161,6 @@ def test_errors(self): def test_all_nans(self): # GH 15713 # series is all nans - result = Series([np.nan]).asof([0]) - expected = Series([np.nan]) - tm.assert_series_equal(result, expected) # testing non-default indexes N = 50 diff --git a/pandas/tests/series/methods/test_droplevel.py b/pandas/tests/series/methods/test_droplevel.py deleted file mode 100644 index 449ddd1cd0e49..0000000000000 --- a/pandas/tests/series/methods/test_droplevel.py +++ /dev/null @@ -1,19 +0,0 @@ -import pytest - -from pandas import MultiIndex, Series -import pandas._testing as tm - - -class TestDropLevel: - def test_droplevel(self): - # GH#20342 - ser = Series([1, 2, 3, 4]) - ser.index = MultiIndex.from_arrays( - [(1, 2, 3, 4), (5, 6, 7, 8)], names=["a", "b"] - ) - expected = ser.reset_index("b", drop=True) - result = ser.droplevel("b", axis="index") - tm.assert_series_equal(result, expected) - # test that droplevel raises ValueError on axis != 0 - with pytest.raises(ValueError, match="No axis named columns"): - ser.droplevel(1, axis="columns") diff --git a/pandas/tests/series/methods/test_first_and_last.py b/pandas/tests/series/methods/test_first_and_last.py deleted file mode 100644 index 7629dc8cda30b..0000000000000 --- a/pandas/tests/series/methods/test_first_and_last.py +++ /dev/null @@ -1,69 +0,0 @@ -""" -Note: includes tests for `last` -""" - -import numpy as np -import pytest - -from pandas import Series, date_range -import pandas._testing as tm - - -class TestFirst: - def test_first_subset(self): - rng = date_range("1/1/2000", "1/1/2010", freq="12h") - ts = Series(np.random.randn(len(rng)), index=rng) - result = ts.first("10d") - assert len(result) == 20 - - rng = date_range("1/1/2000", "1/1/2010", freq="D") - ts = Series(np.random.randn(len(rng)), index=rng) - result = ts.first("10d") - assert len(result) == 10 - - result = ts.first("3M") - expected = ts[:"3/31/2000"] - tm.assert_series_equal(result, expected) - - result = ts.first("21D") - expected = ts[:21] - tm.assert_series_equal(result, expected) - - result = ts[:0].first("3M") - tm.assert_series_equal(result, ts[:0]) - - def test_first_raises(self): - # GH#20725 - ser = Series("a b c".split()) - msg = "'first' only supports a DatetimeIndex index" - with pytest.raises(TypeError, match=msg): - ser.first("1D") - - def test_last_subset(self): - rng = date_range("1/1/2000", "1/1/2010", freq="12h") - ts = Series(np.random.randn(len(rng)), index=rng) - result = ts.last("10d") - assert len(result) == 20 - - rng = date_range("1/1/2000", "1/1/2010", freq="D") - ts = Series(np.random.randn(len(rng)), index=rng) - result = ts.last("10d") - assert len(result) == 10 - - result = ts.last("21D") - expected = ts["12/12/2009":] - tm.assert_series_equal(result, expected) - - result = ts.last("21D") - expected = ts[-21:] - tm.assert_series_equal(result, expected) - - result = ts[:0].last("3M") - tm.assert_series_equal(result, ts[:0]) - - def test_last_raises(self): - # GH#20725 - ser = Series("a b c".split()) - msg = "'last' only supports a DatetimeIndex index" - with pytest.raises(TypeError, match=msg): - ser.last("1D") diff --git a/pandas/tests/series/indexing/test_pop.py b/pandas/tests/series/methods/test_pop.py similarity index 100% rename from pandas/tests/series/indexing/test_pop.py rename to pandas/tests/series/methods/test_pop.py diff --git a/pandas/tests/series/methods/test_truncate.py b/pandas/tests/series/methods/test_truncate.py index b03f516eeffc5..21de593c0e2af 100644 --- a/pandas/tests/series/methods/test_truncate.py +++ b/pandas/tests/series/methods/test_truncate.py @@ -1,102 +1,11 @@ from datetime import datetime -import numpy as np -import pytest - import pandas as pd from pandas import Series, date_range import pandas._testing as tm -from pandas.tseries.offsets import BDay - class TestTruncate: - def test_truncate(self, datetime_series): - offset = BDay() - - ts = datetime_series[::3] - - start, end = datetime_series.index[3], datetime_series.index[6] - start_missing, end_missing = datetime_series.index[2], datetime_series.index[7] - - # neither specified - truncated = ts.truncate() - tm.assert_series_equal(truncated, ts) - - # both specified - expected = ts[1:3] - - truncated = ts.truncate(start, end) - tm.assert_series_equal(truncated, expected) - - truncated = ts.truncate(start_missing, end_missing) - tm.assert_series_equal(truncated, expected) - - # start specified - expected = ts[1:] - - truncated = ts.truncate(before=start) - tm.assert_series_equal(truncated, expected) - - truncated = ts.truncate(before=start_missing) - tm.assert_series_equal(truncated, expected) - - # end specified - expected = ts[:3] - - truncated = ts.truncate(after=end) - tm.assert_series_equal(truncated, expected) - - truncated = ts.truncate(after=end_missing) - tm.assert_series_equal(truncated, expected) - - # corner case, empty series returned - truncated = ts.truncate(after=datetime_series.index[0] - offset) - assert len(truncated) == 0 - - truncated = ts.truncate(before=datetime_series.index[-1] + offset) - assert len(truncated) == 0 - - msg = "Truncate: 1999-12-31 00:00:00 must be after 2000-02-14 00:00:00" - with pytest.raises(ValueError, match=msg): - ts.truncate( - before=datetime_series.index[-1] + offset, - after=datetime_series.index[0] - offset, - ) - - def test_truncate_nonsortedindex(self): - # GH#17935 - - s = Series(["a", "b", "c", "d", "e"], index=[5, 3, 2, 9, 0]) - msg = "truncate requires a sorted index" - - with pytest.raises(ValueError, match=msg): - s.truncate(before=3, after=9) - - rng = pd.date_range("2011-01-01", "2012-01-01", freq="W") - ts = Series(np.random.randn(len(rng)), index=rng) - msg = "truncate requires a sorted index" - - with pytest.raises(ValueError, match=msg): - ts.sort_values(ascending=False).truncate(before="2011-11", after="2011-12") - - @pytest.mark.parametrize( - "before, after, indices", - [(1, 2, [2, 1]), (None, 2, [2, 1, 0]), (1, None, [3, 2, 1])], - ) - @pytest.mark.parametrize("klass", [pd.Int64Index, pd.DatetimeIndex]) - def test_truncate_decreasing_index(self, before, after, indices, klass): - # https://github.com/pandas-dev/pandas/issues/33756 - idx = klass([3, 2, 1, 0]) - if klass is pd.DatetimeIndex: - before = pd.Timestamp(before) if before is not None else None - after = pd.Timestamp(after) if after is not None else None - indices = [pd.Timestamp(i) for i in indices] - values = Series(range(len(idx)), index=idx) - result = values.truncate(before=before, after=after) - expected = values.loc[indices] - tm.assert_series_equal(result, expected) - def test_truncate_datetimeindex_tz(self): # GH 9243 idx = date_range("4/1/2005", "4/30/2005", freq="D", tz="US/Pacific") @@ -133,21 +42,6 @@ def test_truncate_periodindex(self): expected_idx2 = pd.PeriodIndex([pd.Period("2017-09-02")]) tm.assert_series_equal(result2, Series([2], index=expected_idx2)) - def test_truncate_multiindex(self): - # GH 34564 - mi = pd.MultiIndex.from_product([[1, 2, 3, 4], ["A", "B"]], names=["L1", "L2"]) - s1 = Series(range(mi.shape[0]), index=mi, name="col") - result = s1.truncate(before=2, after=3) - - df = pd.DataFrame.from_dict( - {"L1": [2, 2, 3, 3], "L2": ["A", "B", "A", "B"], "col": [2, 3, 4, 5]} - ) - return_value = df.set_index(["L1", "L2"], inplace=True) - assert return_value is None - expected = df.col - - tm.assert_series_equal(result, expected) - def test_truncate_one_element_series(self): # GH 35544 series = Series([0.1], index=pd.DatetimeIndex(["2020-08-04"])) From 83c2e651b4b4cfff58298c0090b67a0a3d4db2e1 Mon Sep 17 00:00:00 2001 From: Sven Date: Wed, 4 Nov 2020 12:55:11 +1100 Subject: [PATCH 06/21] Gh 36562 typeerror comparison not supported between float and str (#37096) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/algorithms.py | 43 ++++++++++++++----- .../tests/frame/methods/test_combine_first.py | 31 ++++++++++++- pandas/tests/test_sorting.py | 7 +++ 4 files changed, 70 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 7111d54d65815..ae6e2de1b819c 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -469,6 +469,7 @@ MultiIndex - Bug in :meth:`DataFrame.xs` when used with :class:`IndexSlice` raises ``TypeError`` with message ``"Expected label or tuple of labels"`` (:issue:`35301`) - Bug in :meth:`DataFrame.reset_index` with ``NaT`` values in index raises ``ValueError`` with message ``"cannot convert float NaN to integer"`` (:issue:`36541`) +- Bug in :meth:`DataFrame.combine_first` when used with :class:`MultiIndex` containing string and ``NaN`` values raises ``TypeError`` (:issue:`36562`) I/O ^^^ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index e9e04ace784b6..ec88eb817b3f8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2061,27 +2061,25 @@ def safe_sort( dtype, _ = infer_dtype_from_array(values) values = np.asarray(values, dtype=dtype) - def sort_mixed(values): - # order ints before strings, safe in py3 - str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) - nums = np.sort(values[~str_pos]) - strs = np.sort(values[str_pos]) - return np.concatenate([nums, np.asarray(strs, dtype=object)]) - sorter = None + if ( not is_extension_array_dtype(values) and lib.infer_dtype(values, skipna=False) == "mixed-integer" ): - # unorderable in py3 if mixed str/int - ordered = sort_mixed(values) + ordered = _sort_mixed(values) else: try: sorter = values.argsort() ordered = values.take(sorter) except TypeError: - # try this anyway - ordered = sort_mixed(values) + # Previous sorters failed or were not applicable, try `_sort_mixed` + # which would work, but which fails for special case of 1d arrays + # with tuples. + if values.size and isinstance(values[0], tuple): + ordered = _sort_tuples(values) + else: + ordered = _sort_mixed(values) # codes: @@ -2128,3 +2126,26 @@ def sort_mixed(values): np.putmask(new_codes, mask, na_sentinel) return ordered, ensure_platform_int(new_codes) + + +def _sort_mixed(values): + """ order ints before strings in 1d arrays, safe in py3 """ + str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) + nums = np.sort(values[~str_pos]) + strs = np.sort(values[str_pos]) + return np.concatenate([nums, np.asarray(strs, dtype=object)]) + + +def _sort_tuples(values: np.ndarray[tuple]): + """ + Convert array of tuples (1d) to array or array (2d). + We need to keep the columns separately as they contain different types and + nans (can't use `np.sort` as it may fail when str and nan are mixed in a + column as types cannot be compared). + """ + from pandas.core.internals.construction import to_arrays + from pandas.core.sorting import lexsort_indexer + + arrays, _ = to_arrays(values, None) + indexer = lexsort_indexer(arrays, orders=True) + return values[indexer] diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 4850c6a50f8a8..08c4293323500 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -4,7 +4,7 @@ import pytest import pandas as pd -from pandas import DataFrame, Index, Series +from pandas import DataFrame, Index, MultiIndex, Series import pandas._testing as tm @@ -365,3 +365,32 @@ def test_combine_first_string_dtype_only_na(self): {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string" ).set_index(["a", "b"]) tm.assert_frame_equal(result, expected) + + +def test_combine_first_with_nan_multiindex(): + # gh-36562 + + mi1 = MultiIndex.from_arrays( + [["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]], names=["a", "b"] + ) + df = DataFrame({"c": [1, 1, 1, 1, 1, 1]}, index=mi1) + mi2 = MultiIndex.from_arrays( + [["a", "b", "c", "a", "b", "d"], [1, 1, 1, 1, 1, 1]], names=["a", "b"] + ) + s = Series([1, 2, 3, 4, 5, 6], index=mi2) + res = df.combine_first(DataFrame({"d": s})) + mi_expected = MultiIndex.from_arrays( + [ + ["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan], + [1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6], + ], + names=["a", "b"], + ) + expected = DataFrame( + { + "c": [np.nan, np.nan, 1, 1, 1, 1, 1, np.nan, 1, np.nan, 1], + "d": [1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0, np.nan], + }, + index=mi_expected, + ) + tm.assert_frame_equal(res, expected) diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 1c9fd46ae451f..5f85ae2ec2318 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -453,3 +453,10 @@ def test_extension_array_codes(self, verify, na_sentinel): expected_codes = np.array([0, 2, na_sentinel, 1], dtype=np.intp) tm.assert_extension_array_equal(result, expected_values) tm.assert_numpy_array_equal(codes, expected_codes) + + +def test_mixed_str_nan(): + values = np.array(["b", np.nan, "a", "b"], dtype=object) + result = safe_sort(values) + expected = np.array([np.nan, "a", "b", "b"], dtype=object) + tm.assert_numpy_array_equal(result, expected) From 1e69e2c3cab6f66b4f0b782a36a3c0c6ba562108 Mon Sep 17 00:00:00 2001 From: Micael Jarniac Date: Tue, 3 Nov 2020 22:58:12 -0300 Subject: [PATCH 07/21] docs: fix punctuation (#37612) --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c90ab9cceea8c..8050ce8b1b636 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2196,7 +2196,7 @@ def to_json( * Series: - default is 'index' - - allowed values are: {'split','records','index','table'}. + - allowed values are: {'split', 'records', 'index', 'table'}. * DataFrame: From 831320f05da1be9c0a3191ac6bb1ef403686cfb1 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Tue, 3 Nov 2020 20:59:21 -0500 Subject: [PATCH 08/21] REGR: pd.to_hdf(..., dropna=True) not dropping missing rows (#37564) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/io/pytables.py | 3 +++ pandas/tests/io/pytables/test_store.py | 25 ++++++++++++++++++++----- 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index ae6e2de1b819c..16e6c12488b83 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -493,6 +493,7 @@ I/O - Bug in output rendering of complex numbers showing too many trailing zeros (:issue:`36799`) - Bug in :class:`HDFStore` threw a ``TypeError`` when exporting an empty :class:`DataFrame` with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`) - Bug in :class:`HDFStore` was dropping timezone information when exporting :class:`Series` with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`) +- Bug in :meth:`DataFrame.to_hdf` was not dropping missing rows with ``dropna=True`` (:issue:`35719`) Plotting ^^^^^^^^ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 347ce6e853794..bf21a8fe2fc74 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -268,6 +268,7 @@ def to_hdf( data_columns=data_columns, errors=errors, encoding=encoding, + dropna=dropna, ) path_or_buf = stringify_path(path_or_buf) @@ -1051,6 +1052,7 @@ def put( encoding=None, errors: str = "strict", track_times: bool = True, + dropna: bool = False, ): """ Store object in HDFStore. @@ -1100,6 +1102,7 @@ def put( encoding=encoding, errors=errors, track_times=track_times, + dropna=dropna, ) def remove(self, key: str, where=None, start=None, stop=None): diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index f37b0aabd3aed..d76a5a6f64055 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -1253,17 +1253,32 @@ def test_append_all_nans(self, setup_path): store.append("df2", df[10:], dropna=False) tm.assert_frame_equal(store["df2"], df) - # Test to make sure defaults are to not drop. - # Corresponding to Issue 9382 + def test_store_dropna(self, setup_path): df_with_missing = DataFrame( - {"col1": [0, np.nan, 2], "col2": [1, np.nan, np.nan]} + {"col1": [0.0, np.nan, 2.0], "col2": [1.0, np.nan, np.nan]}, + index=list("abc"), ) + df_without_missing = DataFrame( + {"col1": [0.0, 2.0], "col2": [1.0, np.nan]}, index=list("ac") + ) + + # # Test to make sure defaults are to not drop. + # # Corresponding to Issue 9382 + with ensure_clean_path(setup_path) as path: + df_with_missing.to_hdf(path, "df", format="table") + reloaded = read_hdf(path, "df") + tm.assert_frame_equal(df_with_missing, reloaded) with ensure_clean_path(setup_path) as path: - df_with_missing.to_hdf(path, "df_with_missing", format="table") - reloaded = read_hdf(path, "df_with_missing") + df_with_missing.to_hdf(path, "df", format="table", dropna=False) + reloaded = read_hdf(path, "df") tm.assert_frame_equal(df_with_missing, reloaded) + with ensure_clean_path(setup_path) as path: + df_with_missing.to_hdf(path, "df", format="table", dropna=True) + reloaded = read_hdf(path, "df") + tm.assert_frame_equal(df_without_missing, reloaded) + def test_read_missing_key_close_store(self, setup_path): # GH 25766 with ensure_clean_path(setup_path) as path: From e5cbaec9cc79e48091de1bb533344c137264bc11 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 3 Nov 2020 18:05:28 -0800 Subject: [PATCH 09/21] parametrize set_axis tests (#37619) --- pandas/tests/frame/test_alter_axes.py | 16 ------ pandas/tests/generic/methods/test_set_axis.py | 22 ++++++++ pandas/tests/series/methods/test_set_name.py | 21 +++++++ pandas/tests/series/test_alter_axes.py | 55 ------------------- 4 files changed, 43 insertions(+), 71 deletions(-) create mode 100644 pandas/tests/series/methods/test_set_name.py delete mode 100644 pandas/tests/series/test_alter_axes.py diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 3cd35e900ee06..4bd1d5fa56468 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -1,7 +1,6 @@ from datetime import datetime import numpy as np -import pytest from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -24,15 +23,6 @@ class TestDataFrameAlterAxes: - def test_set_index_directly(self, float_string_frame): - df = float_string_frame - idx = Index(np.arange(len(df))[::-1]) - - df.index = idx - tm.assert_index_equal(df.index, idx) - with pytest.raises(ValueError, match="Length mismatch"): - df.index = idx[::2] - def test_convert_dti_to_series(self): # don't cast a DatetimeIndex WITH a tz, leave as object # GH 6032 @@ -101,12 +91,6 @@ def test_convert_dti_to_series(self): df.pop("ts") tm.assert_frame_equal(df, expected) - def test_set_columns(self, float_string_frame): - cols = Index(np.arange(len(float_string_frame.columns))) - float_string_frame.columns = cols - with pytest.raises(ValueError, match="Length mismatch"): - float_string_frame.columns = cols[::2] - def test_dti_set_index_reindex(self): # GH 6631 df = DataFrame(np.random.random(6)) diff --git a/pandas/tests/generic/methods/test_set_axis.py b/pandas/tests/generic/methods/test_set_axis.py index 278d43ef93d2f..a46a91811f40e 100644 --- a/pandas/tests/generic/methods/test_set_axis.py +++ b/pandas/tests/generic/methods/test_set_axis.py @@ -57,6 +57,28 @@ def test_set_axis_invalid_axis_name(self, axis, obj): with pytest.raises(ValueError, match="No axis named"): obj.set_axis(list("abc"), axis=axis) + def test_set_axis_setattr_index_not_collection(self, obj): + # wrong type + msg = ( + r"Index\(\.\.\.\) must be called with a collection of some " + r"kind, None was passed" + ) + with pytest.raises(TypeError, match=msg): + obj.index = None + + def test_set_axis_setattr_index_wrong_length(self, obj): + # wrong length + msg = ( + f"Length mismatch: Expected axis has {len(obj)} elements, " + f"new values have {len(obj)-1} elements" + ) + with pytest.raises(ValueError, match=msg): + obj.index = np.arange(len(obj) - 1) + + if obj.ndim == 2: + with pytest.raises(ValueError, match="Length mismatch"): + obj.columns = obj.columns[::2] + class TestDataFrameSetAxis(SharedSetAxisTests): @pytest.fixture diff --git a/pandas/tests/series/methods/test_set_name.py b/pandas/tests/series/methods/test_set_name.py new file mode 100644 index 0000000000000..cbc8ebde7a8ab --- /dev/null +++ b/pandas/tests/series/methods/test_set_name.py @@ -0,0 +1,21 @@ +from datetime import datetime + +from pandas import Series + + +class TestSetName: + def test_set_name(self): + ser = Series([1, 2, 3]) + ser2 = ser._set_name("foo") + assert ser2.name == "foo" + assert ser.name is None + assert ser is not ser2 + + def test_set_name_attribute(self): + ser = Series([1, 2, 3]) + ser2 = Series([1, 2, 3], name="bar") + for name in [7, 7.0, "name", datetime(2001, 1, 1), (1,), "\u05D0"]: + ser.name = name + assert ser.name == name + ser2.name = name + assert ser2.name == name diff --git a/pandas/tests/series/test_alter_axes.py b/pandas/tests/series/test_alter_axes.py deleted file mode 100644 index 181d7de43d945..0000000000000 --- a/pandas/tests/series/test_alter_axes.py +++ /dev/null @@ -1,55 +0,0 @@ -from datetime import datetime - -import numpy as np -import pytest - -from pandas import Index, Series -import pandas._testing as tm - - -class TestSeriesAlterAxes: - def test_setindex(self, string_series): - # wrong type - msg = ( - r"Index\(\.\.\.\) must be called with a collection of some " - r"kind, None was passed" - ) - with pytest.raises(TypeError, match=msg): - string_series.index = None - - # wrong length - msg = ( - "Length mismatch: Expected axis has 30 elements, " - "new values have 29 elements" - ) - with pytest.raises(ValueError, match=msg): - string_series.index = np.arange(len(string_series) - 1) - - # works - string_series.index = np.arange(len(string_series)) - assert isinstance(string_series.index, Index) - - # Renaming - - def test_set_name_attribute(self): - s = Series([1, 2, 3]) - s2 = Series([1, 2, 3], name="bar") - for name in [7, 7.0, "name", datetime(2001, 1, 1), (1,), "\u05D0"]: - s.name = name - assert s.name == name - s2.name = name - assert s2.name == name - - def test_set_name(self): - s = Series([1, 2, 3]) - s2 = s._set_name("foo") - assert s2.name == "foo" - assert s.name is None - assert s is not s2 - - def test_set_index_makes_timeseries(self): - idx = tm.makeDateIndex(10) - - s = Series(range(10)) - s.index = idx - assert s.index._is_all_dates From 36f026dba6a3e47568379e6463ecd9e00cc1568c Mon Sep 17 00:00:00 2001 From: Maxim Ivanov <41443370+ivanovmg@users.noreply.github.com> Date: Wed, 4 Nov 2020 09:18:04 +0700 Subject: [PATCH 10/21] CLN: clean color selection in _matplotlib/style (#37203) --- pandas/plotting/_matplotlib/style.py | 280 ++++++++++++++++++++++----- pandas/tests/plotting/test_style.py | 157 +++++++++++++++ 2 files changed, 384 insertions(+), 53 deletions(-) create mode 100644 pandas/tests/plotting/test_style.py diff --git a/pandas/plotting/_matplotlib/style.py b/pandas/plotting/_matplotlib/style.py index b919728971505..b2c7b2610845c 100644 --- a/pandas/plotting/_matplotlib/style.py +++ b/pandas/plotting/_matplotlib/style.py @@ -1,4 +1,14 @@ -# being a bit too dynamic +from typing import ( + TYPE_CHECKING, + Collection, + Dict, + Iterator, + List, + Optional, + Sequence, + Union, + cast, +) import warnings import matplotlib.cm as cm @@ -9,92 +19,256 @@ import pandas.core.common as com +if TYPE_CHECKING: + from matplotlib.colors import Colormap + + +Color = Union[str, Sequence[float]] + def get_standard_colors( - num_colors: int, colormap=None, color_type: str = "default", color=None + num_colors: int, + colormap: Optional["Colormap"] = None, + color_type: str = "default", + color: Optional[Union[Dict[str, Color], Color, Collection[Color]]] = None, ): - import matplotlib.pyplot as plt + """ + Get standard colors based on `colormap`, `color_type` or `color` inputs. + + Parameters + ---------- + num_colors : int + Minimum number of colors to be returned. + Ignored if `color` is a dictionary. + colormap : :py:class:`matplotlib.colors.Colormap`, optional + Matplotlib colormap. + When provided, the resulting colors will be derived from the colormap. + color_type : {"default", "random"}, optional + Type of colors to derive. Used if provided `color` and `colormap` are None. + Ignored if either `color` or `colormap` are not None. + color : dict or str or sequence, optional + Color(s) to be used for deriving sequence of colors. + Can be either be a dictionary, or a single color (single color string, + or sequence of floats representing a single color), + or a sequence of colors. + + Returns + ------- + dict or list + Standard colors. Can either be a mapping if `color` was a dictionary, + or a list of colors with a length of `num_colors` or more. + + Warns + ----- + UserWarning + If both `colormap` and `color` are provided. + Parameter `color` will override. + """ + if isinstance(color, dict): + return color + + colors = _derive_colors( + color=color, + colormap=colormap, + color_type=color_type, + num_colors=num_colors, + ) + + return _cycle_colors(colors, num_colors=num_colors) + + +def _derive_colors( + *, + color: Optional[Union[Color, Collection[Color]]], + colormap: Optional[Union[str, "Colormap"]], + color_type: str, + num_colors: int, +) -> List[Color]: + """ + Derive colors from either `colormap`, `color_type` or `color` inputs. + + Get a list of colors either from `colormap`, or from `color`, + or from `color_type` (if both `colormap` and `color` are None). + + Parameters + ---------- + color : str or sequence, optional + Color(s) to be used for deriving sequence of colors. + Can be either be a single color (single color string, or sequence of floats + representing a single color), or a sequence of colors. + colormap : :py:class:`matplotlib.colors.Colormap`, optional + Matplotlib colormap. + When provided, the resulting colors will be derived from the colormap. + color_type : {"default", "random"}, optional + Type of colors to derive. Used if provided `color` and `colormap` are None. + Ignored if either `color` or `colormap`` are not None. + num_colors : int + Number of colors to be extracted. + Returns + ------- + list + List of colors extracted. + + Warns + ----- + UserWarning + If both `colormap` and `color` are provided. + Parameter `color` will override. + """ if color is None and colormap is not None: - if isinstance(colormap, str): - cmap = colormap - colormap = cm.get_cmap(colormap) - if colormap is None: - raise ValueError(f"Colormap {cmap} is not recognized") - colors = [colormap(num) for num in np.linspace(0, 1, num=num_colors)] + return _get_colors_from_colormap(colormap, num_colors=num_colors) elif color is not None: if colormap is not None: warnings.warn( "'color' and 'colormap' cannot be used simultaneously. Using 'color'" ) - colors = ( - list(color) - if is_list_like(color) and not isinstance(color, dict) - else color - ) + return _get_colors_from_color(color) else: - if color_type == "default": - # need to call list() on the result to copy so we don't - # modify the global rcParams below - try: - colors = [c["color"] for c in list(plt.rcParams["axes.prop_cycle"])] - except KeyError: - colors = list(plt.rcParams.get("axes.color_cycle", list("bgrcmyk"))) - if isinstance(colors, str): - colors = list(colors) - - colors = colors[0:num_colors] - elif color_type == "random": - - def random_color(column): - """ Returns a random color represented as a list of length 3""" - # GH17525 use common._random_state to avoid resetting the seed - rs = com.random_state(column) - return rs.rand(3).tolist() - - colors = [random_color(num) for num in range(num_colors)] - else: - raise ValueError("color_type must be either 'default' or 'random'") + return _get_colors_from_color_type(color_type, num_colors=num_colors) - if isinstance(colors, str) and _is_single_color(colors): - # GH #36972 - colors = [colors] - # Append more colors by cycling if there is not enough color. - # Extra colors will be ignored by matplotlib if there are more colors - # than needed and nothing needs to be done here. +def _cycle_colors(colors: List[Color], num_colors: int) -> List[Color]: + """Append more colors by cycling if there is not enough color. + + Extra colors will be ignored by matplotlib if there are more colors + than needed and nothing needs to be done here. + """ if len(colors) < num_colors: - try: - multiple = num_colors // len(colors) - 1 - except ZeroDivisionError: - raise ValueError("Invalid color argument: ''") + multiple = num_colors // len(colors) - 1 mod = num_colors % len(colors) - colors += multiple * colors colors += colors[:mod] return colors -def _is_single_color(color: str) -> bool: - """Check if ``color`` is a single color. +def _get_colors_from_colormap( + colormap: Union[str, "Colormap"], + num_colors: int, +) -> List[Color]: + """Get colors from colormap.""" + colormap = _get_cmap_instance(colormap) + return [colormap(num) for num in np.linspace(0, 1, num=num_colors)] + + +def _get_cmap_instance(colormap: Union[str, "Colormap"]) -> "Colormap": + """Get instance of matplotlib colormap.""" + if isinstance(colormap, str): + cmap = colormap + colormap = cm.get_cmap(colormap) + if colormap is None: + raise ValueError(f"Colormap {cmap} is not recognized") + return colormap + + +def _get_colors_from_color( + color: Union[Color, Collection[Color]], +) -> List[Color]: + """Get colors from user input color.""" + if len(color) == 0: + raise ValueError(f"Invalid color argument: {color}") + + if _is_single_color(color): + color = cast(Color, color) + return [color] + + color = cast(Collection[Color], color) + return list(_gen_list_of_colors_from_iterable(color)) + + +def _is_single_color(color: Union[Color, Collection[Color]]) -> bool: + """Check if `color` is a single color, not a sequence of colors. + + Single color is of these kinds: + - Named color "red", "C0", "firebrick" + - Alias "g" + - Sequence of floats, such as (0.1, 0.2, 0.3) or (0.1, 0.2, 0.3, 0.4). + + See Also + -------- + _is_single_string_color + """ + if isinstance(color, str) and _is_single_string_color(color): + # GH #36972 + return True + + if _is_floats_color(color): + return True + + return False + + +def _gen_list_of_colors_from_iterable(color: Collection[Color]) -> Iterator[Color]: + """ + Yield colors from string of several letters or from collection of colors. + """ + for x in color: + if _is_single_color(x): + yield x + else: + raise ValueError(f"Invalid color {x}") + + +def _is_floats_color(color: Union[Color, Collection[Color]]) -> bool: + """Check if color comprises a sequence of floats representing color.""" + return bool( + is_list_like(color) + and (len(color) == 3 or len(color) == 4) + and all(isinstance(x, (int, float)) for x in color) + ) + + +def _get_colors_from_color_type(color_type: str, num_colors: int) -> List[Color]: + """Get colors from user input color type.""" + if color_type == "default": + return _get_default_colors(num_colors) + elif color_type == "random": + return _get_random_colors(num_colors) + else: + raise ValueError("color_type must be either 'default' or 'random'") + + +def _get_default_colors(num_colors: int) -> List[Color]: + """Get `num_colors` of default colors from matplotlib rc params.""" + import matplotlib.pyplot as plt + + colors = [c["color"] for c in plt.rcParams["axes.prop_cycle"]] + return colors[0:num_colors] + + +def _get_random_colors(num_colors: int) -> List[Color]: + """Get `num_colors` of random colors.""" + return [_random_color(num) for num in range(num_colors)] + + +def _random_color(column: int) -> List[float]: + """Get a random color represented as a list of length 3""" + # GH17525 use common._random_state to avoid resetting the seed + rs = com.random_state(column) + return rs.rand(3).tolist() + + +def _is_single_string_color(color: Color) -> bool: + """Check if `color` is a single string color. - Examples of single colors: + Examples of single string colors: - 'r' - 'g' - 'red' - 'green' - 'C3' + - 'firebrick' Parameters ---------- - color : string - Color string. + color : Color + Color string or sequence of floats. Returns ------- bool - True if ``color`` looks like a valid color. + True if `color` looks like a valid color. False otherwise. """ conv = matplotlib.colors.ColorConverter() diff --git a/pandas/tests/plotting/test_style.py b/pandas/tests/plotting/test_style.py new file mode 100644 index 0000000000000..665bda15724fd --- /dev/null +++ b/pandas/tests/plotting/test_style.py @@ -0,0 +1,157 @@ +import pytest + +from pandas import Series + +pytest.importorskip("matplotlib") +from pandas.plotting._matplotlib.style import get_standard_colors + + +class TestGetStandardColors: + @pytest.mark.parametrize( + "num_colors, expected", + [ + (3, ["red", "green", "blue"]), + (5, ["red", "green", "blue", "red", "green"]), + (7, ["red", "green", "blue", "red", "green", "blue", "red"]), + (2, ["red", "green"]), + (1, ["red"]), + ], + ) + def test_default_colors_named_from_prop_cycle(self, num_colors, expected): + import matplotlib as mpl + from matplotlib.pyplot import cycler + + mpl_params = { + "axes.prop_cycle": cycler(color=["red", "green", "blue"]), + } + with mpl.rc_context(rc=mpl_params): + result = get_standard_colors(num_colors=num_colors) + assert result == expected + + @pytest.mark.parametrize( + "num_colors, expected", + [ + (1, ["b"]), + (3, ["b", "g", "r"]), + (4, ["b", "g", "r", "y"]), + (5, ["b", "g", "r", "y", "b"]), + (7, ["b", "g", "r", "y", "b", "g", "r"]), + ], + ) + def test_default_colors_named_from_prop_cycle_string(self, num_colors, expected): + import matplotlib as mpl + from matplotlib.pyplot import cycler + + mpl_params = { + "axes.prop_cycle": cycler(color="bgry"), + } + with mpl.rc_context(rc=mpl_params): + result = get_standard_colors(num_colors=num_colors) + assert result == expected + + @pytest.mark.parametrize( + "num_colors, expected_name", + [ + (1, ["C0"]), + (3, ["C0", "C1", "C2"]), + ( + 12, + [ + "C0", + "C1", + "C2", + "C3", + "C4", + "C5", + "C6", + "C7", + "C8", + "C9", + "C0", + "C1", + ], + ), + ], + ) + def test_default_colors_named_undefined_prop_cycle(self, num_colors, expected_name): + import matplotlib as mpl + import matplotlib.colors as mcolors + + with mpl.rc_context(rc={}): + expected = [mcolors.to_hex(x) for x in expected_name] + result = get_standard_colors(num_colors=num_colors) + assert result == expected + + @pytest.mark.parametrize( + "num_colors, expected", + [ + (1, ["red", "green", (0.1, 0.2, 0.3)]), + (2, ["red", "green", (0.1, 0.2, 0.3)]), + (3, ["red", "green", (0.1, 0.2, 0.3)]), + (4, ["red", "green", (0.1, 0.2, 0.3), "red"]), + ], + ) + def test_user_input_color_sequence(self, num_colors, expected): + color = ["red", "green", (0.1, 0.2, 0.3)] + result = get_standard_colors(color=color, num_colors=num_colors) + assert result == expected + + @pytest.mark.parametrize( + "num_colors, expected", + [ + (1, ["r", "g", "b", "k"]), + (2, ["r", "g", "b", "k"]), + (3, ["r", "g", "b", "k"]), + (4, ["r", "g", "b", "k"]), + (5, ["r", "g", "b", "k", "r"]), + (6, ["r", "g", "b", "k", "r", "g"]), + ], + ) + def test_user_input_color_string(self, num_colors, expected): + color = "rgbk" + result = get_standard_colors(color=color, num_colors=num_colors) + assert result == expected + + @pytest.mark.parametrize( + "num_colors, expected", + [ + (1, [(0.1, 0.2, 0.3)]), + (2, [(0.1, 0.2, 0.3), (0.1, 0.2, 0.3)]), + (3, [(0.1, 0.2, 0.3), (0.1, 0.2, 0.3), (0.1, 0.2, 0.3)]), + ], + ) + def test_user_input_color_floats(self, num_colors, expected): + color = (0.1, 0.2, 0.3) + result = get_standard_colors(color=color, num_colors=num_colors) + assert result == expected + + @pytest.mark.parametrize( + "color, num_colors, expected", + [ + ("Crimson", 1, ["Crimson"]), + ("DodgerBlue", 2, ["DodgerBlue", "DodgerBlue"]), + ("firebrick", 3, ["firebrick", "firebrick", "firebrick"]), + ], + ) + def test_user_input_named_color_string(self, color, num_colors, expected): + result = get_standard_colors(color=color, num_colors=num_colors) + assert result == expected + + @pytest.mark.parametrize("color", ["", [], (), Series([], dtype="object")]) + def test_empty_color_raises(self, color): + with pytest.raises(ValueError, match="Invalid color argument"): + get_standard_colors(color=color, num_colors=1) + + @pytest.mark.parametrize( + "color", + [ + "bad_color", + ("red", "green", "bad_color"), + (0.1,), + (0.1, 0.2), + (0.1, 0.2, 0.3, 0.4, 0.5), # must be either 3 or 4 floats + ], + ) + def test_bad_color_raises(self, color): + with pytest.raises(ValueError, match="Invalid color"): + get_standard_colors(color=color, num_colors=5) From a5aed5d2979428b00e90e586093c69f6e21864ac Mon Sep 17 00:00:00 2001 From: Erfan Nariman <34067903+erfannariman@users.noreply.github.com> Date: Wed, 4 Nov 2020 03:21:00 +0100 Subject: [PATCH 11/21] DEPR: DataFrame/Series.slice_shift (#37601) --- doc/source/whatsnew/v1.2.0.rst | 2 ++ pandas/core/generic.py | 13 ++++++++++++- pandas/tests/generic/test_finalize.py | 2 -- pandas/tests/generic/test_generic.py | 11 +++++++++++ 4 files changed, 25 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 16e6c12488b83..fd5451505eefe 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -341,6 +341,8 @@ Deprecations - Deprecate use of strings denoting units with 'M', 'Y' or 'y' in :func:`~pandas.to_timedelta` (:issue:`36666`) - :class:`Index` methods ``&``, ``|``, and ``^`` behaving as the set operations :meth:`Index.intersection`, :meth:`Index.union`, and :meth:`Index.symmetric_difference`, respectively, are deprecated and in the future will behave as pointwise boolean operations matching :class:`Series` behavior. Use the named set methods instead (:issue:`36758`) - :meth:`Categorical.is_dtype_equal` and :meth:`CategoricalIndex.is_dtype_equal` are deprecated, will be removed in a future version (:issue:`37545`) +- :meth:`Series.slice_shift` and :meth:`DataFrame.slice_shift` are deprecated, use :meth:`Series.shift` or :meth:`DataFrame.shift` instead (:issue:`37601`) + .. --------------------------------------------------------------------------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8050ce8b1b636..36ce2c4776bd0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9347,10 +9347,13 @@ def shift( def slice_shift(self: FrameOrSeries, periods: int = 1, axis=0) -> FrameOrSeries: """ Equivalent to `shift` without copying data. - The shifted data will not include the dropped periods and the shifted axis will be smaller than the original. + .. deprecated:: 1.2.0 + slice_shift is deprecated, + use DataFrame/Series.shift instead. + Parameters ---------- periods : int @@ -9365,6 +9368,14 @@ def slice_shift(self: FrameOrSeries, periods: int = 1, axis=0) -> FrameOrSeries: While the `slice_shift` is faster than `shift`, you may pay for it later during alignment. """ + + msg = ( + "The 'slice_shift' method is deprecated " + "and will be removed in a future version. " + "You can use DataFrame/Series.shift instead" + ) + warnings.warn(msg, FutureWarning, stacklevel=2) + if periods == 0: return self diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index d7aadda990f53..d16e854c25ed8 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -424,8 +424,6 @@ (pd.DataFrame, frame_data, operator.methodcaller("where", np.array([[True]]))), (pd.Series, ([1, 2],), operator.methodcaller("mask", np.array([True, False]))), (pd.DataFrame, frame_data, operator.methodcaller("mask", np.array([[True]]))), - (pd.Series, ([1, 2],), operator.methodcaller("slice_shift")), - (pd.DataFrame, frame_data, operator.methodcaller("slice_shift")), pytest.param( ( pd.Series, diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 930c48cbdc214..7fde448bb36dc 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -480,3 +480,14 @@ def test_flags_identity(self, frame_or_series): assert s.flags is s.flags s2 = s.copy() assert s2.flags is not s.flags + + def test_slice_shift_deprecated(self): + # GH 37601 + df = DataFrame({"A": [1, 2, 3, 4]}) + s = Series([1, 2, 3, 4]) + + with tm.assert_produces_warning(FutureWarning): + df["A"].slice_shift() + + with tm.assert_produces_warning(FutureWarning): + s.slice_shift() From e38e987160c792f315685dc74fc1fc33d9389a71 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 3 Nov 2020 18:25:34 -0800 Subject: [PATCH 12/21] REF: re-use validate_setitem_value in Categorical.fillna (#37597) --- pandas/core/arrays/categorical.py | 14 ++++---------- pandas/tests/arrays/categorical/test_missing.py | 5 ++++- pandas/tests/frame/methods/test_fillna.py | 2 +- pandas/tests/indexes/categorical/test_fillna.py | 4 ++-- pandas/tests/series/methods/test_fillna.py | 6 +++--- 5 files changed, 14 insertions(+), 17 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b1f913e9ea641..9f0414cf7a806 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1655,21 +1655,15 @@ def fillna(self, value=None, method=None, limit=None): codes = self._ndarray.copy() mask = self.isna() + new_codes = self._validate_setitem_value(value) + if isinstance(value, (np.ndarray, Categorical)): # We get ndarray or Categorical if called via Series.fillna, # where it will unwrap another aligned Series before getting here - - not_categories = ~algorithms.isin(value, self.categories) - if not isna(value[not_categories]).all(): - # All entries in `value` must either be a category or NA - raise ValueError("fill value must be in categories") - - values_codes = _get_codes_for_values(value, self.categories) - codes[mask] = values_codes[mask] + codes[mask] = new_codes[mask] else: - new_code = self._validate_fill_value(value) - codes[mask] = new_code + codes[mask] = new_codes return self._from_backing_data(codes) diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index 21bea9356dcf0..364c290edc46c 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -60,7 +60,10 @@ def test_set_item_nan(self): ), (dict(), "Must specify a fill 'value' or 'method'."), (dict(method="bad"), "Invalid fill method. Expecting .* bad"), - (dict(value=Series([1, 2, 3, 4, "a"])), "fill value must be in categories"), + ( + dict(value=Series([1, 2, 3, 4, "a"])), + "Cannot setitem on a Categorical with a new category", + ), ], ) def test_fillna_raises(self, fillna_kwargs, msg): diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 9fa1aa65379c5..bbb57da39705b 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -170,7 +170,7 @@ def test_na_actions_categorical(self): res = df.fillna(value={"cats": 3, "vals": "b"}) tm.assert_frame_equal(res, df_exp_fill) - msg = "'fill_value=4' is not present in this Categorical's categories" + msg = "Cannot setitem on a Categorical with a new category" with pytest.raises(ValueError, match=msg): df.fillna(value={"cats": 4, "vals": "c"}) diff --git a/pandas/tests/indexes/categorical/test_fillna.py b/pandas/tests/indexes/categorical/test_fillna.py index f6a6747166011..c8fc55c29054e 100644 --- a/pandas/tests/indexes/categorical/test_fillna.py +++ b/pandas/tests/indexes/categorical/test_fillna.py @@ -14,7 +14,7 @@ def test_fillna_categorical(self): tm.assert_index_equal(idx.fillna(1.0), exp) # fill by value not in categories raises ValueError - msg = "'fill_value=2.0' is not present in this Categorical's categories" + msg = "Cannot setitem on a Categorical with a new category" with pytest.raises(ValueError, match=msg): idx.fillna(2.0) @@ -36,7 +36,7 @@ def test_fillna_validates_with_no_nas(self): ci = CategoricalIndex([2, 3, 3]) cat = ci._data - msg = "'fill_value=False' is not present in this Categorical's categories" + msg = "Cannot setitem on a Categorical with a new category" with pytest.raises(ValueError, match=msg): ci.fillna(False) diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index d45486b9bdb29..aaa58cdb390f7 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -653,14 +653,14 @@ def test_fillna_categorical_raises(self): data = ["a", np.nan, "b", np.nan, np.nan] ser = Series(Categorical(data, categories=["a", "b"])) - msg = "'fill_value=d' is not present in this Categorical's categories" + msg = "Cannot setitem on a Categorical with a new category" with pytest.raises(ValueError, match=msg): ser.fillna("d") - with pytest.raises(ValueError, match="fill value must be in categories"): + with pytest.raises(ValueError, match=msg): ser.fillna(Series("d")) - with pytest.raises(ValueError, match="fill value must be in categories"): + with pytest.raises(ValueError, match=msg): ser.fillna({1: "d", 3: "a"}) msg = '"value" parameter must be a scalar or dict, but you passed a "list"' From 54dda900a180c099eaa89ff44cea7225c9d93bf0 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Tue, 3 Nov 2020 20:57:03 -0600 Subject: [PATCH 13/21] PERF: release gil for ewma_time (#37389) --- pandas/_libs/window/aggregations.pyx | 49 ++++++++++++++++------------ 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index b2dbf7802e6f0..3556085bb300b 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1,14 +1,13 @@ # cython: boundscheck=False, wraparound=False, cdivision=True import cython -from cython import Py_ssize_t from libcpp.deque cimport deque import numpy as np cimport numpy as cnp -from numpy cimport float32_t, float64_t, int64_t, ndarray, uint8_t +from numpy cimport float32_t, float64_t, int64_t, ndarray cnp.import_array() @@ -1398,7 +1397,7 @@ def roll_weighted_var(float64_t[:] values, float64_t[:] weights, # ---------------------------------------------------------------------- # Exponentially weighted moving average -def ewma_time(ndarray[float64_t] vals, int minp, ndarray[int64_t] times, +def ewma_time(const float64_t[:] vals, int minp, ndarray[int64_t] times, int64_t halflife): """ Compute exponentially-weighted moving average using halflife and time @@ -1416,30 +1415,40 @@ def ewma_time(ndarray[float64_t] vals, int minp, ndarray[int64_t] times, ndarray """ cdef: - Py_ssize_t i, num_not_nan = 0, N = len(vals) + Py_ssize_t i, j, num_not_nan = 0, N = len(vals) bint is_not_nan - float64_t last_result - ndarray[uint8_t] mask = np.zeros(N, dtype=np.uint8) - ndarray[float64_t] weights, observations, output = np.empty(N, dtype=np.float64) + float64_t last_result, weights_dot, weights_sum, weight, halflife_float + float64_t[:] times_float + float64_t[:] observations = np.zeros(N, dtype=float) + float64_t[:] times_masked = np.zeros(N, dtype=float) + ndarray[float64_t] output = np.empty(N, dtype=float) if N == 0: return output + halflife_float = halflife + times_float = times.astype(float) last_result = vals[0] - for i in range(N): - is_not_nan = vals[i] == vals[i] - num_not_nan += is_not_nan - if is_not_nan: - mask[i] = 1 - weights = 0.5 ** ((times[i] - times[mask.view(np.bool_)]) / halflife) - observations = vals[mask.view(np.bool_)] - last_result = np.sum(weights * observations) / np.sum(weights) - - if num_not_nan >= minp: - output[i] = last_result - else: - output[i] = NaN + with nogil: + for i in range(N): + is_not_nan = vals[i] == vals[i] + num_not_nan += is_not_nan + if is_not_nan: + times_masked[num_not_nan-1] = times_float[i] + observations[num_not_nan-1] = vals[i] + + weights_sum = 0 + weights_dot = 0 + for j in range(num_not_nan): + weight = 0.5 ** ( + (times_float[i] - times_masked[j]) / halflife_float) + weights_sum += weight + weights_dot += weight * observations[j] + + last_result = weights_dot / weights_sum + + output[i] = last_result if num_not_nan >= minp else NaN return output From f6f3dd3e77278c9932105664a94aaca5c1422880 Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Wed, 4 Nov 2020 03:59:02 +0100 Subject: [PATCH 14/21] BUG: Groupy dropped nan groups from result when grouping over single column (#36842) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/_libs/lib.pyx | 29 +++++++++++++-------- pandas/core/groupby/ops.py | 9 +++---- pandas/core/sorting.py | 11 ++++++-- pandas/tests/groupby/test_groupby.py | 7 +++++ pandas/tests/groupby/test_groupby_dropna.py | 20 +++++++++++++- pandas/tests/window/test_rolling.py | 15 +++++++++++ 7 files changed, 72 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index fd5451505eefe..e811bbc9ab7a0 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -527,6 +527,7 @@ Groupby/resample/rolling - Using :meth:`Rolling.var()` instead of :meth:`Rolling.std()` avoids numerical issues for :meth:`Rolling.corr()` when :meth:`Rolling.var()` is still within floating point precision while :meth:`Rolling.std()` is not (:issue:`31286`) - Bug in :meth:`df.groupby(..).quantile() ` and :meth:`df.resample(..).quantile() ` raised ``TypeError`` when values were of type ``Timedelta`` (:issue:`29485`) - Bug in :meth:`Rolling.median` and :meth:`Rolling.quantile` returned wrong values for :class:`BaseIndexer` subclasses with non-monotonic starting or ending points for windows (:issue:`37153`) +- Bug in :meth:`DataFrame.groupby` dropped ``nan`` groups from result with ``dropna=False`` when grouping over a single column (:issue:`35646`, :issue:`35542`) Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e493e5e9d41d3..0b0334d52c1e9 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -896,21 +896,28 @@ def indices_fast(ndarray index, const int64_t[:] labels, list keys, if lab != cur: if lab != -1: - tup = PyTuple_New(k) - for j in range(k): - val = keys[j][sorted_labels[j][i - 1]] - PyTuple_SET_ITEM(tup, j, val) - Py_INCREF(val) - + if k == 1: + # When k = 1 we do not want to return a tuple as key + tup = keys[0][sorted_labels[0][i - 1]] + else: + tup = PyTuple_New(k) + for j in range(k): + val = keys[j][sorted_labels[j][i - 1]] + PyTuple_SET_ITEM(tup, j, val) + Py_INCREF(val) result[tup] = index[start:i] start = i cur = lab - tup = PyTuple_New(k) - for j in range(k): - val = keys[j][sorted_labels[j][n - 1]] - PyTuple_SET_ITEM(tup, j, val) - Py_INCREF(val) + if k == 1: + # When k = 1 we do not want to return a tuple as key + tup = keys[0][sorted_labels[0][n - 1]] + else: + tup = PyTuple_New(k) + for j in range(k): + val = keys[j][sorted_labels[j][n - 1]] + PyTuple_SET_ITEM(tup, j, val) + Py_INCREF(val) result[tup] = index[start:] return result diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index bca71b5c9646b..ccf23a6f24c42 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -229,12 +229,9 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): @cache_readonly def indices(self): """ dict {group name -> group indices} """ - if len(self.groupings) == 1: - return self.groupings[0].indices - else: - codes_list = [ping.codes for ping in self.groupings] - keys = [ping.group_index for ping in self.groupings] - return get_indexer_dict(codes_list, keys) + codes_list = [ping.codes for ping in self.groupings] + keys = [ping.group_index for ping in self.groupings] + return get_indexer_dict(codes_list, keys) @property def codes(self) -> List[np.ndarray]: diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 2e32a7572adc7..e390229b5dcba 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -4,6 +4,7 @@ TYPE_CHECKING, Callable, DefaultDict, + Dict, Iterable, List, Optional, @@ -528,16 +529,22 @@ def get_flattened_list( return [tuple(array) for array in arrays.values()] -def get_indexer_dict(label_list, keys): +def get_indexer_dict( + label_list: List[np.ndarray], keys: List["Index"] +) -> Dict[Union[str, Tuple], np.ndarray]: """ Returns ------- - dict + dict: Labels mapped to indexers. """ shape = [len(x) for x in keys] group_index = get_group_index(label_list, shape, sort=True, xnull=True) + if np.all(group_index == -1): + # When all keys are nan and dropna=True, indices_fast can't handle this + # and the return is empty anyway + return {} ngroups = ( ((group_index.size and group_index.max()) + 1) if is_int64_overflow_possible(shape) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 2563eeeb68672..a0c228200e73a 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1298,6 +1298,13 @@ def test_groupby_nat_exclude(): grouped.get_group(pd.NaT) +def test_groupby_two_group_keys_all_nan(): + # GH #36842: Grouping over two group keys shouldn't raise an error + df = DataFrame({"a": [np.nan, np.nan], "b": [np.nan, np.nan], "c": [1, 2]}) + result = df.groupby(["a", "b"]).indices + assert result == {} + + def test_groupby_2d_malformed(): d = DataFrame(index=range(2)) d["group"] = ["g1", "g2"] diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 29a8f883f0ff5..02ce4dcf2ae2b 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( @@ -335,3 +335,21 @@ def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, expected = pd.DataFrame(selected_data, index=mi) tm.assert_frame_equal(result, expected) + + +def test_groupby_nan_included(): + # GH 35646 + data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]} + df = pd.DataFrame(data) + grouped = df.groupby("group", dropna=False) + result = grouped.indices + dtype = "int64" + expected = { + "g1": np.array([0, 2], dtype=dtype), + "g2": np.array([3], dtype=dtype), + np.nan: np.array([1, 4], dtype=dtype), + } + for result_values, expected_values in zip(result.values(), expected.values()): + tm.assert_numpy_array_equal(result_values, expected_values) + assert np.isnan(list(result.keys())[2]) + assert list(result.keys())[0:2] == ["g1", "g2"] diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 2c8439aae75e5..02bcfab8d3388 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1087,3 +1087,18 @@ def test_rolling_corr_timedelta_index(index, window): result = x.rolling(window).corr(y) expected = Series([np.nan, np.nan, 1, 1, 1], index=index) tm.assert_almost_equal(result, expected) + + +def test_groupby_rolling_nan_included(): + # GH 35542 + data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]} + df = DataFrame(data) + result = df.groupby("group", dropna=False).rolling(1, min_periods=1).mean() + expected = DataFrame( + {"B": [0.0, 2.0, 3.0, 1.0, 4.0]}, + index=pd.MultiIndex.from_tuples( + [("g1", 0), ("g1", 2), ("g2", 3), (np.nan, 1), (np.nan, 4)], + names=["group", None], + ), + ) + tm.assert_frame_equal(result, expected) From d3788525ee368db0b6565b72ab4eeba9140fd1d4 Mon Sep 17 00:00:00 2001 From: attack68 <24256554+attack68@users.noreply.github.com> Date: Wed, 4 Nov 2020 04:00:05 +0100 Subject: [PATCH 15/21] ENH: implement timeszones support for read_json(orient='table') and astype() from 'object' (#35973) --- doc/source/whatsnew/v1.2.0.rst | 3 ++ pandas/core/arrays/datetimes.py | 8 ++- pandas/io/json/_json.py | 4 +- pandas/io/json/_table_schema.py | 4 -- pandas/tests/frame/methods/test_astype.py | 24 +++++++++ .../tests/io/json/test_json_table_schema.py | 54 ++++++++++++++++--- 6 files changed, 85 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e811bbc9ab7a0..0937ec3866e12 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -217,6 +217,7 @@ Other enhancements - ``Styler`` now allows direct CSS class name addition to individual data cells (:issue:`36159`) - :meth:`Rolling.mean()` and :meth:`Rolling.sum()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`) - :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`) +- - Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`) - Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`) - Added :meth:`Rolling.sem()` and :meth:`Expanding.sem()` to compute the standard error of mean (:issue:`26476`). @@ -393,6 +394,8 @@ Datetimelike - Bug in :class:`DatetimeIndex.shift` incorrectly raising when shifting empty indexes (:issue:`14811`) - :class:`Timestamp` and :class:`DatetimeIndex` comparisons between timezone-aware and timezone-naive objects now follow the standard library ``datetime`` behavior, returning ``True``/``False`` for ``!=``/``==`` and raising for inequality comparisons (:issue:`28507`) - Bug in :meth:`DatetimeIndex.equals` and :meth:`TimedeltaIndex.equals` incorrectly considering ``int64`` indexes as equal (:issue:`36744`) +- :meth:`to_json` and :meth:`read_json` now implements timezones parsing when orient structure is 'table'. +- :meth:`astype` now attempts to convert to 'datetime64[ns, tz]' directly from 'object' with inferred timezone from string (:issue:`35973`). - Bug in :meth:`TimedeltaIndex.sum` and :meth:`Series.sum` with ``timedelta64`` dtype on an empty index or series returning ``NaT`` instead of ``Timedelta(0)`` (:issue:`31751`) - Bug in :meth:`DatetimeArray.shift` incorrectly allowing ``fill_value`` with a mismatched timezone (:issue:`37299`) - Bug in adding a :class:`BusinessDay` with nonzero ``offset`` to a non-scalar other (:issue:`37457`) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index f655d10881011..905242bfdd8ad 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1968,7 +1968,13 @@ def sequence_to_dt64ns( data, inferred_tz = objects_to_datetime64ns( data, dayfirst=dayfirst, yearfirst=yearfirst ) - tz = _maybe_infer_tz(tz, inferred_tz) + if tz and inferred_tz: + # two timezones: convert to intended from base UTC repr + data = tzconversion.tz_convert_from_utc(data.view("i8"), tz) + data = data.view(DT64NS_DTYPE) + elif inferred_tz: + tz = inferred_tz + data_dtype = data.dtype # `data` may have originally been a Categorical[datetime64[ns, tz]], diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 98b9a585d890e..0cc6ca984b25d 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -262,7 +262,9 @@ def __init__( # NotImplemented on a column MultiIndex if obj.ndim == 2 and isinstance(obj.columns, MultiIndex): - raise NotImplementedError("orient='table' is not supported for MultiIndex") + raise NotImplementedError( + "orient='table' is not supported for MultiIndex columns" + ) # TODO: Do this timedelta properly in objToJSON.c See GH #15137 if ( diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index 2b4c86b3c4406..0499a35296490 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -323,10 +323,6 @@ def parse_table_schema(json, precise_float): for field in table["schema"]["fields"] } - # Cannot directly use as_type with timezone data on object; raise for now - if any(str(x).startswith("datetime64[ns, ") for x in dtypes.values()): - raise NotImplementedError('table="orient" can not yet read timezone data') - # No ISO constructor for Timedelta as of yet, so need to raise if "timedelta64" in dtypes.values(): raise NotImplementedError( diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index d3f256259b15f..f05c90f37ea8a 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -587,3 +587,27 @@ def test_astype_ignores_errors_for_extension_dtypes(self, df, errors): msg = "(Cannot cast)|(could not convert)" with pytest.raises((ValueError, TypeError), match=msg): df.astype(float, errors=errors) + + def test_astype_tz_conversion(self): + # GH 35973 + val = {"tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London")} + df = DataFrame(val) + result = df.astype({"tz": "datetime64[ns, Europe/Berlin]"}) + + expected = df + expected["tz"] = expected["tz"].dt.tz_convert("Europe/Berlin") + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("tz", ["UTC", "Europe/Berlin"]) + def test_astype_tz_object_conversion(self, tz): + # GH 35973 + val = {"tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London")} + expected = DataFrame(val) + + # convert expected to object dtype from other tz str (independently tested) + result = expected.astype({"tz": f"datetime64[ns, {tz}]"}) + result = result.astype({"tz": "object"}) + + # do real test: object dtype to a specified tz, different from construction tz. + result = result.astype({"tz": "datetime64[ns, Europe/London]"}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 6e35b224ef4c3..dba4b9214e50c 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -676,6 +676,11 @@ class TestTableOrientReader: {"floats": [1.0, 2.0, 3.0, 4.0]}, {"floats": [1.1, 2.2, 3.3, 4.4]}, {"bools": [True, False, False, True]}, + { + "timezones": pd.date_range( + "2016-01-01", freq="d", periods=4, tz="US/Central" + ) # added in # GH 35973 + }, ], ) @pytest.mark.skipif(sys.version_info[:3] == (3, 7, 0), reason="GH-35309") @@ -686,22 +691,59 @@ def test_read_json_table_orient(self, index_nm, vals, recwarn): tm.assert_frame_equal(df, result) @pytest.mark.parametrize("index_nm", [None, "idx", "index"]) + @pytest.mark.parametrize( + "vals", + [{"timedeltas": pd.timedelta_range("1H", periods=4, freq="T")}], + ) + def test_read_json_table_orient_raises(self, index_nm, vals, recwarn): + df = DataFrame(vals, index=pd.Index(range(4), name=index_nm)) + out = df.to_json(orient="table") + with pytest.raises(NotImplementedError, match="can not yet read "): + pd.read_json(out, orient="table") + + @pytest.mark.parametrize( + "idx", + [ + pd.Index(range(4)), + pd.Index( + pd.date_range( + "2020-08-30", + freq="d", + periods=4, + ), + freq=None, + ), + pd.Index( + pd.date_range("2020-08-30", freq="d", periods=4, tz="US/Central"), + freq=None, + ), + pd.MultiIndex.from_product( + [ + pd.date_range("2020-08-30", freq="d", periods=2, tz="US/Central"), + ["x", "y"], + ], + ), + ], + ) @pytest.mark.parametrize( "vals", [ - {"timedeltas": pd.timedelta_range("1H", periods=4, freq="T")}, + {"floats": [1.1, 2.2, 3.3, 4.4]}, + {"dates": pd.date_range("2020-08-30", freq="d", periods=4)}, { "timezones": pd.date_range( - "2016-01-01", freq="d", periods=4, tz="US/Central" + "2020-08-30", freq="d", periods=4, tz="Europe/London" ) }, ], ) - def test_read_json_table_orient_raises(self, index_nm, vals, recwarn): - df = DataFrame(vals, index=pd.Index(range(4), name=index_nm)) + @pytest.mark.skipif(sys.version_info[:3] == (3, 7, 0), reason="GH-35309") + def test_read_json_table_timezones_orient(self, idx, vals, recwarn): + # GH 35973 + df = DataFrame(vals, index=idx) out = df.to_json(orient="table") - with pytest.raises(NotImplementedError, match="can not yet read "): - pd.read_json(out, orient="table") + result = pd.read_json(out, orient="table") + tm.assert_frame_equal(df, result) def test_comprehensive(self): df = DataFrame( From a648fb2699ef44555b38db89c4af2e97cfcf8208 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Tue, 3 Nov 2020 23:09:16 -0500 Subject: [PATCH 16/21] REF/BUG/TYP: read_csv shouldn't close user-provided file handles (#36997) * BUG/REF: read_csv shouldn't close user-provided file handles * get_handle: typing, returns is_wrapped, use dataclass, and make sure that all created handlers are returned * remove unused imports * added IOHandleArgs.close * added IOArgs.close * mostly comments * move memory_map from TextReader to CParserWrapper * moved IOArgs and IOHandles * more comments Co-authored-by: Jeff Reback --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/_libs/parsers.pyx | 122 ++------------ pandas/_typing.py | 29 +--- pandas/core/frame.py | 6 +- pandas/io/common.py | 178 ++++++++++++++++----- pandas/io/excel/_base.py | 40 +++-- pandas/io/feather_format.py | 9 +- pandas/io/formats/csvs.py | 54 ++----- pandas/io/formats/format.py | 14 +- pandas/io/json/_json.py | 88 +++++----- pandas/io/orc.py | 1 + pandas/io/parsers.py | 92 +++++------ pandas/io/pickle.py | 42 ++--- pandas/io/sas/sas7bdat.py | 21 +-- pandas/io/sas/sas_xport.py | 19 +-- pandas/io/sas/sasreader.py | 3 +- pandas/io/stata.py | 158 ++++++++---------- pandas/tests/frame/methods/test_to_csv.py | 7 +- pandas/tests/io/json/test_readlines.py | 2 +- pandas/tests/io/parser/test_common.py | 61 ++++++- pandas/tests/io/parser/test_encoding.py | 4 + pandas/tests/io/parser/test_textreader.py | 7 +- pandas/tests/io/test_compression.py | 26 +-- pandas/tests/series/methods/test_to_csv.py | 6 +- 24 files changed, 480 insertions(+), 510 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 0937ec3866e12..33e9bd0c2732a 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -498,6 +498,7 @@ I/O - Bug in output rendering of complex numbers showing too many trailing zeros (:issue:`36799`) - Bug in :class:`HDFStore` threw a ``TypeError`` when exporting an empty :class:`DataFrame` with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`) - Bug in :class:`HDFStore` was dropping timezone information when exporting :class:`Series` with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`) +- :func:`read_csv` was closing user-provided binary file handles when ``engine="c"`` and an ``encoding`` was requested (:issue:`36980`) - Bug in :meth:`DataFrame.to_hdf` was not dropping missing rows with ``dropna=True`` (:issue:`35719`) Plotting diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index b87e46f9b6648..4b7a47c5f93c2 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1,15 +1,10 @@ # Copyright (c) 2012, Lambda Foundry, Inc. # See LICENSE for the license -import bz2 from csv import QUOTE_MINIMAL, QUOTE_NONE, QUOTE_NONNUMERIC from errno import ENOENT -import gzip -import io -import os import sys import time import warnings -import zipfile from libc.stdlib cimport free from libc.string cimport strcasecmp, strlen, strncpy @@ -17,7 +12,7 @@ from libc.string cimport strcasecmp, strlen, strncpy import cython from cython import Py_ssize_t -from cpython.bytes cimport PyBytes_AsString, PyBytes_FromString +from cpython.bytes cimport PyBytes_AsString from cpython.exc cimport PyErr_Fetch, PyErr_Occurred from cpython.object cimport PyObject from cpython.ref cimport Py_XDECREF @@ -67,7 +62,6 @@ from pandas._libs.khash cimport ( khiter_t, ) -from pandas.compat import get_lzma_file, import_lzma from pandas.errors import DtypeWarning, EmptyDataError, ParserError, ParserWarning from pandas.core.dtypes.common import ( @@ -82,11 +76,10 @@ from pandas.core.dtypes.common import ( ) from pandas.core.dtypes.concat import union_categoricals -lzma = import_lzma() - cdef: float64_t INF = np.inf float64_t NEGINF = -INF + int64_t DEFAULT_CHUNKSIZE = 256 * 1024 cdef extern from "headers/portable.h": @@ -275,14 +268,15 @@ cdef extern from "parser/io.h": size_t *bytes_read, int *status) -DEFAULT_CHUNKSIZE = 256 * 1024 - - cdef class TextReader: """ # source: StringIO or file object + ..versionchange:: 1.2.0 + removed 'compression', 'memory_map', and 'encoding' argument. + These arguments are outsourced to CParserWrapper. + 'source' has to be a file handle. """ cdef: @@ -299,7 +293,7 @@ cdef class TextReader: cdef public: int64_t leading_cols, table_width, skipfooter, buffer_lines - bint allow_leading_cols, mangle_dupe_cols, memory_map, low_memory + bint allow_leading_cols, mangle_dupe_cols, low_memory bint delim_whitespace object delimiter, converters object na_values @@ -307,8 +301,6 @@ cdef class TextReader: object index_col object skiprows object dtype - object encoding - object compression object usecols list dtype_cast_order set unnamed_cols @@ -321,10 +313,8 @@ cdef class TextReader: header_end=0, index_col=None, names=None, - bint memory_map=False, tokenize_chunksize=DEFAULT_CHUNKSIZE, bint delim_whitespace=False, - compression=None, converters=None, bint skipinitialspace=False, escapechar=None, @@ -332,7 +322,6 @@ cdef class TextReader: quotechar=b'"', quoting=0, lineterminator=None, - encoding=None, comment=None, decimal=b'.', thousands=None, @@ -356,15 +345,7 @@ cdef class TextReader: bint skip_blank_lines=True): # set encoding for native Python and C library - if encoding is not None: - if not isinstance(encoding, bytes): - encoding = encoding.encode('utf-8') - encoding = encoding.lower() - self.c_encoding = encoding - else: - self.c_encoding = NULL - - self.encoding = encoding + self.c_encoding = NULL self.parser = parser_new() self.parser.chunksize = tokenize_chunksize @@ -374,9 +355,6 @@ cdef class TextReader: # For timekeeping self.clocks = [] - self.compression = compression - self.memory_map = memory_map - self.parser.usecols = (usecols is not None) self._setup_parser_source(source) @@ -562,11 +540,6 @@ cdef class TextReader: parser_del(self.parser) def close(self): - # we need to properly close an open derived - # filehandle here, e.g. and UTFRecoder - if self.handle is not None: - self.handle.close() - # also preemptively free all allocated memory parser_free(self.parser) if self.true_set: @@ -614,82 +587,15 @@ cdef class TextReader: cdef: void *ptr - self.parser.cb_io = NULL - self.parser.cb_cleanup = NULL - - if self.compression: - if self.compression == 'gzip': - if isinstance(source, str): - source = gzip.GzipFile(source, 'rb') - else: - source = gzip.GzipFile(fileobj=source) - elif self.compression == 'bz2': - source = bz2.BZ2File(source, 'rb') - elif self.compression == 'zip': - zip_file = zipfile.ZipFile(source) - zip_names = zip_file.namelist() - - if len(zip_names) == 1: - file_name = zip_names.pop() - source = zip_file.open(file_name) - - elif len(zip_names) == 0: - raise ValueError(f'Zero files found in compressed ' - f'zip file {source}') - else: - raise ValueError(f'Multiple files found in compressed ' - f'zip file {zip_names}') - elif self.compression == 'xz': - if isinstance(source, str): - source = get_lzma_file(lzma)(source, 'rb') - else: - source = get_lzma_file(lzma)(filename=source) - else: - raise ValueError(f'Unrecognized compression type: ' - f'{self.compression}') - - if (self.encoding and hasattr(source, "read") and - not hasattr(source, "encoding")): - source = io.TextIOWrapper( - source, self.encoding.decode('utf-8'), newline='') - - self.encoding = b'utf-8' - self.c_encoding = self.encoding - - self.handle = source - - if isinstance(source, str): - encoding = sys.getfilesystemencoding() or "utf-8" - usource = source - source = source.encode(encoding) - - if self.memory_map: - ptr = new_mmap(source) - if ptr == NULL: - # fall back - ptr = new_file_source(source, self.parser.chunksize) - self.parser.cb_io = &buffer_file_bytes - self.parser.cb_cleanup = &del_file_source - else: - self.parser.cb_io = &buffer_mmap_bytes - self.parser.cb_cleanup = &del_mmap - else: - ptr = new_file_source(source, self.parser.chunksize) - self.parser.cb_io = &buffer_file_bytes - self.parser.cb_cleanup = &del_file_source - self.parser.source = ptr - - elif hasattr(source, 'read'): - # e.g., StringIO - - ptr = new_rd_source(source) - self.parser.source = ptr - self.parser.cb_io = &buffer_rd_bytes - self.parser.cb_cleanup = &del_rd_source - else: + if not hasattr(source, "read"): raise IOError(f'Expected file path name or file-like object, ' f'got {type(source)} type') + ptr = new_rd_source(source) + self.parser.source = ptr + self.parser.cb_io = &buffer_rd_bytes + self.parser.cb_cleanup = &del_rd_source + cdef _get_header(self): # header is now a list of lists, so field_count should use header[0] diff --git a/pandas/_typing.py b/pandas/_typing.py index 3376559fb23ff..3e89cf24632e2 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -1,6 +1,6 @@ -from dataclasses import dataclass from datetime import datetime, timedelta, tzinfo -from io import IOBase +from io import BufferedIOBase, RawIOBase, TextIOBase, TextIOWrapper +from mmap import mmap from pathlib import Path from typing import ( IO, @@ -10,7 +10,6 @@ Callable, Collection, Dict, - Generic, Hashable, List, Mapping, @@ -77,8 +76,6 @@ "ExtensionDtype", str, np.dtype, Type[Union[str, float, int, complex, bool, object]] ] DtypeObj = Union[np.dtype, "ExtensionDtype"] -FilePathOrBuffer = Union[str, Path, IO[AnyStr], IOBase] -FileOrBuffer = Union[str, IO[AnyStr], IOBase] # FrameOrSeriesUnion means either a DataFrame or a Series. E.g. # `def func(a: FrameOrSeriesUnion) -> FrameOrSeriesUnion: ...` means that if a Series @@ -133,6 +130,10 @@ "Resampler", ] +# filenames and file-like-objects +Buffer = Union[IO[AnyStr], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap] +FileOrBuffer = Union[str, Buffer[T]] +FilePathOrBuffer = Union[Path, FileOrBuffer[T]] # for arbitrary kwargs passed during reading/writing files StorageOptions = Optional[Dict[str, Any]] @@ -150,21 +151,3 @@ # type of float formatter in DataFrameFormatter FloatFormatType = Union[str, Callable, "EngFormatter"] - - -@dataclass -class IOargs(Generic[ModeVar, EncodingVar]): - """ - Return value of io/common.py:get_filepath_or_buffer. - - Note (copy&past from io/parsers): - filepath_or_buffer can be Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile] - though mypy handling of conditional imports is difficult. - See https://github.com/python/mypy/issues/1297 - """ - - filepath_or_buffer: FileOrBuffer - encoding: EncodingVar - compression: CompressionDict - should_close: bool - mode: Union[ModeVar, str] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 24b89085ac121..a3130ec27713d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -15,6 +15,7 @@ import datetime from io import StringIO import itertools +import mmap from textwrap import dedent from typing import ( IO, @@ -2286,10 +2287,9 @@ def to_markdown( if buf is None: return result ioargs = get_filepath_or_buffer(buf, mode=mode, storage_options=storage_options) - assert not isinstance(ioargs.filepath_or_buffer, str) + assert not isinstance(ioargs.filepath_or_buffer, (str, mmap.mmap)) ioargs.filepath_or_buffer.writelines(result) - if ioargs.should_close: - ioargs.filepath_or_buffer.close() + ioargs.close() return None @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") diff --git a/pandas/io/common.py b/pandas/io/common.py index c147ae9fd0aa8..90a79e54015c4 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -2,8 +2,9 @@ import bz2 from collections import abc +import dataclasses import gzip -from io import BufferedIOBase, BytesIO, RawIOBase +from io import BufferedIOBase, BytesIO, RawIOBase, TextIOWrapper import mmap import os import pathlib @@ -13,12 +14,14 @@ Any, AnyStr, Dict, + Generic, List, Mapping, Optional, Tuple, Type, Union, + cast, ) from urllib.parse import ( urljoin, @@ -31,12 +34,12 @@ import zipfile from pandas._typing import ( + Buffer, CompressionDict, CompressionOptions, EncodingVar, FileOrBuffer, FilePathOrBuffer, - IOargs, ModeVar, StorageOptions, ) @@ -56,6 +59,76 @@ from io import IOBase +@dataclasses.dataclass +class IOArgs(Generic[ModeVar, EncodingVar]): + """ + Return value of io/common.py:get_filepath_or_buffer. + + This is used to easily close created fsspec objects. + + Note (copy&past from io/parsers): + filepath_or_buffer can be Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile] + though mypy handling of conditional imports is difficult. + See https://github.com/python/mypy/issues/1297 + """ + + filepath_or_buffer: FileOrBuffer + encoding: EncodingVar + mode: Union[ModeVar, str] + compression: CompressionDict + should_close: bool = False + + def close(self) -> None: + """ + Close the buffer if it was created by get_filepath_or_buffer. + """ + if self.should_close: + assert not isinstance(self.filepath_or_buffer, str) + try: + self.filepath_or_buffer.close() + except (OSError, ValueError): + pass + self.should_close = False + + +@dataclasses.dataclass +class IOHandles: + """ + Return value of io/common.py:get_handle + + This is used to easily close created buffers and to handle corner cases when + TextIOWrapper is inserted. + + handle: The file handle to be used. + created_handles: All file handles that are created by get_handle + is_wrapped: Whether a TextIOWrapper needs to be detached. + """ + + handle: Buffer + created_handles: List[Buffer] = dataclasses.field(default_factory=list) + is_wrapped: bool = False + + def close(self) -> None: + """ + Close all created buffers. + + Note: If a TextIOWrapper was inserted, it is flushed and detached to + avoid closing the potentially user-created buffer. + """ + if self.is_wrapped: + assert isinstance(self.handle, TextIOWrapper) + self.handle.flush() + self.handle.detach() + self.created_handles.remove(self.handle) + try: + for handle in self.created_handles: + handle.close() + except (OSError, ValueError): + pass + self.created_handles = [] + self.is_wrapped = False + + def is_url(url) -> bool: """ Check to see if a URL has a valid protocol. @@ -176,7 +249,7 @@ def get_filepath_or_buffer( compression: CompressionOptions = None, mode: ModeVar = None, # type: ignore[assignment] storage_options: StorageOptions = None, -) -> IOargs[ModeVar, EncodingVar]: +) -> IOArgs[ModeVar, EncodingVar]: """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. @@ -201,7 +274,7 @@ def get_filepath_or_buffer( ..versionchange:: 1.2.0 - Returns the dataclass IOargs. + Returns the dataclass IOArgs. """ filepath_or_buffer = stringify_path(filepath_or_buffer) @@ -225,6 +298,10 @@ def get_filepath_or_buffer( compression = dict(compression, method=compression_method) + # uniform encoding names + if encoding is not None: + encoding = encoding.replace("_", "-").lower() + # bz2 and xz do not write the byte order mark for utf-16 and utf-32 # print a warning when writing such files if ( @@ -258,7 +335,7 @@ def get_filepath_or_buffer( compression = {"method": "gzip"} reader = BytesIO(req.read()) req.close() - return IOargs( + return IOArgs( filepath_or_buffer=reader, encoding=encoding, compression=compression, @@ -310,7 +387,7 @@ def get_filepath_or_buffer( filepath_or_buffer, mode=fsspec_mode, **(storage_options or {}) ).open() - return IOargs( + return IOArgs( filepath_or_buffer=file_obj, encoding=encoding, compression=compression, @@ -323,7 +400,7 @@ def get_filepath_or_buffer( ) if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): - return IOargs( + return IOArgs( filepath_or_buffer=_expand_user(filepath_or_buffer), encoding=encoding, compression=compression, @@ -335,7 +412,7 @@ def get_filepath_or_buffer( msg = f"Invalid file path or buffer object type: {type(filepath_or_buffer)}" raise ValueError(msg) - return IOargs( + return IOArgs( filepath_or_buffer=filepath_or_buffer, encoding=encoding, compression=compression, @@ -455,14 +532,14 @@ def infer_compression( def get_handle( - path_or_buf, + path_or_buf: FilePathOrBuffer, mode: str, - encoding=None, + encoding: Optional[str] = None, compression: CompressionOptions = None, memory_map: bool = False, is_text: bool = True, - errors=None, -): + errors: Optional[str] = None, +) -> IOHandles: """ Get file handle for given path/buffer and mode. @@ -506,14 +583,9 @@ def get_handle( See the errors argument for :func:`open` for a full list of options. - .. versionadded:: 1.1.0 + .. versionchanged:: 1.2.0 - Returns - ------- - f : file-like - A file-like object. - handles : list of file-like objects - A list of file-like object that were opened in this function. + Returns the dataclass IOHandles """ need_text_wrapping: Tuple[Type["IOBase"], ...] try: @@ -532,12 +604,16 @@ def get_handle( except ImportError: pass - handles: List[Union[IO, _MMapWrapper]] = list() - f = path_or_buf + handles: List[Buffer] = list() + + # Windows does not default to utf-8. Set to utf-8 for a consistent behavior + if encoding is None: + encoding = "utf-8" # Convert pathlib.Path/py.path.local or string path_or_buf = stringify_path(path_or_buf) is_path = isinstance(path_or_buf, str) + f = path_or_buf compression, compression_args = get_compression_method(compression) if is_path: @@ -548,25 +624,29 @@ def get_handle( # GZ Compression if compression == "gzip": if is_path: + assert isinstance(path_or_buf, str) f = gzip.GzipFile(filename=path_or_buf, mode=mode, **compression_args) else: - f = gzip.GzipFile(fileobj=path_or_buf, mode=mode, **compression_args) + f = gzip.GzipFile( + fileobj=path_or_buf, # type: ignore[arg-type] + mode=mode, + **compression_args, + ) # BZ Compression elif compression == "bz2": - f = bz2.BZ2File(path_or_buf, mode=mode, **compression_args) + f = bz2.BZ2File( + path_or_buf, mode=mode, **compression_args # type: ignore[arg-type] + ) # ZIP Compression elif compression == "zip": - zf = _BytesZipFile(path_or_buf, mode, **compression_args) - # Ensure the container is closed as well. - handles.append(zf) - if zf.mode == "w": - f = zf - elif zf.mode == "r": - zip_names = zf.namelist() + f = _BytesZipFile(path_or_buf, mode, **compression_args) + if f.mode == "r": + handles.append(f) + zip_names = f.namelist() if len(zip_names) == 1: - f = zf.open(zip_names.pop()) + f = f.open(zip_names.pop()) elif len(zip_names) == 0: raise ValueError(f"Zero files found in ZIP file {path_or_buf}") else: @@ -584,36 +664,40 @@ def get_handle( msg = f"Unrecognized compression type: {compression}" raise ValueError(msg) + assert not isinstance(f, str) handles.append(f) elif is_path: # Check whether the filename is to be opened in binary mode. # Binary mode does not support 'encoding' and 'newline'. is_binary_mode = "b" in mode - + assert isinstance(path_or_buf, str) if encoding and not is_binary_mode: # Encoding f = open(path_or_buf, mode, encoding=encoding, errors=errors, newline="") - elif is_text and not is_binary_mode: - # No explicit encoding - f = open(path_or_buf, mode, errors="replace", newline="") else: # Binary mode f = open(path_or_buf, mode) handles.append(f) # Convert BytesIO or file objects passed with an encoding - if is_text and (compression or isinstance(f, need_text_wrapping)): - from io import TextIOWrapper - - g = TextIOWrapper(f, encoding=encoding, errors=errors, newline="") - if not isinstance(f, (BufferedIOBase, RawIOBase)): - handles.append(g) - f = g + is_wrapped = False + if is_text and ( + compression + or isinstance(f, need_text_wrapping) + or "b" in getattr(f, "mode", "") + ): + f = TextIOWrapper( + f, encoding=encoding, errors=errors, newline="" # type: ignore[arg-type] + ) + handles.append(f) + # do not mark as wrapped when the user provided a string + is_wrapped = not is_path if memory_map and hasattr(f, "fileno"): + assert not isinstance(f, str) try: - wrapped = _MMapWrapper(f) + wrapped = cast(mmap.mmap, _MMapWrapper(f)) # type: ignore[arg-type] f.close() handles.remove(f) handles.append(wrapped) @@ -625,7 +709,13 @@ def get_handle( # leave the file handler as is then pass - return f, handles + handles.reverse() # close the most recently added buffer first + assert not isinstance(f, str) + return IOHandles( + handle=f, + created_handles=handles, + is_wrapped=is_wrapped, + ) # error: Definition of "__exit__" in base class "ZipFile" is incompatible with diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 3461652f4ea24..03c61c3ed8376 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -17,6 +17,7 @@ from pandas.core.frame import DataFrame from pandas.io.common import ( + IOArgs, get_filepath_or_buffer, is_url, stringify_path, @@ -349,24 +350,37 @@ def read_excel( class BaseExcelReader(metaclass=abc.ABCMeta): def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): + self.ioargs = IOArgs( + filepath_or_buffer=filepath_or_buffer, + encoding=None, + mode=None, + compression={"method": None}, + ) # If filepath_or_buffer is a url, load the data into a BytesIO if is_url(filepath_or_buffer): - filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) + self.ioargs = IOArgs( + filepath_or_buffer=BytesIO(urlopen(filepath_or_buffer).read()), + should_close=True, + encoding=None, + mode=None, + compression={"method": None}, + ) elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): - filepath_or_buffer = get_filepath_or_buffer( + self.ioargs = get_filepath_or_buffer( filepath_or_buffer, storage_options=storage_options - ).filepath_or_buffer + ) - if isinstance(filepath_or_buffer, self._workbook_class): - self.book = filepath_or_buffer - elif hasattr(filepath_or_buffer, "read"): + if isinstance(self.ioargs.filepath_or_buffer, self._workbook_class): + self.book = self.ioargs.filepath_or_buffer + elif hasattr(self.ioargs.filepath_or_buffer, "read"): # N.B. xlrd.Book has a read attribute too - filepath_or_buffer.seek(0) - self.book = self.load_workbook(filepath_or_buffer) - elif isinstance(filepath_or_buffer, str): - self.book = self.load_workbook(filepath_or_buffer) - elif isinstance(filepath_or_buffer, bytes): - self.book = self.load_workbook(BytesIO(filepath_or_buffer)) + assert not isinstance(self.ioargs.filepath_or_buffer, str) + self.ioargs.filepath_or_buffer.seek(0) + self.book = self.load_workbook(self.ioargs.filepath_or_buffer) + elif isinstance(self.ioargs.filepath_or_buffer, str): + self.book = self.load_workbook(self.ioargs.filepath_or_buffer) + elif isinstance(self.ioargs.filepath_or_buffer, bytes): + self.book = self.load_workbook(BytesIO(self.ioargs.filepath_or_buffer)) else: raise ValueError( "Must explicitly set engine if not passing in buffer or path for io." @@ -382,7 +396,7 @@ def load_workbook(self, filepath_or_buffer): pass def close(self): - pass + self.ioargs.close() @property @abc.abstractmethod diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 9a42b8289ab47..198acd5862d45 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -81,9 +81,7 @@ def to_feather( feather.write_feather(df, ioargs.filepath_or_buffer, **kwargs) - if ioargs.should_close: - assert not isinstance(ioargs.filepath_or_buffer, str) - ioargs.filepath_or_buffer.close() + ioargs.close() def read_feather( @@ -137,9 +135,6 @@ def read_feather( ioargs.filepath_or_buffer, columns=columns, use_threads=bool(use_threads) ) - # s3fs only validates the credentials when the file is closed. - if ioargs.should_close: - assert not isinstance(ioargs.filepath_or_buffer, str) - ioargs.filepath_or_buffer.close() + ioargs.close() return df diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 6c62d6825bc84..20226dbb3c9d4 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -3,7 +3,6 @@ """ import csv as csvlib -from io import StringIO, TextIOWrapper import os from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Sequence, Union @@ -39,7 +38,7 @@ class CSVFormatter: def __init__( self, formatter: "DataFrameFormatter", - path_or_buf: Optional[FilePathOrBuffer[str]] = None, + path_or_buf: FilePathOrBuffer[str] = "", sep: str = ",", cols: Optional[Sequence[Label]] = None, index_label: Optional[IndexLabel] = None, @@ -60,25 +59,14 @@ def __init__( self.obj = self.fmt.frame - self.encoding = encoding or "utf-8" - - if path_or_buf is None: - path_or_buf = StringIO() - - ioargs = get_filepath_or_buffer( + self.ioargs = get_filepath_or_buffer( path_or_buf, - encoding=self.encoding, + encoding=encoding, compression=compression, mode=mode, storage_options=storage_options, ) - self.compression = ioargs.compression.pop("method") - self.compression_args = ioargs.compression - self.path_or_buf = ioargs.filepath_or_buffer - self.should_close = ioargs.should_close - self.mode = ioargs.mode - self.sep = sep self.index_label = self._initialize_index_label(index_label) self.errors = errors @@ -238,20 +226,19 @@ def save(self) -> None: """ Create the writer & save. """ - # get a handle or wrap an existing handle to take care of 1) compression and - # 2) text -> byte conversion - f, handles = get_handle( - self.path_or_buf, - self.mode, - encoding=self.encoding, + # apply compression and byte/text conversion + handles = get_handle( + self.ioargs.filepath_or_buffer, + self.ioargs.mode, + encoding=self.ioargs.encoding, errors=self.errors, - compression=dict(self.compression_args, method=self.compression), + compression=self.ioargs.compression, ) try: # Note: self.encoding is irrelevant here self.writer = csvlib.writer( - f, + handles.handle, # type: ignore[arg-type] lineterminator=self.line_terminator, delimiter=self.sep, quoting=self.quoting, @@ -263,23 +250,10 @@ def save(self) -> None: self._save() finally: - if self.should_close: - f.close() - elif ( - isinstance(f, TextIOWrapper) - and not f.closed - and f != self.path_or_buf - and hasattr(self.path_or_buf, "write") - ): - # get_handle uses TextIOWrapper for non-binary handles. TextIOWrapper - # closes the wrapped handle if it is not detached. - f.flush() # make sure everything is written - f.detach() # makes f unusable - del f - elif f != self.path_or_buf: - f.close() - for _fh in handles: - _fh.close() + # close compression and byte/text wrapper + handles.close() + # close any fsspec-like objects + self.ioargs.close() def _save(self) -> None: if self._need_to_save_header: diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 3c759f477899b..43e76d0aef490 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1046,8 +1046,12 @@ def to_csv( """ from pandas.io.formats.csvs import CSVFormatter + created_buffer = path_or_buf is None + if created_buffer: + path_or_buf = StringIO() + csv_formatter = CSVFormatter( - path_or_buf=path_or_buf, + path_or_buf=path_or_buf, # type: ignore[arg-type] line_terminator=line_terminator, sep=sep, encoding=encoding, @@ -1067,9 +1071,11 @@ def to_csv( ) csv_formatter.save() - if path_or_buf is None: - assert isinstance(csv_formatter.path_or_buf, StringIO) - return csv_formatter.path_or_buf.getvalue() + if created_buffer: + assert isinstance(path_or_buf, StringIO) + content = path_or_buf.getvalue() + path_or_buf.close() + return content return None diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 0cc6ca984b25d..040279b9f3e67 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -1,10 +1,10 @@ from abc import ABC, abstractmethod from collections import abc import functools -from io import BytesIO, StringIO +from io import StringIO from itertools import islice import os -from typing import IO, Any, Callable, List, Mapping, Optional, Tuple, Type, Union +from typing import Any, Callable, Mapping, Optional, Tuple, Type, Union import numpy as np @@ -26,7 +26,12 @@ from pandas.core.generic import NDFrame from pandas.core.reshape.concat import concat -from pandas.io.common import get_compression_method, get_filepath_or_buffer, get_handle +from pandas.io.common import ( + IOHandles, + get_compression_method, + get_filepath_or_buffer, + get_handle, +) from pandas.io.json._normalize import convert_to_line_delimits from pandas.io.json._table_schema import build_table_schema, parse_table_schema from pandas.io.parsers import validate_integer @@ -59,17 +64,6 @@ def to_json( "'index=False' is only valid when 'orient' is 'split' or 'table'" ) - if path_or_buf is not None: - ioargs = get_filepath_or_buffer( - path_or_buf, - compression=compression, - mode="wt", - storage_options=storage_options, - ) - path_or_buf = ioargs.filepath_or_buffer - should_close = ioargs.should_close - compression = ioargs.compression - if lines and orient != "records": raise ValueError("'lines' keyword only valid when 'orient' is records") @@ -101,20 +95,27 @@ def to_json( if lines: s = convert_to_line_delimits(s) - if isinstance(path_or_buf, str): - fh, handles = get_handle(path_or_buf, "w", compression=compression) + if path_or_buf is not None: + # open fsspec URLs + ioargs = get_filepath_or_buffer( + path_or_buf, + compression=compression, + mode="wt", + storage_options=storage_options, + ) + # apply compression and byte/text conversion + handles = get_handle( + ioargs.filepath_or_buffer, "w", compression=ioargs.compression + ) try: - fh.write(s) + handles.handle.write(s) finally: - fh.close() - for handle in handles: - handle.close() - elif path_or_buf is None: - return s + # close compression and byte/text wrapper + handles.close() + # close any fsspec-like objects + ioargs.close() else: - path_or_buf.write(s) - if should_close: - path_or_buf.close() + return s class Writer(ABC): @@ -547,12 +548,10 @@ def read_json( dtype = True if convert_axes is None and orient != "table": convert_axes = True - if encoding is None: - encoding = "utf-8" ioargs = get_filepath_or_buffer( path_or_buf, - encoding=encoding, + encoding=encoding or "utf-8", compression=compression, storage_options=storage_options, ) @@ -579,9 +578,7 @@ def read_json( return json_reader result = json_reader.read() - if ioargs.should_close: - assert not isinstance(ioargs.filepath_or_buffer, str) - ioargs.filepath_or_buffer.close() + ioargs.close() return result @@ -631,9 +628,8 @@ def __init__( self.lines = lines self.chunksize = chunksize self.nrows_seen = 0 - self.should_close = False self.nrows = nrows - self.file_handles: List[IO] = [] + self.handles: Optional[IOHandles] = None if self.chunksize is not None: self.chunksize = validate_integer("chunksize", self.chunksize, 1) @@ -672,30 +668,25 @@ def _get_data_from_filepath(self, filepath_or_buffer): This method turns (1) into (2) to simplify the rest of the processing. It returns input types (2) and (3) unchanged. """ - data = filepath_or_buffer - + # if it is a string but the file does not exist, it might be a JSON string exists = False - if isinstance(data, str): + if isinstance(filepath_or_buffer, str): try: exists = os.path.exists(filepath_or_buffer) # gh-5874: if the filepath is too long will raise here except (TypeError, ValueError): pass - if exists or self.compression["method"] is not None: - data, self.file_handles = get_handle( + if exists or not isinstance(filepath_or_buffer, str): + self.handles = get_handle( filepath_or_buffer, "r", encoding=self.encoding, compression=self.compression, ) - self.should_close = True - self.open_stream = data - - if isinstance(data, BytesIO): - data = data.getvalue().decode() + filepath_or_buffer = self.handles.handle - return data + return filepath_or_buffer def _combine_lines(self, lines) -> str: """ @@ -759,13 +750,8 @@ def close(self): If an open stream or file was passed, we leave it open. """ - if self.should_close: - try: - self.open_stream.close() - except (OSError, AttributeError): - pass - for file_handle in self.file_handles: - file_handle.close() + if self.handles is not None: + self.handles.close() def __next__(self): if self.nrows: diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 829ff6408d86d..5a734f0878a0c 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -53,4 +53,5 @@ def read_orc( ioargs = get_filepath_or_buffer(path) orc_file = pyarrow.orc.ORCFile(ioargs.filepath_or_buffer) result = orc_file.read(columns=columns, **kwargs).to_pandas() + ioargs.close() return result diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 2110a2d400be8..3b72869188344 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -5,7 +5,7 @@ from collections import abc, defaultdict import csv import datetime -from io import StringIO, TextIOWrapper +from io import StringIO import itertools import re import sys @@ -63,7 +63,13 @@ from pandas.core.series import Series from pandas.core.tools import datetimes as tools -from pandas.io.common import get_filepath_or_buffer, get_handle, validate_header_arg +from pandas.io.common import ( + get_compression_method, + get_filepath_or_buffer, + get_handle, + stringify_path, + validate_header_arg, +) from pandas.io.date_converters import generic_parser # BOM character (byte order mark) @@ -428,17 +434,16 @@ def _validate_names(names): def _read(filepath_or_buffer: FilePathOrBuffer, kwds): """Generic reader of line files.""" - encoding = kwds.get("encoding", None) storage_options = kwds.get("storage_options", None) - if encoding is not None: - encoding = re.sub("_", "-", encoding).lower() - kwds["encoding"] = encoding - compression = kwds.get("compression", "infer") ioargs = get_filepath_or_buffer( - filepath_or_buffer, encoding, compression, storage_options=storage_options + filepath_or_buffer, + kwds.get("encoding", None), + kwds.get("compression", "infer"), + storage_options=storage_options, ) kwds["compression"] = ioargs.compression + kwds["encoding"] = ioargs.encoding if kwds.get("date_parser", None) is not None: if isinstance(kwds["parse_dates"], bool): @@ -461,14 +466,10 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): try: data = parser.read(nrows) finally: + # close compression and byte/text wrapper parser.close() - - if ioargs.should_close: - assert not isinstance(ioargs.filepath_or_buffer, str) - try: - ioargs.filepath_or_buffer.close() - except ValueError: - pass + # close any fsspec-like objects + ioargs.close() return data @@ -1350,10 +1351,6 @@ def __init__(self, kwds): self._first_chunk = True - # GH 13932 - # keep references to file handles opened by the parser itself - self.handles = [] - def _validate_parse_dates_presence(self, columns: List[str]) -> None: """ Check if parse_dates are in columns. @@ -1403,8 +1400,7 @@ def _validate_parse_dates_presence(self, columns: List[str]) -> None: ) def close(self): - for f in self.handles: - f.close() + self.handles.close() @property def _has_complex_date_col(self): @@ -1838,23 +1834,29 @@ def __init__(self, src, **kwds): ParserBase.__init__(self, kwds) - encoding = kwds.get("encoding") + if kwds.get("memory_map", False): + # memory-mapped files are directly handled by the TextReader. + src = stringify_path(src) - # parsers.TextReader doesn't support compression dicts - if isinstance(kwds.get("compression"), dict): - kwds["compression"] = kwds["compression"]["method"] - - if kwds.get("compression") is None and encoding: - if isinstance(src, str): - src = open(src, "rb") - self.handles.append(src) - - # Handle the file object with universal line mode enabled. - # We will handle the newline character ourselves later on. - if hasattr(src, "read") and not hasattr(src, "encoding"): - src = TextIOWrapper(src, encoding=encoding, newline="") + if get_compression_method(kwds.get("compression", None))[0] is not None: + raise ValueError( + "read_csv does not support compression with memory_map=True. " + + "Please use memory_map=False instead." + ) - kwds["encoding"] = "utf-8" + self.handles = get_handle( + src, + mode="r", + encoding=kwds.get("encoding", None), + compression=kwds.get("compression", None), + memory_map=kwds.get("memory_map", False), + is_text=True, + ) + kwds.pop("encoding", None) + kwds.pop("memory_map", None) + kwds.pop("compression", None) + if kwds.get("memory_map", False) and hasattr(self.handles.handle, "mmap"): + self.handles.handle = self.handles.handle.mmap # #2442 kwds["allow_leading_cols"] = self.index_col is not False @@ -1863,7 +1865,7 @@ def __init__(self, src, **kwds): self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) kwds["usecols"] = self.usecols - self._reader = parsers.TextReader(src, **kwds) + self._reader = parsers.TextReader(self.handles.handle, **kwds) self.unnamed_cols = self._reader.unnamed_cols passed_names = self.names is None @@ -1942,11 +1944,10 @@ def __init__(self, src, **kwds): self._implicit_index = self._reader.leading_cols > 0 - def close(self): - for f in self.handles: - f.close() + def close(self) -> None: + super().close() - # close additional handles opened by C parser (for compression) + # close additional handles opened by C parser try: self._reader.close() except ValueError: @@ -2237,20 +2238,19 @@ def __init__(self, f, **kwds): self.comment = kwds["comment"] self._comment_lines = [] - f, handles = get_handle( + self.handles = get_handle( f, "r", encoding=self.encoding, compression=self.compression, memory_map=self.memory_map, ) - self.handles.extend(handles) # Set self.data to something that can read lines. - if hasattr(f, "readline"): - self._make_reader(f) + if hasattr(self.handles.handle, "readline"): + self._make_reader(self.handles.handle) else: - self.data = f + self.data = self.handles.handle # Get columns in two steps: infer from data, then # infer column indices from self.usecols if it is specified. diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 426a40a65b522..6fa044b4651a5 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -92,25 +92,18 @@ def to_pickle( mode="wb", storage_options=storage_options, ) - f, fh = get_handle( + handles = get_handle( ioargs.filepath_or_buffer, "wb", compression=ioargs.compression, is_text=False ) if protocol < 0: protocol = pickle.HIGHEST_PROTOCOL try: - pickle.dump(obj, f, protocol=protocol) + pickle.dump(obj, handles.handle, protocol=protocol) # type: ignore[arg-type] finally: - if f != filepath_or_buffer: - # do not close user-provided file objects GH 35679 - f.close() - for _f in fh: - _f.close() - if ioargs.should_close: - assert not isinstance(ioargs.filepath_or_buffer, str) - try: - ioargs.filepath_or_buffer.close() - except ValueError: - pass + # close compression and byte/text wrapper + handles.close() + # close any fsspec-like objects + ioargs.close() def read_pickle( @@ -193,7 +186,7 @@ def read_pickle( ioargs = get_filepath_or_buffer( filepath_or_buffer, compression=compression, storage_options=storage_options ) - f, fh = get_handle( + handles = get_handle( ioargs.filepath_or_buffer, "rb", compression=ioargs.compression, is_text=False ) @@ -208,24 +201,17 @@ def read_pickle( with warnings.catch_warnings(record=True): # We want to silence any warnings about, e.g. moved modules. warnings.simplefilter("ignore", Warning) - return pickle.load(f) + return pickle.load(handles.handle) # type: ignore[arg-type] except excs_to_catch: # e.g. # "No module named 'pandas.core.sparse.series'" # "Can't get attribute '__nat_unpickle' on None: def close(self) -> None: """ close the handle if its open """ - try: - self.path_or_buf.close() - except OSError: - pass + self.ioargs.close() def _set_encoding(self) -> None: """ @@ -1938,7 +1936,7 @@ def _open_file_binary_write( fname: FilePathOrBuffer, compression: CompressionOptions, storage_options: StorageOptions = None, -) -> Tuple[BinaryIO, bool, CompressionOptions]: +) -> Tuple[IOHandles, CompressionOptions]: """ Open a binary file or no-op if file-like. @@ -1958,34 +1956,22 @@ def _open_file_binary_write( docs for the set of allowed keys and values .. versionadded:: 1.2.0 - - Returns - ------- - file : file-like object - File object supporting write - own : bool - True if the file was created, otherwise False """ - if hasattr(fname, "write"): - # See https://github.com/python/mypy/issues/1424 for hasattr challenges - # error: Incompatible return value type (got "Tuple[Union[str, Path, - # IO[Any]], bool, None]", expected "Tuple[BinaryIO, bool, Union[str, - # Mapping[str, str], None]]") - return fname, False, None # type: ignore[return-value] - elif isinstance(fname, (str, Path)): - # Extract compression mode as given, if dict - ioargs = get_filepath_or_buffer( - fname, mode="wb", compression=compression, storage_options=storage_options - ) - f, _ = get_handle( - ioargs.filepath_or_buffer, - "wb", - compression=ioargs.compression, - is_text=False, - ) - return f, True, ioargs.compression - else: - raise TypeError("fname must be a binary file, buffer or path-like.") + ioargs = get_filepath_or_buffer( + fname, mode="wb", compression=compression, storage_options=storage_options + ) + handles = get_handle( + ioargs.filepath_or_buffer, + "wb", + compression=ioargs.compression, + is_text=False, + ) + if ioargs.filepath_or_buffer != fname and not isinstance( + ioargs.filepath_or_buffer, str + ): + # add handle created by get_filepath_or_buffer + handles.created_handles.append(ioargs.filepath_or_buffer) + return handles, ioargs.compression def _set_endianness(endianness: str) -> str: @@ -2236,9 +2222,8 @@ def __init__( self._time_stamp = time_stamp self._data_label = data_label self._variable_labels = variable_labels - self._own_file = True self._compression = compression - self._output_file: Optional[BinaryIO] = None + self._output_file: Optional[Buffer] = None # attach nobs, nvars, data, varlist, typlist self._prepare_pandas(data) self.storage_options = storage_options @@ -2249,21 +2234,20 @@ def __init__( self._fname = stringify_path(fname) self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8} self._converted_names: Dict[Label, str] = {} - self._file: Optional[BinaryIO] = None def _write(self, to_write: str) -> None: """ Helper to call encode before writing to file for Python 3 compat. """ - assert self._file is not None - self._file.write(to_write.encode(self._encoding)) + self.handles.handle.write( + to_write.encode(self._encoding) # type: ignore[arg-type] + ) def _write_bytes(self, value: bytes) -> None: """ Helper to assert file is open before writing. """ - assert self._file is not None - self._file.write(value) + self.handles.handle.write(value) # type: ignore[arg-type] def _prepare_categoricals(self, data: DataFrame) -> DataFrame: """ @@ -2527,12 +2511,14 @@ def _encode_strings(self) -> None: self.data[col] = encoded def write_file(self) -> None: - self._file, self._own_file, compression = _open_file_binary_write( + self.handles, compression = _open_file_binary_write( self._fname, self._compression, storage_options=self.storage_options ) if compression is not None: - self._output_file = self._file - self._file = BytesIO() + # ZipFile creates a file (with the same name) for each write call. + # Write it first into a buffer and then write the buffer to the ZipFile. + self._output_file = self.handles.handle + self.handles.handle = BytesIO() try: self._write_header(data_label=self._data_label, time_stamp=self._time_stamp) self._write_map() @@ -2552,10 +2538,9 @@ def write_file(self) -> None: self._write_map() except Exception as exc: self._close() - if self._own_file: + if isinstance(self._fname, (str, Path)): try: - if isinstance(self._fname, (str, Path)): - os.unlink(self._fname) + os.unlink(self._fname) except OSError: warnings.warn( f"This save was not successful but {self._fname} could not " @@ -2571,24 +2556,18 @@ def _close(self) -> None: Close the file if it was created by the writer. If a buffer or file-like object was passed in, for example a GzipFile, - then leave this file open for the caller to close. In either case, - attempt to flush the file contents to ensure they are written to disk - (if supported) + then leave this file open for the caller to close. """ - # Some file-like objects might not support flush - assert self._file is not None + # write compression if self._output_file is not None: - assert isinstance(self._file, BytesIO) - bio = self._file + assert isinstance(self.handles.handle, BytesIO) + bio = self.handles.handle bio.seek(0) - self._file = self._output_file - self._file.write(bio.read()) - try: - self._file.flush() - except AttributeError: - pass - if self._own_file: - self._file.close() + self.handles.handle = self._output_file + self.handles.handle.write(bio.read()) # type: ignore[arg-type] + bio.close() + # close any created handles + self.handles.close() def _write_map(self) -> None: """No-op, future compatibility""" @@ -3140,8 +3119,8 @@ def _tag(val: Union[str, bytes], tag: str) -> bytes: def _update_map(self, tag: str) -> None: """Update map location for tag with file position""" - assert self._file is not None - self._map[tag] = self._file.tell() + assert self.handles.handle is not None + self._map[tag] = self.handles.handle.tell() def _write_header( self, @@ -3208,12 +3187,11 @@ def _write_map(self) -> None: the map with 0s. The second call writes the final map locations when all blocks have been written. """ - assert self._file is not None if not self._map: self._map = dict( ( ("stata_data", 0), - ("map", self._file.tell()), + ("map", self.handles.handle.tell()), ("variable_types", 0), ("varnames", 0), ("sortlist", 0), @@ -3229,7 +3207,7 @@ def _write_map(self) -> None: ) ) # Move to start of map - self._file.seek(self._map["map"]) + self.handles.handle.seek(self._map["map"]) bio = BytesIO() for val in self._map.values(): bio.write(struct.pack(self._byteorder + "Q", val)) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 5bf1ce508dfc4..3103f6e1ba0b1 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -1034,11 +1034,12 @@ def test_to_csv_compression(self, df, encoding, compression): tm.assert_frame_equal(df, result) # test the round trip using file handle - to_csv -> read_csv - f, _handles = get_handle( + handles = get_handle( filename, "w", compression=compression, encoding=encoding ) - with f: - df.to_csv(f, encoding=encoding) + df.to_csv(handles.handle, encoding=encoding) + assert not handles.handle.closed + handles.close() result = pd.read_csv( filename, compression=compression, diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 933bdc462e3f8..2e68d3306c7d1 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -143,7 +143,7 @@ def test_readjson_chunks_closes(chunksize): ) reader.read() assert ( - reader.open_stream.closed + reader.handles.handle.closed ), f"didn't close stream with chunksize = {chunksize}" diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index b33289213e258..e61a5fce99c69 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -6,7 +6,7 @@ import csv from datetime import datetime from inspect import signature -from io import StringIO +from io import BytesIO, StringIO import os import platform from urllib.error import URLError @@ -2253,3 +2253,62 @@ def test_dict_keys_as_names(all_parsers): result = parser.read_csv(StringIO(data), names=keys) expected = DataFrame({"a": [1], "b": [2]}) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("io_class", [StringIO, BytesIO]) +@pytest.mark.parametrize("encoding", [None, "utf-8"]) +def test_read_csv_file_handle(all_parsers, io_class, encoding): + """ + Test whether read_csv does not close user-provided file handles. + + GH 36980 + """ + parser = all_parsers + expected = DataFrame({"a": [1], "b": [2]}) + + content = "a,b\n1,2" + if io_class == BytesIO: + content = content.encode("utf-8") + handle = io_class(content) + + tm.assert_frame_equal(parser.read_csv(handle, encoding=encoding), expected) + assert not handle.closed + + +def test_memory_map_compression_error(c_parser_only): + """ + c-parsers do not support memory_map=True with compression. + + GH 36997 + """ + parser = c_parser_only + df = DataFrame({"a": [1], "b": [2]}) + msg = ( + "read_csv does not support compression with memory_map=True. " + + "Please use memory_map=False instead." + ) + + with tm.ensure_clean() as path: + df.to_csv(path, compression="gzip", index=False) + + with pytest.raises(ValueError, match=msg): + parser.read_csv(path, memory_map=True, compression="gzip") + + +def test_memory_map_file_handle(all_parsers): + """ + Support some buffers with memory_map=True. + + GH 36997 + """ + parser = all_parsers + expected = DataFrame({"a": [1], "b": [2]}) + + handle = StringIO() + expected.to_csv(handle, index=False) + handle.seek(0) + + tm.assert_frame_equal( + parser.read_csv(handle, memory_map=True), + expected, + ) diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 876696ecdad9c..e74265da3e966 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -152,14 +152,17 @@ def test_binary_mode_file_buffers( with open(fpath, mode="r", encoding=encoding) as fa: result = parser.read_csv(fa) + assert not fa.closed tm.assert_frame_equal(expected, result) with open(fpath, mode="rb") as fb: result = parser.read_csv(fb, encoding=encoding) + assert not fb.closed tm.assert_frame_equal(expected, result) with open(fpath, mode="rb", buffering=0) as fb: result = parser.read_csv(fb, encoding=encoding) + assert not fb.closed tm.assert_frame_equal(expected, result) @@ -199,6 +202,7 @@ def test_encoding_named_temp_file(all_parsers): result = parser.read_csv(f, encoding=encoding) tm.assert_frame_equal(result, expected) + assert not f.closed @pytest.mark.parametrize( diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index 1c2518646bb29..413b78a52ad38 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -31,13 +31,10 @@ def test_file_handle(self): reader = TextReader(f) reader.read() - def test_string_filename(self): - reader = TextReader(self.csv1, header=None) - reader.read() - def test_file_handle_mmap(self): + # this was never using memory_map=True with open(self.csv1, "rb") as f: - reader = TextReader(f, memory_map=True, header=None) + reader = TextReader(f, header=None) reader.read() def test_StringIO(self): diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 31e9ad4cf4416..8d7d5d85cbb48 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -47,18 +47,18 @@ def test_compression_size(obj, method, compression_only): @pytest.mark.parametrize("method", ["to_csv", "to_json"]) def test_compression_size_fh(obj, method, compression_only): with tm.ensure_clean() as path: - f, handles = icom.get_handle(path, "w", compression=compression_only) - with f: - getattr(obj, method)(f) - assert not f.closed - assert f.closed + handles = icom.get_handle(path, "w", compression=compression_only) + getattr(obj, method)(handles.handle) + assert not handles.handle.closed + handles.close() + assert handles.handle.closed compressed_size = os.path.getsize(path) with tm.ensure_clean() as path: - f, handles = icom.get_handle(path, "w", compression=None) - with f: - getattr(obj, method)(f) - assert not f.closed - assert f.closed + handles = icom.get_handle(path, "w", compression=None) + getattr(obj, method)(handles.handle) + assert not handles.handle.closed + handles.close() + assert handles.handle.closed uncompressed_size = os.path.getsize(path) assert uncompressed_size > compressed_size @@ -111,10 +111,10 @@ def test_compression_warning(compression_only): columns=["X", "Y", "Z"], ) with tm.ensure_clean() as path: - f, handles = icom.get_handle(path, "w", compression=compression_only) + handles = icom.get_handle(path, "w", compression=compression_only) with tm.assert_produces_warning(RuntimeWarning, check_stacklevel=False): - with f: - df.to_csv(f, compression=compression_only) + df.to_csv(handles.handle, compression=compression_only) + handles.close() def test_compression_binary(compression_only): diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py index a72e860340f25..714173158f4d6 100644 --- a/pandas/tests/series/methods/test_to_csv.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -143,11 +143,11 @@ def test_to_csv_compression(self, s, encoding, compression): tm.assert_series_equal(s, result) # test the round trip using file handle - to_csv -> read_csv - f, _handles = get_handle( + handles = get_handle( filename, "w", compression=compression, encoding=encoding ) - with f: - s.to_csv(f, encoding=encoding, header=True) + s.to_csv(handles.handle, encoding=encoding, header=True) + handles.close() result = pd.read_csv( filename, compression=compression, From ff1cd78535f1badc74061c36700ea005193a8461 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 4 Nov 2020 11:01:25 +0000 Subject: [PATCH 17/21] more typing checks to pre-commit (#37539) --- .pre-commit-config.yaml | 30 +++++++++++++++++++++++++++ ci/code_checks.sh | 23 -------------------- scripts/validate_unwanted_patterns.py | 2 +- 3 files changed, 31 insertions(+), 24 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b0f35087dc922..0c1e4e330c903 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -119,6 +119,36 @@ repos: entry: python scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module" types: [python] exclude: ^(asv_bench|pandas/tests|doc)/ + - id: FrameOrSeriesUnion + name: Check for use of Union[Series, DataFrame] instead of FrameOrSeriesUnion alias + entry: Union\[.*(Series.*DataFrame|DataFrame.*Series).*\] + language: pygrep + types: [python] + exclude: ^pandas/_typing\.py$ + - id: type-not-class + name: Check for use of foo.__class__ instead of type(foo) + entry: \.__class__ + language: pygrep + files: \.(py|pyx)$ + - id: unwanted-typing + name: Check for use of comment-based annotation syntax and missing error codes + entry: | + (?x) + \#\ type:\ (?!ignore)| + \#\ type:\s?ignore(?!\[) + language: pygrep + types: [python] + - id: no-os-remove + name: Check code for instances of os.remove + entry: os\.remove + language: pygrep + types: [python] + files: ^pandas/tests/ + exclude: | + (?x)^ + pandas/tests/io/excel/test_writers\.py| + pandas/tests/io/pytables/common\.py| + pandas/tests/io/pytables/test_store\.py$ - repo: https://github.com/asottile/yesqa rev: v1.2.2 hooks: diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 7c48905135f89..b5d63e259456b 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -122,29 +122,6 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then RET=$(($RET + $?)) ; echo $MSG "DONE" # ------------------------------------------------------------------------- - # Type annotations - - MSG='Check for use of comment-based annotation syntax' ; echo $MSG - invgrep -R --include="*.py" -P '# type: (?!ignore)' pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for missing error codes with # type: ignore' ; echo $MSG - invgrep -R --include="*.py" -P '# type:\s?ignore(?!\[)' pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for use of Union[Series, DataFrame] instead of FrameOrSeriesUnion alias' ; echo $MSG - invgrep -R --include="*.py" --exclude=_typing.py -E 'Union\[.*(Series.*DataFrame|DataFrame.*Series).*\]' pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - - # ------------------------------------------------------------------------- - MSG='Check for use of foo.__class__ instead of type(foo)' ; echo $MSG - invgrep -R --include=*.{py,pyx} '\.__class__' pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check code for instances of os.remove' ; echo $MSG - invgrep -R --include="*.py*" --exclude "common.py" --exclude "test_writers.py" --exclude "test_store.py" -E "os\.remove" pandas/tests/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Check for inconsistent use of pandas namespace in tests' ; echo $MSG for class in "Series" "DataFrame" "Index" "MultiIndex" "Timestamp" "Timedelta" "TimedeltaIndex" "DatetimeIndex" "Categorical"; do check_namespace ${class} diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 7b648a589bc61..9c58a55cb907e 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -474,7 +474,7 @@ def main( sys.exit( main( - function=globals().get(args.validation_type), # type: ignore + function=globals().get(args.validation_type), source_path=args.paths, output_format=args.format, ) From 15f843ab102d7a0cd7f1c7870dfec72d0e28d252 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov <41443370+ivanovmg@users.noreply.github.com> Date: Wed, 4 Nov 2020 18:04:32 +0700 Subject: [PATCH 18/21] TST: 32bit dtype compat test_groupby_dropna (#37623) --- pandas/tests/groupby/test_groupby_dropna.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 02ce4dcf2ae2b..e38fa5e8de87e 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -343,7 +343,7 @@ def test_groupby_nan_included(): df = pd.DataFrame(data) grouped = df.groupby("group", dropna=False) result = grouped.indices - dtype = "int64" + dtype = np.intp expected = { "g1": np.array([0, 2], dtype=dtype), "g2": np.array([3], dtype=dtype), From cc9c646463d4a93abdc7c61bbb47e7d2ccf2fc4b Mon Sep 17 00:00:00 2001 From: Janus Date: Wed, 4 Nov 2020 14:22:54 +0100 Subject: [PATCH 19/21] BUG: Metadata propagation for groupby iterator (#37461) --- doc/source/whatsnew/v1.1.5.rst | 2 +- pandas/core/groupby/ops.py | 15 ++++++++++++--- pandas/tests/groupby/test_groupby_subclass.py | 9 +++++++++ 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index cf728d94b2a55..a122154904996 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -23,7 +23,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Bug in metadata propagation for ``groupby`` iterator (:issue:`37343`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index ccf23a6f24c42..f807b740abaf2 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -140,9 +140,16 @@ def get_iterator( splitter = self._get_splitter(data, axis=axis) keys = self._get_group_keys() for key, (i, group) in zip(keys, splitter): - yield key, group + yield key, group.__finalize__(data, method="groupby") def _get_splitter(self, data: FrameOrSeries, axis: int = 0) -> "DataSplitter": + """ + Returns + ------- + Generator yielding subsetted objects + + __finalize__ has not been called for the the subsetted objects returned. + """ comp_ids, _, ngroups = self.group_info return get_splitter(data, comp_ids, ngroups, axis=axis) @@ -918,7 +925,8 @@ class SeriesSplitter(DataSplitter): def _chop(self, sdata: Series, slice_obj: slice) -> Series: # fastpath equivalent to `sdata.iloc[slice_obj]` mgr = sdata._mgr.get_slice(slice_obj) - return type(sdata)(mgr, name=sdata.name, fastpath=True) + # __finalize__ not called here, must be applied by caller if applicable + return sdata._constructor(mgr, name=sdata.name, fastpath=True) class FrameSplitter(DataSplitter): @@ -934,7 +942,8 @@ def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: # else: # return sdata.iloc[:, slice_obj] mgr = sdata._mgr.get_slice(slice_obj, axis=1 - self.axis) - return type(sdata)(mgr) + # __finalize__ not called here, must be applied by caller if applicable + return sdata._constructor(mgr) def get_splitter( diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index cc7a79e976513..d268d87708552 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -51,6 +51,15 @@ def test_groupby_preserves_subclass(obj, groupby_func): tm.assert_series_equal(result1, result2) +def test_groupby_preserves_metadata(): + # GH-37343 + custom_df = tm.SubclassedDataFrame({"a": [1, 2, 3], "b": [1, 1, 2], "c": [7, 8, 9]}) + assert "testattr" in custom_df._metadata + custom_df.testattr = "hello" + for _, group_df in custom_df.groupby("c"): + assert group_df.testattr == "hello" + + @pytest.mark.parametrize("obj", [DataFrame, tm.SubclassedDataFrame]) def test_groupby_resample_preserves_subclass(obj): # GH28330 -- preserve subclass through groupby.resample() From 1c6cd01a4f3ba0e8f4dc2fccc64c216f577b5eca Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 4 Nov 2020 05:43:46 -0800 Subject: [PATCH 20/21] BUG: read-only values in cython funcs (#37613) --- doc/source/whatsnew/v1.2.0.rst | 2 ++ pandas/_libs/join.pyx | 2 +- pandas/_libs/tslibs/strptime.pyx | 4 ++-- pandas/_libs/tslibs/timedeltas.pyx | 2 +- pandas/core/arrays/datetimelike.py | 3 +-- pandas/tests/libs/test_join.py | 7 ++++++- pandas/tests/tools/test_to_datetime.py | 10 ++++++++++ pandas/tests/tools/test_to_timedelta.py | 10 ++++++++++ 8 files changed, 33 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 33e9bd0c2732a..2e976371c0ac8 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -399,11 +399,13 @@ Datetimelike - Bug in :meth:`TimedeltaIndex.sum` and :meth:`Series.sum` with ``timedelta64`` dtype on an empty index or series returning ``NaT`` instead of ``Timedelta(0)`` (:issue:`31751`) - Bug in :meth:`DatetimeArray.shift` incorrectly allowing ``fill_value`` with a mismatched timezone (:issue:`37299`) - Bug in adding a :class:`BusinessDay` with nonzero ``offset`` to a non-scalar other (:issue:`37457`) +- Bug in :func:`to_datetime` with a read-only array incorrectly raising (:issue:`34857`) Timedelta ^^^^^^^^^ - Bug in :class:`TimedeltaIndex`, :class:`Series`, and :class:`DataFrame` floor-division with ``timedelta64`` dtypes and ``NaT`` in the denominator (:issue:`35529`) - Bug in parsing of ISO 8601 durations in :class:`Timedelta`, :meth:`pd.to_datetime` (:issue:`37159`, fixes :issue:`29773` and :issue:`36204`) +- Bug in :func:`to_timedelta` with a read-only array incorrectly raising (:issue:`34857`) Timezones ^^^^^^^^^ diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 13c7187923473..1b79d68c13570 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -268,7 +268,7 @@ ctypedef fused join_t: @cython.wraparound(False) @cython.boundscheck(False) -def left_join_indexer_unique(join_t[:] left, join_t[:] right): +def left_join_indexer_unique(ndarray[join_t] left, ndarray[join_t] right): cdef: Py_ssize_t i, j, nleft, nright ndarray[int64_t] indexer diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index d2690be905a68..bc4632ad028ab 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -12,7 +12,7 @@ from _thread import allocate_lock as _thread_allocate_lock import numpy as np import pytz -from numpy cimport int64_t +from numpy cimport int64_t, ndarray from pandas._libs.tslibs.nattype cimport ( NPY_NAT, @@ -51,7 +51,7 @@ cdef dict _parse_code_table = {'y': 0, 'u': 22} -def array_strptime(object[:] values, object fmt, bint exact=True, errors='raise'): +def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors='raise'): """ Calculates the datetime structs represented by the passed array of strings diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 45f32d92c7a74..29e8c58055f9e 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -227,7 +227,7 @@ cdef convert_to_timedelta64(object ts, str unit): @cython.boundscheck(False) @cython.wraparound(False) -def array_to_timedelta64(object[:] values, str unit=None, str errors="raise"): +def array_to_timedelta64(ndarray[object] values, str unit=None, str errors="raise"): """ Convert an ndarray to an array of timedeltas. If errors == 'coerce', coerce non-convertible objects to NaT. Otherwise, raise. diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 1955a96160a4a..e845dbf39dbc9 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1025,9 +1025,8 @@ def _addsub_object_array(self, other: np.ndarray, op): result : same class as self """ assert op in [operator.add, operator.sub] - if len(other) == 1: + if len(other) == 1 and self.ndim == 1: # If both 1D then broadcasting is unambiguous - # TODO(EA2D): require self.ndim == other.ndim here return op(self, other[0]) warnings.warn( diff --git a/pandas/tests/libs/test_join.py b/pandas/tests/libs/test_join.py index 95d6dcbaf3baf..f3f09d7a42204 100644 --- a/pandas/tests/libs/test_join.py +++ b/pandas/tests/libs/test_join.py @@ -135,9 +135,14 @@ def test_cython_inner_join(self): tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) -def test_left_join_indexer_unique(): +@pytest.mark.parametrize("readonly", [True, False]) +def test_left_join_indexer_unique(readonly): a = np.array([1, 2, 3, 4, 5], dtype=np.int64) b = np.array([2, 2, 3, 4, 4], dtype=np.int64) + if readonly: + # GH#37312, GH#37264 + a.setflags(write=False) + b.setflags(write=False) result = libjoin.left_join_indexer_unique(b, a) expected = np.array([1, 1, 2, 3, 3], dtype=np.int64) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index ebe118252c8cf..10bda16655586 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -36,6 +36,16 @@ class TestTimeConversionFormats: + @pytest.mark.parametrize("readonly", [True, False]) + def test_to_datetime_readonly(self, readonly): + # GH#34857 + arr = np.array([], dtype=object) + if readonly: + arr.setflags(write=False) + result = to_datetime(arr) + expected = to_datetime([]) + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_format(self, cache): values = ["1/1/2000", "1/2/2000", "1/3/2000"] diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py index 8e48295c533cc..5be7e81df53f2 100644 --- a/pandas/tests/tools/test_to_timedelta.py +++ b/pandas/tests/tools/test_to_timedelta.py @@ -9,6 +9,16 @@ class TestTimedeltas: + @pytest.mark.parametrize("readonly", [True, False]) + def test_to_timedelta_readonly(self, readonly): + # GH#34857 + arr = np.array([], dtype=object) + if readonly: + arr.setflags(write=False) + result = to_timedelta(arr) + expected = to_timedelta([]) + tm.assert_index_equal(result, expected) + def test_to_timedelta(self): result = to_timedelta(["", ""]) From a0571352b1ecf3b93dd0badbd02f873bebf906e0 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 4 Nov 2020 13:47:36 +0000 Subject: [PATCH 21/21] CLN refactor core/arrays (#37581) --- pandas/core/arrays/base.py | 9 ++++----- pandas/core/arrays/boolean.py | 20 +++++++++---------- pandas/core/arrays/categorical.py | 8 +++----- pandas/core/arrays/datetimelike.py | 6 ++---- pandas/core/arrays/masked.py | 7 ++++--- pandas/core/arrays/numpy_.py | 6 ++---- pandas/core/arrays/period.py | 6 ++---- pandas/core/arrays/sparse/array.py | 32 ++++++++++-------------------- pandas/core/arrays/timedeltas.py | 6 ++---- 9 files changed, 40 insertions(+), 60 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 57f8f11d4d04c..82d79cc47a4ae 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -460,7 +460,7 @@ def astype(self, dtype, copy=True): if is_dtype_equal(dtype, self.dtype): if not copy: return self - elif copy: + else: return self.copy() if isinstance(dtype, StringDtype): # allow conversion to StringArrays return dtype.construct_array_type()._from_sequence(self, copy=False) @@ -544,14 +544,13 @@ def argsort( ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) values = self._values_for_argsort() - result = nargsort( + return nargsort( values, kind=kind, ascending=ascending, na_position=na_position, mask=np.asarray(self.isna()), ) - return result def argmin(self): """ @@ -780,12 +779,12 @@ def equals(self, other: object) -> bool: boolean Whether the arrays are equivalent. """ - if not type(self) == type(other): + if type(self) != type(other): return False other = cast(ExtensionArray, other) if not is_dtype_equal(self.dtype, other.dtype): return False - elif not len(self) == len(other): + elif len(self) != len(other): return False else: equal_values = self == other diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 73aa97c832848..21306455573b8 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -170,12 +170,13 @@ def coerce_to_array( values[~mask_values] = values_object[~mask_values].astype(bool) # if the values were integer-like, validate it were actually 0/1's - if inferred_dtype in integer_like: - if not np.all( + if (inferred_dtype in integer_like) and not ( + np.all( values[~mask_values].astype(float) == values_object[~mask_values].astype(float) - ): - raise TypeError("Need to pass bool-like values") + ) + ): + raise TypeError("Need to pass bool-like values") if mask is None and mask_values is None: mask = np.zeros(len(values), dtype=bool) @@ -193,9 +194,9 @@ def coerce_to_array( if mask_values is not None: mask = mask | mask_values - if not values.ndim == 1: + if values.ndim != 1: raise ValueError("values must be a 1D list-like") - if not mask.ndim == 1: + if mask.ndim != 1: raise ValueError("mask must be a 1D list-like") return values, mask @@ -395,9 +396,8 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: self._data.astype(dtype.numpy_dtype), self._mask.copy(), copy=False ) # for integer, error if there are missing values - if is_integer_dtype(dtype): - if self._hasna: - raise ValueError("cannot convert NA to integer") + if is_integer_dtype(dtype) and self._hasna: + raise ValueError("cannot convert NA to integer") # for float dtype, ensure we use np.nan before casting (numpy cannot # deal with pd.NA) na_value = self._na_value @@ -576,7 +576,7 @@ def _logical_method(self, other, op): elif isinstance(other, np.bool_): other = other.item() - if other_is_scalar and not (other is libmissing.NA or lib.is_bool(other)): + if other_is_scalar and other is not libmissing.NA and not lib.is_bool(other): raise TypeError( "'other' should be pandas.NA or a bool. " f"Got {type(other).__name__} instead." diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 9f0414cf7a806..626fb495dec03 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1314,8 +1314,7 @@ def isna(self): Categorical.notna : Boolean inverse of Categorical.isna. """ - ret = self._codes == -1 - return ret + return self._codes == -1 isnull = isna @@ -1363,7 +1362,7 @@ def value_counts(self, dropna=True): from pandas import CategoricalIndex, Series code, cat = self._codes, self.categories - ncat, mask = len(cat), 0 <= code + ncat, mask = (len(cat), code >= 0) ix, clean = np.arange(ncat), mask.all() if dropna or clean: @@ -1920,8 +1919,7 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: ) counts = counts.cumsum() _result = (r[start:end] for start, end in zip(counts, counts[1:])) - result = dict(zip(categories, _result)) - return result + return dict(zip(categories, _result)) # ------------------------------------------------------------------ # Reductions diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index e845dbf39dbc9..404511895ddf0 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1062,8 +1062,7 @@ def _time_shift(self, periods, freq=None): if isinstance(freq, str): freq = to_offset(freq) offset = periods * freq - result = self + offset - return result + return self + offset if periods == 0 or len(self) == 0: # GH#14811 empty case @@ -1533,10 +1532,9 @@ def _round(self, freq, mode, ambiguous, nonexistent): self = cast("DatetimeArray", self) naive = self.tz_localize(None) result = naive._round(freq, mode, ambiguous, nonexistent) - aware = result.tz_localize( + return result.tz_localize( self.tz, ambiguous=ambiguous, nonexistent=nonexistent ) - return aware values = self.view("i8") result = round_nsint64(values, mode, freq) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 9febba0f544ac..b633f268049e5 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -84,9 +84,9 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): "mask should be boolean numpy array. Use " "the 'pd.array' function instead" ) - if not values.ndim == 1: + if values.ndim != 1: raise ValueError("values must be a 1D array") - if not mask.ndim == 1: + if mask.ndim != 1: raise ValueError("mask must be a 1D array") if copy: @@ -209,7 +209,8 @@ def to_numpy( dtype = object if self._hasna: if ( - not (is_object_dtype(dtype) or is_string_dtype(dtype)) + not is_object_dtype(dtype) + and not is_string_dtype(dtype) and na_value is libmissing.NA ): raise ValueError( diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index cd48f6cbc8170..e1a424b719a4a 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -281,17 +281,15 @@ def all(self, *, axis=None, out=None, keepdims=False, skipna=True): def min(self, *, skipna: bool = True, **kwargs) -> Scalar: nv.validate_min((), kwargs) - result = masked_reductions.min( + return masked_reductions.min( values=self.to_numpy(), mask=self.isna(), skipna=skipna ) - return result def max(self, *, skipna: bool = True, **kwargs) -> Scalar: nv.validate_max((), kwargs) - result = masked_reductions.max( + return masked_reductions.max( values=self.to_numpy(), mask=self.isna(), skipna=skipna ) - return result def sum(self, *, axis=None, skipna=True, min_count=0, **kwargs) -> Scalar: nv.validate_sum((), kwargs) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index d808ade53ad33..8de84a0187e95 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -589,7 +589,7 @@ def astype(self, dtype, copy: bool = True): if is_dtype_equal(dtype, self._dtype): if not copy: return self - elif copy: + else: return self.copy() if is_period_dtype(dtype): return self.asfreq(dtype.freq) @@ -1080,11 +1080,9 @@ def _make_field_arrays(*fields): elif length is None: length = len(x) - arrays = [ + return [ np.asarray(x) if isinstance(x, (np.ndarray, list, ABCSeries)) else np.repeat(x, length) for x in fields ] - - return arrays diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 4346e02069667..5f4cd4b269a2a 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -316,9 +316,8 @@ def __init__( raise Exception("must only pass scalars with an index") if is_scalar(data): - if index is not None: - if data is None: - data = np.nan + if index is not None and data is None: + data = np.nan if index is not None: npoints = len(index) @@ -575,8 +574,7 @@ def density(self): >>> s.density 0.6 """ - r = float(self.sp_index.npoints) / float(self.sp_index.length) - return r + return float(self.sp_index.npoints) / float(self.sp_index.length) @property def npoints(self) -> int: @@ -736,25 +734,17 @@ def value_counts(self, dropna=True): keys, counts = algos.value_counts_arraylike(self.sp_values, dropna=dropna) fcounts = self.sp_index.ngaps - if fcounts > 0: - if self._null_fill_value and dropna: - pass + if fcounts > 0 and (not self._null_fill_value or not dropna): + mask = isna(keys) if self._null_fill_value else keys == self.fill_value + if mask.any(): + counts[mask] += fcounts else: - if self._null_fill_value: - mask = isna(keys) - else: - mask = keys == self.fill_value - - if mask.any(): - counts[mask] += fcounts - else: - keys = np.insert(keys, 0, self.fill_value) - counts = np.insert(counts, 0, fcounts) + keys = np.insert(keys, 0, self.fill_value) + counts = np.insert(counts, 0, fcounts) if not isinstance(keys, ABCIndexClass): keys = Index(keys) - result = Series(counts, index=keys) - return result + return Series(counts, index=keys) # -------- # Indexing @@ -1062,7 +1052,7 @@ def astype(self, dtype=None, copy=True): if is_dtype_equal(dtype, self._dtype): if not copy: return self - elif copy: + else: return self.copy() dtype = self.dtype.update_dtype(dtype) subtype = dtype._subtype_with_str diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index e4a844fd4c6ef..8a87df18b6adb 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -227,8 +227,7 @@ def _from_sequence( data, inferred_freq = sequence_to_td64ns(data, copy=copy, unit=None) freq, _ = dtl.validate_inferred_freq(None, inferred_freq, False) - result = cls._simple_new(data, freq=freq) - return result + return cls._simple_new(data, freq=freq) @classmethod def _from_sequence_not_strict( @@ -334,10 +333,9 @@ def astype(self, dtype, copy: bool = True): if self._hasnans: # avoid double-copying result = self._data.astype(dtype, copy=False) - values = self._maybe_mask_results( + return self._maybe_mask_results( result, fill_value=None, convert="float64" ) - return values result = self._data.astype(dtype, copy=copy) return result.astype("i8") elif is_timedelta64_ns_dtype(dtype):