From 03abbdc71e4137f0aa8f983fdc6c920b8a6787de Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Mon, 2 Nov 2020 11:35:38 +0000 Subject: [PATCH 1/2] refactor core-arrays --- pandas/core/arrays/base.py | 12 +++++------ pandas/core/arrays/boolean.py | 22 ++++++++++---------- pandas/core/arrays/categorical.py | 8 +++----- pandas/core/arrays/datetimelike.py | 6 ++---- pandas/core/arrays/masked.py | 7 ++++--- pandas/core/arrays/numpy_.py | 6 ++---- pandas/core/arrays/period.py | 6 ++---- pandas/core/arrays/sparse/array.py | 32 ++++++++++-------------------- pandas/core/arrays/timedeltas.py | 6 ++---- 9 files changed, 41 insertions(+), 64 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 57f8f11d4d04c..3216957e1f188 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -335,8 +335,7 @@ def __iter__(self): # This needs to be implemented so that pandas recognizes extension # arrays as list-like. The default implementation makes successive # calls to ``__getitem__``, which may be slower than necessary. - for i in range(len(self)): - yield self[i] + yield from self def __eq__(self, other: Any) -> ArrayLike: """ @@ -460,7 +459,7 @@ def astype(self, dtype, copy=True): if is_dtype_equal(dtype, self.dtype): if not copy: return self - elif copy: + else: return self.copy() if isinstance(dtype, StringDtype): # allow conversion to StringArrays return dtype.construct_array_type()._from_sequence(self, copy=False) @@ -544,14 +543,13 @@ def argsort( ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) values = self._values_for_argsort() - result = nargsort( + return nargsort( values, kind=kind, ascending=ascending, na_position=na_position, mask=np.asarray(self.isna()), ) - return result def argmin(self): """ @@ -780,12 +778,12 @@ def equals(self, other: object) -> bool: boolean Whether the arrays are equivalent. """ - if not type(self) == type(other): + if type(self) != type(other): return False other = cast(ExtensionArray, other) if not is_dtype_equal(self.dtype, other.dtype): return False - elif not len(self) == len(other): + elif len(self) != len(other): return False else: equal_values = self == other diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 73aa97c832848..e992e7478017e 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -170,12 +170,11 @@ def coerce_to_array( values[~mask_values] = values_object[~mask_values].astype(bool) # if the values were integer-like, validate it were actually 0/1's - if inferred_dtype in integer_like: - if not np.all( - values[~mask_values].astype(float) - == values_object[~mask_values].astype(float) - ): - raise TypeError("Need to pass bool-like values") + if inferred_dtype in integer_like and not np.all( + values[~mask_values].astype(float) + == values_object[~mask_values].astype(float) + ): + raise TypeError("Need to pass bool-like values") if mask is None and mask_values is None: mask = np.zeros(len(values), dtype=bool) @@ -193,9 +192,9 @@ def coerce_to_array( if mask_values is not None: mask = mask | mask_values - if not values.ndim == 1: + if values.ndim != 1: raise ValueError("values must be a 1D list-like") - if not mask.ndim == 1: + if mask.ndim != 1: raise ValueError("mask must be a 1D list-like") return values, mask @@ -395,9 +394,8 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: self._data.astype(dtype.numpy_dtype), self._mask.copy(), copy=False ) # for integer, error if there are missing values - if is_integer_dtype(dtype): - if self._hasna: - raise ValueError("cannot convert NA to integer") + if is_integer_dtype(dtype) and self._hasna: + raise ValueError("cannot convert NA to integer") # for float dtype, ensure we use np.nan before casting (numpy cannot # deal with pd.NA) na_value = self._na_value @@ -576,7 +574,7 @@ def _logical_method(self, other, op): elif isinstance(other, np.bool_): other = other.item() - if other_is_scalar and not (other is libmissing.NA or lib.is_bool(other)): + if other_is_scalar and other is not libmissing.NA and not lib.is_bool(other): raise TypeError( "'other' should be pandas.NA or a bool. " f"Got {type(other).__name__} instead." diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 03f66ff82ad75..f77cea73ef6c1 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1319,8 +1319,7 @@ def isna(self): Categorical.notna : Boolean inverse of Categorical.isna. """ - ret = self._codes == -1 - return ret + return self._codes == -1 isnull = isna @@ -1368,7 +1367,7 @@ def value_counts(self, dropna=True): from pandas import CategoricalIndex, Series code, cat = self._codes, self.categories - ncat, mask = len(cat), 0 <= code + ncat, mask = (len(cat), code >= 0) ix, clean = np.arange(ncat), mask.all() if dropna or clean: @@ -1930,8 +1929,7 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: ) counts = counts.cumsum() _result = (r[start:end] for start, end in zip(counts, counts[1:])) - result = dict(zip(categories, _result)) - return result + return dict(zip(categories, _result)) # ------------------------------------------------------------------ # Reductions diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index f8a609fb0cabe..d82399e00a2aa 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1064,8 +1064,7 @@ def _time_shift(self, periods, freq=None): if isinstance(freq, str): freq = to_offset(freq) offset = periods * freq - result = self + offset - return result + return self + offset if periods == 0 or len(self) == 0: # GH#14811 empty case @@ -1535,10 +1534,9 @@ def _round(self, freq, mode, ambiguous, nonexistent): self = cast("DatetimeArray", self) naive = self.tz_localize(None) result = naive._round(freq, mode, ambiguous, nonexistent) - aware = result.tz_localize( + return result.tz_localize( self.tz, ambiguous=ambiguous, nonexistent=nonexistent ) - return aware values = self.view("i8") result = round_nsint64(values, mode, freq) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 9febba0f544ac..b633f268049e5 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -84,9 +84,9 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): "mask should be boolean numpy array. Use " "the 'pd.array' function instead" ) - if not values.ndim == 1: + if values.ndim != 1: raise ValueError("values must be a 1D array") - if not mask.ndim == 1: + if mask.ndim != 1: raise ValueError("mask must be a 1D array") if copy: @@ -209,7 +209,8 @@ def to_numpy( dtype = object if self._hasna: if ( - not (is_object_dtype(dtype) or is_string_dtype(dtype)) + not is_object_dtype(dtype) + and not is_string_dtype(dtype) and na_value is libmissing.NA ): raise ValueError( diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index cd48f6cbc8170..e1a424b719a4a 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -281,17 +281,15 @@ def all(self, *, axis=None, out=None, keepdims=False, skipna=True): def min(self, *, skipna: bool = True, **kwargs) -> Scalar: nv.validate_min((), kwargs) - result = masked_reductions.min( + return masked_reductions.min( values=self.to_numpy(), mask=self.isna(), skipna=skipna ) - return result def max(self, *, skipna: bool = True, **kwargs) -> Scalar: nv.validate_max((), kwargs) - result = masked_reductions.max( + return masked_reductions.max( values=self.to_numpy(), mask=self.isna(), skipna=skipna ) - return result def sum(self, *, axis=None, skipna=True, min_count=0, **kwargs) -> Scalar: nv.validate_sum((), kwargs) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index b95a7acc19b1f..e2fbf26840c22 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -593,7 +593,7 @@ def astype(self, dtype, copy: bool = True): if is_dtype_equal(dtype, self._dtype): if not copy: return self - elif copy: + else: return self.copy() if is_period_dtype(dtype): return self.asfreq(dtype.freq) @@ -1084,11 +1084,9 @@ def _make_field_arrays(*fields): elif length is None: length = len(x) - arrays = [ + return [ np.asarray(x) if isinstance(x, (np.ndarray, list, ABCSeries)) else np.repeat(x, length) for x in fields ] - - return arrays diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 4346e02069667..5f4cd4b269a2a 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -316,9 +316,8 @@ def __init__( raise Exception("must only pass scalars with an index") if is_scalar(data): - if index is not None: - if data is None: - data = np.nan + if index is not None and data is None: + data = np.nan if index is not None: npoints = len(index) @@ -575,8 +574,7 @@ def density(self): >>> s.density 0.6 """ - r = float(self.sp_index.npoints) / float(self.sp_index.length) - return r + return float(self.sp_index.npoints) / float(self.sp_index.length) @property def npoints(self) -> int: @@ -736,25 +734,17 @@ def value_counts(self, dropna=True): keys, counts = algos.value_counts_arraylike(self.sp_values, dropna=dropna) fcounts = self.sp_index.ngaps - if fcounts > 0: - if self._null_fill_value and dropna: - pass + if fcounts > 0 and (not self._null_fill_value or not dropna): + mask = isna(keys) if self._null_fill_value else keys == self.fill_value + if mask.any(): + counts[mask] += fcounts else: - if self._null_fill_value: - mask = isna(keys) - else: - mask = keys == self.fill_value - - if mask.any(): - counts[mask] += fcounts - else: - keys = np.insert(keys, 0, self.fill_value) - counts = np.insert(counts, 0, fcounts) + keys = np.insert(keys, 0, self.fill_value) + counts = np.insert(counts, 0, fcounts) if not isinstance(keys, ABCIndexClass): keys = Index(keys) - result = Series(counts, index=keys) - return result + return Series(counts, index=keys) # -------- # Indexing @@ -1062,7 +1052,7 @@ def astype(self, dtype=None, copy=True): if is_dtype_equal(dtype, self._dtype): if not copy: return self - elif copy: + else: return self.copy() dtype = self.dtype.update_dtype(dtype) subtype = dtype._subtype_with_str diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 0d9d257810674..806e784799f4e 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -227,8 +227,7 @@ def _from_sequence( data, inferred_freq = sequence_to_td64ns(data, copy=copy, unit=None) freq, _ = dtl.validate_inferred_freq(None, inferred_freq, False) - result = cls._simple_new(data, freq=freq) - return result + return cls._simple_new(data, freq=freq) @classmethod def _from_sequence_not_strict( @@ -338,10 +337,9 @@ def astype(self, dtype, copy: bool = True): if self._hasnans: # avoid double-copying result = self._data.astype(dtype, copy=False) - values = self._maybe_mask_results( + return self._maybe_mask_results( result, fill_value=None, convert="float64" ) - return values result = self._data.astype(dtype, copy=copy) return result.astype("i8") elif is_timedelta64_ns_dtype(dtype): From a4c46e78415010a36da8d5acb0f56fbf62d93685 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 4 Nov 2020 07:42:59 +0000 Subject: [PATCH 2/2] add parens, fix failing doctest --- pandas/core/arrays/base.py | 3 ++- pandas/core/arrays/boolean.py | 8 +++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 3216957e1f188..82d79cc47a4ae 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -335,7 +335,8 @@ def __iter__(self): # This needs to be implemented so that pandas recognizes extension # arrays as list-like. The default implementation makes successive # calls to ``__getitem__``, which may be slower than necessary. - yield from self + for i in range(len(self)): + yield self[i] def __eq__(self, other: Any) -> ArrayLike: """ diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index e992e7478017e..21306455573b8 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -170,9 +170,11 @@ def coerce_to_array( values[~mask_values] = values_object[~mask_values].astype(bool) # if the values were integer-like, validate it were actually 0/1's - if inferred_dtype in integer_like and not np.all( - values[~mask_values].astype(float) - == values_object[~mask_values].astype(float) + if (inferred_dtype in integer_like) and not ( + np.all( + values[~mask_values].astype(float) + == values_object[~mask_values].astype(float) + ) ): raise TypeError("Need to pass bool-like values")