From e977432f5dda08d5eaf0890161cd0304c4eac209 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 2 Sep 2020 22:00:00 -0500 Subject: [PATCH] Optionally disallow duplicate labels (#28394) --- doc/source/reference/frame.rst | 16 + .../reference/general_utility_functions.rst | 1 + doc/source/reference/series.rst | 15 + doc/source/user_guide/duplicates.rst | 210 ++++++++ doc/source/user_guide/index.rst | 1 + doc/source/whatsnew/v1.2.0.rst | 49 ++ pandas/__init__.py | 1 + pandas/_testing.py | 15 + pandas/core/api.py | 1 + pandas/core/flags.py | 113 +++++ pandas/core/frame.py | 11 +- pandas/core/generic.py | 130 ++++- pandas/core/indexes/base.py | 48 +- pandas/core/series.py | 10 +- pandas/errors/__init__.py | 21 + pandas/tests/api/test_api.py | 1 + pandas/tests/base/test_misc.py | 3 +- pandas/tests/frame/test_api.py | 27 ++ pandas/tests/generic/test_duplicate_labels.py | 450 ++++++++++++++++++ pandas/tests/generic/test_generic.py | 10 + pandas/tests/series/test_api.py | 26 + pandas/tests/test_flags.py | 48 ++ pandas/tests/util/test_assert_frame_equal.py | 15 + pandas/tests/util/test_assert_series_equal.py | 15 + 24 files changed, 1227 insertions(+), 10 deletions(-) create mode 100644 doc/source/user_guide/duplicates.rst create mode 100644 pandas/core/flags.py create mode 100644 pandas/tests/generic/test_duplicate_labels.py create mode 100644 pandas/tests/test_flags.py diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index 4d9d18e3d204e..9a1ebc8d670dc 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -37,6 +37,7 @@ Attributes and underlying data DataFrame.shape DataFrame.memory_usage DataFrame.empty + DataFrame.set_flags Conversion ~~~~~~~~~~ @@ -276,6 +277,21 @@ Time Series-related DataFrame.tz_convert DataFrame.tz_localize +.. _api.frame.flags: + +Flags +~~~~~ + +Flags refer to attributes of the pandas object. Properties of the dataset (like +the date is was recorded, the URL it was accessed from, etc.) should be stored +in :attr:`DataFrame.attrs`. + +.. autosummary:: + :toctree: api/ + + Flags + + .. _api.frame.metadata: Metadata diff --git a/doc/source/reference/general_utility_functions.rst b/doc/source/reference/general_utility_functions.rst index c1759110b94ad..3cba0a81a7011 100644 --- a/doc/source/reference/general_utility_functions.rst +++ b/doc/source/reference/general_utility_functions.rst @@ -37,6 +37,7 @@ Exceptions and warnings errors.AccessorRegistrationWarning errors.DtypeWarning + errors.DuplicateLabelError errors.EmptyDataError errors.InvalidIndexError errors.MergeError diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index ae3e121ca8212..5131d35334693 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -39,6 +39,8 @@ Attributes Series.empty Series.dtypes Series.name + Series.flags + Series.set_flags Conversion ---------- @@ -527,6 +529,19 @@ Sparse-dtype specific methods and attributes are provided under the Series.sparse.from_coo Series.sparse.to_coo +.. _api.series.flags: + +Flags +~~~~~ + +Flags refer to attributes of the pandas object. Properties of the dataset (like +the date is was recorded, the URL it was accessed from, etc.) should be stored +in :attr:`Series.attrs`. + +.. autosummary:: + :toctree: api/ + + Flags .. _api.series.metadata: diff --git a/doc/source/user_guide/duplicates.rst b/doc/source/user_guide/duplicates.rst new file mode 100644 index 0000000000000..b65822fab2b23 --- /dev/null +++ b/doc/source/user_guide/duplicates.rst @@ -0,0 +1,210 @@ +.. _duplicates: + +**************** +Duplicate Labels +**************** + +:class:`Index` objects are not required to be unique; you can have duplicate row +or column labels. This may be a bit confusing at first. If you're familiar with +SQL, you know that row labels are similar to a primary key on a table, and you +would never want duplicates in a SQL table. But one of pandas' roles is to clean +messy, real-world data before it goes to some downstream system. And real-world +data has duplicates, even in fields that are supposed to be unique. + +This section describes how duplicate labels change the behavior of certain +operations, and how prevent duplicates from arising during operations, or to +detect them if they do. + +.. ipython:: python + + import pandas as pd + import numpy as np + +Consequences of Duplicate Labels +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some pandas methods (:meth:`Series.reindex` for example) just don't work with +duplicates present. The output can't be determined, and so pandas raises. + +.. ipython:: python + :okexcept: + + s1 = pd.Series([0, 1, 2], index=['a', 'b', 'b']) + s1.reindex(['a', 'b', 'c']) + +Other methods, like indexing, can give very surprising results. Typically +indexing with a scalar will *reduce dimensionality*. Slicing a ``DataFrame`` +with a scalar will return a ``Series``. Slicing a ``Series`` with a scalar will +return a scalar. But with duplicates, this isn't the case. + +.. ipython:: python + + df1 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=['A', 'A', 'B']) + df1 + +We have duplicates in the columns. If we slice ``'B'``, we get back a ``Series`` + +.. ipython:: python + + df1['B'] # a series + +But slicing ``'A'`` returns a ``DataFrame`` + + +.. ipython:: python + + df1['A'] # a DataFrame + +This applies to row labels as well + +.. ipython:: python + + df2 = pd.DataFrame({"A": [0, 1, 2]}, index=['a', 'a', 'b']) + df2 + df2.loc['b', 'A'] # a scalar + df2.loc['a', 'A'] # a Series + +Duplicate Label Detection +~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can check whether an :class:`Index` (storing the row or column labels) is +unique with :attr:`Index.is_unique`: + +.. ipython:: python + + df2 + df2.index.is_unique + df2.columns.is_unique + +.. note:: + + Checking whether an index is unique is somewhat expensive for large datasets. + Pandas does cache this result, so re-checking on the same index is very fast. + +:meth:`Index.duplicated` will return a boolean ndarray indicating whether a +label is repeated. + +.. ipython:: python + + df2.index.duplicated() + +Which can be used as a boolean filter to drop duplicate rows. + +.. ipython:: python + + df2.loc[~df2.index.duplicated(), :] + +If you need additional logic to handle duplicate labels, rather than just +dropping the repeats, using :meth:`~DataFrame.groupby` on the index is a common +trick. For example, we'll resolve duplicates by taking the average of all rows +with the same label. + +.. ipython:: python + + df2.groupby(level=0).mean() + +.. _duplicates.disallow: + +Disallowing Duplicate Labels +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 1.2.0 + +As noted above, handling duplicates is an important feature when reading in raw +data. That said, you may want to avoid introducing duplicates as part of a data +processing pipeline (from methods like :meth:`pandas.concat`, +:meth:`~DataFrame.rename`, etc.). Both :class:`Series` and :class:`DataFrame` +*disallow* duplicate labels by calling ``.set_flags(allows_duplicate_labels=False)``. +(the default is to allow them). If there are duplicate labels, an exception +will be raised. + +.. ipython:: python + :okexcept: + + pd.Series( + [0, 1, 2], + index=['a', 'b', 'b'] + ).set_flags(allows_duplicate_labels=False) + +This applies to both row and column labels for a :class:`DataFrame` + +.. ipython:: python + :okexcept: + + pd.DataFrame( + [[0, 1, 2], [3, 4, 5]], columns=["A", "B", "C"], + ).set_flags(allows_duplicate_labels=False) + +This attribute can be checked or set with :attr:`~DataFrame.flags.allows_duplicate_labels`, +which indicates whether that object can have duplicate labels. + +.. ipython:: python + + df = ( + pd.DataFrame({"A": [0, 1, 2, 3]}, + index=['x', 'y', 'X', 'Y']) + .set_flags(allows_duplicate_labels=False) + ) + df + df.flags.allows_duplicate_labels + +:meth:`DataFrame.set_flags` can be used to return a new ``DataFrame`` with attributes +like ``allows_duplicate_labels`` set to some value + +.. ipython:: python + + df2 = df.set_flags(allows_duplicate_labels=True) + df2.flags.allows_duplicate_labels + +The new ``DataFrame`` returned is a view on the same data as the old ``DataFrame``. +Or the property can just be set directly on the same object + + +.. ipython:: python + + df2.flags.allows_duplicate_labels = False + df2.flags.allows_duplicate_labels + +When processing raw, messy data you might initially read in the messy data +(which potentially has duplicate labels), deduplicate, and then disallow duplicates +going forward, to ensure that your data pipeline doesn't introduce duplicates. + + +.. code-block:: python + + >>> raw = pd.read_csv("...") + >>> deduplicated = raw.groupby(level=0).first() # remove duplicates + >>> deduplicated.flags.allows_duplicate_labels = False # disallow going forward + +Setting ``allows_duplicate_labels=True`` on a ``Series`` or ``DataFrame`` with duplicate +labels or performing an operation that introduces duplicate labels on a ``Series`` or +``DataFrame`` that disallows duplicates will raise an +:class:`errors.DuplicateLabelError`. + +.. ipython:: python + :okexcept: + + df.rename(str.upper) + +This error message contains the labels that are duplicated, and the numeric positions +of all the duplicates (including the "original") in the ``Series`` or ``DataFrame`` + +Duplicate Label Propagation +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In general, disallowing duplicates is "sticky". It's preserved through +operations. + +.. ipython:: python + :okexcept: + + s1 = pd.Series(0, index=['a', 'b']).set_flags(allows_duplicate_labels=False) + s1 + s1.head().rename({"a": "b"}) + +.. warning:: + + This is an experimental feature. Currently, many methods fail to + propagate the ``allows_duplicate_labels`` value. In future versions + it is expected that every method taking or returning one or more + DataFrame or Series objects will propagate ``allows_duplicate_labels``. diff --git a/doc/source/user_guide/index.rst b/doc/source/user_guide/index.rst index 8226e72779588..2fc9e066e6712 100644 --- a/doc/source/user_guide/index.rst +++ b/doc/source/user_guide/index.rst @@ -33,6 +33,7 @@ Further information on any specific method can be obtained in the reshaping text missing_data + duplicates categorical integer_na boolean diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 9c8ee10a8a0af..7c083b95b21f3 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -13,6 +13,53 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +.. _whatsnew_120.duplicate_labels: + +Optionally disallow duplicate labels +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:class:`Series` and :class:`DataFrame` can now be created with ``allows_duplicate_labels=False`` flag to +control whether the index or columns can contain duplicate labels (:issue:`28394`). This can be used to +prevent accidental introduction of duplicate labels, which can affect downstream operations. + +By default, duplicates continue to be allowed + +.. ipython:: python + + pd.Series([1, 2], index=['a', 'a']) + +.. ipython:: python + :okexcept: + + pd.Series([1, 2], index=['a', 'a']).set_flags(allows_duplicate_labels=False) + +Pandas will propagate the ``allows_duplicate_labels`` property through many operations. + +.. ipython:: python + :okexcept: + + a = ( + pd.Series([1, 2], index=['a', 'b']) + .set_flags(allows_duplicate_labels=False) + ) + a + # An operation introducing duplicates + a.reindex(['a', 'b', 'a']) + +.. warning:: + + This is an experimental feature. Currently, many methods fail to + propagate the ``allows_duplicate_labels`` value. In future versions + it is expected that every method taking or returning one or more + DataFrame or Series objects will propagate ``allows_duplicate_labels``. + +See :ref:`duplicates` for more. + +The ``allows_duplicate_labels`` flag is stored in the new :attr:`DataFrame.flags` +attribute. This stores global attributes that apply to the *pandas object*. This +differs from :attr:`DataFrame.attrs`, which stores information that applies to +the dataset. + Passing arguments to fsspec backends ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -53,6 +100,8 @@ For example: Other enhancements ^^^^^^^^^^^^^^^^^^ + +- Added :meth:`~DataFrame.set_flags` for setting table-wide flags on a ``Series`` or ``DataFrame`` (:issue:`28394`) - :class:`Index` with object dtype supports division and multiplication (:issue:`34160`) - - diff --git a/pandas/__init__.py b/pandas/__init__.py index 36576da74c75d..2737bcd8f9ccf 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -100,6 +100,7 @@ to_datetime, to_timedelta, # misc + Flags, Grouper, factorize, unique, diff --git a/pandas/_testing.py b/pandas/_testing.py index b402b040d9268..04d36749a3d8c 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -1225,6 +1225,7 @@ def assert_series_equal( check_categorical=True, check_category_order=True, check_freq=True, + check_flags=True, rtol=1.0e-5, atol=1.0e-8, obj="Series", @@ -1271,6 +1272,11 @@ def assert_series_equal( .. versionadded:: 1.0.2 check_freq : bool, default True Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex. + check_flags : bool, default True + Whether to check the `flags` attribute. + + .. versionadded:: 1.2.0 + rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. @@ -1307,6 +1313,9 @@ def assert_series_equal( msg2 = f"{len(right)}, {right.index}" raise_assert_detail(obj, "Series length are different", msg1, msg2) + if check_flags: + assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}" + # index comparison assert_index_equal( left.index, @@ -1429,6 +1438,7 @@ def assert_frame_equal( check_categorical=True, check_like=False, check_freq=True, + check_flags=True, rtol=1.0e-5, atol=1.0e-8, obj="DataFrame", @@ -1490,6 +1500,8 @@ def assert_frame_equal( (same as in columns) - same labels must be with the same data. check_freq : bool, default True Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex. + check_flags : bool, default True + Whether to check the `flags` attribute. rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. @@ -1563,6 +1575,9 @@ def assert_frame_equal( if check_like: left, right = left.reindex_like(right), right + if check_flags: + assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}" + # index comparison assert_index_equal( left.index, diff --git a/pandas/core/api.py b/pandas/core/api.py index b0b65f9d0be34..348e9206d6e19 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -26,6 +26,7 @@ ) from pandas.core.arrays.string_ import StringDtype from pandas.core.construction import array +from pandas.core.flags import Flags from pandas.core.groupby import Grouper, NamedAgg from pandas.core.indexes.api import ( CategoricalIndex, diff --git a/pandas/core/flags.py b/pandas/core/flags.py new file mode 100644 index 0000000000000..15966d8ddce2a --- /dev/null +++ b/pandas/core/flags.py @@ -0,0 +1,113 @@ +import weakref + + +class Flags: + """ + Flags that apply to pandas objects. + + .. versionadded:: 1.2.0 + + Parameters + ---------- + obj : Series or DataFrame + The object these flags are associated with + allows_duplicate_labels : bool, default True + Whether to allow duplicate labels in this object. By default, + duplicate labels are permitted. Setting this to ``False`` will + cause an :class:`errors.DuplicateLabelError` to be raised when + `index` (or columns for DataFrame) is not unique, or any + subsequent operation on introduces duplicates. + See :ref:`duplicates.disallow` for more. + + .. warning:: + + This is an experimental feature. Currently, many methods fail to + propagate the ``allows_duplicate_labels`` value. In future versions + it is expected that every method taking or returning one or more + DataFrame or Series objects will propagate ``allows_duplicate_labels``. + + Notes + ----- + Attributes can be set in two ways + + >>> df = pd.DataFrame() + >>> df.flags + + >>> df.flags.allows_duplicate_labels = False + >>> df.flags + + + >>> df.flags['allows_duplicate_labels'] = True + >>> df.flags + + """ + + _keys = {"allows_duplicate_labels"} + + def __init__(self, obj, *, allows_duplicate_labels): + self._allows_duplicate_labels = allows_duplicate_labels + self._obj = weakref.ref(obj) + + @property + def allows_duplicate_labels(self) -> bool: + """ + Whether this object allows duplicate labels. + + Setting ``allows_duplicate_labels=False`` ensures that the + index (and columns of a DataFrame) are unique. Most methods + that accept and return a Series or DataFrame will propagate + the value of ``allows_duplicate_labels``. + + See :ref:`duplicates` for more. + + See Also + -------- + DataFrame.attrs : Set global metadata on this object. + DataFrame.set_flags : Set global flags on this object. + + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2]}, index=['a', 'a']) + >>> df.allows_duplicate_labels + True + >>> df.allows_duplicate_labels = False + Traceback (most recent call last): + ... + pandas.errors.DuplicateLabelError: Index has duplicates. + positions + label + a [0, 1] + """ + return self._allows_duplicate_labels + + @allows_duplicate_labels.setter + def allows_duplicate_labels(self, value: bool): + value = bool(value) + obj = self._obj() + if obj is None: + raise ValueError("This flag's object has been deleted.") + + if not value: + for ax in obj.axes: + ax._maybe_check_unique() + + self._allows_duplicate_labels = value + + def __getitem__(self, key): + if key not in self._keys: + raise KeyError(key) + + return getattr(self, key) + + def __setitem__(self, key, value): + if key not in self._keys: + raise ValueError(f"Unknown flag {key}. Must be one of {self._keys}") + setattr(self, key, value) + + def __repr__(self): + return f"" + + def __eq__(self, other): + if isinstance(other, type(self)): + return self.allows_duplicate_labels == other.allows_duplicate_labels + return False diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7832547685567..b4c12b9e52f56 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -458,7 +458,9 @@ def __init__( if isinstance(data, BlockManager): if index is None and columns is None and dtype is None and copy is False: # GH#33357 fastpath - NDFrame.__init__(self, data) + NDFrame.__init__( + self, data, + ) return mgr = self._init_mgr( @@ -3659,6 +3661,11 @@ def insert(self, loc, column, value, allow_duplicates=False) -> None: value : int, Series, or array-like allow_duplicates : bool, optional """ + if allow_duplicates and not self.flags.allows_duplicate_labels: + raise ValueError( + "Cannot specify 'allow_duplicates=True' when " + "'self.flags.allows_duplicate_labels' is False." + ) self._ensure_valid_index(value) value = self._sanitize_column(column, value, broadcast=False) self._mgr.insert(loc, column, value, allow_duplicates=allow_duplicates) @@ -4559,6 +4566,7 @@ def set_index( 4 16 10 2014 31 """ inplace = validate_bool_kwarg(inplace, "inplace") + self._check_inplace_and_allows_duplicate_labels(inplace) if not isinstance(keys, list): keys = [keys] @@ -4804,6 +4812,7 @@ class max type monkey mammal NaN jump """ inplace = validate_bool_kwarg(inplace, "inplace") + self._check_inplace_and_allows_duplicate_labels(inplace) if inplace: new_obj = self else: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fd924c964c1e1..c9eb4a34683f8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -94,6 +94,7 @@ from pandas.core.base import PandasObject, SelectionMixin import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype +from pandas.core.flags import Flags from pandas.core.indexes.api import Index, MultiIndex, RangeIndex, ensure_index from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.period import Period, PeriodIndex @@ -188,6 +189,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): "_metadata", "__array_struct__", "__array_interface__", + "_flags", ] _internal_names_set: Set[str] = set(_internal_names) _accessors: Set[str] = set() @@ -217,6 +219,7 @@ def __init__( else: attrs = dict(attrs) object.__setattr__(self, "_attrs", attrs) + object.__setattr__(self, "_flags", Flags(self, allows_duplicate_labels=True)) @classmethod def _init_mgr(cls, mgr, axes, dtype=None, copy: bool = False) -> BlockManager: @@ -237,15 +240,20 @@ def _init_mgr(cls, mgr, axes, dtype=None, copy: bool = False) -> BlockManager: return mgr # ---------------------------------------------------------------------- + # attrs and flags @property def attrs(self) -> Dict[Optional[Hashable], Any]: """ - Dictionary of global attributes on this object. + Dictionary of global attributes of this dataset. .. warning:: attrs is experimental and may change without warning. + + See Also + -------- + DataFrame.flags """ if self._attrs is None: self._attrs = {} @@ -255,6 +263,96 @@ def attrs(self) -> Dict[Optional[Hashable], Any]: def attrs(self, value: Mapping[Optional[Hashable], Any]) -> None: self._attrs = dict(value) + @property + def flags(self) -> Flags: + """ + Get the properties associated with this pandas object. + + The available flags are + + * :attr:`Flags.allows_duplicate_labels` + + See Also + -------- + Flags + DataFrame.attrs + + Notes + ----- + "Flags" differ from "metadata". Flags reflect properties of the + pandas object (the Series or DataFrame). Metadata refer to properties + of the dataset, and should be stored in :attr:`DataFrame.attrs`. + + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2]}) + >>> df.flags + + + Flags can be get or set using ``.`` + + >>> df.flags.allows_duplicate_labels + True + >>> df.flags.allows_duplicate_labels = False + + Or by slicing with a key + + >>> df.flags["allows_duplicate_labels"] + False + >>> df.flags["allows_duplicate_labels"] = True + """ + return self._flags + + def set_flags( + self: FrameOrSeries, + *, + copy: bool = False, + allows_duplicate_labels: Optional[bool] = None, + ) -> FrameOrSeries: + """ + Return a new object with updated flags. + + Parameters + ---------- + allows_duplicate_labels : bool, optional + Whether the returned object allows duplicate labels. + + Returns + ------- + Series or DataFrame + The same type as the caller. + + See Also + -------- + DataFrame.attrs : Global metadata applying to this dataset. + DataFrame.flags : Global flags applying to this object. + + Notes + ----- + This method returns a new object that's a view on the same data + as the input. Mutating the input or the output values will be reflected + in the other. + + This method is intended to be used in method chains. + + "Flags" differ from "metadata". Flags reflect properties of the + pandas object (the Series or DataFrame). Metadata refer to properties + of the dataset, and should be stored in :attr:`DataFrame.attrs`. + + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2]}) + >>> df.flags.allows_duplicate_labels + True + >>> df2 = df.set_flags(allows_duplicate_labels=False) + >>> df2.flags.allows_duplicate_labels + False + """ + df = self.copy(deep=copy) + if allows_duplicate_labels is not None: + df.flags["allows_duplicate_labels"] = allows_duplicate_labels + return df + @classmethod def _validate_dtype(cls, dtype): """ validate the passed dtype """ @@ -557,6 +655,11 @@ def set_axis(self, labels, axis: Axis = 0, inplace: bool = False): -------- %(klass)s.rename_axis : Alter the name of the index%(see_also_sub)s. """ + self._check_inplace_and_allows_duplicate_labels(inplace) + return self._set_axis_nocheck(labels, axis, inplace) + + def _set_axis_nocheck(self, labels, axis: Axis, inplace: bool): + # NDFrame.rename with inplace=False calls set_axis(inplace=True) on a copy. if inplace: setattr(self, self._get_axis_name(axis), labels) else: @@ -926,6 +1029,7 @@ def rename( else: index = mapper + self._check_inplace_and_allows_duplicate_labels(inplace) result = self if inplace else self.copy(deep=copy) for axis_no, replacements in enumerate((index, columns)): @@ -950,7 +1054,7 @@ def rename( raise KeyError(f"{missing_labels} not found in axis") new_index = ax._transform_index(f, level) - result.set_axis(new_index, axis=axis_no, inplace=True) + result._set_axis_nocheck(new_index, axis=axis_no, inplace=True) result._clear_item_cache() if inplace: @@ -1828,11 +1932,11 @@ def __getstate__(self) -> Dict[str, Any]: _typ=self._typ, _metadata=self._metadata, attrs=self.attrs, + _flags={k: self.flags[k] for k in self.flags._keys}, **meta, ) def __setstate__(self, state): - if isinstance(state, BlockManager): self._mgr = state elif isinstance(state, dict): @@ -1843,6 +1947,8 @@ def __setstate__(self, state): if typ is not None: attrs = state.get("_attrs", {}) object.__setattr__(self, "_attrs", attrs) + flags = state.get("_flags", dict(allows_duplicate_labels=True)) + object.__setattr__(self, "_flags", Flags(self, **flags)) # set in the order of internal names # to avoid definitional recursion @@ -1850,7 +1956,7 @@ def __setstate__(self, state): # defined meta = set(self._internal_names + self._metadata) for k in list(meta): - if k in state: + if k in state and k != "_flags": v = state[k] object.__setattr__(self, k, v) @@ -3802,6 +3908,13 @@ def __delitem__(self, key) -> None: # ---------------------------------------------------------------------- # Unsorted + def _check_inplace_and_allows_duplicate_labels(self, inplace): + if inplace and not self.flags.allows_duplicate_labels: + raise ValueError( + "Cannot specify 'inplace=True' when " + "'self.flags.allows_duplicate_labels' is False." + ) + def get(self, key, default=None): """ Get item from object for given key (ex: DataFrame column). @@ -5163,10 +5276,19 @@ def __finalize__( if isinstance(other, NDFrame): for name in other.attrs: self.attrs[name] = other.attrs[name] + + self.flags.allows_duplicate_labels = other.flags.allows_duplicate_labels # For subclasses using _metadata. for name in self._metadata: assert isinstance(name, str) object.__setattr__(self, name, getattr(other, name, None)) + + if method == "concat": + allows_duplicate_labels = all( + x.flags.allows_duplicate_labels for x in other.objs + ) + self.flags.allows_duplicate_labels = allows_duplicate_labels + return self def __getattr__(self, name: str): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 48b02fc525cc1..65b5dfb6df911 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -27,7 +27,7 @@ from pandas._typing import AnyArrayLike, Dtype, DtypeObj, Label from pandas.compat import set_function_name from pandas.compat.numpy import function as nv -from pandas.errors import InvalidIndexError +from pandas.errors import DuplicateLabelError, InvalidIndexError from pandas.util._decorators import Appender, Substitution, cache_readonly, doc from pandas.core.dtypes import concat as _concat @@ -488,6 +488,52 @@ def _simple_new(cls, values, name: Label = None): def _constructor(self): return type(self) + def _maybe_check_unique(self): + """ + Check that an Index has no duplicates. + + This is typically only called via + `NDFrame.flags.allows_duplicate_labels.setter` when it's set to + True (duplicates aren't allowed). + + Raises + ------ + DuplicateLabelError + When the index is not unique. + """ + if not self.is_unique: + msg = """Index has duplicates.""" + duplicates = self._format_duplicate_message() + msg += "\n{}".format(duplicates) + + raise DuplicateLabelError(msg) + + def _format_duplicate_message(self): + """ + Construct the DataFrame for a DuplicateLabelError. + + This returns a DataFrame indicating the labels and positions + of duplicates in an index. This should only be called when it's + already known that duplicates are present. + + Examples + -------- + >>> idx = pd.Index(['a', 'b', 'a']) + >>> idx._format_duplicate_message() + positions + label + a [0, 2] + """ + from pandas import Series + + duplicates = self[self.duplicated(keep="first")].unique() + assert len(duplicates) + + out = Series(np.arange(len(self))).groupby(self).agg(list)[duplicates] + if self.nlevels == 1: + out = out.rename_axis("label") + return out.to_frame(name="positions") + # -------------------------------------------------------------------- # Index Internals Methods diff --git a/pandas/core/series.py b/pandas/core/series.py index a8a2d300fa168..9d84ce4b9ab2e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -201,7 +201,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): # Constructors def __init__( - self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False + self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False, ): if ( @@ -211,7 +211,9 @@ def __init__( and copy is False ): # GH#33357 called with just the SingleBlockManager - NDFrame.__init__(self, data) + NDFrame.__init__( + self, data, + ) self.name = name return @@ -330,7 +332,9 @@ def __init__( data = SingleBlockManager.from_array(data, index) - generic.NDFrame.__init__(self, data) + generic.NDFrame.__init__( + self, data, + ) self.name = name self._set_axis(0, index, fastpath=True) diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 6ac3004d29996..15389ca2c3e61 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -202,6 +202,27 @@ class NumbaUtilError(Exception): """ +class DuplicateLabelError(ValueError): + """ + Error raised when an operation would introduce duplicate labels. + + .. versionadded:: 1.2.0 + + Examples + -------- + >>> s = pd.Series([0, 1, 2], index=['a', 'b', 'c']).set_flags( + ... allows_duplicate_labels=False + ... ) + >>> s.reindex(['a', 'a', 'b']) + Traceback (most recent call last): + ... + DuplicateLabelError: Index has duplicates. + positions + label + a [0, 1] + """ + + class InvalidIndexError(Exception): """ Exception raised when attemping to use an invalid index key. diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 1d25336cd3b70..54da13c3c620b 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -61,6 +61,7 @@ class TestPDApi(Base): "ExcelFile", "ExcelWriter", "Float64Index", + "Flags", "Grouper", "HDFStore", "Index", diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index 78a830c7f43d8..9523fba953ad0 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -99,7 +99,7 @@ def test_ndarray_compat_properties(index_or_series_obj): assert getattr(obj, p, None) is not None # deprecated properties - for p in ["flags", "strides", "itemsize", "base", "data"]: + for p in ["strides", "itemsize", "base", "data"]: assert not hasattr(obj, p) msg = "can only convert an array of size 1 to a Python scalar" @@ -116,6 +116,7 @@ def test_ndarray_compat_properties(index_or_series_obj): @pytest.mark.skipif(PYPY, reason="not relevant for PyPy") def test_memory_usage(index_or_series_obj): obj = index_or_series_obj + res = obj.memory_usage() res_deep = obj.memory_usage(deep=True) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 0716cf5e27119..b1c31a6f90133 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -553,6 +553,33 @@ def test_attrs(self): result = df.rename(columns=str) assert result.attrs == {"version": 1} + @pytest.mark.parametrize("allows_duplicate_labels", [True, False, None]) + def test_set_flags(self, allows_duplicate_labels): + df = pd.DataFrame({"A": [1, 2]}) + result = df.set_flags(allows_duplicate_labels=allows_duplicate_labels) + if allows_duplicate_labels is None: + # We don't update when it's not provided + assert result.flags.allows_duplicate_labels is True + else: + assert result.flags.allows_duplicate_labels is allows_duplicate_labels + + # We made a copy + assert df is not result + + # We didn't mutate df + assert df.flags.allows_duplicate_labels is True + + # But we didn't copy data + result.iloc[0, 0] = 0 + assert df.iloc[0, 0] == 0 + + # Now we do copy. + result = df.set_flags( + copy=True, allows_duplicate_labels=allows_duplicate_labels + ) + result.iloc[0, 0] = 10 + assert df.iloc[0, 0] == 0 + def test_cache_on_copy(self): # GH 31784 _item_cache not cleared on copy causes incorrect reads after updates df = DataFrame({"a": [1]}) diff --git a/pandas/tests/generic/test_duplicate_labels.py b/pandas/tests/generic/test_duplicate_labels.py new file mode 100644 index 0000000000000..97468e1f10a8b --- /dev/null +++ b/pandas/tests/generic/test_duplicate_labels.py @@ -0,0 +1,450 @@ +"""Tests dealing with the NDFrame.allows_duplicates.""" +import operator + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + +not_implemented = pytest.mark.xfail(reason="Not implemented.") + +# ---------------------------------------------------------------------------- +# Preservation + + +class TestPreserves: + @pytest.mark.parametrize( + "cls, data", + [ + (pd.Series, np.array([])), + (pd.Series, [1, 2]), + (pd.DataFrame, {}), + (pd.DataFrame, {"A": [1, 2]}), + ], + ) + def test_construction_ok(self, cls, data): + result = cls(data) + assert result.flags.allows_duplicate_labels is True + + result = cls(data).set_flags(allows_duplicate_labels=False) + assert result.flags.allows_duplicate_labels is False + + @pytest.mark.parametrize( + "func", + [ + operator.itemgetter(["a"]), + operator.methodcaller("add", 1), + operator.methodcaller("rename", str.upper), + operator.methodcaller("rename", "name"), + pytest.param(operator.methodcaller("abs"), marks=not_implemented), + # TODO: test np.abs + ], + ) + def test_preserved_series(self, func): + s = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False) + assert func(s).flags.allows_duplicate_labels is False + + @pytest.mark.parametrize( + "other", [pd.Series(0, index=["a", "b", "c"]), pd.Series(0, index=["a", "b"])] + ) + # TODO: frame + @not_implemented + def test_align(self, other): + s = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False) + a, b = s.align(other) + assert a.flags.allows_duplicate_labels is False + assert b.flags.allows_duplicate_labels is False + + def test_preserved_frame(self): + df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ) + assert df.loc[["a"]].flags.allows_duplicate_labels is False + assert df.loc[:, ["A", "B"]].flags.allows_duplicate_labels is False + + @not_implemented + def test_to_frame(self): + s = pd.Series(dtype=float).set_flags(allows_duplicate_labels=False) + assert s.to_frame().flags.allows_duplicate_labels is False + + @pytest.mark.parametrize("func", ["add", "sub"]) + @pytest.mark.parametrize( + "frame", [False, pytest.param(True, marks=not_implemented)] + ) + @pytest.mark.parametrize("other", [1, pd.Series([1, 2], name="A")]) + def test_binops(self, func, other, frame): + df = pd.Series([1, 2], name="A", index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ) + if frame: + df = df.to_frame() + if isinstance(other, pd.Series) and frame: + other = other.to_frame() + func = operator.methodcaller(func, other) + assert df.flags.allows_duplicate_labels is False + assert func(df).flags.allows_duplicate_labels is False + + @not_implemented + def test_preserve_getitem(self): + df = pd.DataFrame({"A": [1, 2]}).set_flags(allows_duplicate_labels=False) + assert df[["A"]].flags.allows_duplicate_labels is False + assert df["A"].flags.allows_duplicate_labels is False + assert df.loc[0].flags.allows_duplicate_labels is False + assert df.loc[[0]].flags.allows_duplicate_labels is False + assert df.loc[0, ["A"]].flags.allows_duplicate_labels is False + + @pytest.mark.xfail(reason="Unclear behavior.") + def test_ndframe_getitem_caching_issue(self): + # NDFrame.__getitem__ will cache the first df['A']. May need to + # invalidate that cache? Update the cached entries? + df = pd.DataFrame({"A": [0]}).set_flags(allows_duplicate_labels=False) + assert df["A"].flags.allows_duplicate_labels is False + df.flags.allows_duplicate_labels = True + assert df["A"].flags.allows_duplicate_labels is True + + @pytest.mark.parametrize( + "objs, kwargs", + [ + # Series + ( + [ + pd.Series(1, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + pd.Series(2, index=["c", "d"]).set_flags( + allows_duplicate_labels=False + ), + ], + {}, + ), + ( + [ + pd.Series(1, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + pd.Series(2, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + ], + {"ignore_index": True}, + ), + ( + [ + pd.Series(1, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + pd.Series(2, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + ], + {"axis": 1}, + ), + # Frame + ( + [ + pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + pd.DataFrame({"A": [1, 2]}, index=["c", "d"]).set_flags( + allows_duplicate_labels=False + ), + ], + {}, + ), + ( + [ + pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + ], + {"ignore_index": True}, + ), + ( + [ + pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + pd.DataFrame({"B": [1, 2]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + ], + {"axis": 1}, + ), + # Series / Frame + ( + [ + pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + pd.Series([1, 2], index=["a", "b"], name="B",).set_flags( + allows_duplicate_labels=False, + ), + ], + {"axis": 1}, + ), + ], + ) + def test_concat(self, objs, kwargs): + result = pd.concat(objs, **kwargs) + assert result.flags.allows_duplicate_labels is False + + @pytest.mark.parametrize( + "left, right, kwargs, expected", + [ + # false false false + pytest.param( + pd.DataFrame({"A": [0, 1]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + pd.DataFrame({"B": [0, 1]}, index=["a", "d"]).set_flags( + allows_duplicate_labels=False + ), + dict(left_index=True, right_index=True), + False, + marks=not_implemented, + ), + # false true false + pytest.param( + pd.DataFrame({"A": [0, 1]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + pd.DataFrame({"B": [0, 1]}, index=["a", "d"]), + dict(left_index=True, right_index=True), + False, + marks=not_implemented, + ), + # true true true + ( + pd.DataFrame({"A": [0, 1]}, index=["a", "b"]), + pd.DataFrame({"B": [0, 1]}, index=["a", "d"]), + dict(left_index=True, right_index=True), + True, + ), + ], + ) + def test_merge(self, left, right, kwargs, expected): + result = pd.merge(left, right, **kwargs) + assert result.flags.allows_duplicate_labels is expected + + @not_implemented + def test_groupby(self): + # XXX: This is under tested + # TODO: + # - apply + # - transform + # - Should passing a grouper that disallows duplicates propagate? + df = pd.DataFrame({"A": [1, 2, 3]}).set_flags(allows_duplicate_labels=False) + result = df.groupby([0, 0, 1]).agg("count") + assert result.flags.allows_duplicate_labels is False + + @pytest.mark.parametrize("frame", [True, False]) + @not_implemented + def test_window(self, frame): + df = pd.Series( + 1, + index=pd.date_range("2000", periods=12), + name="A", + allows_duplicate_labels=False, + ) + if frame: + df = df.to_frame() + assert df.rolling(3).mean().flags.allows_duplicate_labels is False + assert df.ewm(3).mean().flags.allows_duplicate_labels is False + assert df.expanding(3).mean().flags.allows_duplicate_labels is False + + +# ---------------------------------------------------------------------------- +# Raises + + +class TestRaises: + @pytest.mark.parametrize( + "cls, axes", + [ + (pd.Series, {"index": ["a", "a"], "dtype": float}), + (pd.DataFrame, {"index": ["a", "a"]}), + (pd.DataFrame, {"index": ["a", "a"], "columns": ["b", "b"]}), + (pd.DataFrame, {"columns": ["b", "b"]}), + ], + ) + def test_set_flags_with_duplicates(self, cls, axes): + result = cls(**axes) + assert result.flags.allows_duplicate_labels is True + + with pytest.raises(pd.errors.DuplicateLabelError): + cls(**axes).set_flags(allows_duplicate_labels=False) + + @pytest.mark.parametrize( + "data", + [ + pd.Series(index=[0, 0], dtype=float), + pd.DataFrame(index=[0, 0]), + pd.DataFrame(columns=[0, 0]), + ], + ) + def test_setting_allows_duplicate_labels_raises(self, data): + with pytest.raises(pd.errors.DuplicateLabelError): + data.flags.allows_duplicate_labels = False + + assert data.flags.allows_duplicate_labels is True + + @pytest.mark.parametrize( + "func", [operator.methodcaller("append", pd.Series(0, index=["a", "b"]))] + ) + def test_series_raises(self, func): + s = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False) + with pytest.raises(pd.errors.DuplicateLabelError): + func(s) + + @pytest.mark.parametrize( + "getter, target", + [ + (operator.itemgetter(["A", "A"]), None), + # loc + (operator.itemgetter(["a", "a"]), "loc"), + pytest.param( + operator.itemgetter(("a", ["A", "A"])), "loc", marks=not_implemented + ), + pytest.param( + operator.itemgetter((["a", "a"], "A")), "loc", marks=not_implemented + ), + # iloc + (operator.itemgetter([0, 0]), "iloc"), + pytest.param( + operator.itemgetter((0, [0, 0])), "iloc", marks=not_implemented + ), + pytest.param( + operator.itemgetter(([0, 0], 0)), "iloc", marks=not_implemented + ), + ], + ) + def test_getitem_raises(self, getter, target): + df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ) + if target: + # df, df.loc, or df.iloc + target = getattr(df, target) + else: + target = df + + with pytest.raises(pd.errors.DuplicateLabelError): + getter(target) + + @pytest.mark.parametrize( + "objs, kwargs", + [ + ( + [ + pd.Series(1, index=[0, 1], name="a").set_flags( + allows_duplicate_labels=False + ), + pd.Series(2, index=[0, 1], name="a").set_flags( + allows_duplicate_labels=False + ), + ], + {"axis": 1}, + ) + ], + ) + def test_concat_raises(self, objs, kwargs): + with pytest.raises(pd.errors.DuplicateLabelError): + pd.concat(objs, **kwargs) + + @not_implemented + def test_merge_raises(self): + a = pd.DataFrame({"A": [0, 1, 2]}, index=["a", "b", "c"]).set_flags( + allows_duplicate_labels=False + ) + b = pd.DataFrame({"B": [0, 1, 2]}, index=["a", "b", "b"]) + with pytest.raises(pd.errors.DuplicateLabelError): + pd.merge(a, b, left_index=True, right_index=True) + + +@pytest.mark.parametrize( + "idx", + [ + pd.Index([1, 1]), + pd.Index(["a", "a"]), + pd.Index([1.1, 1.1]), + pd.PeriodIndex([pd.Period("2000", "D")] * 2), + pd.DatetimeIndex([pd.Timestamp("2000")] * 2), + pd.TimedeltaIndex([pd.Timedelta("1D")] * 2), + pd.CategoricalIndex(["a", "a"]), + pd.IntervalIndex([pd.Interval(0, 1)] * 2), + pd.MultiIndex.from_tuples([("a", 1), ("a", 1)]), + ], + ids=lambda x: type(x).__name__, +) +def test_raises_basic(idx): + with pytest.raises(pd.errors.DuplicateLabelError): + pd.Series(1, index=idx).set_flags(allows_duplicate_labels=False) + + with pytest.raises(pd.errors.DuplicateLabelError): + pd.DataFrame({"A": [1, 1]}, index=idx).set_flags(allows_duplicate_labels=False) + + with pytest.raises(pd.errors.DuplicateLabelError): + pd.DataFrame([[1, 2]], columns=idx).set_flags(allows_duplicate_labels=False) + + +def test_format_duplicate_labels_message(): + idx = pd.Index(["a", "b", "a", "b", "c"]) + result = idx._format_duplicate_message() + expected = pd.DataFrame( + {"positions": [[0, 2], [1, 3]]}, index=pd.Index(["a", "b"], name="label") + ) + tm.assert_frame_equal(result, expected) + + +def test_format_duplicate_labels_message_multi(): + idx = pd.MultiIndex.from_product([["A"], ["a", "b", "a", "b", "c"]]) + result = idx._format_duplicate_message() + expected = pd.DataFrame( + {"positions": [[0, 2], [1, 3]]}, + index=pd.MultiIndex.from_product([["A"], ["a", "b"]]), + ) + tm.assert_frame_equal(result, expected) + + +def test_dataframe_insert_raises(): + df = pd.DataFrame({"A": [1, 2]}).set_flags(allows_duplicate_labels=False) + with pytest.raises(ValueError, match="Cannot specify"): + df.insert(0, "A", [3, 4], allow_duplicates=True) + + +@pytest.mark.parametrize( + "method, frame_only", + [ + (operator.methodcaller("set_index", "A", inplace=True), True), + (operator.methodcaller("set_axis", ["A", "B"], inplace=True), False), + (operator.methodcaller("reset_index", inplace=True), True), + (operator.methodcaller("rename", lambda x: x, inplace=True), False), + ], +) +def test_inplace_raises(method, frame_only): + df = pd.DataFrame({"A": [0, 0], "B": [1, 2]}).set_flags( + allows_duplicate_labels=False + ) + s = df["A"] + s.flags.allows_duplicate_labels = False + msg = "Cannot specify" + + with pytest.raises(ValueError, match=msg): + method(df) + if not frame_only: + with pytest.raises(ValueError, match=msg): + method(s) + + +def test_pickle(): + a = pd.Series([1, 2]).set_flags(allows_duplicate_labels=False) + b = tm.round_trip_pickle(a) + tm.assert_series_equal(a, b) + + a = pd.DataFrame({"A": []}).set_flags(allows_duplicate_labels=False) + b = tm.round_trip_pickle(a) + tm.assert_frame_equal(a, b) diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 5e66925a38ec6..23bb673586768 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -887,3 +887,13 @@ def test_axis_numbers_deprecated(self, box): obj = box(dtype=object) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): obj._AXIS_NUMBERS + + @pytest.mark.parametrize("as_frame", [True, False]) + def test_flags_identity(self, as_frame): + s = pd.Series([1, 2]) + if as_frame: + s = s.to_frame() + + assert s.flags is s.flags + s2 = s.copy() + assert s2.flags is not s.flags diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index d81e8a4f82ffb..a69c0ee75eaba 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -524,6 +524,32 @@ def test_attrs(self): result = s + 1 assert result.attrs == {"version": 1} + @pytest.mark.parametrize("allows_duplicate_labels", [True, False, None]) + def test_set_flags(self, allows_duplicate_labels): + df = pd.Series([1, 2]) + result = df.set_flags(allows_duplicate_labels=allows_duplicate_labels) + if allows_duplicate_labels is None: + # We don't update when it's not provided + assert result.flags.allows_duplicate_labels is True + else: + assert result.flags.allows_duplicate_labels is allows_duplicate_labels + + # We made a copy + assert df is not result + # We didn't mutate df + assert df.flags.allows_duplicate_labels is True + + # But we didn't copy data + result.iloc[0] = 0 + assert df.iloc[0] == 0 + + # Now we do copy. + result = df.set_flags( + copy=True, allows_duplicate_labels=allows_duplicate_labels + ) + result.iloc[0] = 10 + assert df.iloc[0] == 0 + class TestCategoricalSeries: @pytest.mark.parametrize( diff --git a/pandas/tests/test_flags.py b/pandas/tests/test_flags.py new file mode 100644 index 0000000000000..f6e3ae4980afb --- /dev/null +++ b/pandas/tests/test_flags.py @@ -0,0 +1,48 @@ +import pytest + +import pandas as pd + + +class TestFlags: + def test_equality(self): + a = pd.DataFrame().set_flags(allows_duplicate_labels=True).flags + b = pd.DataFrame().set_flags(allows_duplicate_labels=False).flags + + assert a == a + assert b == b + assert a != b + assert a != 2 + + def test_set(self): + df = pd.DataFrame().set_flags(allows_duplicate_labels=True) + a = df.flags + a.allows_duplicate_labels = False + assert a.allows_duplicate_labels is False + a["allows_duplicate_labels"] = True + assert a.allows_duplicate_labels is True + + def test_repr(self): + a = repr(pd.DataFrame({"A"}).set_flags(allows_duplicate_labels=True).flags) + assert a == "" + a = repr(pd.DataFrame({"A"}).set_flags(allows_duplicate_labels=False).flags) + assert a == "" + + def test_obj_ref(self): + df = pd.DataFrame() + flags = df.flags + del df + with pytest.raises(ValueError, match="object has been deleted"): + flags.allows_duplicate_labels = True + + def test_getitem(self): + df = pd.DataFrame() + flags = df.flags + assert flags["allows_duplicate_labels"] is True + flags["allows_duplicate_labels"] = False + assert flags["allows_duplicate_labels"] is False + + with pytest.raises(KeyError): + flags["a"] + + with pytest.raises(ValueError): + flags["a"] = 10 diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index 3aa3c64923b14..5174ff005b5fb 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -268,3 +268,18 @@ def test_assert_frame_equal_ignore_extension_dtype_mismatch(right_dtype): left = pd.DataFrame({"a": [1, 2, 3]}, dtype="Int64") right = pd.DataFrame({"a": [1, 2, 3]}, dtype=right_dtype) tm.assert_frame_equal(left, right, check_dtype=False) + + +def test_allows_duplicate_labels(): + left = pd.DataFrame() + right = pd.DataFrame().set_flags(allows_duplicate_labels=False) + tm.assert_frame_equal(left, left) + tm.assert_frame_equal(right, right) + tm.assert_frame_equal(left, right, check_flags=False) + tm.assert_frame_equal(right, left, check_flags=False) + + with pytest.raises(AssertionError, match="