Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API: add 'level' kwarg to 'Index.isin' method #7892

Merged
merged 1 commit into from
Aug 4, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 21 additions & 10 deletions doc/source/indexing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -582,7 +582,7 @@ and :ref:`Advanced Indexing <indexing.advanced>` you may select along more than
.. _indexing.basics.indexing_isin:

Indexing with isin
~~~~~~~~~~~~~~~~~~
------------------

Consider the ``isin`` method of Series, which returns a boolean vector that is
true wherever the Series elements exist in the passed list. This allows you to
Expand All @@ -591,13 +591,30 @@ select rows where one or more columns have values you want:
.. ipython:: python
s = Series(np.arange(5),index=np.arange(5)[::-1],dtype='int64')
s
s.isin([2, 4, 6])
s[s.isin([2, 4, 6])]
The same method is available for ``Index`` objects and is useful for the cases
when you don't know which of the sought labels are in fact present:

s.isin([2, 4])
.. ipython:: python
s[s.index.isin([2, 4, 6])]
s[s.isin([2, 4])]
# compare it to the following
s[[2, 4, 6]]
In addition to that, ``MultiIndex`` allows selecting a separate level to use
in the membership check:

.. ipython:: python
s_mi = Series(np.arange(6),
index=pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']]))
s_mi
s_mi.iloc[s_mi.index.isin([(1, 'a'), (2, 'b'), (0, 'c')])]
s_mi.iloc[s_mi.index.isin(['a', 'c', 'e'], level=1)]
DataFrame also has an ``isin`` method. When calling ``isin``, pass a set of
values as either an array or dict. If values is an array, ``isin`` returns
Expand Down Expand Up @@ -1622,12 +1639,6 @@ with duplicates dropped.
idx1.sym_diff(idx2)
idx1 ^ idx2
The ``isin`` method of Index objects
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

One additional operation is the ``isin`` method that works analogously to the
``Series.isin`` method found :ref:`here <indexing.boolean>`.

.. _indexing.hierarchical:

Hierarchical indexing (MultiIndex)
Expand Down
13 changes: 13 additions & 0 deletions doc/source/v0.15.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,19 @@ API changes
strings must contain 244 or fewer characters. Attempting to write Stata
dta files with strings longer than 244 characters raises a ``ValueError``. (:issue:`7858`)

- ``Index.isin`` now supports a ``level`` argument to specify which index level
to use for membership tests (:issue:`7892`, :issue:`7890`)

.. code-block:: python

In [1]: idx = pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']])

In [2]: idx.values
Out[2]: array([(0, 'a'), (0, 'b'), (0, 'c'), (1, 'a'), (1, 'b'), (1, 'c')], dtype=object)

In [3]: idx.isin(['a', 'c', 'e'], level=1)
Out[3]: array([ True, False, True, True, False, True], dtype=bool)


.. _whatsnew_0150.cat:

Expand Down
80 changes: 58 additions & 22 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import pandas.index as _index
from pandas.lib import Timestamp, is_datetime_array
from pandas.core.base import FrozenList, FrozenNDArray, IndexOpsMixin
from pandas.util.decorators import cache_readonly, deprecate
from pandas.util.decorators import cache_readonly, deprecate, Appender
from pandas.core.common import isnull, array_equivalent
import pandas.core.common as com
from pandas.core.common import (_values_from_object, is_float, is_integer,
Expand Down Expand Up @@ -687,13 +687,29 @@ def _engine(self):
# property, for now, slow to look up
return self._engine_type(lambda: self.values, len(self))

def _validate_index_level(self, level):
"""
Validate index level.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

his already exists as _get_level_number (oh I see you are extending it), ok then

For single-level Index getting level number is a no-op, but some
verification must be done like in MultiIndex.
"""
if isinstance(level, int):
if level < 0 and level != -1:
raise IndexError("Too many levels: Index has only 1 level,"
" %d is not a valid level number" % (level,))
elif level > 0:
raise IndexError("Too many levels:"
" Index has only 1 level, not %d" %
(level + 1))
elif level != self.name:
raise KeyError('Level %s must be same as name (%s)'
% (level, self.name))

def _get_level_number(self, level):
if not isinstance(level, int):
if level != self.name:
raise AssertionError('Level %s must be same as name (%s)'
% (level, self.name))
level = 0
return level
self._validate_index_level(level)
return 0

@cache_readonly
def inferred_type(self):
Expand Down Expand Up @@ -1271,7 +1287,7 @@ def get_level_values(self, level):
values : ndarray
"""
# checks that level number is actually just 1
self._get_level_number(level)
self._validate_index_level(level)
return self

def get_indexer(self, target, method=None, limit=None):
Expand Down Expand Up @@ -1370,20 +1386,34 @@ def groupby(self, to_groupby):
def map(self, mapper):
return self._arrmap(self.values, mapper)

def isin(self, values):
def isin(self, values, level=None):
"""
Compute boolean array of whether each index value is found in the
passed set of values
Parameters
----------
values : set or sequence of values
Sought values.
level : str or int, optional
Name or position of the index level to use (if the index is a
MultiIndex).
Notes
-----
If `level` is specified:
- if it is the name of one *and only one* index level, use that level;
- otherwise it should be a number indicating level position.
Returns
-------
is_contained : ndarray (boolean dtype)
"""
value_set = set(values)
if level is not None:
self._validate_index_level(level)
return lib.ismember(self._array_values(), value_set)

def _array_values(self):
Expand Down Expand Up @@ -2149,20 +2179,11 @@ def hasnans(self):
def is_unique(self):
return super(Float64Index, self).is_unique and self._nan_idxs.size < 2

def isin(self, values):
"""
Compute boolean array of whether each index value is found in the
passed set of values
Parameters
----------
values : set or sequence of values
Returns
-------
is_contained : ndarray (boolean dtype)
"""
@Appender(Index.isin.__doc__)
def isin(self, values, level=None):
value_set = set(values)
if level is not None:
self._validate_index_level(level)
return lib.ismember_nans(self._array_values(), value_set,
isnull(list(value_set)).any())

Expand Down Expand Up @@ -4052,6 +4073,21 @@ def _wrap_joined_index(self, joined, other):
names = self.names if self.names == other.names else None
return MultiIndex.from_tuples(joined, names=names)

@Appender(Index.isin.__doc__)
def isin(self, values, level=None):
if level is None:
return lib.ismember(self._array_values(), set(values))
else:
num = self._get_level_number(level)
levs = self.levels[num]
labs = self.labels[num]

sought_labels = levs.isin(values).nonzero()[0]
if levs.size == 0:
return np.zeros(len(labs), dtype=np.bool_)
else:
return np.lib.arraysetops.in1d(labs, sought_labels)


# For utility purposes

Expand Down
94 changes: 93 additions & 1 deletion pandas/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -840,7 +840,7 @@ def test_get_set_value(self):
self.assertEqual(values[67], 10)

def test_isin(self):
values = ['foo', 'bar']
values = ['foo', 'bar', 'quux']

idx = Index(['qux', 'baz', 'foo', 'bar'])
result = idx.isin(values)
Expand All @@ -853,6 +853,49 @@ def test_isin(self):
self.assertEqual(len(result), 0)
self.assertEqual(result.dtype, np.bool_)

def test_isin_nan(self):
self.assert_numpy_array_equal(
Index(['a', np.nan]).isin([np.nan]), [False, True])
self.assert_numpy_array_equal(
Index(['a', pd.NaT]).isin([pd.NaT]), [False, True])
self.assert_numpy_array_equal(
Index(['a', np.nan]).isin([float('nan')]), [False, False])
self.assert_numpy_array_equal(
Index(['a', np.nan]).isin([pd.NaT]), [False, False])
# Float64Index overrides isin, so must be checked separately
self.assert_numpy_array_equal(
Float64Index([1.0, np.nan]).isin([np.nan]), [False, True])
self.assert_numpy_array_equal(
Float64Index([1.0, np.nan]).isin([float('nan')]), [False, True])
self.assert_numpy_array_equal(
Float64Index([1.0, np.nan]).isin([pd.NaT]), [False, True])

def test_isin_level_kwarg(self):
def check_idx(idx):
values = idx.tolist()[-2:] + ['nonexisting']

expected = np.array([False, False, True, True])
self.assert_numpy_array_equal(expected, idx.isin(values, level=0))
self.assert_numpy_array_equal(expected, idx.isin(values, level=-1))

self.assertRaises(IndexError, idx.isin, values, level=1)
self.assertRaises(IndexError, idx.isin, values, level=10)
self.assertRaises(IndexError, idx.isin, values, level=-2)

self.assertRaises(KeyError, idx.isin, values, level=1.0)
self.assertRaises(KeyError, idx.isin, values, level='foobar')

idx.name = 'foobar'
self.assert_numpy_array_equal(expected,
idx.isin(values, level='foobar'))

self.assertRaises(KeyError, idx.isin, values, level='xyzzy')
self.assertRaises(KeyError, idx.isin, values, level=np.nan)

check_idx(Index(['qux', 'baz', 'foo', 'bar']))
# Float64Index overrides isin, so must be checked separately
check_idx(Float64Index([1.0, 2.0, 3.0, 4.0]))

def test_boolean_cmp(self):
values = [1, 2, 3, 4]

Expand Down Expand Up @@ -2948,6 +2991,55 @@ def test_level_setting_resets_attributes(self):
# if this fails, probably didn't reset the cache correctly.
assert not ind.is_monotonic

def test_isin(self):
values = [('foo', 2), ('bar', 3), ('quux', 4)]

idx = MultiIndex.from_arrays([['qux', 'baz', 'foo', 'bar'],
np.arange(4)])
result = idx.isin(values)
expected = np.array([False, False, True, True])
self.assert_numpy_array_equal(result, expected)

# empty, return dtype bool
idx = MultiIndex.from_arrays([[], []])
result = idx.isin(values)
self.assertEqual(len(result), 0)
self.assertEqual(result.dtype, np.bool_)

def test_isin_nan(self):
idx = MultiIndex.from_arrays([['foo', 'bar'], [1.0, np.nan]])
self.assert_numpy_array_equal(idx.isin([('bar', np.nan)]),
[False, False])
self.assert_numpy_array_equal(idx.isin([('bar', float('nan'))]),
[False, False])

def test_isin_level_kwarg(self):
idx = MultiIndex.from_arrays([['qux', 'baz', 'foo', 'bar'],
np.arange(4)])

vals_0 = ['foo', 'bar', 'quux']
vals_1 = [2, 3, 10]

expected = np.array([False, False, True, True])
self.assert_numpy_array_equal(expected, idx.isin(vals_0, level=0))
self.assert_numpy_array_equal(expected, idx.isin(vals_0, level=-2))

self.assert_numpy_array_equal(expected, idx.isin(vals_1, level=1))
self.assert_numpy_array_equal(expected, idx.isin(vals_1, level=-1))

self.assertRaises(IndexError, idx.isin, vals_0, level=5)
self.assertRaises(IndexError, idx.isin, vals_0, level=-5)

self.assertRaises(KeyError, idx.isin, vals_0, level=1.0)
self.assertRaises(KeyError, idx.isin, vals_1, level=-1.0)
self.assertRaises(KeyError, idx.isin, vals_1, level='A')

idx.names = ['A', 'B']
self.assert_numpy_array_equal(expected, idx.isin(vals_0, level='A'))
self.assert_numpy_array_equal(expected, idx.isin(vals_1, level='B'))

self.assertRaises(KeyError, idx.isin, vals_1, level='C')


def test_get_combined_index():
from pandas.core.index import _get_combined_index
Expand Down