Skip to content

Commit

Permalink
Merge pull request #7892 from immerrr/add-level-kwarg-for-index-isin
Browse files Browse the repository at this point in the history
API: add 'level' kwarg to 'Index.isin' method
  • Loading branch information
jreback committed Aug 4, 2014
2 parents 50f0959 + 7708590 commit 0646ad5
Show file tree
Hide file tree
Showing 4 changed files with 185 additions and 33 deletions.
31 changes: 21 additions & 10 deletions doc/source/indexing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -582,7 +582,7 @@ and :ref:`Advanced Indexing <indexing.advanced>` you may select along more than
.. _indexing.basics.indexing_isin:

Indexing with isin
~~~~~~~~~~~~~~~~~~
------------------

Consider the ``isin`` method of Series, which returns a boolean vector that is
true wherever the Series elements exist in the passed list. This allows you to
Expand All @@ -591,13 +591,30 @@ select rows where one or more columns have values you want:
.. ipython:: python
s = Series(np.arange(5),index=np.arange(5)[::-1],dtype='int64')
s
s.isin([2, 4, 6])
s[s.isin([2, 4, 6])]
The same method is available for ``Index`` objects and is useful for the cases
when you don't know which of the sought labels are in fact present:

s.isin([2, 4])
.. ipython:: python
s[s.index.isin([2, 4, 6])]
s[s.isin([2, 4])]
# compare it to the following
s[[2, 4, 6]]
In addition to that, ``MultiIndex`` allows selecting a separate level to use
in the membership check:

.. ipython:: python
s_mi = Series(np.arange(6),
index=pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']]))
s_mi
s_mi.iloc[s_mi.index.isin([(1, 'a'), (2, 'b'), (0, 'c')])]
s_mi.iloc[s_mi.index.isin(['a', 'c', 'e'], level=1)]
DataFrame also has an ``isin`` method. When calling ``isin``, pass a set of
values as either an array or dict. If values is an array, ``isin`` returns
Expand Down Expand Up @@ -1622,12 +1639,6 @@ with duplicates dropped.
idx1.sym_diff(idx2)
idx1 ^ idx2
The ``isin`` method of Index objects
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

One additional operation is the ``isin`` method that works analogously to the
``Series.isin`` method found :ref:`here <indexing.boolean>`.

.. _indexing.hierarchical:

Hierarchical indexing (MultiIndex)
Expand Down
13 changes: 13 additions & 0 deletions doc/source/v0.15.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,19 @@ API changes
strings must contain 244 or fewer characters. Attempting to write Stata
dta files with strings longer than 244 characters raises a ``ValueError``. (:issue:`7858`)

- ``Index.isin`` now supports a ``level`` argument to specify which index level
to use for membership tests (:issue:`7892`, :issue:`7890`)

.. code-block:: python

In [1]: idx = pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']])

In [2]: idx.values
Out[2]: array([(0, 'a'), (0, 'b'), (0, 'c'), (1, 'a'), (1, 'b'), (1, 'c')], dtype=object)

In [3]: idx.isin(['a', 'c', 'e'], level=1)
Out[3]: array([ True, False, True, True, False, True], dtype=bool)


.. _whatsnew_0150.cat:

Expand Down
80 changes: 58 additions & 22 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import pandas.index as _index
from pandas.lib import Timestamp, is_datetime_array
from pandas.core.base import FrozenList, FrozenNDArray, IndexOpsMixin
from pandas.util.decorators import cache_readonly, deprecate
from pandas.util.decorators import cache_readonly, deprecate, Appender
from pandas.core.common import isnull, array_equivalent
import pandas.core.common as com
from pandas.core.common import (_values_from_object, is_float, is_integer,
Expand Down Expand Up @@ -687,13 +687,29 @@ def _engine(self):
# property, for now, slow to look up
return self._engine_type(lambda: self.values, len(self))

def _validate_index_level(self, level):
"""
Validate index level.
For single-level Index getting level number is a no-op, but some
verification must be done like in MultiIndex.
"""
if isinstance(level, int):
if level < 0 and level != -1:
raise IndexError("Too many levels: Index has only 1 level,"
" %d is not a valid level number" % (level,))
elif level > 0:
raise IndexError("Too many levels:"
" Index has only 1 level, not %d" %
(level + 1))
elif level != self.name:
raise KeyError('Level %s must be same as name (%s)'
% (level, self.name))

def _get_level_number(self, level):
if not isinstance(level, int):
if level != self.name:
raise AssertionError('Level %s must be same as name (%s)'
% (level, self.name))
level = 0
return level
self._validate_index_level(level)
return 0

@cache_readonly
def inferred_type(self):
Expand Down Expand Up @@ -1271,7 +1287,7 @@ def get_level_values(self, level):
values : ndarray
"""
# checks that level number is actually just 1
self._get_level_number(level)
self._validate_index_level(level)
return self

def get_indexer(self, target, method=None, limit=None):
Expand Down Expand Up @@ -1370,20 +1386,34 @@ def groupby(self, to_groupby):
def map(self, mapper):
return self._arrmap(self.values, mapper)

def isin(self, values):
def isin(self, values, level=None):
"""
Compute boolean array of whether each index value is found in the
passed set of values
Parameters
----------
values : set or sequence of values
Sought values.
level : str or int, optional
Name or position of the index level to use (if the index is a
MultiIndex).
Notes
-----
If `level` is specified:
- if it is the name of one *and only one* index level, use that level;
- otherwise it should be a number indicating level position.
Returns
-------
is_contained : ndarray (boolean dtype)
"""
value_set = set(values)
if level is not None:
self._validate_index_level(level)
return lib.ismember(self._array_values(), value_set)

def _array_values(self):
Expand Down Expand Up @@ -2149,20 +2179,11 @@ def hasnans(self):
def is_unique(self):
return super(Float64Index, self).is_unique and self._nan_idxs.size < 2

def isin(self, values):
"""
Compute boolean array of whether each index value is found in the
passed set of values
Parameters
----------
values : set or sequence of values
Returns
-------
is_contained : ndarray (boolean dtype)
"""
@Appender(Index.isin.__doc__)
def isin(self, values, level=None):
value_set = set(values)
if level is not None:
self._validate_index_level(level)
return lib.ismember_nans(self._array_values(), value_set,
isnull(list(value_set)).any())

Expand Down Expand Up @@ -4052,6 +4073,21 @@ def _wrap_joined_index(self, joined, other):
names = self.names if self.names == other.names else None
return MultiIndex.from_tuples(joined, names=names)

@Appender(Index.isin.__doc__)
def isin(self, values, level=None):
if level is None:
return lib.ismember(self._array_values(), set(values))
else:
num = self._get_level_number(level)
levs = self.levels[num]
labs = self.labels[num]

sought_labels = levs.isin(values).nonzero()[0]
if levs.size == 0:
return np.zeros(len(labs), dtype=np.bool_)
else:
return np.lib.arraysetops.in1d(labs, sought_labels)


# For utility purposes

Expand Down
94 changes: 93 additions & 1 deletion pandas/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -840,7 +840,7 @@ def test_get_set_value(self):
self.assertEqual(values[67], 10)

def test_isin(self):
values = ['foo', 'bar']
values = ['foo', 'bar', 'quux']

idx = Index(['qux', 'baz', 'foo', 'bar'])
result = idx.isin(values)
Expand All @@ -853,6 +853,49 @@ def test_isin(self):
self.assertEqual(len(result), 0)
self.assertEqual(result.dtype, np.bool_)

def test_isin_nan(self):
self.assert_numpy_array_equal(
Index(['a', np.nan]).isin([np.nan]), [False, True])
self.assert_numpy_array_equal(
Index(['a', pd.NaT]).isin([pd.NaT]), [False, True])
self.assert_numpy_array_equal(
Index(['a', np.nan]).isin([float('nan')]), [False, False])
self.assert_numpy_array_equal(
Index(['a', np.nan]).isin([pd.NaT]), [False, False])
# Float64Index overrides isin, so must be checked separately
self.assert_numpy_array_equal(
Float64Index([1.0, np.nan]).isin([np.nan]), [False, True])
self.assert_numpy_array_equal(
Float64Index([1.0, np.nan]).isin([float('nan')]), [False, True])
self.assert_numpy_array_equal(
Float64Index([1.0, np.nan]).isin([pd.NaT]), [False, True])

def test_isin_level_kwarg(self):
def check_idx(idx):
values = idx.tolist()[-2:] + ['nonexisting']

expected = np.array([False, False, True, True])
self.assert_numpy_array_equal(expected, idx.isin(values, level=0))
self.assert_numpy_array_equal(expected, idx.isin(values, level=-1))

self.assertRaises(IndexError, idx.isin, values, level=1)
self.assertRaises(IndexError, idx.isin, values, level=10)
self.assertRaises(IndexError, idx.isin, values, level=-2)

self.assertRaises(KeyError, idx.isin, values, level=1.0)
self.assertRaises(KeyError, idx.isin, values, level='foobar')

idx.name = 'foobar'
self.assert_numpy_array_equal(expected,
idx.isin(values, level='foobar'))

self.assertRaises(KeyError, idx.isin, values, level='xyzzy')
self.assertRaises(KeyError, idx.isin, values, level=np.nan)

check_idx(Index(['qux', 'baz', 'foo', 'bar']))
# Float64Index overrides isin, so must be checked separately
check_idx(Float64Index([1.0, 2.0, 3.0, 4.0]))

def test_boolean_cmp(self):
values = [1, 2, 3, 4]

Expand Down Expand Up @@ -2948,6 +2991,55 @@ def test_level_setting_resets_attributes(self):
# if this fails, probably didn't reset the cache correctly.
assert not ind.is_monotonic

def test_isin(self):
values = [('foo', 2), ('bar', 3), ('quux', 4)]

idx = MultiIndex.from_arrays([['qux', 'baz', 'foo', 'bar'],
np.arange(4)])
result = idx.isin(values)
expected = np.array([False, False, True, True])
self.assert_numpy_array_equal(result, expected)

# empty, return dtype bool
idx = MultiIndex.from_arrays([[], []])
result = idx.isin(values)
self.assertEqual(len(result), 0)
self.assertEqual(result.dtype, np.bool_)

def test_isin_nan(self):
idx = MultiIndex.from_arrays([['foo', 'bar'], [1.0, np.nan]])
self.assert_numpy_array_equal(idx.isin([('bar', np.nan)]),
[False, False])
self.assert_numpy_array_equal(idx.isin([('bar', float('nan'))]),
[False, False])

def test_isin_level_kwarg(self):
idx = MultiIndex.from_arrays([['qux', 'baz', 'foo', 'bar'],
np.arange(4)])

vals_0 = ['foo', 'bar', 'quux']
vals_1 = [2, 3, 10]

expected = np.array([False, False, True, True])
self.assert_numpy_array_equal(expected, idx.isin(vals_0, level=0))
self.assert_numpy_array_equal(expected, idx.isin(vals_0, level=-2))

self.assert_numpy_array_equal(expected, idx.isin(vals_1, level=1))
self.assert_numpy_array_equal(expected, idx.isin(vals_1, level=-1))

self.assertRaises(IndexError, idx.isin, vals_0, level=5)
self.assertRaises(IndexError, idx.isin, vals_0, level=-5)

self.assertRaises(KeyError, idx.isin, vals_0, level=1.0)
self.assertRaises(KeyError, idx.isin, vals_1, level=-1.0)
self.assertRaises(KeyError, idx.isin, vals_1, level='A')

idx.names = ['A', 'B']
self.assert_numpy_array_equal(expected, idx.isin(vals_0, level='A'))
self.assert_numpy_array_equal(expected, idx.isin(vals_1, level='B'))

self.assertRaises(KeyError, idx.isin, vals_1, level='C')


def test_get_combined_index():
from pandas.core.index import _get_combined_index
Expand Down

0 comments on commit 0646ad5

Please sign in to comment.