pandas-dev · jreback · Feb 28, 2019 · Mar 15, 2018 · Mar 15, 2018 · Mar 15, 2018
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -14,7 +14,7 @@
 method_blacklist = {
     'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean',
                'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min',
-               'var', 'mad', 'describe', 'std'},
+               'var', 'mad', 'describe', 'std', 'quantile'},
     'datetime': {'median', 'prod', 'sem', 'cumsum', 'sum', 'mean', 'skew',
                  'cumprod', 'cummax', 'pct_change', 'var', 'mad', 'describe',
                  'std'}
@@ -343,8 +343,8 @@ class GroupByMethods(object):
               ['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin',
                'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head',
                'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique',
-               'pct_change', 'prod', 'rank', 'sem', 'shift', 'size', 'skew',
-               'std', 'sum', 'tail', 'unique', 'value_counts', 'var'],
+               'pct_change', 'prod', 'quantile', 'rank', 'sem', 'shift', 'size',
+               'skew', 'std', 'sum', 'tail', 'unique', 'value_counts', 'var'],
               ['direct', 'transformation']]
 
     def setup(self, dtype, method, application):

diff --git a/pandas/_libs/groupby.pxd b/pandas/_libs/groupby.pxd
@@ -0,0 +1,6 @@
+cdef enum InterpolationEnumType:
+    INTERPOLATION_LINEAR,
+    INTERPOLATION_LOWER,
+    INTERPOLATION_HIGHER,
+    INTERPOLATION_NEAREST,
+    INTERPOLATION_MIDPOINT
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -379,5 +379,103 @@ def group_any_all(ndarray[uint8_t] out,
                 out[lab] = flag_val
 
 
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_quantile(ndarray[float64_t] out,
+                   ndarray[int64_t] labels,
+                   numeric[:] values,
+                   ndarray[uint8_t] mask,
+                   double_t q,
+                   object interpolation):
+    """
+    Calculate the quantile per group.
+
+    Parameters
+    ----------
+    out : ndarray
+        Array of aggregated values that will be written to.
+    labels : ndarray
+        Array containing the unique group labels.
+    values : ndarray
+        Array containing the values to apply the function against.
+    q : double
+        The quantile value to search for.
+
+    Notes
+    -----
+    Rather than explicitly returning a value, this function modifies the
+    provided `out` parameter.
+    """
+    cdef:
+        Py_ssize_t i, N=len(labels)
+        int64_t lab, ngroups, grp_sz, non_na_sz, grp_start=0, idx=0
+        uint8_t interp, offset
+        numeric val, next_val
+        double_t q_idx, frac
+        ndarray[int64_t] counts, non_na_counts
+        ndarray[int64_t] sort_arr
+
+    inter_methods = {
+        'linear': INTERPOLATION_LINEAR,
+        'lower': INTERPOLATION_LOWER,
+        'higher': INTERPOLATION_HIGHER,
+        'nearest': INTERPOLATION_NEAREST,
+        'midpoint': INTERPOLATION_MIDPOINT,
+    }
+    interp = inter_methods[interpolation]
+
+    counts = np.zeros_like(out, dtype=np.int64)
+    non_na_counts = np.zeros_like(out, dtype=np.int64)
+    ngroups = len(counts)
+
+    # First figure out the size of every group
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            counts[lab] += 1
+            if not mask[i]:
+                non_na_counts[lab] += 1
+
+    # Get an index of values sorted by labels and then values
+    assert len(values) == len(labels)
+    order = (values, labels)
+    sort_arr = np.lexsort(order).astype(np.int64, copy=False)
+
+    with nogil:
+        for i in range(ngroups):
+            # Figure out how many group elements there are
+            grp_sz = counts[i]
+            non_na_sz = non_na_counts[i]
+
+            # Calculate where to retrieve the desired value
+            # Casting to int will intentionaly truncate result
+            idx = grp_start + <int64_t>(q * <double_t>(non_na_sz - 1))
+
+            val = values[sort_arr[idx]]
+            # If requested quantile falls evenly on a particular index
+            # then write that index's value out. Otherwise interpolate
+            q_idx = q * (non_na_sz - 1)
+            frac = q_idx % 1
+
+            if frac == 0.0 or interp == INTERPOLATION_LOWER:
+                out[i] = val
+            else:
+                next_val = values[sort_arr[idx + 1]]
+                if interp == INTERPOLATION_LINEAR:
+                    out[i] = val + (next_val - val) * frac
+                elif interp == INTERPOLATION_HIGHER:
+                    out[i] = next_val
+                elif interp == INTERPOLATION_MIDPOINT:
+                    out[i] = (val + next_val) / 2.0
+                elif interp == INTERPOLATION_NEAREST:
+                    if frac > .5 or (frac == .5 and q > .5):  # Always safe?
+                        out[i] = next_val
+                    else:
+                        out[i] = val
+
+            # Increment the index reference in sorted_arr for the next group
+            grp_start += grp_sz
+
+
 # generated from template
 include "groupby_helper.pxi"
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -1732,6 +1732,70 @@ def nth(self, n, dropna=None):
 
         return result
 
+    def quantile(self, q=0.5, interpolation='linear'):
+        """
+        Return group values at the given quantile, a la numpy.percentile.
+
+        Parameters
+        ----------
+        q : float or array-like, default 0.5 (50% quantile)
+            0 <= q <= 1, the quantile(s) to compute
+        interpolation : str
+            Method to use when the desired quantile falls between two points.
+
+        Returns
+        -------
+        Series or DataFrame
+            Return type determined by caller of GroupBy object.
+
+        See Also
+        --------
+        Series.quantile : Similar method for Series
+        DataFrame.quantile : Similar method for DataFrame
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...    [['foo'] * 5 + ['bar'] * 5,
+        ...     [1, 2, 3, 4, 5, 5, 4, 3, 2, 1]],
+        ...    columns=['key', 'val'])
+        >>> df
+        """
+
+        is_dt = False
+        is_int = False
+
+        def pre_processor(vals):
+            if vals.dtype == np.object:
+                raise TypeError("'quantile' cannot be performed against "
+                                "'object' dtypes!")
+            elif vals.dtype == np.int:
+                nonlocal is_int
+                is_int = True
+            elif vals.dtype == 'datetime64[ns]':
+                vals = vals.astype(np.float)
+                nonlocal is_dt
+                is_dt = True
+
+            return vals
+
+        def post_processor(vals):
+            if is_dt:
+                vals = vals.astype('datetime64[ns]')
+            elif is_int and interpolation in ['lower', 'higher', 'nearest']:
+                vals = vals.astype(np.int)
+
+            return vals
+
+        return self._get_cythonized_result('group_quantile', self.grouper,
+                                           aggregate=True,
+                                           needs_values=True,
+                                           needs_mask=True,
+                                           cython_dtype=np.float64,
+                                           pre_processing=pre_processor,
+                                           post_processing=post_processor,
+                                           q=q, interpolation=interpolation)
+
     @Substitution(name='groupby')
     def ngroup(self, ascending=True):
         """
@@ -1928,43 +1992,46 @@ def cummax(self, axis=0, **kwargs):
     def _get_cythonized_result(self, how, grouper, aggregate=False,
                                cython_dtype=None, needs_values=False,
                                needs_mask=False, needs_ngroups=False,
-                               result_is_index=False,
-                               pre_processing=None, post_processing=None,
-                               **kwargs):
-        """Get result for Cythonized functions
+                               result_is_index=False, pre_processing=None,
+                               post_processing=None, **kwargs):
+        """
+        Get result for Cythonized functions.
 
         Parameters
         ----------
-        how : str, Cythonized function name to be called
-        grouper : Grouper object containing pertinent group info
+        how : str
+            Cythonized function name to be called.
+        grouper : pandas.Grouper
+            Grouper object containing pertinent group info.
         aggregate : bool, default False
             Whether the result should be aggregated to match the number of
-            groups
+            groups.
         cython_dtype : default None
             Type of the array that will be modified by the Cython call. If
-            `None`, the type will be inferred from the values of each slice
+            `None`, the type will be inferred from the values of each slice.
         needs_values : bool, default False
             Whether the values should be a part of the Cython call
-            signature
+            signature.
         needs_mask : bool, default False
             Whether boolean mask needs to be part of the Cython call
-            signature
+            signature.
         needs_ngroups : bool, default False
-            Whether number of groups is part of the Cython call signature
+            Whether number of groups is part of the Cython call signature.
         result_is_index : bool, default False
             Whether the result of the Cython operation is an index of
-            values to be retrieved, instead of the actual values themselves
+            values to be retrieved, instead of the actual values themselves.
         pre_processing : function, default None
-            Function to be applied to `values` prior to passing to Cython
-            Raises if `needs_values` is False
+            Function to be applied to `values` prior to passing to Cython.
+            Raises if `needs_values` is False.
         post_processing : function, default None
-            Function to be applied to result of Cython function
-        **kwargs : dict
-            Extra arguments to be passed back to Cython funcs
+            Function to be applied to result of Cython function.
+        **kwargs
+            Extra arguments to be passed back to Cython funcs.
 
         Returns
         -------
-        `Series` or `DataFrame`  with filled values
+        `Series` or `DataFrame`
+            Object type determined by caller of the ``GroupBy`` object.
         """
         if result_is_index and aggregate:
             raise ValueError("'result_is_index' and 'aggregate' cannot both "

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -3396,8 +3396,11 @@ def map(self, mapper, na_action=None):
 
     def isin(self, values, level=None):
         """
+        Return a boolean array where the index values are in `values`.
+
         Compute boolean array of whether each index value is found in the
-        passed set of values.
+        passed set of values. The length of the returned boolean array matches
+        the length of the index.
 
         Parameters
         ----------
@@ -3406,23 +3409,74 @@ def isin(self, values, level=None):
 
             .. versionadded:: 0.18.1
 
-            Support for values as a set
+               Support for values as a set.
 
         level : str or int, optional
             Name or position of the index level to use (if the index is a
-            MultiIndex).
+            `MultiIndex`).
+
+        Returns
+        -------
+        is_contained : ndarray
+            NumPy array of boolean values.
+
+        See also
+        --------
+        Series.isin : Same for Series.
+        DataFrame.isin : Same method for DataFrames.
 
         Notes
         -----
+        In the case of `MultiIndex` you must either specify `values` as a
+        list-like object containing tuples that are the same length as the
+        number of levels, or specify `level`. Otherwise it will raise a
+        ``ValueError``.
+
         If `level` is specified:
 
         - if it is the name of one *and only one* index level, use that level;
         - otherwise it should be a number indicating level position.
 
-        Returns
-        -------
-        is_contained : ndarray (boolean dtype)
+        Examples
+        --------
+        >>> idx = pd.Index([1,2,3])
+        >>> idx
+        Int64Index([1, 2, 3], dtype='int64')
+
+        Check whether each index value in a list of values.
+        >>> idx.isin([1, 4])
+        array([ True, False, False])
+
+        >>> midx = pd.MultiIndex.from_arrays([[1,2,3],
+        ...                                  ['red', 'blue', 'green']],
+        ...                                  names=('number', 'color'))
+        >>> midx
+        MultiIndex(levels=[[1, 2, 3], ['blue', 'green', 'red']],
+                   labels=[[0, 1, 2], [2, 0, 1]],
+                   names=['number', 'color'])
+
+        Check whether the strings in the 'color' level of the MultiIndex
+        are in a list of colors.
+
+        >>> midx.isin(['red', 'orange', 'yellow'], level='color')
+        array([ True, False, False])
+
+        To check across the levels of a MultiIndex, pass a list of tuples:
+
+        >>> midx.isin([(1, 'red'), (3, 'red')])
+        array([ True, False, False])
+
+        For a DatetimeIndex, string values in `values` are converted to
+        Timestamps.
+
+        >>> dates = ['2000-03-11', '2000-03-12', '2000-03-13']
+        >>> dti = pd.to_datetime(dates)
+        >>> dti
+        DatetimeIndex(['2000-03-11', '2000-03-12', '2000-03-13'],
+        dtype='datetime64[ns]', freq=None)
 
+        >>> dti.isin(['2000-03-11'])
+        array([ True, False, False])
         """
         if level is not None:
             self._validate_index_level(level)