pandas-dev · jreback · Feb 28, 2019 · Mar 15, 2018 · Mar 15, 2018 · Mar 15, 2018
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -12,7 +12,7 @@
 method_blacklist = {
     'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean',
                'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min',
-               'var', 'mad', 'describe', 'std'},
+               'var', 'mad', 'describe', 'std', 'quantile'},
     'datetime': {'median', 'prod', 'sem', 'cumsum', 'sum', 'mean', 'skew',
                  'cumprod', 'cummax', 'pct_change', 'var', 'mad', 'describe',
                  'std'}
@@ -314,8 +314,9 @@ class GroupByMethods(object):
               ['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin',
                'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head',
                'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique',
-               'pct_change', 'prod', 'rank', 'sem', 'shift', 'size', 'skew',
-               'std', 'sum', 'tail', 'unique', 'value_counts', 'var'],
+               'pct_change', 'prod', 'quantile', 'rank', 'sem', 'shift',
+               'size', 'skew', 'std', 'sum', 'tail', 'unique', 'value_counts',
+               'var'],
               ['direct', 'transformation']]
 
     def setup(self, dtype, method, application):

diff --git a/pandas/_libs/groupby.pxd b/pandas/_libs/groupby.pxd
@@ -0,0 +1,6 @@
+cdef enum InterpolationEnumType:
+    INTERPOLATION_LINEAR,
+    INTERPOLATION_LOWER,
+    INTERPOLATION_HIGHER,
+    INTERPOLATION_NEAREST,
+    INTERPOLATION_MIDPOINT
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -380,5 +380,106 @@ def group_any_all(uint8_t[:] out,
                 out[lab] = flag_val
 
 
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_quantile(ndarray[float64_t] out,
+                   ndarray[int64_t] labels,
+                   numeric[:] values,
+                   ndarray[uint8_t] mask,
+                   float64_t q,
+                   object interpolation):
+    """
+    Calculate the quantile per group.
+
+    Parameters
+    ----------
+    out : ndarray
+        Array of aggregated values that will be written to.
+    labels : ndarray
+        Array containing the unique group labels.
+    values : ndarray
+        Array containing the values to apply the function against.
+    q : float
+        The quantile value to search for.
+
+    Notes
+    -----
+    Rather than explicitly returning a value, this function modifies the
+    provided `out` parameter.
+    """
+    cdef:
+        Py_ssize_t i, N=len(labels)
+        int64_t lab, ngroups, grp_sz, non_na_sz, grp_start=0, idx=0
+        uint8_t interp, offset
+        numeric val, next_val
+        float64_t q_idx, frac
+        ndarray[int64_t] counts, non_na_counts
+        ndarray[int64_t] sort_arr
+
+    assert <Py_ssize_t>len(values) == N
+    inter_methods = {
+        'linear': INTERPOLATION_LINEAR,
+        'lower': INTERPOLATION_LOWER,
+        'higher': INTERPOLATION_HIGHER,
+        'nearest': INTERPOLATION_NEAREST,
+        'midpoint': INTERPOLATION_MIDPOINT,
+    }
+    interp = inter_methods[interpolation]
+
+    counts = np.zeros_like(out, dtype=np.int64)
+    non_na_counts = np.zeros_like(out, dtype=np.int64)
+    ngroups = len(counts)
+
+    # First figure out the size of every group
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            counts[lab] += 1
+            if not mask[i]:
+                non_na_counts[lab] += 1
+
+    # Get an index of values sorted by labels and then values
+    order = (values, labels)
+    sort_arr = np.lexsort(order).astype(np.int64, copy=False)
+
+    with nogil:
+        for i in range(ngroups):
+            # Figure out how many group elements there are
+            grp_sz = counts[i]
+            non_na_sz = non_na_counts[i]
+
+            if non_na_sz == 0:
+                out[i] = NaN
+            else:
+                # Calculate where to retrieve the desired value
+                # Casting to int will intentionaly truncate result
+                idx = grp_start + <int64_t>(q * <float64_t>(non_na_sz - 1))
+
+                val = values[sort_arr[idx]]
+                # If requested quantile falls evenly on a particular index
+                # then write that index's value out. Otherwise interpolate
+                q_idx = q * (non_na_sz - 1)
+                frac = q_idx % 1
+
+                if frac == 0.0 or interp == INTERPOLATION_LOWER:
+                    out[i] = val
+                else:
+                    next_val = values[sort_arr[idx + 1]]
+                    if interp == INTERPOLATION_LINEAR:
+                        out[i] = val + (next_val - val) * frac
+                    elif interp == INTERPOLATION_HIGHER:
+                        out[i] = next_val
+                    elif interp == INTERPOLATION_MIDPOINT:
+                        out[i] = (val + next_val) / 2.0
+                    elif interp == INTERPOLATION_NEAREST:
+                        if frac > .5 or (frac == .5 and q > .5):  # Always OK?
+                            out[i] = next_val
+                        else:
+                            out[i] = val
+
+            # Increment the index reference in sorted_arr for the next group
+            grp_start += grp_sz
+
+
 # generated from template
 include "groupby_helper.pxi"
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -1670,6 +1670,72 @@ def nth(self, n, dropna=None):
 
         return result
 
+    def quantile(self, q=0.5, interpolation='linear'):
+        """
+        Return group values at the given quantile, a la numpy.percentile.
+
+        Parameters
+        ----------
+        q : float or array-like, default 0.5 (50% quantile)
+            0 <= q <= 1, the quantile(s) to compute
+        interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
+            Method to use when the desired quantile falls between two points.
+
+        Returns
+        -------
+        Series or DataFrame
+            Return type determined by caller of GroupBy object.
+
+        See Also
+        --------
+        Series.quantile : Similar method for Series
+        DataFrame.quantile : Similar method for DataFrame
+        numpy.percentile : NumPy method to compute qth percentile
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...    [['foo'] * 5 + ['bar'] * 5,
+        ...     [1, 2, 3, 4, 5, 5, 4, 3, 2, 1]],
+        ...    columns=['key', 'val'])
+        >>> df
+        """
+
+        inferences = {  # TODO (py27): replace with nonlocal
+            'is_dt': False,
+            'is_int': False
+        }
+
+        def pre_processor(vals):
+            if vals.dtype == np.object:
+                raise TypeError("'quantile' cannot be performed against "
+                                "'object' dtypes!")
+            elif vals.dtype == np.int:
+                inferences['is_int'] = True
+            elif vals.dtype == 'datetime64[ns]':
+                vals = vals.astype(np.float)
+                inferences['is_dt'] = True
+
+            return vals
+
+        def post_processor(vals):
+            if inferences['is_dt']:
+                vals = vals.astype('datetime64[ns]')
+            elif inferences['is_int'] and interpolation in [
+                    'lower', 'higher', 'nearest']:
+                vals = vals.astype(np.int)
+
+            return vals
+
+        return self._get_cythonized_result('group_quantile', self.grouper,
+                                           aggregate=True,
+                                           needs_values=True,
+                                           needs_mask=True,
+                                           cython_dtype=np.float64,
+                                           pre_processing=pre_processor,
+                                           post_processing=post_processor,
+                                           q=q, interpolation=interpolation)
+
     @Substitution(name='groupby')
     def ngroup(self, ascending=True):
         """

diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
@@ -694,6 +694,26 @@ def test_is_monotonic_decreasing(in_vals, out_vals):
 
 # describe
 # --------------------------------
+def test_describe():
+    df = DataFrame([
+        [1, 2, 'foo'],
+        [1, np.nan, 'bar'],
+        [3, np.nan, 'baz']
+    ], columns=['A', 'B', 'C'])
+    grp = df.groupby('A')
+
+    index = pd.Index([1, 3], name='A')
+    columns = pd.MultiIndex.from_product([
+        ['B'], ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']])
+
+    expected = pd.DataFrame([
+        [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
+        [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
+    ], index=index, columns=columns)
+
+    result = grp.describe()
+    tm.assert_frame_equal(result, expected)
+
 
 def test_apply_describe_bug(mframe):
     grouped = mframe.groupby(level='first')
@@ -1055,6 +1075,55 @@ def test_size(df):
     tm.assert_series_equal(df.groupby('A').size(), out)
 
 
+# quantile
+# --------------------------------
+@pytest.mark.parametrize("interpolation", [
+    "linear", "lower", "higher", "nearest", "midpoint"])
+@pytest.mark.parametrize("a_vals,b_vals", [
+    # Ints
+    ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]),
+    ([1, 2, 3, 4], [4, 3, 2, 1]),
+    ([1, 2, 3, 4, 5], [4, 3, 2, 1]),
+    # Floats
+    ([1., 2., 3., 4., 5.], [5., 4., 3., 2., 1.]),
+    # Missing data
+    ([1., np.nan, 3., np.nan, 5.], [5., np.nan, 3., np.nan, 1.]),
+    ([np.nan, 4., np.nan, 2., np.nan], [np.nan, 4., np.nan, 2., np.nan]),
+    # Timestamps
+    ([x for x in pd.date_range('1/1/18', freq='D', periods=5)],
+     [x for x in pd.date_range('1/1/18', freq='D', periods=5)][::-1]),
+    # All NA
+    ([np.nan] * 5, [np.nan] * 5),
+])
+@pytest.mark.parametrize('q', [0, .25, .5, .75, 1])
+def test_quantile(interpolation, a_vals, b_vals, q):
+    if interpolation == 'nearest' and q == 0.5 and b_vals == [4, 3, 2, 1]:
+        pytest.skip("Unclear numpy expectation for nearest result with "
+                    "equidistant data")
+
+    a_expected = pd.Series(a_vals).quantile(q, interpolation=interpolation)
+    b_expected = pd.Series(b_vals).quantile(q, interpolation=interpolation)
+
+    df = pd.DataFrame({
+        'key': ['a'] * len(a_vals) + ['b'] * len(b_vals),
+        'val': a_vals + b_vals})
+
+    expected = DataFrame([a_expected, b_expected], columns=['val'],
+                         index=Index(['a', 'b'], name='key'))
+    result = df.groupby('key').quantile(q, interpolation=interpolation)
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_quantile_raises():
+    df = pd.DataFrame([
+        ['foo', 'a'], ['foo', 'b'], ['foo', 'c']], columns=['key', 'val'])
+
+    with tm.assert_raises_regex(TypeError, "cannot be performed against "
+                                "'object' dtypes"):
+        df.groupby('key').quantile()
+
+
 # pipe
 # --------------------------------
 

diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -222,13 +222,13 @@ def f(x, q=None, axis=0):
     agg_result = df_grouped.agg(np.percentile, 80, axis=0)
     apply_result = df_grouped.apply(DataFrame.quantile, .8)
     expected = df_grouped.quantile(.8)
-    assert_frame_equal(apply_result, expected)
+    assert_frame_equal(apply_result, expected, check_names=False)
     assert_frame_equal(agg_result, expected, check_names=False)
 
     agg_result = df_grouped.agg(f, q=80)
     apply_result = df_grouped.apply(DataFrame.quantile, q=.8)
     assert_frame_equal(agg_result, expected, check_names=False)
-    assert_frame_equal(apply_result, expected)
+    assert_frame_equal(apply_result, expected, check_names=False)
 
 
 def test_len():