diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index e799d308c1f4a..bfbfb97353fa4 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -119,16 +119,18 @@ def value_counts(values, sort=True, ascending=False): return result -def rank(values, axis=0, method='average', na_option='keep'): +def rank(values, axis=0, method='average', na_option='keep', + ascending=True): """ """ if values.ndim == 1: f, values = _get_data_algo(values, _rank1d_functions) - ranks = f(values, ties_method=method) + ranks = f(values, ties_method=method, ascending=ascending) elif values.ndim == 2: f, values = _get_data_algo(values, _rank2d_functions) - ranks = f(values, axis=axis, ties_method=method) + ranks = f(values, axis=axis, ties_method=method, + ascending=ascending) return ranks diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a14cd65cd9667..b1345a7b9090d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3731,7 +3731,7 @@ def clip_lower(self, threshold): return self.apply(lambda x: x.clip_lower(threshold)) def rank(self, axis=0, numeric_only=None, method='average', - na_option='keep'): + na_option='keep', ascending=True): """ Compute numerical data ranks (1 through n) along axis. Equal values are assigned a rank that is the average of the ranks of those values @@ -3749,6 +3749,8 @@ def rank(self, axis=0, numeric_only=None, method='average', first: ranks assigned in order they appear in the array na_option : {'keep'} keep: leave NA values where they are + ascending : boolean, default True + False for ranks by high (1) to low (N) Returns ------- @@ -3758,7 +3760,8 @@ def rank(self, axis=0, numeric_only=None, method='average', if numeric_only is None: try: - ranks = rank(self.values, axis=axis, method=method) + ranks = rank(self.values, axis=axis, method=method, + ascending=ascending) return DataFrame(ranks, index=self.index, columns=self.columns) except TypeError: numeric_only = True @@ -3767,7 +3770,8 @@ def rank(self, axis=0, numeric_only=None, method='average', data = self._get_numeric_data() else: data = self - ranks = rank(data.values, axis=axis, method=method) + ranks = rank(data.values, axis=axis, method=method, + ascending=ascending) return DataFrame(ranks, index=data.index, columns=data.columns) #---------------------------------------------------------------------- diff --git a/pandas/core/series.py b/pandas/core/series.py index aedcab20c196e..ea2dd044ec29b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1576,7 +1576,7 @@ def argsort(self, axis=0, kind='quicksort', order=None): return Series(np.argsort(values, kind=kind), index=self.index, name=self.name) - def rank(self, method='average', na_option='keep'): + def rank(self, method='average', na_option='keep', ascending=True): """ Compute data ranks (1 through n). Equal values are assigned a rank that is the average of the ranks of those values @@ -1590,13 +1590,16 @@ def rank(self, method='average', na_option='keep'): first: ranks assigned in order they appear in the array na_option : {'keep'} keep: leave NA values where they are + ascending : boolean, default True + False for ranks by high (1) to low (N) Returns ------- ranks : Series """ from pandas.core.algorithms import rank - ranks = rank(self.values, method=method, na_option=na_option) + ranks = rank(self.values, method=method, na_option=na_option, + ascending=ascending) return Series(ranks, index=self.index, name=self.name) def order(self, na_last=True, ascending=True, kind='mergesort'): diff --git a/pandas/src/sandbox.pyx b/pandas/src/sandbox.pyx index 5846ea04fc2ef..df4e2a0b7f0fb 100644 --- a/pandas/src/sandbox.pyx +++ b/pandas/src/sandbox.pyx @@ -269,3 +269,18 @@ cdef extern from "math.h": double fabs(double) cdef float64_t FP_ERR = 1e-13 + +cimport util + +cdef: + int TIEBREAK_AVERAGE = 0 + int TIEBREAK_MIN = 1 + int TIEBREAK_MAX = 2 + int TIEBREAK_FIRST = 3 + +tiebreakers = { + 'average' : TIEBREAK_AVERAGE, + 'min' : TIEBREAK_MIN, + 'max' : TIEBREAK_MAX, + 'first' : TIEBREAK_FIRST +} diff --git a/pandas/src/stats.pyx b/pandas/src/stats.pyx index 521eea51347ab..f4d87f411a97e 100644 --- a/pandas/src/stats.pyx +++ b/pandas/src/stats.pyx @@ -7,6 +7,7 @@ cdef: int TIEBREAK_MIN = 1 int TIEBREAK_MAX = 2 int TIEBREAK_FIRST = 3 + int TIEBREAK_FIRST_DESCENDING = 4 tiebreakers = { 'average' : TIEBREAK_AVERAGE, @@ -15,7 +16,61 @@ tiebreakers = { 'first' : TIEBREAK_FIRST } -def rank_1d_float64(object in_arr, ties_method='average'): + +# ctypedef fused pvalue_t: +# float64_t +# int64_t +# object + +# from cython cimport floating, integral + +cdef _take_2d_float64(ndarray[float64_t, ndim=2] values, + object idx): + cdef: + Py_ssize_t i, j, N, K + ndarray[Py_ssize_t, ndim=2, cast=True] indexer = idx + ndarray[float64_t, ndim=2] result + object val + + N, K = ( values).shape + result = np.empty_like(values) + for i in range(N): + for j in range(K): + result[i, j] = values[i, indexer[i, j]] + return result + +cdef _take_2d_int64(ndarray[int64_t, ndim=2] values, + object idx): + cdef: + Py_ssize_t i, j, N, K + ndarray[Py_ssize_t, ndim=2, cast=True] indexer = idx + ndarray[int64_t, ndim=2] result + object val + + N, K = ( values).shape + result = np.empty_like(values) + for i in range(N): + for j in range(K): + result[i, j] = values[i, indexer[i, j]] + return result + +cdef _take_2d_object(ndarray[object, ndim=2] values, + object idx): + cdef: + Py_ssize_t i, j, N, K + ndarray[Py_ssize_t, ndim=2, cast=True] indexer = idx + ndarray[object, ndim=2] result + object val + + N, K = ( values).shape + result = values.copy() + for i in range(N): + for j in range(K): + result[i, j] = values[i, indexer[i, j]] + return result + + +def rank_1d_float64(object in_arr, ties_method='average', ascending=True): """ Fast NaN-friendly version of scipy.stats.rankdata """ @@ -24,7 +79,6 @@ def rank_1d_float64(object in_arr, ties_method='average'): Py_ssize_t i, j, n, dups = 0 ndarray[float64_t] sorted_data, ranks, values ndarray[int64_t] argsorted - int32_t idx float64_t val, nan_value float64_t sum_ranks = 0 int tiebreak = 0 @@ -32,7 +86,10 @@ def rank_1d_float64(object in_arr, ties_method='average'): values = np.asarray(in_arr).copy() - nan_value = np.inf + if ascending: + nan_value = np.inf + else: + nan_value = -np.inf mask = np.isnan(values) np.putmask(values, mask, nan_value) @@ -40,7 +97,17 @@ def rank_1d_float64(object in_arr, ties_method='average'): ranks = np.empty(n, dtype='f8') # py2.5/win32 hack, can't pass i8 - _as = values.argsort() + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = values.argsort(kind='mergesort') + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = values.argsort() + + if not ascending: + _as = _as[::-1] + sorted_data = values.take(_as) argsorted = _as.astype('i8') @@ -64,10 +131,14 @@ def rank_1d_float64(object in_arr, ties_method='average'): elif tiebreak == TIEBREAK_FIRST: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = j + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = 2 * i - j - dups + 2 sum_ranks = dups = 0 return ranks -def rank_1d_int64(object in_arr, ties_method='average'): + +def rank_1d_int64(object in_arr, ties_method='average', ascending=True): """ Fast NaN-friendly version of scipy.stats.rankdata """ @@ -77,7 +148,6 @@ def rank_1d_int64(object in_arr, ties_method='average'): ndarray[int64_t] sorted_data, values ndarray[float64_t] ranks ndarray[int64_t] argsorted - int32_t idx int64_t val float64_t sum_ranks = 0 int tiebreak = 0 @@ -89,7 +159,17 @@ def rank_1d_int64(object in_arr, ties_method='average'): ranks = np.empty(n, dtype='f8') # py2.5/win32 hack, can't pass i8 - _as = values.argsort() + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = values.argsort(kind='mergesort') + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = values.argsort() + + if not ascending: + _as = _as[::-1] + sorted_data = values.take(_as) argsorted = _as.astype('i8') @@ -97,7 +177,7 @@ def rank_1d_int64(object in_arr, ties_method='average'): sum_ranks += i + 1 dups += 1 val = sorted_data[i] - if i == n - 1 or sorted_data[i + 1] - val: + if i == n - 1 or fabs(sorted_data[i + 1] - val) > 0: if tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = sum_ranks / dups @@ -110,10 +190,15 @@ def rank_1d_int64(object in_arr, ties_method='average'): elif tiebreak == TIEBREAK_FIRST: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = j + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = 2 * i - j - dups + 2 sum_ranks = dups = 0 return ranks -def rank_2d_float64(object in_arr, axis=0, ties_method='average'): + +def rank_2d_float64(object in_arr, axis=0, ties_method='average', + ascending=True): """ Fast NaN-friendly version of scipy.stats.rankdata """ @@ -122,7 +207,6 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average'): Py_ssize_t i, j, z, k, n, dups = 0 ndarray[float64_t, ndim=2] ranks, values ndarray[int64_t, ndim=2] argsorted - int32_t idx float64_t val, nan_value float64_t sum_ranks = 0 int tiebreak = 0 @@ -135,13 +219,29 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average'): else: values = in_arr.copy() - nan_value = np.inf + if ascending: + nan_value = np.inf + else: + nan_value = -np.inf + np.putmask(values, np.isnan(values), nan_value) n, k = ( values).shape ranks = np.empty((n, k), dtype='f8') - argsorted = values.argsort(1).astype('i8') - values.sort(axis=1) + + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = values.argsort(axis=1, kind='mergesort') + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = values.argsort(1) + + if not ascending: + _as = _as[:, ::-1] + + values = _take_2d_float64(values, _as) + argsorted = _as.astype('i8') for i in range(n): dups = sum_ranks = 0 @@ -165,6 +265,77 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average'): elif tiebreak == TIEBREAK_FIRST: for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = z + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 + sum_ranks = dups = 0 + + if axis == 0: + return ranks.T + else: + return ranks + + +def rank_2d_int64(object in_arr, axis=0, ties_method='average', + ascending=True): + """ + Fast NaN-friendly version of scipy.stats.rankdata + """ + + cdef: + Py_ssize_t i, j, z, k, n, dups = 0 + ndarray[float64_t, ndim=2] ranks + ndarray[int64_t, ndim=2] argsorted + ndarray[int64_t, ndim=2, cast=True] values + int64_t val + float64_t sum_ranks = 0 + int tiebreak = 0 + tiebreak = tiebreakers[ties_method] + + if axis == 0: + values = np.asarray(in_arr).T + else: + values = np.asarray(in_arr) + + n, k = ( values).shape + ranks = np.empty((n, k), dtype='f8') + + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = values.argsort(axis=1, kind='mergesort') + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = values.argsort(1) + + if not ascending: + _as = _as[:, ::-1] + + values = _take_2d_int64(values, _as) + argsorted = _as.astype('i8') + + for i in range(n): + dups = sum_ranks = 0 + for j in range(k): + sum_ranks += j + 1 + dups += 1 + val = values[i, j] + if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR: + if tiebreak == TIEBREAK_AVERAGE: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = j - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = j + 1 + elif tiebreak == TIEBREAK_FIRST: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = z + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 sum_ranks = dups = 0 if axis == 0: @@ -172,7 +343,9 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average'): else: return ranks -def rank_1d_generic(object in_arr, bint retry=1, ties_method='average'): + +def rank_1d_generic(object in_arr, bint retry=1, ties_method='average', + ascending=True): """ Fast NaN-friendly version of scipy.stats.rankdata """ @@ -182,7 +355,6 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average'): ndarray[float64_t] ranks ndarray sorted_data, values ndarray[int64_t] argsorted - int32_t idx object val, nan_value float64_t sum_ranks = 0 int tiebreak = 0 @@ -193,7 +365,12 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average'): if values.dtype != np.object_: values = values.astype('O') - nan_value = Infinity() # always greater than everything + if ascending: + # always greater than everything + nan_value = Infinity() + else: + nan_value = NegInfinity() + mask = isnullobj(values) np.putmask(values, mask, nan_value) @@ -208,10 +385,15 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average'): raise valid_locs = (-mask).nonzero()[0] - ranks.put(valid_locs, rank_1d_generic(values.take(valid_locs), 0)) + ranks.put(valid_locs, rank_1d_generic(values.take(valid_locs), 0, + ties_method=ties_method, + ascending=ascending)) np.putmask(ranks, mask, np.nan) return ranks + if not ascending: + _as = _as[::-1] + sorted_data = values.take(_as) argsorted = _as.astype('i8') @@ -235,8 +417,6 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average'): ranks[argsorted[j]] = i + 1 elif tiebreak == TIEBREAK_FIRST: raise ValueError('first not supported for non-numeric data') - # for j in range(i - dups + 1, i + 1): - # ranks[argsorted[j]] = j + 1 sum_ranks = dups = 0 return ranks @@ -246,19 +426,31 @@ cdef inline are_diff(object left, object right): except TypeError: return left != right +_return_false = lambda self, other: False +_return_true = lambda self, other: True + class Infinity(object): - return_false = lambda self, other: False - return_true = lambda self, other: True - __lt__ = return_false - __le__ = return_false - __eq__ = return_false - __ne__ = return_true - __gt__ = return_true - __ge__ = return_true - __cmp__ = return_false - -def rank_2d_generic(object in_arr, axis=0, ties_method='average'): + __lt__ = _return_false + __le__ = _return_false + __eq__ = _return_false + __ne__ = _return_true + __gt__ = _return_true + __ge__ = _return_true + __cmp__ = _return_false + +class NegInfinity(object): + + __lt__ = _return_true + __le__ = _return_true + __eq__ = _return_false + __ne__ = _return_true + __gt__ = _return_false + __ge__ = _return_false + __cmp__ = _return_true + +def rank_2d_generic(object in_arr, axis=0, ties_method='average', + ascending=True): """ Fast NaN-friendly version of scipy.stats.rankdata """ @@ -268,7 +460,6 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average'): ndarray[float64_t, ndim=2] ranks ndarray[object, ndim=2] values ndarray[int64_t, ndim=2] argsorted - int32_t idx object val, nan_value float64_t sum_ranks = 0 int tiebreak = 0 @@ -284,7 +475,12 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average'): if values.dtype != np.object_: values = values.astype('O') - nan_value = Infinity() # always greater than everything + if ascending: + # always greater than everything + nan_value = Infinity() + else: + nan_value = NegInfinity() + mask = isnullobj2d(values) np.putmask(values, mask, nan_value) @@ -292,17 +488,23 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average'): ranks = np.empty((n, k), dtype='f8') try: - argsorted = values.argsort(1).astype('i8') + _as = values.argsort(1) except TypeError: values = in_arr for i in range(len(values)): - ranks[i] = rank_1d_generic(in_arr[i]) + ranks[i] = rank_1d_generic(in_arr[i], + ties_method=ties_method, + ascending=ascending) if axis == 0: return ranks.T else: return ranks - values.sort(axis=1) + if not ascending: + _as = _as[:, ::-1] + + values = _take_2d_object(values, _as) + argsorted = _as.astype('i8') for i in range(n): dups = sum_ranks = infs = 0 @@ -325,12 +527,24 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average'): for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = j + 1 elif tiebreak == TIEBREAK_FIRST: - raise ValueError('first not supported for non-numeric data') - # for z in range(j - dups + 1, j + 1): - # ranks[i, argsorted[i, z]] = z + 1 + raise ValueError('first not supported for ' + 'non-numeric data') sum_ranks = dups = 0 if axis == 0: return ranks.T else: return ranks + +# def _take_indexer_2d(ndarray[float64_t, ndim=2] values, +# ndarray[Py_ssize_t, ndim=2, cast=True] indexer): +# cdef: +# Py_ssize_t i, j, N, K +# ndarray[float64_t, ndim=2] result + +# N, K = ( values).shape +# result = np.empty_like(values) +# for i in range(N): +# for j in range(K): +# result[i, j] = values[i, indexer[i, j]] +# return result diff --git a/pandas/tests/test_stats.py b/pandas/tests/test_stats.py index 63a1121517b40..820cba29bdf15 100644 --- a/pandas/tests/test_stats.py +++ b/pandas/tests/test_stats.py @@ -2,33 +2,38 @@ import unittest from numpy import nan +import numpy as np -from pandas import * +from pandas import Series, DataFrame from pandas.util.compat import product from pandas.util.testing import (assert_frame_equal, + assert_series_equal, assert_almost_equal) -class TestStats(unittest.TestCase): +class TestRank(unittest.TestCase): + + s = Series([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]) + df = DataFrame({'A': s, 'B': s}) + + results = { + 'average': np.array([1.5, 5.5, 7.0, 3.5, nan, + 3.5, 1.5, 8.0, nan, 5.5]), + 'min': np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5]), + 'max': np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6]), + 'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]) + } def test_rank_tie_methods(self): - s = Series([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]) + s = self.s def _check(s, expected, method='average'): result = s.rank(method=method) assert_almost_equal(result, expected) - results = { - 'average': np.array([1.5, 5.5, 7.0, 3.5, nan, - 3.5, 1.5, 8.0, nan, 5.5]), - 'min': np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5]), - 'max': np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6]), - 'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]) - } - dtypes = [None, object] - disabled = set([(object, 'first')]) + results = self.results for method, dtype in product(results, dtypes): if (dtype, method) in disabled: @@ -36,9 +41,46 @@ def _check(s, expected, method='average'): series = s if dtype is None else s.astype(dtype) _check(series, results[method], method=method) + def test_rank_descending(self): + dtypes = ['O', 'f8', 'i8'] + + for dtype, method in product(dtypes, self.results): + if 'i' in dtype: + s = self.s.dropna() + df = self.df.dropna() + else: + s = self.s.astype(dtype) + df = self.df.astype(dtype) + + res = s.rank(ascending=False) + expected = (s.max() - s).rank() + assert_series_equal(res, expected) + + res = df.rank(ascending=False) + expected = (df.max() - df).rank() + assert_frame_equal(res, expected) + + if method == 'first' and dtype == 'O': + continue + + expected = (s.max() - s).rank(method=method) + res2 = s.rank(method=method, ascending=False) + assert_series_equal(res2, expected) + + expected = (df.max() - df).rank(method=method) + + if dtype != 'O': + res2 = df.rank(method=method, ascending=False, + numeric_only=True) + assert_frame_equal(res2, expected) + + res3 = df.rank(method=method, ascending=False, + numeric_only=False) + assert_frame_equal(res3, expected) + def test_rank_2d_tie_methods(self): - s = Series([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]) - df = DataFrame({'A': s, 'B': s}) + s = self.s + df = self.df def _check2d(df, expected, method='average', axis=0): exp_df = DataFrame({'A': expected, 'B': expected}) @@ -50,17 +92,9 @@ def _check2d(df, expected, method='average', axis=0): result = df.rank(method=method, axis=axis) assert_frame_equal(result, exp_df) - results = { - 'average': np.array([1.5, 5.5, 7.0, 3.5, nan, - 3.5, 1.5, 8.0, nan, 5.5]), - 'min': np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5]), - 'max': np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6]), - 'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]) - } - dtypes = [None, object] - disabled = set([(object, 'first')]) + results = self.results for method, axis, dtype in product(results, [0, 1], dtypes): if (dtype, method) in disabled: @@ -68,6 +102,15 @@ def _check2d(df, expected, method='average', axis=0): frame = df if dtype is None else df.astype(dtype) _check2d(frame, results[method], method=method, axis=axis) + def test_rank_int(self): + s = self.s.dropna().astype('i8') + + for method, res in self.results.iteritems(): + result = s.rank(method=method) + expected = Series(res).dropna() + expected.index = result.index + assert_series_equal(result, expected) + if __name__ == '__main__': nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], exit=False)