Skip to content

Commit

Permalink
ENH: add other tie breaking methods to rank Cython routines, still so…
Browse files Browse the repository at this point in the history
…me testing needed in generic case, GH #874
  • Loading branch information
wesm committed Mar 11, 2012
1 parent a262f30 commit 1627631
Show file tree
Hide file tree
Showing 5 changed files with 176 additions and 19 deletions.
9 changes: 6 additions & 3 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3730,7 +3730,7 @@ def clip_lower(self, threshold):
"""
return self.apply(lambda x: x.clip_lower(threshold))

def rank(self, axis=0, numeric_only=None):
def rank(self, axis=0, numeric_only=None, method='average'):
"""
Compute numerical data ranks (1 through n) along axis. Equal values are
assigned a rank that is the average of the ranks of those values
Expand All @@ -3750,9 +3750,12 @@ def rank(self, axis=0, numeric_only=None):
try:
values = self.values
if issubclass(values.dtype.type, np.floating):
ranks = lib.rank_2d_float64(values, axis=axis)
ranks = lib.rank_2d_float64(values, axis=axis,
ties_method=method)
else:
ranks = lib.rank_2d_generic(values, axis=axis)
ranks = lib.rank_2d_generic(values, axis=axis,
ties_method=method)
return DataFrame(ranks, index=self.index, columns=self.columns)
except TypeError:
numeric_only = True

Expand Down
11 changes: 8 additions & 3 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1576,19 +1576,24 @@ def argsort(self, axis=0, kind='quicksort', order=None):
return Series(np.argsort(values, kind=kind), index=self.index,
name=self.name)

def rank(self):
def rank(self, method='average', na_option='keep'):
"""
Compute data ranks (1 through n). Equal values are assigned a rank that
is the average of the ranks of those values
Parameters
----------
method : {'average', 'min', 'max', 'first'}
na_option : {'keep'}
Returns
-------
ranks : Series
"""
try:
ranks = lib.rank_1d_float64(self.values)
ranks = lib.rank_1d_float64(self.values, ties_method=method)
except Exception:
ranks = lib.rank_1d_generic(self.values)
ranks = lib.rank_1d_generic(self.values, ties_method=method)
return Series(ranks, index=self.index, name=self.name)

def order(self, na_last=True, ascending=True, kind='mergesort'):
Expand Down
86 changes: 73 additions & 13 deletions pandas/src/stats.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,20 @@ cdef float64_t FP_ERR = 1e-13

cimport util

def rank_1d_float64(object in_arr):
cdef:
int TIEBREAK_AVERAGE = 0
int TIEBREAK_MIN = 1
int TIEBREAK_MAX = 2
int TIEBREAK_FIRST = 3

tiebreakers = {
'average' : TIEBREAK_AVERAGE,
'min' : TIEBREAK_MIN,
'max' : TIEBREAK_MAX,
'first' : TIEBREAK_FIRST
}

def rank_1d_float64(object in_arr, ties_method='average'):
"""
Fast NaN-friendly version of scipy.stats.rankdata
"""
Expand All @@ -14,6 +27,8 @@ def rank_1d_float64(object in_arr):
int32_t idx
float64_t val, nan_value
float64_t sum_ranks = 0
int tiebreak = 0
tiebreak = tiebreakers[ties_method]

values = np.asarray(in_arr).copy()

Expand All @@ -27,7 +42,6 @@ def rank_1d_float64(object in_arr):
# py2.5/win32 hack, can't pass i8
_as = values.argsort()
sorted_data = values.take(_as)

argsorted = _as.astype('i8')

for i in range(n):
Expand All @@ -38,12 +52,22 @@ def rank_1d_float64(object in_arr):
ranks[argsorted[i]] = nan
continue
if i == n - 1 or fabs(sorted_data[i + 1] - val) > FP_ERR:
for j in range(i - dups + 1, i + 1):
ranks[argsorted[j]] = sum_ranks / dups
if tiebreak == TIEBREAK_AVERAGE:
for j in range(i - dups + 1, i + 1):
ranks[argsorted[j]] = sum_ranks / dups
elif tiebreak == TIEBREAK_MIN:
for j in range(i - dups + 1, i + 1):
ranks[argsorted[j]] = i - dups + 2
elif tiebreak == TIEBREAK_MAX:
for j in range(i - dups + 1, i + 1):
ranks[argsorted[j]] = i + 1
elif tiebreak == TIEBREAK_FIRST:
for j in range(i - dups + 1, i + 1):
ranks[argsorted[j]] = j + 1
sum_ranks = dups = 0
return ranks

def rank_2d_float64(object in_arr, axis=0):
def rank_2d_float64(object in_arr, axis=0, ties_method='average'):
"""
Fast NaN-friendly version of scipy.stats.rankdata
"""
Expand All @@ -55,6 +79,8 @@ def rank_2d_float64(object in_arr, axis=0):
int32_t idx
float64_t val, nan_value
float64_t sum_ranks = 0
int tiebreak = 0
tiebreak = tiebreakers[ties_method]

in_arr = np.asarray(in_arr)

Expand All @@ -81,16 +107,26 @@ def rank_2d_float64(object in_arr, axis=0):
ranks[i, argsorted[i, j]] = nan
continue
if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR:
for z in range(j - dups + 1, j + 1):
ranks[i, argsorted[i, z]] = sum_ranks / dups
if tiebreak == TIEBREAK_AVERAGE:
for z in range(j - dups + 1, j + 1):
ranks[i, argsorted[i, z]] = sum_ranks / dups
elif tiebreak == TIEBREAK_MIN:
for z in range(j - dups + 1, j + 1):
ranks[i, argsorted[i, z]] = j - dups + 2
elif tiebreak == TIEBREAK_MAX:
for z in range(j - dups + 1, j + 1):
ranks[i, argsorted[i, z]] = j + 1
elif tiebreak == TIEBREAK_FIRST:
for z in range(j - dups + 1, j + 1):
ranks[i, argsorted[i, z]] = z + 1
sum_ranks = dups = 0

if axis == 0:
return ranks.T
else:
return ranks

def rank_1d_generic(object in_arr, bint retry=1):
def rank_1d_generic(object in_arr, bint retry=1, ties_method='average'):
"""
Fast NaN-friendly version of scipy.stats.rankdata
"""
Expand All @@ -103,6 +139,8 @@ def rank_1d_generic(object in_arr, bint retry=1):
int32_t idx
object val, nan_value
float64_t sum_ranks = 0
int tiebreak = 0
tiebreak = tiebreakers[ties_method]

values = np.array(in_arr, copy=True)

Expand Down Expand Up @@ -140,8 +178,18 @@ def rank_1d_generic(object in_arr, bint retry=1):
continue
if (i == n - 1 or
are_diff(util.get_value_at(sorted_data, i + 1), val)):
for j in range(i - dups + 1, i + 1):
ranks[argsorted[j]] = sum_ranks / dups
if tiebreak == TIEBREAK_AVERAGE:
for j in range(i - dups + 1, i + 1):
ranks[argsorted[j]] = sum_ranks / dups
elif tiebreak == TIEBREAK_MIN:
for j in range(i - dups + 1, i + 1):
ranks[argsorted[j]] = i - dups + 2
elif tiebreak == TIEBREAK_MAX:
for j in range(i - dups + 1, i + 1):
ranks[argsorted[j]] = i + 1
elif tiebreak == TIEBREAK_FIRST:
for j in range(i - dups + 1, i + 1):
ranks[argsorted[j]] = j + 1
sum_ranks = dups = 0
return ranks

Expand All @@ -163,7 +211,7 @@ class Infinity(object):
__ge__ = return_true
__cmp__ = return_false

def rank_2d_generic(object in_arr, axis=0):
def rank_2d_generic(object in_arr, axis=0, ties_method='average'):
"""
Fast NaN-friendly version of scipy.stats.rankdata
"""
Expand All @@ -176,6 +224,8 @@ def rank_2d_generic(object in_arr, axis=0):
int32_t idx
object val, nan_value
float64_t sum_ranks = 0
int tiebreak = 0
tiebreak = tiebreakers[ties_method]

in_arr = np.asarray(in_arr)

Expand Down Expand Up @@ -218,8 +268,18 @@ def rank_2d_generic(object in_arr, axis=0):
sum_ranks += (j - infs) + 1
dups += 1
if j == k - 1 or are_diff(values[i, j + 1], val):
for z in range(j - dups + 1, j + 1):
ranks[i, argsorted[i, z]] = sum_ranks / dups
if tiebreak == TIEBREAK_AVERAGE:
for z in range(j - dups + 1, j + 1):
ranks[i, argsorted[i, z]] = sum_ranks / dups
elif tiebreak == TIEBREAK_MIN:
for z in range(j - dups + 1, j + 1):
ranks[i, argsorted[i, z]] = j - dups + 2
elif tiebreak == TIEBREAK_MAX:
for z in range(j - dups + 1, j + 1):
ranks[i, argsorted[i, z]] = j + 1
elif tiebreak == TIEBREAK_FIRST:
for z in range(j - dups + 1, j + 1):
ranks[i, argsorted[i, z]] = z + 1
sum_ranks = dups = 0

if axis == 0:
Expand Down
67 changes: 67 additions & 0 deletions pandas/tests/test_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import nose
import unittest

from numpy import nan

from pandas import *

from pandas.util.compat import product
from pandas.util.testing import (assert_frame_equal,
assert_almost_equal)

class TestStats(unittest.TestCase):

def test_rank_tie_methods(self):
s = Series([1, 3, 4, 2, nan, 2, 1, 5, nan, 3])

# c(1, 3, 4, 2, NA, 2, 1, 5, NA, 3)

ranks = s.rank()
expected = np.array([1.5, 5.5, 7.0, 3.5, nan,
3.5, 1.5, 8.0, nan, 5.5])
assert_almost_equal(ranks, expected)

ranks = s.rank(method='min')
expected = np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5])
assert_almost_equal(ranks, expected)

ranks = s.rank(method='max')
expected = np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6])
assert_almost_equal(ranks, expected)

ranks = s.rank(method='first')
expected = np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6])
assert_almost_equal(ranks, expected)

def test_rank_2d_tie_methods(self):
s = Series([1, 3, 4, 2, nan, 2, 1, 5, nan, 3])
df = DataFrame({'A': s, 'B': s})

def _check2d(df, expected, method='average', axis=0):
exp_df = DataFrame({'A': expected, 'B': expected})

if axis == 1:
df = df.T
exp_df = exp_df.T

result = df.rank(method=method, axis=axis)
assert_frame_equal(result, exp_df)

results = {
'average': np.array([1.5, 5.5, 7.0, 3.5, nan,
3.5, 1.5, 8.0, nan, 5.5]),
'min': np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5]),
'max': np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6]),
'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6])
}

dtypes = [None]

for method, axis, dtype in product(results, [0, 1], dtypes):
frame = df if dtype is None else df.astype(dtype)
_check2d(frame, results[method], method=method, axis=axis)

if __name__ == '__main__':
nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'],
exit=False)

22 changes: 22 additions & 0 deletions vb_suite/stat_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,25 @@
Benchmark("df[1].sum(level=[0, 1])", setup, repeat=1,
start_date=datetime(2011, 11, 15))

#----------------------------------------------------------------------
# rank

setup = common_setup + """
values = np.concatenate([np.arange(100000),
np.random.randn(100000),
np.arange(100000)])
s = Series(values)
"""

stats_rank_average = Benchmark('s.rank()', setup,
start_date=datetime(2011, 12, 12))

setup = common_setup + """
df = DataFrame(np.random.randn(5000, 50))
"""

stats_rank2d_axis1_average = Benchmark('df.rank(1)', setup,
start_date=datetime(2011, 12, 12))

stats_rank2d_axis0_average = Benchmark('df.rank()', setup,
start_date=datetime(2011, 12, 12))

0 comments on commit 1627631

Please sign in to comment.