Skip to content

Commit

Permalink
ENH: refactoring for #874, fast int64 rank
Browse files Browse the repository at this point in the history
  • Loading branch information
wesm committed Mar 12, 2012
1 parent 9c0dc26 commit 24d7b02
Show file tree
Hide file tree
Showing 5 changed files with 98 additions and 23 deletions.
42 changes: 35 additions & 7 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
Returns
-------
"""
hash_klass, values = _get_hash_table_and_cast(values)
hash_klass, values = _get_data_algo(values, _hashtables)

uniques = []
table = hash_klass(len(values))
Expand Down Expand Up @@ -119,19 +119,47 @@ def value_counts(values, sort=True, ascending=False):

return result

def rank(values, axis=0, method='average', na_option='keep'):
"""
"""
if values.ndim == 1:
f, values = _get_data_algo(values, _rank1d_functions)
ranks = f(values, ties_method=method)
elif values.ndim == 2:
f, values = _get_data_algo(values, _rank2d_functions)
ranks = f(values, axis=axis, ties_method=method)
return ranks

def _get_hash_table_and_cast(values):

def _get_data_algo(values, func_map):
if com.is_float_dtype(values):
klass = lib.Float64HashTable
f = func_map['float64']
values = com._ensure_float64(values)
elif com.is_integer_dtype(values):
klass = lib.Int64HashTable
f = func_map['int64']
values = com._ensure_int64(values)
else:
klass = lib.PyObjectHashTable
f = func_map['generic']
values = com._ensure_object(values)
return klass, values

return f, values

_rank1d_functions = {
'float64' : lib.rank_1d_float64,
'int64' : lib.rank_1d_int64,
'generic' : lib.rank_1d_generic
}

_rank2d_functions = {
'float64' : lib.rank_2d_float64,
'generic' : lib.rank_2d_generic
}

_hashtables = {
'float64' : lib.Float64HashTable,
'int64' : lib.Int64HashTable,
'generic' : lib.PyObjectHashTable
}

def unique(values):
"""
Expand Down
19 changes: 7 additions & 12 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3730,7 +3730,8 @@ def clip_lower(self, threshold):
"""
return self.apply(lambda x: x.clip_lower(threshold))

def rank(self, axis=0, numeric_only=None, method='average', na_option='keep'):
def rank(self, axis=0, numeric_only=None, method='average',
na_option='keep'):
"""
Compute numerical data ranks (1 through n) along axis. Equal values are
assigned a rank that is the average of the ranks of those values
Expand All @@ -3753,27 +3754,21 @@ def rank(self, axis=0, numeric_only=None, method='average', na_option='keep'):
-------
ranks : DataFrame
"""
from pandas.core.algorithms import rank

if numeric_only is None:
try:
values = self.values
if issubclass(values.dtype.type, np.floating):
ranks = lib.rank_2d_float64(values, axis=axis,
ties_method=method)
else:
ranks = lib.rank_2d_generic(values, axis=axis,
ties_method=method)
ranks = rank(self.values, axis=axis, method=method)
return DataFrame(ranks, index=self.index, columns=self.columns)
except TypeError:
numeric_only = True

if numeric_only:
data = self._get_numeric_data()
ranks = lib.rank_2d_float64(data.values.astype('f8'), axis=axis)
return DataFrame(ranks, index=data.index, columns=data.columns)
else:
data = self
ranks = lib.rank_2d_generic(data.values.astype('O'), axis=axis)
return DataFrame(ranks, index=data.index, columns=data.columns)
ranks = rank(data.values, axis=axis, method=method)
return DataFrame(ranks, index=data.index, columns=data.columns)

#----------------------------------------------------------------------
# Plotting
Expand Down
6 changes: 2 additions & 4 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1595,10 +1595,8 @@ def rank(self, method='average', na_option='keep'):
-------
ranks : Series
"""
try:
ranks = lib.rank_1d_float64(self.values, ties_method=method)
except Exception:
ranks = lib.rank_1d_generic(self.values, ties_method=method)
from pandas.core.algorithms import rank
ranks = rank(self.values, method=method, na_option=na_option)
return Series(ranks, index=self.index, name=self.name)

def order(self, na_last=True, ascending=True, kind='mergesort'):
Expand Down
46 changes: 46 additions & 0 deletions pandas/src/stats.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,52 @@ def rank_1d_float64(object in_arr, ties_method='average'):
sum_ranks = dups = 0
return ranks

def rank_1d_int64(object in_arr, ties_method='average'):
"""
Fast NaN-friendly version of scipy.stats.rankdata
"""

cdef:
Py_ssize_t i, j, n, dups = 0
ndarray[int64_t] sorted_data, values
ndarray[float64_t] ranks
ndarray[int64_t] argsorted
int32_t idx
int64_t val
float64_t sum_ranks = 0
int tiebreak = 0
tiebreak = tiebreakers[ties_method]

values = np.asarray(in_arr)

n = len(values)
ranks = np.empty(n, dtype='f8')

# py2.5/win32 hack, can't pass i8
_as = values.argsort()
sorted_data = values.take(_as)
argsorted = _as.astype('i8')

for i in range(n):
sum_ranks += i + 1
dups += 1
val = sorted_data[i]
if i == n - 1 or sorted_data[i + 1] - val:
if tiebreak == TIEBREAK_AVERAGE:
for j in range(i - dups + 1, i + 1):
ranks[argsorted[j]] = sum_ranks / dups
elif tiebreak == TIEBREAK_MIN:
for j in range(i - dups + 1, i + 1):
ranks[argsorted[j]] = i - dups + 2
elif tiebreak == TIEBREAK_MAX:
for j in range(i - dups + 1, i + 1):
ranks[argsorted[j]] = i + 1
elif tiebreak == TIEBREAK_FIRST:
for j in range(i - dups + 1, i + 1):
ranks[argsorted[j]] = j + 1
sum_ranks = dups = 0
return ranks

def rank_2d_float64(object in_arr, axis=0, ties_method='average'):
"""
Fast NaN-friendly version of scipy.stats.rankdata
Expand Down
8 changes: 8 additions & 0 deletions vb_suite/stat_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,14 @@
stats_rank_average = Benchmark('s.rank()', setup,
start_date=datetime(2011, 12, 12))

setup = common_setup + """
values = np.random.randint(0, 100000, size=200000)
s = Series(values)
"""

stats_rank_average_int = Benchmark('s.rank()', setup,
start_date=datetime(2011, 12, 12))

setup = common_setup + """
df = DataFrame(np.random.randn(5000, 50))
"""
Expand Down

0 comments on commit 24d7b02

Please sign in to comment.