Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: rank na_options top and bottom #1508 #2159

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,11 +191,12 @@ def rank(values, axis=0, method='average', na_option='keep',
"""
if values.ndim == 1:
f, values = _get_data_algo(values, _rank1d_functions)
ranks = f(values, ties_method=method, ascending=ascending)
ranks = f(values, ties_method=method, ascending=ascending,
na_option=na_option)
elif values.ndim == 2:
f, values = _get_data_algo(values, _rank2d_functions)
ranks = f(values, axis=axis, ties_method=method,
ascending=ascending)
ascending=ascending, na_option=na_option)
return ranks


Expand Down
8 changes: 5 additions & 3 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4704,8 +4704,10 @@ def rank(self, axis=0, numeric_only=None, method='average',
min: lowest rank in group
max: highest rank in group
first: ranks assigned in order they appear in the array
na_option : {'keep'}
na_option : {'keep', 'top', 'bottom'}
keep: leave NA values where they are
top: smallest rank if ascending
bottom: smallest rank if descending
ascending : boolean, default True
False for ranks by high (1) to low (N)

Expand All @@ -4716,7 +4718,7 @@ def rank(self, axis=0, numeric_only=None, method='average',
if numeric_only is None:
try:
ranks = algos.rank(self.values, axis=axis, method=method,
ascending=ascending)
ascending=ascending, na_option=na_option)
return DataFrame(ranks, index=self.index, columns=self.columns)
except TypeError:
numeric_only = True
Expand All @@ -4726,7 +4728,7 @@ def rank(self, axis=0, numeric_only=None, method='average',
else:
data = self
ranks = algos.rank(data.values, axis=axis, method=method,
ascending=ascending)
ascending=ascending, na_option=na_option)
return DataFrame(ranks, index=data.index, columns=data.columns)

def to_timestamp(self, freq=None, how='start', axis=0, copy=True):
Expand Down
30 changes: 16 additions & 14 deletions pandas/src/stats.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ cdef _take_2d_object(ndarray[object, ndim=2] values,
return result


def rank_1d_float64(object in_arr, ties_method='average', ascending=True):
def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
na_option='keep'):
"""
Fast NaN-friendly version of scipy.stats.rankdata
"""
Expand All @@ -86,7 +87,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True):

values = np.asarray(in_arr).copy()

if ascending:
if ascending ^ (na_option == 'top'):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice w/ the xor

nan_value = np.inf
else:
nan_value = -np.inf
Expand Down Expand Up @@ -115,7 +116,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True):
sum_ranks += i + 1
dups += 1
val = sorted_data[i]
if val == nan_value:
if (val == nan_value) and (na_option == 'keep'):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will dog performance pretty badly. i'll merge and then tweak this

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually it's pretty minor (only about a 10% penalty in a 1mm-length Series that's 50% NA)

ranks[argsorted[i]] = nan
continue
if i == n - 1 or fabs(sorted_data[i + 1] - val) > FP_ERR:
Expand All @@ -138,7 +139,8 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True):
return ranks


def rank_1d_int64(object in_arr, ties_method='average', ascending=True):
def rank_1d_int64(object in_arr, ties_method='average', ascending=True,
na_option='keep'):
"""
Fast NaN-friendly version of scipy.stats.rankdata
"""
Expand Down Expand Up @@ -198,7 +200,7 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True):


def rank_2d_float64(object in_arr, axis=0, ties_method='average',
ascending=True):
ascending=True, na_option='keep'):
"""
Fast NaN-friendly version of scipy.stats.rankdata
"""
Expand All @@ -219,7 +221,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
else:
values = in_arr.copy()

if ascending:
if ascending ^ (na_option == 'top'):
nan_value = np.inf
else:
nan_value = -np.inf
Expand Down Expand Up @@ -249,7 +251,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
sum_ranks += j + 1
dups += 1
val = values[i, j]
if val == nan_value:
if val == nan_value and na_option == 'keep':
ranks[i, argsorted[i, j]] = nan
continue
if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR:
Expand Down Expand Up @@ -277,7 +279,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',


def rank_2d_int64(object in_arr, axis=0, ties_method='average',
ascending=True):
ascending=True, na_option='keep'):
"""
Fast NaN-friendly version of scipy.stats.rankdata
"""
Expand Down Expand Up @@ -345,7 +347,7 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average',


def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
ascending=True):
ascending=True, na_option='keep'):
"""
Fast NaN-friendly version of scipy.stats.rankdata
"""
Expand All @@ -365,7 +367,7 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
if values.dtype != np.object_:
values = values.astype('O')

if ascending:
if ascending ^ (na_option == 'top'):
# always greater than everything
nan_value = Infinity()
else:
Expand Down Expand Up @@ -401,7 +403,7 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
sum_ranks += i + 1
dups += 1
val = util.get_value_at(sorted_data, i)
if val is nan_value:
if val is nan_value and na_option=='keep':
ranks[argsorted[i]] = nan
continue
if (i == n - 1 or
Expand Down Expand Up @@ -450,7 +452,7 @@ class NegInfinity(object):
__cmp__ = _return_true

def rank_2d_generic(object in_arr, axis=0, ties_method='average',
ascending=True):
ascending=True, na_option='keep'):
"""
Fast NaN-friendly version of scipy.stats.rankdata
"""
Expand All @@ -475,7 +477,7 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average',
if values.dtype != np.object_:
values = values.astype('O')

if ascending:
if ascending ^ (na_option == 'top'):
# always greater than everything
nan_value = Infinity()
else:
Expand Down Expand Up @@ -510,7 +512,7 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average',
dups = sum_ranks = infs = 0
for j in range(k):
val = values[i, j]
if val is nan_value:
if val is nan_value and na_option == 'keep':
ranks[i, argsorted[i, j]] = nan
infs += 1
continue
Expand Down
67 changes: 67 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6444,6 +6444,73 @@ def test_rank2(self):
expected = self.mixed_frame.rank(1, numeric_only=True)
assert_frame_equal(result, expected)

def test_rank_na_option(self):
from pandas.compat.scipy import rankdata

self.frame['A'][::2] = np.nan
self.frame['B'][::3] = np.nan
self.frame['C'][::4] = np.nan
self.frame['D'][::5] = np.nan

#bottom
ranks0 = self.frame.rank(na_option='bottom')
ranks1 = self.frame.rank(1, na_option='bottom')

fvals = self.frame.fillna(np.inf).values

exp0 = np.apply_along_axis(rankdata, 0, fvals)
exp1 = np.apply_along_axis(rankdata, 1, fvals)

assert_almost_equal(ranks0.values, exp0)
assert_almost_equal(ranks1.values, exp1)

#top
ranks0 = self.frame.rank(na_option='top')
ranks1 = self.frame.rank(1, na_option='top')

fval0 = self.frame.fillna((self.frame.min() - 1).to_dict()).values
fval1 = self.frame.T
fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
fval1 = fval1.fillna(np.inf).values

exp0 = np.apply_along_axis(rankdata, 0, fval0)
exp1 = np.apply_along_axis(rankdata, 1, fval1)

assert_almost_equal(ranks0.values, exp0)
assert_almost_equal(ranks1.values, exp1)

#descending

#bottom
ranks0 = self.frame.rank(na_option='top', ascending=False)
ranks1 = self.frame.rank(1, na_option='top', ascending=False)

fvals = self.frame.fillna(np.inf).values

exp0 = np.apply_along_axis(rankdata, 0, -fvals)
exp1 = np.apply_along_axis(rankdata, 1, -fvals)

assert_almost_equal(ranks0.values, exp0)
assert_almost_equal(ranks1.values, exp1)

#descending

#top
ranks0 = self.frame.rank(na_option='bottom', ascending=False)
ranks1 = self.frame.rank(1, na_option='bottom', ascending=False)

fval0 = self.frame.fillna((self.frame.min() - 1).to_dict()).values
fval1 = self.frame.T
fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
fval1 = fval1.fillna(np.inf).values

exp0 = np.apply_along_axis(rankdata, 0, -fval0)
exp1 = np.apply_along_axis(rankdata, 1, -fval1)

assert_almost_equal(ranks0.values, exp0)
assert_almost_equal(ranks1.values, exp1)


def test_describe(self):
desc = self.tsframe.describe()
desc = self.mixed_frame.describe()
Expand Down