ENH: refactoring for #874, fast int64 rank

pandas-dev · Mar 12, 2012 · 24d7b02 · 24d7b02
1 parent 9c0dc26
commit 24d7b02
Show file tree

Hide file tree

Showing 5 changed files with 98 additions and 23 deletions.
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -71,7 +71,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
     Returns
     -------
     """
-    hash_klass, values = _get_hash_table_and_cast(values)
+    hash_klass, values = _get_data_algo(values, _hashtables)
 
     uniques = []
     table = hash_klass(len(values))
@@ -119,19 +119,47 @@ def value_counts(values, sort=True, ascending=False):
 
     return result
 
+def rank(values, axis=0, method='average', na_option='keep'):
+    """
+
+    """
+    if values.ndim == 1:
+        f, values = _get_data_algo(values, _rank1d_functions)
+        ranks = f(values, ties_method=method)
+    elif values.ndim == 2:
+        f, values = _get_data_algo(values, _rank2d_functions)
+        ranks = f(values, axis=axis, ties_method=method)
+    return ranks
 
-def _get_hash_table_and_cast(values):
+
+def _get_data_algo(values, func_map):
     if com.is_float_dtype(values):
-        klass = lib.Float64HashTable
+        f = func_map['float64']
         values = com._ensure_float64(values)
     elif com.is_integer_dtype(values):
-        klass = lib.Int64HashTable
+        f = func_map['int64']
         values = com._ensure_int64(values)
     else:
-        klass = lib.PyObjectHashTable
+        f = func_map['generic']
         values = com._ensure_object(values)
-    return klass, values
-
+    return f, values
+
+_rank1d_functions = {
+    'float64' : lib.rank_1d_float64,
+    'int64' : lib.rank_1d_int64,
+    'generic' : lib.rank_1d_generic
+}
+
+_rank2d_functions = {
+    'float64' : lib.rank_2d_float64,
+    'generic' : lib.rank_2d_generic
+}
+
+_hashtables = {
+    'float64' : lib.Float64HashTable,
+    'int64' : lib.Int64HashTable,
+    'generic' : lib.PyObjectHashTable
+}
 
 def unique(values):
     """

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3730,7 +3730,8 @@ def clip_lower(self, threshold):
         """
         return self.apply(lambda x: x.clip_lower(threshold))
 
-    def rank(self, axis=0, numeric_only=None, method='average', na_option='keep'):
+    def rank(self, axis=0, numeric_only=None, method='average',
+             na_option='keep'):
         """
         Compute numerical data ranks (1 through n) along axis. Equal values are
         assigned a rank that is the average of the ranks of those values
@@ -3753,27 +3754,21 @@ def rank(self, axis=0, numeric_only=None, method='average', na_option='keep'):
         -------
         ranks : DataFrame
         """
+        from pandas.core.algorithms import rank
+
         if numeric_only is None:
             try:
-                values = self.values
-                if issubclass(values.dtype.type, np.floating):
-                    ranks = lib.rank_2d_float64(values, axis=axis,
-                                                ties_method=method)
-                else:
-                    ranks = lib.rank_2d_generic(values, axis=axis,
-                                                ties_method=method)
+                ranks = rank(self.values, axis=axis, method=method)
                 return DataFrame(ranks, index=self.index, columns=self.columns)
             except TypeError:
                 numeric_only = True
 
         if numeric_only:
             data = self._get_numeric_data()
-            ranks = lib.rank_2d_float64(data.values.astype('f8'), axis=axis)
-            return DataFrame(ranks, index=data.index, columns=data.columns)
         else:
             data = self
-            ranks = lib.rank_2d_generic(data.values.astype('O'), axis=axis)
-            return DataFrame(ranks, index=data.index, columns=data.columns)
+        ranks = rank(data.values, axis=axis, method=method)
+        return DataFrame(ranks, index=data.index, columns=data.columns)
 
     #----------------------------------------------------------------------
     # Plotting

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1595,10 +1595,8 @@ def rank(self, method='average', na_option='keep'):
         -------
         ranks : Series
         """
-        try:
-            ranks = lib.rank_1d_float64(self.values, ties_method=method)
-        except Exception:
-            ranks = lib.rank_1d_generic(self.values, ties_method=method)
+        from pandas.core.algorithms import rank
+        ranks = rank(self.values, method=method, na_option=na_option)
         return Series(ranks, index=self.index, name=self.name)
 
     def order(self, na_last=True, ascending=True, kind='mergesort'):

diff --git a/pandas/src/stats.pyx b/pandas/src/stats.pyx
@@ -67,6 +67,52 @@ def rank_1d_float64(object in_arr, ties_method='average'):
             sum_ranks = dups = 0
     return ranks
 
+def rank_1d_int64(object in_arr, ties_method='average'):
+    """
+    Fast NaN-friendly version of scipy.stats.rankdata
+    """
+
+    cdef:
+        Py_ssize_t i, j, n, dups = 0
+        ndarray[int64_t] sorted_data, values
+        ndarray[float64_t] ranks
+        ndarray[int64_t] argsorted
+        int32_t idx
+        int64_t val
+        float64_t sum_ranks = 0
+        int tiebreak = 0
+    tiebreak = tiebreakers[ties_method]
+
+    values = np.asarray(in_arr)
+
+    n = len(values)
+    ranks = np.empty(n, dtype='f8')
+
+    # py2.5/win32 hack, can't pass i8
+    _as = values.argsort()
+    sorted_data = values.take(_as)
+    argsorted = _as.astype('i8')
+
+    for i in range(n):
+        sum_ranks += i + 1
+        dups += 1
+        val = sorted_data[i]
+        if i == n - 1 or sorted_data[i + 1] - val:
+            if tiebreak == TIEBREAK_AVERAGE:
+                for j in range(i - dups + 1, i + 1):
+                    ranks[argsorted[j]] = sum_ranks / dups
+            elif tiebreak == TIEBREAK_MIN:
+                for j in range(i - dups + 1, i + 1):
+                    ranks[argsorted[j]] = i - dups + 2
+            elif tiebreak == TIEBREAK_MAX:
+                for j in range(i - dups + 1, i + 1):
+                    ranks[argsorted[j]] = i + 1
+            elif tiebreak == TIEBREAK_FIRST:
+                for j in range(i - dups + 1, i + 1):
+                    ranks[argsorted[j]] = j + 1
+            sum_ranks = dups = 0
+    return ranks
+
 def rank_2d_float64(object in_arr, axis=0, ties_method='average'):
     """
     Fast NaN-friendly version of scipy.stats.rankdata

diff --git a/vb_suite/stat_ops.py b/vb_suite/stat_ops.py
@@ -56,6 +56,14 @@
 stats_rank_average = Benchmark('s.rank()', setup,
                                start_date=datetime(2011, 12, 12))
 
+setup = common_setup + """
+values = np.random.randint(0, 100000, size=200000)
+s = Series(values)
+"""
+
+stats_rank_average_int = Benchmark('s.rank()', setup,
+                                   start_date=datetime(2011, 12, 12))
+
 setup = common_setup + """
 df = DataFrame(np.random.randn(5000, 50))
 """