BUG: Prevent uint64 overflow in Series.unique

Introduces a UInt64HashTable class to hash uint64 elements and prevent overflow in functions like Series.unique. Closes pandas-devgh-14721.
forking-repos · Dec 20, 2016 · 380c580 · 380c580
1 parent 3ccb501
commit 380c580
Show file tree

Hide file tree

Showing 10 changed files with 59 additions and 7 deletions.
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -258,4 +258,6 @@ Bug Fixes
 
 
 
+
+- Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`)
 - Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -25,6 +25,7 @@
                                  _ensure_platform_int,
                                  _ensure_object,
                                  _ensure_float64,
+                                 _ensure_uint64,
                                  _ensure_int64,
                                  is_list_like)
 from pandas.compat.numpy import _np_version_under1p10
@@ -129,9 +130,12 @@ def unique1d(values):
         table = htable.Int64HashTable(len(values))
         uniques = table.unique(_ensure_int64(values))
         uniques = uniques.view('m8[ns]')
-    elif np.issubdtype(values.dtype, np.integer):
+    elif np.issubdtype(values.dtype, np.signedinteger):
         table = htable.Int64HashTable(len(values))
         uniques = table.unique(_ensure_int64(values))
+    elif np.issubdtype(values.dtype, np.unsignedinteger):
+        table = htable.UInt64HashTable(len(values))
+        uniques = table.unique(_ensure_uint64(values))
     else:
 
         # its cheaper to use a String Hash Table than Object

diff --git a/pandas/hashtable.pxd b/pandas/hashtable.pxd
@@ -1,10 +1,17 @@
-from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, kh_str_t, int64_t, float64_t
+from khash cimport (kh_int64_t, kh_uint64_t, kh_float64_t, kh_pymap_t,
+                    kh_str_t, uint64_t, int64_t, float64_t)
 
 # prototypes for sharing
 
 cdef class HashTable:
     pass
 
+cdef class UInt64HashTable(HashTable):
+    cdef kh_uint64_t *table
+
+    cpdef get_item(self, uint64_t val)
+    cpdef set_item(self, uint64_t key, Py_ssize_t val)
+
 cdef class Int64HashTable(HashTable):
     cdef kh_int64_t *table
 

diff --git a/pandas/src/algos_common_helper.pxi.in b/pandas/src/algos_common_helper.pxi.in
@@ -553,6 +553,7 @@ dtypes = [('float64', 'FLOAT64', 'float64'),
           ('int16', 'INT16', 'int16'),
           ('int32', 'INT32', 'int32'),
           ('int64', 'INT64', 'int64'),
+          ('uint64', 'UINT64', 'uint64'),
           # ('platform_int', 'INT', 'int_'),
           # ('object', 'OBJECT', 'object_'),
 ]

diff --git a/pandas/src/hashtable_class_helper.pxi.in b/pandas/src/hashtable_class_helper.pxi.in
@@ -17,7 +17,8 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 
 dtypes = [('Float64', 'float64', 'float64_t'),
           ('Int64', 'int64', 'int64_t'),
-          ('String', 'string', 'char *')]
+          ('String', 'string', 'char *'),
+          ('UInt64', 'uint64', 'uint64_t')]
 }}
 
 {{for name, dtype, arg in dtypes}}
@@ -40,6 +41,7 @@ cdef inline void append_data_{{dtype}}({{name}}VectorData *data,
 
 ctypedef fused vector_data:
     Int64VectorData
+    UInt64VectorData
     Float64VectorData
     StringVectorData
 
@@ -54,6 +56,7 @@ cdef inline bint needs_resize(vector_data *data) nogil:
 
 # name, dtype, arg, idtype
 dtypes = [('Float64', 'float64', 'float64_t', 'np.float64'),
+          ('UInt64', 'uint64', 'uint64_t', 'np.uint64'),
           ('Int64', 'int64', 'int64_t', 'np.int64')]
 
 }}
@@ -201,6 +204,7 @@ cdef class HashTable:
 
 # name, dtype, null_condition, float_group
 dtypes = [('Float64', 'float64', 'val != val', True),
+          ('UInt64', 'uint64', 'val == 0', False),
           ('Int64', 'int64', 'val == iNaT', False)]
 
 }}

diff --git a/pandas/src/hashtable_func_helper.pxi.in b/pandas/src/hashtable_func_helper.pxi.in
@@ -11,7 +11,7 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 {{py:
 
 # name
-dtypes = ['float64', 'int64']
+dtypes = ['float64', 'int64', 'uint64']
 
 }}
 

diff --git a/pandas/src/khash.pxd b/pandas/src/khash.pxd
@@ -1,5 +1,5 @@
 from cpython cimport PyObject
-from numpy cimport int64_t, int32_t, uint32_t, float64_t
+from numpy cimport int64_t, uint64_t, int32_t, uint32_t, float64_t
 
 cdef extern from "khash_python.h":
     ctypedef uint32_t khint_t
@@ -55,7 +55,6 @@ cdef extern from "khash_python.h":
 
     bint kh_exist_str(kh_str_t*, khiter_t) nogil
 
-
     ctypedef struct kh_int64_t:
         khint_t n_buckets, size, n_occupied, upper_bound
         uint32_t *flags
@@ -72,6 +71,24 @@ cdef extern from "khash_python.h":
 
     bint kh_exist_int64(kh_int64_t*, khiter_t) nogil
 
+    ctypedef uint64_t khuint64_t
+
+    ctypedef struct kh_uint64_t:
+        khint_t n_buckets, size, n_occupied, upper_bound
+        uint32_t *flags
+        khuint64_t *keys
+        size_t *vals
+
+    inline kh_uint64_t* kh_init_uint64() nogil
+    inline void kh_destroy_uint64(kh_uint64_t*) nogil
+    inline void kh_clear_uint64(kh_uint64_t*) nogil
+    inline khint_t kh_get_uint64(kh_uint64_t*, int64_t) nogil
+    inline void kh_resize_uint64(kh_uint64_t*, khint_t) nogil
+    inline khint_t kh_put_uint64(kh_uint64_t*, int64_t, int*) nogil
+    inline void kh_del_uint64(kh_uint64_t*, khint_t) nogil
+
+    bint kh_exist_uint64(kh_uint64_t*, khiter_t) nogil
+
     ctypedef struct kh_float64_t:
         khint_t n_buckets, size, n_occupied, upper_bound
         uint32_t *flags

diff --git a/pandas/src/klib/khash.h b/pandas/src/klib/khash.h
@@ -567,12 +567,14 @@ typedef const char *kh_cstr_t;
 
 #define kh_exist_str(h, k) (kh_exist(h, k))
 #define kh_exist_float64(h, k) (kh_exist(h, k))
+#define kh_exist_uint64(h, k) (kh_exist(h, k))
 #define kh_exist_int64(h, k) (kh_exist(h, k))
 #define kh_exist_int32(h, k) (kh_exist(h, k))
 
 KHASH_MAP_INIT_STR(str, size_t)
 KHASH_MAP_INIT_INT(int32, size_t)
 KHASH_MAP_INIT_INT64(int64, size_t)
+KHASH_MAP_INIT_UINT64(uint64, size_t)
 
 
 #endif /* __AC_KHASH_H */
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -899,6 +899,18 @@ def test_lookup_nan(self):
         self.assert_numpy_array_equal(m.lookup(xs),
                                       np.arange(len(xs), dtype=np.int64))
 
+    def test_lookup_overflow(self):
+        xs = np.array([1, 2, 2**63], dtype=np.uint64)
+        m = hashtable.UInt64HashTable()
+        m.map_locations(xs)
+        self.assert_numpy_array_equal(m.lookup(xs),
+                                      np.arange(len(xs), dtype=np.int64))
+
+    def test_get_unique(self):
+        s = pd.Series([1, 2, 2**63, 2**63], dtype=np.uint64)
+        exp = np.array([1, 2, 2**63], dtype=np.uint64)
+        self.assert_numpy_array_equal(s.unique(), exp)
+
     def test_vector_resize(self):
         # Test for memory errors after internal vector
         # reallocations (pull request #7157)
@@ -915,7 +927,8 @@ def _test_vector_resize(htable, uniques, dtype, nvals):
             (hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object'),
             (hashtable.StringHashTable, hashtable.ObjectVector, 'object'),
             (hashtable.Float64HashTable, hashtable.Float64Vector, 'float64'),
-            (hashtable.Int64HashTable, hashtable.Int64Vector, 'int64')]
+            (hashtable.Int64HashTable, hashtable.Int64Vector, 'int64'),
+            (hashtable.UInt64HashTable, hashtable.UInt64Vector, 'uint64')]
 
         for (tbl, vect, dtype) in test_cases:
             # resizing to empty is a special case

diff --git a/pandas/types/common.py b/pandas/types/common.py
@@ -32,6 +32,8 @@ def _ensure_float(arr):
         arr = arr.astype(float)
     return arr
 
+
+_ensure_uint64 = algos.ensure_uint64
 _ensure_int64 = algos.ensure_int64
 _ensure_int32 = algos.ensure_int32
 _ensure_int16 = algos.ensure_int16
Original file line number	Diff line number	Diff line change
Expand Up		@@ -258,4 +258,6 @@ Bug Fixes




		- Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`)
		- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
-Original file line number
+Diff line change
@@ Expand Up @@
     {{py:
     # name
-    dtypes = ['float64', 'int64']
+    dtypes = ['float64', 'int64', 'uint64']
     }}
@@ Expand Down @@