From 380c58041a2cf327b1cc2de71586707dbdf16129 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 18 Dec 2016 23:28:36 -0500 Subject: [PATCH] BUG: Prevent uint64 overflow in Series.unique Introduces a UInt64HashTable class to hash uint64 elements and prevent overflow in functions like Series.unique. Closes gh-14721. --- doc/source/whatsnew/v0.20.0.txt | 2 ++ pandas/core/algorithms.py | 6 +++++- pandas/hashtable.pxd | 9 ++++++++- pandas/src/algos_common_helper.pxi.in | 1 + pandas/src/hashtable_class_helper.pxi.in | 6 +++++- pandas/src/hashtable_func_helper.pxi.in | 2 +- pandas/src/khash.pxd | 21 +++++++++++++++++++-- pandas/src/klib/khash.h | 2 ++ pandas/tests/test_algos.py | 15 ++++++++++++++- pandas/types/common.py | 2 ++ 10 files changed, 59 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 83a70aa34fccf..a8421535636f9 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -258,4 +258,6 @@ Bug Fixes + +- Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`) - Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 6bcd3776867b6..e51774ce4d9b4 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -25,6 +25,7 @@ _ensure_platform_int, _ensure_object, _ensure_float64, + _ensure_uint64, _ensure_int64, is_list_like) from pandas.compat.numpy import _np_version_under1p10 @@ -129,9 +130,12 @@ def unique1d(values): table = htable.Int64HashTable(len(values)) uniques = table.unique(_ensure_int64(values)) uniques = uniques.view('m8[ns]') - elif np.issubdtype(values.dtype, np.integer): + elif np.issubdtype(values.dtype, np.signedinteger): table = htable.Int64HashTable(len(values)) uniques = table.unique(_ensure_int64(values)) + elif np.issubdtype(values.dtype, np.unsignedinteger): + table = htable.UInt64HashTable(len(values)) + uniques = table.unique(_ensure_uint64(values)) else: # its cheaper to use a String Hash Table than Object diff --git a/pandas/hashtable.pxd b/pandas/hashtable.pxd index f3ea7ad792160..cd06b938310a8 100644 --- a/pandas/hashtable.pxd +++ b/pandas/hashtable.pxd @@ -1,10 +1,17 @@ -from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, kh_str_t, int64_t, float64_t +from khash cimport (kh_int64_t, kh_uint64_t, kh_float64_t, kh_pymap_t, + kh_str_t, uint64_t, int64_t, float64_t) # prototypes for sharing cdef class HashTable: pass +cdef class UInt64HashTable(HashTable): + cdef kh_uint64_t *table + + cpdef get_item(self, uint64_t val) + cpdef set_item(self, uint64_t key, Py_ssize_t val) + cdef class Int64HashTable(HashTable): cdef kh_int64_t *table diff --git a/pandas/src/algos_common_helper.pxi.in b/pandas/src/algos_common_helper.pxi.in index c52c734f727e9..c1c190704b4c7 100644 --- a/pandas/src/algos_common_helper.pxi.in +++ b/pandas/src/algos_common_helper.pxi.in @@ -553,6 +553,7 @@ dtypes = [('float64', 'FLOAT64', 'float64'), ('int16', 'INT16', 'int16'), ('int32', 'INT32', 'int32'), ('int64', 'INT64', 'int64'), + ('uint64', 'UINT64', 'uint64'), # ('platform_int', 'INT', 'int_'), # ('object', 'OBJECT', 'object_'), ] diff --git a/pandas/src/hashtable_class_helper.pxi.in b/pandas/src/hashtable_class_helper.pxi.in index 22714e6305677..55c840b20c78b 100644 --- a/pandas/src/hashtable_class_helper.pxi.in +++ b/pandas/src/hashtable_class_helper.pxi.in @@ -17,7 +17,8 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in dtypes = [('Float64', 'float64', 'float64_t'), ('Int64', 'int64', 'int64_t'), - ('String', 'string', 'char *')] + ('String', 'string', 'char *'), + ('UInt64', 'uint64', 'uint64_t')] }} {{for name, dtype, arg in dtypes}} @@ -40,6 +41,7 @@ cdef inline void append_data_{{dtype}}({{name}}VectorData *data, ctypedef fused vector_data: Int64VectorData + UInt64VectorData Float64VectorData StringVectorData @@ -54,6 +56,7 @@ cdef inline bint needs_resize(vector_data *data) nogil: # name, dtype, arg, idtype dtypes = [('Float64', 'float64', 'float64_t', 'np.float64'), + ('UInt64', 'uint64', 'uint64_t', 'np.uint64'), ('Int64', 'int64', 'int64_t', 'np.int64')] }} @@ -201,6 +204,7 @@ cdef class HashTable: # name, dtype, null_condition, float_group dtypes = [('Float64', 'float64', 'val != val', True), + ('UInt64', 'uint64', 'val == 0', False), ('Int64', 'int64', 'val == iNaT', False)] }} diff --git a/pandas/src/hashtable_func_helper.pxi.in b/pandas/src/hashtable_func_helper.pxi.in index 1840b914f3328..f3e16cfd32963 100644 --- a/pandas/src/hashtable_func_helper.pxi.in +++ b/pandas/src/hashtable_func_helper.pxi.in @@ -11,7 +11,7 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: # name -dtypes = ['float64', 'int64'] +dtypes = ['float64', 'int64', 'uint64'] }} diff --git a/pandas/src/khash.pxd b/pandas/src/khash.pxd index b28f43eecfac7..adb0fe285dbb8 100644 --- a/pandas/src/khash.pxd +++ b/pandas/src/khash.pxd @@ -1,5 +1,5 @@ from cpython cimport PyObject -from numpy cimport int64_t, int32_t, uint32_t, float64_t +from numpy cimport int64_t, uint64_t, int32_t, uint32_t, float64_t cdef extern from "khash_python.h": ctypedef uint32_t khint_t @@ -55,7 +55,6 @@ cdef extern from "khash_python.h": bint kh_exist_str(kh_str_t*, khiter_t) nogil - ctypedef struct kh_int64_t: khint_t n_buckets, size, n_occupied, upper_bound uint32_t *flags @@ -72,6 +71,24 @@ cdef extern from "khash_python.h": bint kh_exist_int64(kh_int64_t*, khiter_t) nogil + ctypedef uint64_t khuint64_t + + ctypedef struct kh_uint64_t: + khint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + khuint64_t *keys + size_t *vals + + inline kh_uint64_t* kh_init_uint64() nogil + inline void kh_destroy_uint64(kh_uint64_t*) nogil + inline void kh_clear_uint64(kh_uint64_t*) nogil + inline khint_t kh_get_uint64(kh_uint64_t*, int64_t) nogil + inline void kh_resize_uint64(kh_uint64_t*, khint_t) nogil + inline khint_t kh_put_uint64(kh_uint64_t*, int64_t, int*) nogil + inline void kh_del_uint64(kh_uint64_t*, khint_t) nogil + + bint kh_exist_uint64(kh_uint64_t*, khiter_t) nogil + ctypedef struct kh_float64_t: khint_t n_buckets, size, n_occupied, upper_bound uint32_t *flags diff --git a/pandas/src/klib/khash.h b/pandas/src/klib/khash.h index dc004a0e1770b..869607a44c001 100644 --- a/pandas/src/klib/khash.h +++ b/pandas/src/klib/khash.h @@ -567,12 +567,14 @@ typedef const char *kh_cstr_t; #define kh_exist_str(h, k) (kh_exist(h, k)) #define kh_exist_float64(h, k) (kh_exist(h, k)) +#define kh_exist_uint64(h, k) (kh_exist(h, k)) #define kh_exist_int64(h, k) (kh_exist(h, k)) #define kh_exist_int32(h, k) (kh_exist(h, k)) KHASH_MAP_INIT_STR(str, size_t) KHASH_MAP_INIT_INT(int32, size_t) KHASH_MAP_INIT_INT64(int64, size_t) +KHASH_MAP_INIT_UINT64(uint64, size_t) #endif /* __AC_KHASH_H */ diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 92a9184ad30fc..7f1745edbb816 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -899,6 +899,18 @@ def test_lookup_nan(self): self.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.int64)) + def test_lookup_overflow(self): + xs = np.array([1, 2, 2**63], dtype=np.uint64) + m = hashtable.UInt64HashTable() + m.map_locations(xs) + self.assert_numpy_array_equal(m.lookup(xs), + np.arange(len(xs), dtype=np.int64)) + + def test_get_unique(self): + s = pd.Series([1, 2, 2**63, 2**63], dtype=np.uint64) + exp = np.array([1, 2, 2**63], dtype=np.uint64) + self.assert_numpy_array_equal(s.unique(), exp) + def test_vector_resize(self): # Test for memory errors after internal vector # reallocations (pull request #7157) @@ -915,7 +927,8 @@ def _test_vector_resize(htable, uniques, dtype, nvals): (hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object'), (hashtable.StringHashTable, hashtable.ObjectVector, 'object'), (hashtable.Float64HashTable, hashtable.Float64Vector, 'float64'), - (hashtable.Int64HashTable, hashtable.Int64Vector, 'int64')] + (hashtable.Int64HashTable, hashtable.Int64Vector, 'int64'), + (hashtable.UInt64HashTable, hashtable.UInt64Vector, 'uint64')] for (tbl, vect, dtype) in test_cases: # resizing to empty is a special case diff --git a/pandas/types/common.py b/pandas/types/common.py index a7ba96f95e31b..06c8ef6e35cd7 100644 --- a/pandas/types/common.py +++ b/pandas/types/common.py @@ -32,6 +32,8 @@ def _ensure_float(arr): arr = arr.astype(float) return arr + +_ensure_uint64 = algos.ensure_uint64 _ensure_int64 = algos.ensure_int64 _ensure_int32 = algos.ensure_int32 _ensure_int16 = algos.ensure_int16