Skip to content

Commit

Permalink
BUG: Prevent uint64 overflow in Series.unique
Browse files Browse the repository at this point in the history
Introduces a UInt64HashTable class to hash
uint64 elements and prevent overflow in
functions like Series.unique.

Closes pandas-devgh-14721.
  • Loading branch information
gfyoung committed Dec 20, 2016
1 parent 3ccb501 commit 380c580
Show file tree
Hide file tree
Showing 10 changed files with 59 additions and 7 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -258,4 +258,6 @@ Bug Fixes




- Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`)
- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
6 changes: 5 additions & 1 deletion pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
_ensure_platform_int,
_ensure_object,
_ensure_float64,
_ensure_uint64,
_ensure_int64,
is_list_like)
from pandas.compat.numpy import _np_version_under1p10
Expand Down Expand Up @@ -129,9 +130,12 @@ def unique1d(values):
table = htable.Int64HashTable(len(values))
uniques = table.unique(_ensure_int64(values))
uniques = uniques.view('m8[ns]')
elif np.issubdtype(values.dtype, np.integer):
elif np.issubdtype(values.dtype, np.signedinteger):
table = htable.Int64HashTable(len(values))
uniques = table.unique(_ensure_int64(values))
elif np.issubdtype(values.dtype, np.unsignedinteger):
table = htable.UInt64HashTable(len(values))
uniques = table.unique(_ensure_uint64(values))
else:

# its cheaper to use a String Hash Table than Object
Expand Down
9 changes: 8 additions & 1 deletion pandas/hashtable.pxd
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, kh_str_t, int64_t, float64_t
from khash cimport (kh_int64_t, kh_uint64_t, kh_float64_t, kh_pymap_t,
kh_str_t, uint64_t, int64_t, float64_t)

# prototypes for sharing

cdef class HashTable:
pass

cdef class UInt64HashTable(HashTable):
cdef kh_uint64_t *table

cpdef get_item(self, uint64_t val)
cpdef set_item(self, uint64_t key, Py_ssize_t val)

cdef class Int64HashTable(HashTable):
cdef kh_int64_t *table

Expand Down
1 change: 1 addition & 0 deletions pandas/src/algos_common_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,7 @@ dtypes = [('float64', 'FLOAT64', 'float64'),
('int16', 'INT16', 'int16'),
('int32', 'INT32', 'int32'),
('int64', 'INT64', 'int64'),
('uint64', 'UINT64', 'uint64'),
# ('platform_int', 'INT', 'int_'),
# ('object', 'OBJECT', 'object_'),
]
Expand Down
6 changes: 5 additions & 1 deletion pandas/src/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in

dtypes = [('Float64', 'float64', 'float64_t'),
('Int64', 'int64', 'int64_t'),
('String', 'string', 'char *')]
('String', 'string', 'char *'),
('UInt64', 'uint64', 'uint64_t')]
}}

{{for name, dtype, arg in dtypes}}
Expand All @@ -40,6 +41,7 @@ cdef inline void append_data_{{dtype}}({{name}}VectorData *data,

ctypedef fused vector_data:
Int64VectorData
UInt64VectorData
Float64VectorData
StringVectorData

Expand All @@ -54,6 +56,7 @@ cdef inline bint needs_resize(vector_data *data) nogil:

# name, dtype, arg, idtype
dtypes = [('Float64', 'float64', 'float64_t', 'np.float64'),
('UInt64', 'uint64', 'uint64_t', 'np.uint64'),
('Int64', 'int64', 'int64_t', 'np.int64')]

}}
Expand Down Expand Up @@ -201,6 +204,7 @@ cdef class HashTable:

# name, dtype, null_condition, float_group
dtypes = [('Float64', 'float64', 'val != val', True),
('UInt64', 'uint64', 'val == 0', False),
('Int64', 'int64', 'val == iNaT', False)]

}}
Expand Down
2 changes: 1 addition & 1 deletion pandas/src/hashtable_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
{{py:

# name
dtypes = ['float64', 'int64']
dtypes = ['float64', 'int64', 'uint64']

}}

Expand Down
21 changes: 19 additions & 2 deletions pandas/src/khash.pxd
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from cpython cimport PyObject
from numpy cimport int64_t, int32_t, uint32_t, float64_t
from numpy cimport int64_t, uint64_t, int32_t, uint32_t, float64_t

cdef extern from "khash_python.h":
ctypedef uint32_t khint_t
Expand Down Expand Up @@ -55,7 +55,6 @@ cdef extern from "khash_python.h":

bint kh_exist_str(kh_str_t*, khiter_t) nogil


ctypedef struct kh_int64_t:
khint_t n_buckets, size, n_occupied, upper_bound
uint32_t *flags
Expand All @@ -72,6 +71,24 @@ cdef extern from "khash_python.h":

bint kh_exist_int64(kh_int64_t*, khiter_t) nogil

ctypedef uint64_t khuint64_t

ctypedef struct kh_uint64_t:
khint_t n_buckets, size, n_occupied, upper_bound
uint32_t *flags
khuint64_t *keys
size_t *vals

inline kh_uint64_t* kh_init_uint64() nogil
inline void kh_destroy_uint64(kh_uint64_t*) nogil
inline void kh_clear_uint64(kh_uint64_t*) nogil
inline khint_t kh_get_uint64(kh_uint64_t*, int64_t) nogil
inline void kh_resize_uint64(kh_uint64_t*, khint_t) nogil
inline khint_t kh_put_uint64(kh_uint64_t*, int64_t, int*) nogil
inline void kh_del_uint64(kh_uint64_t*, khint_t) nogil

bint kh_exist_uint64(kh_uint64_t*, khiter_t) nogil

ctypedef struct kh_float64_t:
khint_t n_buckets, size, n_occupied, upper_bound
uint32_t *flags
Expand Down
2 changes: 2 additions & 0 deletions pandas/src/klib/khash.h
Original file line number Diff line number Diff line change
Expand Up @@ -567,12 +567,14 @@ typedef const char *kh_cstr_t;

#define kh_exist_str(h, k) (kh_exist(h, k))
#define kh_exist_float64(h, k) (kh_exist(h, k))
#define kh_exist_uint64(h, k) (kh_exist(h, k))
#define kh_exist_int64(h, k) (kh_exist(h, k))
#define kh_exist_int32(h, k) (kh_exist(h, k))

KHASH_MAP_INIT_STR(str, size_t)
KHASH_MAP_INIT_INT(int32, size_t)
KHASH_MAP_INIT_INT64(int64, size_t)
KHASH_MAP_INIT_UINT64(uint64, size_t)


#endif /* __AC_KHASH_H */
15 changes: 14 additions & 1 deletion pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -899,6 +899,18 @@ def test_lookup_nan(self):
self.assert_numpy_array_equal(m.lookup(xs),
np.arange(len(xs), dtype=np.int64))

def test_lookup_overflow(self):
xs = np.array([1, 2, 2**63], dtype=np.uint64)
m = hashtable.UInt64HashTable()
m.map_locations(xs)
self.assert_numpy_array_equal(m.lookup(xs),
np.arange(len(xs), dtype=np.int64))

def test_get_unique(self):
s = pd.Series([1, 2, 2**63, 2**63], dtype=np.uint64)
exp = np.array([1, 2, 2**63], dtype=np.uint64)
self.assert_numpy_array_equal(s.unique(), exp)

def test_vector_resize(self):
# Test for memory errors after internal vector
# reallocations (pull request #7157)
Expand All @@ -915,7 +927,8 @@ def _test_vector_resize(htable, uniques, dtype, nvals):
(hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object'),
(hashtable.StringHashTable, hashtable.ObjectVector, 'object'),
(hashtable.Float64HashTable, hashtable.Float64Vector, 'float64'),
(hashtable.Int64HashTable, hashtable.Int64Vector, 'int64')]
(hashtable.Int64HashTable, hashtable.Int64Vector, 'int64'),
(hashtable.UInt64HashTable, hashtable.UInt64Vector, 'uint64')]

for (tbl, vect, dtype) in test_cases:
# resizing to empty is a special case
Expand Down
2 changes: 2 additions & 0 deletions pandas/types/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ def _ensure_float(arr):
arr = arr.astype(float)
return arr


_ensure_uint64 = algos.ensure_uint64
_ensure_int64 = algos.ensure_int64
_ensure_int32 = algos.ensure_int32
_ensure_int16 = algos.ensure_int16
Expand Down

0 comments on commit 380c580

Please sign in to comment.