From 06d9edfeeefb669e14763749cceade783e0f0658 Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Wed, 2 Jun 2021 17:38:18 -0700 Subject: [PATCH 01/18] Adds reference code, tests, and peformance python code for array_to_duplicated_hashable --- performance/__main__.py | 63 ++++++++++++++++++++++++++++- performance/reference/util.py | 59 +++++++++++++++++++++++++++ test/test_util.py | 75 +++++++++++++++++++++++++++++++++++ 3 files changed, 196 insertions(+), 1 deletion(-) diff --git a/performance/__main__.py b/performance/__main__.py index 9e31dc4d..e6d595a8 100644 --- a/performance/__main__.py +++ b/performance/__main__.py @@ -1,7 +1,9 @@ +import argparse import collections import datetime +import functools +import itertools import timeit -import argparse import numpy as np @@ -17,6 +19,7 @@ from performance.reference.util import dtype_from_element as dtype_from_element_ref from performance.reference.util import array_deepcopy as array_deepcopy_ref from performance.reference.util import isna_element as isna_element_ref +from performance.reference.util import _array_to_duplicated_hashable as array_to_duplicated_hashable_ref from performance.reference.array_go import ArrayGO as ArrayGOREF @@ -32,6 +35,7 @@ from arraykit import dtype_from_element as dtype_from_element_ak from arraykit import array_deepcopy as array_deepcopy_ak from arraykit import isna_element as isna_element_ak +from performance.reference.util import _array_to_duplicated_hashable as array_to_duplicated_hashable_ak from arraykit import ArrayGO as ArrayGOAK @@ -359,6 +363,63 @@ class IsNaElementPerfREF(IsNaElementPerf): entry = staticmethod(isna_element_ref) +#------------------------------------------------------------------------------- +class ArrayToDuplicatedHashablePerf(Perf): + NUMBER = 1 + FUNCTIONS = ('array_1d', 'array_2d') + + def __init__(self): + self.arrays_1d_small = [ + np.array([0,0,1,0,None,None,0,1,None], dtype=object), + np.array([0,0,1,0,'q','q',0,1,'q'], dtype=object), + np.array(['q','q','q', 'a', 'w', 'w'], dtype=object), + np.array([0,1,2,2,1,4,5,3,4,5,5,6], dtype=object), + ] + self.arrays_1d_large = [ + np.arange(100_000).astype(object), + np.hstack([np.arange(15), np.arange(90_000), np.arange(15), np.arange(9970)]).astype(object), + ] + + self.arrays_2d_small = [ + np.array([[None, None, None, 32, 17, 17], [2,2,2,False,'q','q'], [2,2,2,False,'q','q'], ], dtype=object), + np.array([[None, None, None, 32, 17, 17], [2,2,2,False,'q','q'], [2,2,2,False,'q','q'], ], dtype=object), + np.array([[50, 50, 32, 17, 17], [2,2,1,3,3]], dtype=object), + ] + self.arrays_2d_large = [ + np.arange(100_000).reshape(10_000, 10).astype(object), + np.hstack([np.arange(15), np.arange(90_000), np.arange(15), np.arange(9970)]).reshape(10_000, 10).astype(object), + ] + + def array_1d(self): + prd = functools.partial(itertools.product, (True, False), (True, False)) + + for _ in range(1000): + for exclude_first, exclude_last, arr in prd(self.arrays_1d_small): + self.entry(arr, exclude_first=exclude_first, exclude_last=exclude_last) + + for _ in range(5): + for exclude_first, exclude_last, arr in prd(self.arrays_1d_large): + self.entry(arr, exclude_first=exclude_first, exclude_last=exclude_last) + + def array_2d(self): + prd = functools.partial(itertools.product, (0, 1), (True, False), (True, False)) + + for _ in range(1000): + for axis, exclude_first, exclude_last, arr in prd(self.arrays_2d_small): + self.entry(arr, axis, exclude_first, exclude_last) + + for _ in range(5): + for axis, exclude_first, exclude_last, arr in prd(self.arrays_2d_large): + self.entry(arr, axis, exclude_first, exclude_last) + + +class ArrayToDuplicatedHashablePerfAK(ArrayToDuplicatedHashablePerf): + entry = staticmethod(array_to_duplicated_hashable_ak) + +class ArrayToDuplicatedHashablePerfREF(ArrayToDuplicatedHashablePerf): + entry = staticmethod(array_to_duplicated_hashable_ref) + + #------------------------------------------------------------------------------- def get_arg_parser(): diff --git a/performance/reference/util.py b/performance/reference/util.py index 0f2d0efc..20bf65ba 100644 --- a/performance/reference/util.py +++ b/performance/reference/util.py @@ -216,3 +216,62 @@ def dtype_from_element(value: tp.Optional[tp.Hashable]) -> np.dtype: # NOTE: calling array and getting dtype on np.nan is faster than combining isinstance, isnan calls return np.array(value).dtype + +#------------------------------------------------------------------------------- +# tools for handling duplicates + +def _array_to_duplicated_hashable( + array: np.ndarray, + axis: int = 0, + exclude_first: bool = False, + exclude_last: bool = False) -> np.ndarray: + ''' + Algorithm for finding duplicates in unsortable arrays for hashables. This will always be an object array. + ''' + # np.unique fails under the same conditions that sorting fails, so there is no need to try np.unique: must go to set drectly. + len_axis = array.shape[axis] + + if array.ndim == 1: + value_source = array + to_hashable = None + else: + if axis == 0: + value_source = array # will iterate rows + else: + value_source = (array[:, i] for i in range(len_axis)) + # values will be arrays; must convert to tuples to make hashable + to_hashable = tuple + + is_dupe = np.full(len_axis, False) + + # could exit early with a set, but would have to hash all array twice to go to set and dictionary + # creating a list for each entry and tracking indices would be very expensive + + unique_to_first: tp.Dict[tp.Hashable, int] = {} # value to first occurence + dupe_to_first: tp.Dict[tp.Hashable, int] = {} + dupe_to_last: tp.Dict[tp.Hashable, int] = {} + + for idx, v in enumerate(value_source): + + if to_hashable: + v = to_hashable(v) + + if v not in unique_to_first: + unique_to_first[v] = idx + else: + # v has been seen before; upate Boolean array + is_dupe[idx] = True + + # if no entry in dupe to first, no update with value in unique to first, which is the index this values was first seen + if v not in dupe_to_first: + dupe_to_first[v] = unique_to_first[v] + # always update last + dupe_to_last[v] = idx + + if exclude_last: # overwrite with False + is_dupe[list(dupe_to_last.values())] = False + + if not exclude_first: # add in first values + is_dupe[list(dupe_to_first.values())] = True + + return is_dupe diff --git a/test/test_util.py b/test/test_util.py index dcdc1c24..7824753a 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -16,6 +16,7 @@ from arraykit import array_deepcopy from arraykit import isna_element from arraykit import dtype_from_element +from performance.reference.util import _array_to_duplicated_hashable from performance.reference.util import mloc as mloc_ref @@ -368,6 +369,80 @@ def test_dtype_from_element_str_and_bytes_dtypes(self) -> None: self.assertEqual(np.dtype(f'|S{size}'), dtype_from_element(bytes(size))) self.assertEqual(np.dtype(f' None: + a = _array_to_duplicated_hashable(np.array([0,1,2,2,1,4,5,3,4,5,5,6], dtype=object), + exclude_first=False, exclude_last=False) + assert a.tolist() == [False, True, True, True, True, True, True, False, True, True, True, False] + + a = _array_to_duplicated_hashable(np.array([0,1,2,2,1,4,5,3,4,5,5,6], dtype=object), + exclude_first=True, exclude_last=False) + assert a.tolist() == [False, False, False, True, True, False, False, False, True, True, True, False] + + def test_array_to_duplicated_hashable_b(self) -> None: + a = np.array([[50, 50, 32, 17, 17], [2,2,1,3,3]], dtype=object) + # find duplicate rows + post = _array_to_duplicated_hashable(a, axis=0) + assert post.tolist() == [False, False] + + post = _array_to_duplicated_hashable(a, axis=1) + assert post.tolist() == [True, True, False, True, True] + + post = _array_to_duplicated_hashable(a, axis=1, exclude_first=True) + assert post.tolist() == [False, True, False, False, True] + + def test_array_to_duplicated_hashable_c(self) -> None: + c = _array_to_duplicated_hashable(np.array(['q','q','q', 'a', 'w', 'w'], dtype=object), + exclude_first=False, exclude_last=False) + assert c.tolist() == [True, True, True, False, True, True] + + def test_array_to_duplicated_hashable_d(self) -> None: + # NOTE: these cases fail with hetergenous types as we cannot sort + a1 = np.array([0,0,1,0,None,None,0,1,None], dtype=object) + a2 = np.array([0,0,1,0,'q','q',0,1,'q'], dtype=object) + + for array in (a1, a2): + post1 = _array_to_duplicated_hashable(array, exclude_first=False, exclude_last=False) + assert post1.tolist() == [True, True, True, True, True, True, True, True, True] + + post2 = _array_to_duplicated_hashable(array, exclude_first=True, exclude_last=False) + assert post2.tolist() == [False, True, False, True, False, True, True, True, True] + + post3 = _array_to_duplicated_hashable(array, exclude_first=False, exclude_last=True) + assert post3.tolist() == [True, True, True, True, True, True, False, False, False] + + post4 = _array_to_duplicated_hashable(array, exclude_first=True, exclude_last=True) + assert post4.tolist() == [False, True, False, True, False, True, False, False, False] + + def test_array_to_duplicated_hashable_e(self) -> None: + array = np.array([[None, None, None, 32, 17, 17], [2,2,2,False,'q','q'], [2,2,2,False,'q','q'], ], dtype=object) + + post1 = _array_to_duplicated_hashable(array, exclude_first=False, exclude_last=False) + assert post1.tolist() == [False, True, True] + + post2 = _array_to_duplicated_hashable(array, exclude_first=True, exclude_last=False) + assert post2.tolist() == [False, False, True] + + post3 = _array_to_duplicated_hashable(array, exclude_first=False, exclude_last=True) + assert post3.tolist() == [False, True, False] + + post4 = _array_to_duplicated_hashable( array, exclude_first=True, exclude_last=True) + assert post4.tolist() == [False, False, False] + + def test_array_to_duplicated_hashable_f(self) -> None: + array = np.array([[None, None, None, 32, 17, 17], [2,2,2,False,'q','q'], [2,2,2,False,'q','q'], ], dtype=object) + + post1 = _array_to_duplicated_hashable(array, axis=1, exclude_first=False, exclude_last=False) + assert post1.tolist() == [True, True, True, False, True, True] + + post2 = _array_to_duplicated_hashable(array, axis=1, exclude_first=True, exclude_last=False) + assert post2.tolist() == [False, True, True, False, False, True] + + post3 = _array_to_duplicated_hashable(array, axis=1, exclude_first=False, exclude_last=True) + assert post3.tolist() == [True, True, False, False, True, False] + + post4 = _array_to_duplicated_hashable(array, axis=1, exclude_first=True, exclude_last=True) + assert post4.tolist() == [False, True, False, False, False, False] + if __name__ == '__main__': unittest.main() From 0aac1dbbbac49f9505a9657fc51b76e2a8f64c97 Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Wed, 2 Jun 2021 17:44:43 -0700 Subject: [PATCH 02/18] Removes leading underscore from name. Adds initial C scaffolding. --- performance/__main__.py | 4 +-- performance/reference/util.py | 2 +- src/__init__.py | 1 + src/__init__.pyi | 7 ++++- src/_arraykit.c | 23 ++++++++++++++++ test/test_util.py | 50 +++++++++++++++++------------------ 6 files changed, 58 insertions(+), 29 deletions(-) diff --git a/performance/__main__.py b/performance/__main__.py index e6d595a8..03efa0eb 100644 --- a/performance/__main__.py +++ b/performance/__main__.py @@ -19,7 +19,7 @@ from performance.reference.util import dtype_from_element as dtype_from_element_ref from performance.reference.util import array_deepcopy as array_deepcopy_ref from performance.reference.util import isna_element as isna_element_ref -from performance.reference.util import _array_to_duplicated_hashable as array_to_duplicated_hashable_ref +from performance.reference.util import array_to_duplicated_hashable as array_to_duplicated_hashable_ref from performance.reference.array_go import ArrayGO as ArrayGOREF @@ -35,7 +35,7 @@ from arraykit import dtype_from_element as dtype_from_element_ak from arraykit import array_deepcopy as array_deepcopy_ak from arraykit import isna_element as isna_element_ak -from performance.reference.util import _array_to_duplicated_hashable as array_to_duplicated_hashable_ak +from performance.reference.util import array_to_duplicated_hashable as array_to_duplicated_hashable_ak from arraykit import ArrayGO as ArrayGOAK diff --git a/performance/reference/util.py b/performance/reference/util.py index 20bf65ba..0852de73 100644 --- a/performance/reference/util.py +++ b/performance/reference/util.py @@ -220,7 +220,7 @@ def dtype_from_element(value: tp.Optional[tp.Hashable]) -> np.dtype: #------------------------------------------------------------------------------- # tools for handling duplicates -def _array_to_duplicated_hashable( +def array_to_duplicated_hashable( array: np.ndarray, axis: int = 0, exclude_first: bool = False, diff --git a/src/__init__.py b/src/__init__.py index 988ca110..b113f124 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -16,3 +16,4 @@ from ._arraykit import resolve_dtype_iter as resolve_dtype_iter from ._arraykit import isna_element as isna_element from ._arraykit import dtype_from_element as dtype_from_element +from ._arraykit import array_to_duplicated_hashable as array_to_duplicated_hashable diff --git a/src/__init__.pyi b/src/__init__.pyi index 4ff12eb9..b839dc44 100644 --- a/src/__init__.pyi +++ b/src/__init__.pyi @@ -32,4 +32,9 @@ def resolve_dtype(__d1: np.dtype, __d2: np.dtype) -> np.dtype: ... def resolve_dtype_iter(__dtypes: tp.Iterable[np.dtype]) -> np.dtype: ... def isna_element(__value: tp.Any) -> bool: ... def dtype_from_element(__value: tp.Optional[tp.Hashable]) -> np.dtype: ... - +def array_to_duplicated_hashable( + array: np.ndarray, + axis: int = 0, + exclude_first: bool = False, + exclude_last: bool = False, + ) -> np.ndarray: ... diff --git a/src/_arraykit.c b/src/_arraykit.c index f8906a5c..14f36099 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -490,6 +490,25 @@ isna_element(PyObject *Py_UNUSED(m), PyObject *arg) Py_RETURN_FALSE; } +//------------------------------------------------------------------------------ +// duplication + +static PyObject * +array_to_duplicated_hashable(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) +{ + return NULL; + // PyObject *array; + // PyObject *memo = NULL; + // if (!PyArg_ParseTupleAndKeywords(args, kwargs, + // "O|O!:array_to_duplicated_hashable", array_deepcopy_kwarg_names, + // &array, + // &PyDict_Type, &memo)) { + // return NULL; + // } + // AK_CHECK_NUMPY_ARRAY(array); + // return AK_ArrayDeepCopy((PyArrayObject*)array, memo); +} + //------------------------------------------------------------------------------ // ArrayGO //------------------------------------------------------------------------------ @@ -772,6 +791,10 @@ static PyMethodDef arraykit_methods[] = { {"resolve_dtype_iter", resolve_dtype_iter, METH_O, NULL}, {"isna_element", isna_element, METH_O, NULL}, {"dtype_from_element", dtype_from_element, METH_O, NULL}, + {"array_to_duplicated_hashable", + (PyCFunction)array_to_duplicated_hashable, + METH_VARARGS | METH_KEYWORDS, + NULL}, {NULL}, }; diff --git a/test/test_util.py b/test/test_util.py index 7824753a..5ed47c33 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -16,7 +16,7 @@ from arraykit import array_deepcopy from arraykit import isna_element from arraykit import dtype_from_element -from performance.reference.util import _array_to_duplicated_hashable +from performance.reference.util import array_to_duplicated_hashable from performance.reference.util import mloc as mloc_ref @@ -369,78 +369,78 @@ def test_dtype_from_element_str_and_bytes_dtypes(self) -> None: self.assertEqual(np.dtype(f'|S{size}'), dtype_from_element(bytes(size))) self.assertEqual(np.dtype(f' None: - a = _array_to_duplicated_hashable(np.array([0,1,2,2,1,4,5,3,4,5,5,6], dtype=object), + def testarray_to_duplicated_hashable_a(self) -> None: + a = array_to_duplicated_hashable(np.array([0,1,2,2,1,4,5,3,4,5,5,6], dtype=object), exclude_first=False, exclude_last=False) assert a.tolist() == [False, True, True, True, True, True, True, False, True, True, True, False] - a = _array_to_duplicated_hashable(np.array([0,1,2,2,1,4,5,3,4,5,5,6], dtype=object), + a = array_to_duplicated_hashable(np.array([0,1,2,2,1,4,5,3,4,5,5,6], dtype=object), exclude_first=True, exclude_last=False) assert a.tolist() == [False, False, False, True, True, False, False, False, True, True, True, False] - def test_array_to_duplicated_hashable_b(self) -> None: + def testarray_to_duplicated_hashable_b(self) -> None: a = np.array([[50, 50, 32, 17, 17], [2,2,1,3,3]], dtype=object) # find duplicate rows - post = _array_to_duplicated_hashable(a, axis=0) + post = array_to_duplicated_hashable(a, axis=0) assert post.tolist() == [False, False] - post = _array_to_duplicated_hashable(a, axis=1) + post = array_to_duplicated_hashable(a, axis=1) assert post.tolist() == [True, True, False, True, True] - post = _array_to_duplicated_hashable(a, axis=1, exclude_first=True) + post = array_to_duplicated_hashable(a, axis=1, exclude_first=True) assert post.tolist() == [False, True, False, False, True] - def test_array_to_duplicated_hashable_c(self) -> None: - c = _array_to_duplicated_hashable(np.array(['q','q','q', 'a', 'w', 'w'], dtype=object), + def testarray_to_duplicated_hashable_c(self) -> None: + c = array_to_duplicated_hashable(np.array(['q','q','q', 'a', 'w', 'w'], dtype=object), exclude_first=False, exclude_last=False) assert c.tolist() == [True, True, True, False, True, True] - def test_array_to_duplicated_hashable_d(self) -> None: + def testarray_to_duplicated_hashable_d(self) -> None: # NOTE: these cases fail with hetergenous types as we cannot sort a1 = np.array([0,0,1,0,None,None,0,1,None], dtype=object) a2 = np.array([0,0,1,0,'q','q',0,1,'q'], dtype=object) for array in (a1, a2): - post1 = _array_to_duplicated_hashable(array, exclude_first=False, exclude_last=False) + post1 = array_to_duplicated_hashable(array, exclude_first=False, exclude_last=False) assert post1.tolist() == [True, True, True, True, True, True, True, True, True] - post2 = _array_to_duplicated_hashable(array, exclude_first=True, exclude_last=False) + post2 = array_to_duplicated_hashable(array, exclude_first=True, exclude_last=False) assert post2.tolist() == [False, True, False, True, False, True, True, True, True] - post3 = _array_to_duplicated_hashable(array, exclude_first=False, exclude_last=True) + post3 = array_to_duplicated_hashable(array, exclude_first=False, exclude_last=True) assert post3.tolist() == [True, True, True, True, True, True, False, False, False] - post4 = _array_to_duplicated_hashable(array, exclude_first=True, exclude_last=True) + post4 = array_to_duplicated_hashable(array, exclude_first=True, exclude_last=True) assert post4.tolist() == [False, True, False, True, False, True, False, False, False] - def test_array_to_duplicated_hashable_e(self) -> None: + def testarray_to_duplicated_hashable_e(self) -> None: array = np.array([[None, None, None, 32, 17, 17], [2,2,2,False,'q','q'], [2,2,2,False,'q','q'], ], dtype=object) - post1 = _array_to_duplicated_hashable(array, exclude_first=False, exclude_last=False) + post1 = array_to_duplicated_hashable(array, exclude_first=False, exclude_last=False) assert post1.tolist() == [False, True, True] - post2 = _array_to_duplicated_hashable(array, exclude_first=True, exclude_last=False) + post2 = array_to_duplicated_hashable(array, exclude_first=True, exclude_last=False) assert post2.tolist() == [False, False, True] - post3 = _array_to_duplicated_hashable(array, exclude_first=False, exclude_last=True) + post3 = array_to_duplicated_hashable(array, exclude_first=False, exclude_last=True) assert post3.tolist() == [False, True, False] - post4 = _array_to_duplicated_hashable( array, exclude_first=True, exclude_last=True) + post4 = array_to_duplicated_hashable( array, exclude_first=True, exclude_last=True) assert post4.tolist() == [False, False, False] - def test_array_to_duplicated_hashable_f(self) -> None: + def testarray_to_duplicated_hashable_f(self) -> None: array = np.array([[None, None, None, 32, 17, 17], [2,2,2,False,'q','q'], [2,2,2,False,'q','q'], ], dtype=object) - post1 = _array_to_duplicated_hashable(array, axis=1, exclude_first=False, exclude_last=False) + post1 = array_to_duplicated_hashable(array, axis=1, exclude_first=False, exclude_last=False) assert post1.tolist() == [True, True, True, False, True, True] - post2 = _array_to_duplicated_hashable(array, axis=1, exclude_first=True, exclude_last=False) + post2 = array_to_duplicated_hashable(array, axis=1, exclude_first=True, exclude_last=False) assert post2.tolist() == [False, True, True, False, False, True] - post3 = _array_to_duplicated_hashable(array, axis=1, exclude_first=False, exclude_last=True) + post3 = array_to_duplicated_hashable(array, axis=1, exclude_first=False, exclude_last=True) assert post3.tolist() == [True, True, False, False, True, False] - post4 = _array_to_duplicated_hashable(array, axis=1, exclude_first=True, exclude_last=True) + post4 = array_to_duplicated_hashable(array, axis=1, exclude_first=True, exclude_last=True) assert post4.tolist() == [False, True, False, False, False, False] From 20e0577c0e535b2115b188cb1de26f5094433ab4 Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Thu, 3 Jun 2021 12:48:17 -0700 Subject: [PATCH 03/18] Adds working initial approach for 1d iteration with no constraints. --- debug.py | 76 ++++++++++ performance/reference/util.py | 3 +- src/_arraykit.c | 251 ++++++++++++++++++++++++++++++++-- 3 files changed, 318 insertions(+), 12 deletions(-) create mode 100644 debug.py diff --git a/debug.py b/debug.py new file mode 100644 index 00000000..1a75ccef --- /dev/null +++ b/debug.py @@ -0,0 +1,76 @@ +from arraykit import array_to_duplicated_hashable +import numpy as np + +class PO: + def __init__(self, v) -> None: + self.v = v + def __repr__(self) -> str: + return f'PO<{self.v}>' + + +def new( + array: np.ndarray, + axis: int = 0, + exclude_first: bool = False, + exclude_last: bool = False, + ) -> np.ndarray: + ''' + Algorithm for finding duplicates in unsortable arrays for hashables. This will always be an object array. + + Note: + np.unique fails under the same conditions that sorting fails, so there is no need to try np.unique: must go to set drectly. + ''' + size = array.shape[axis] + + if array.ndim == 1: + value_source = array + else: + if axis == 0: + value_source = map(tuple, array) + else: + value_source = map(tuple, array.T) + + is_dupe = np.full(size, False) + + if exclude_first and not exclude_last: + # Optimize for route requiring least amount of data structure + + found = set() + + for idx, v in enumerate(value_source): + if v not in found: + found.add(v) + else: + is_dupe[idx] = True + + return is_dupe + + first_unique_locations = {} + last_duplicate_locations = {} + + for idx, v in enumerate(value_source): + if v not in first_unique_locations: + first_unique_locations[v] = idx + else: + is_dupe[idx] = True + + if v not in last_duplicate_locations and not exclude_first: + is_dupe[first_unique_locations[v]] = True + + # always update last + last_duplicate_locations[v] = idx + + if exclude_last: # overwrite with False + is_dupe[list(last_duplicate_locations.values())] = False + + return is_dupe + +arr = np.array([1, PO(1), 2, 3, 1, PO(1), 2, 3]) +#array_to_duplicated_hashable(np.arange(5)) +#array_to_duplicated_hashable(np.arange(5), 213) +#array_to_duplicated_hashable(np.arange(5), 1) +#array_to_duplicated_hashable(np.arange(5), 1, True) +#array_to_duplicated_hashable(np.arange(5), 1, 123) +#array_to_duplicated_hashable(np.arange(5), 1, True) +x = array_to_duplicated_hashable(arr, 1, True, False) +print(x) diff --git a/performance/reference/util.py b/performance/reference/util.py index 0852de73..167de3ba 100644 --- a/performance/reference/util.py +++ b/performance/reference/util.py @@ -224,7 +224,8 @@ def array_to_duplicated_hashable( array: np.ndarray, axis: int = 0, exclude_first: bool = False, - exclude_last: bool = False) -> np.ndarray: + exclude_last: bool = False, + ) -> np.ndarray: ''' Algorithm for finding duplicates in unsortable arrays for hashables. This will always be an object array. ''' diff --git a/src/_arraykit.c b/src/_arraykit.c index 14f36099..0ff2dd2a 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -493,20 +493,249 @@ isna_element(PyObject *Py_UNUSED(m), PyObject *arg) //------------------------------------------------------------------------------ // duplication +static PyObject * +AK_array_to_duplicated_hashable_no_constraints(PyArrayObject *array, PyArrayObject *is_dup) +{ + /* + Rougly equivalent Python code + + found = set() + + for idx, v in enumerate(array): + if v not in found: + found.add(v) + else: + is_dupe[idx] = True + + return is_dupe + */ + // This path is optimized to only construct a single set + PyObject *found = PySet_New(NULL); + if (!found) { + Py_DECREF(is_dup); + return NULL; + } + + NpyIter *iter = NpyIter_New(array, + NPY_ITER_READONLY | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK, + NPY_KEEPORDER, + NPY_NO_CASTING, + NULL); + if (!iter) { + Py_DECREF(is_dup); + Py_DECREF(found); + return NULL; + } + + NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL); + if (!iternext) { + Py_DECREF(is_dup); + Py_DECREF(found); + NpyIter_Deallocate(iter); + return NULL; + } + + char** dataptr = NpyIter_GetDataPtrArray(iter); + npy_intp *strideptr = NpyIter_GetInnerStrideArray(iter); + npy_intp *sizeptr = NpyIter_GetInnerLoopSizePtr(iter); + + do { + char *data = *dataptr; + npy_intp stride = *strideptr; + npy_intp count = *sizeptr; + + PyObject* obj_ref = NULL; + + int i = 0; + while (count--) { + AK_DEBUG_OBJ(found); + // Object arrays contains pointers to PyObjects, so we will only temporarily + // look at the reference here. + memcpy(&obj_ref, data, sizeof(obj_ref)); + + // 5. Assign into result whether or not the element exists in the set + int in_set = PySequence_Contains(found, obj_ref); + if (in_set == -1) { + Py_DECREF(is_dup); + Py_DECREF(found); + NpyIter_Deallocate(iter); + return NULL; + } + else if (in_set == 0) { + Py_INCREF(obj_ref); + int add_success = PySet_Add(found, obj_ref); + Py_DECREF(obj_ref); + if (add_success == -1) { + Py_DECREF(is_dup); + Py_DECREF(found); + NpyIter_Deallocate(iter); + return NULL; + } + } + else { + *(npy_bool *) PyArray_GETPTR1(is_dup, i) = NPY_TRUE; + } + + data += stride; + i += 1; + } + + } while (iternext(iter)); + + Py_DECREF(found); + NpyIter_Deallocate(iter); + + return (PyObject*)is_dup; +} + +static PyObject * +AK_array_to_duplicated_hashable_with_constraints(PyArrayObject *array, PyArrayObject *is_dup) +{ + /* + Rougly equivalent Python code + + found = set() + + for idx, v in enumerate(array): + if v not in found: + found.add(v) + else: + is_dupe[idx] = True + + return is_dupe + */ + PyObject *first_unique_locations = PyDict_New(); + if (!first_unique_locations) { + Py_DECREF(is_dup); + return NULL; + } + + PyObject *last_duplicate_locations = PyDict_New(); + if (!last_duplicate_locations) { + Py_DECREF(first_unique_locations); + Py_DECREF(is_dup); + return NULL; + } + + + NpyIter *iter = NpyIter_New(array, + NPY_ITER_READONLY | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK, + NPY_KEEPORDER, + NPY_NO_CASTING, + NULL); + if (!iter) { + Py_DECREF(is_dup); + Py_DECREF(found); + return NULL; + } + + NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL); + if (!iternext) { + Py_DECREF(is_dup); + Py_DECREF(found); + NpyIter_Deallocate(iter); + return NULL; + } + + char** dataptr = NpyIter_GetDataPtrArray(iter); + npy_intp *strideptr = NpyIter_GetInnerStrideArray(iter); + npy_intp *sizeptr = NpyIter_GetInnerLoopSizePtr(iter); + + do { + char *data = *dataptr; + npy_intp stride = *strideptr; + npy_intp count = *sizeptr; + + PyObject* obj_ref = NULL; + + int i = 0; + while (count--) { + AK_DEBUG_OBJ(found); + // Object arrays contains pointers to PyObjects, so we will only temporarily + // look at the reference here. + memcpy(&obj_ref, data, sizeof(obj_ref)); + + // 5. Assign into result whether or not the element exists in the set + int in_set = PySequence_Contains(found, obj_ref); + if (in_set == -1) { + Py_DECREF(is_dup); + Py_DECREF(found); + NpyIter_Deallocate(iter); + return NULL; + } + else if (in_set == 0) { + Py_INCREF(obj_ref); + int add_success = PySet_Add(found, obj_ref); + Py_DECREF(obj_ref); + if (add_success == -1) { + Py_DECREF(is_dup); + Py_DECREF(found); + NpyIter_Deallocate(iter); + return NULL; + } + } + else { + *(npy_bool *) PyArray_GETPTR1(is_dup, i) = NPY_TRUE; + } + + data += stride; + i += 1; + } + + } while (iternext(iter)); + + Py_DECREF(found); + NpyIter_Deallocate(iter); + + return (PyObject*)is_dup; +} + static PyObject * array_to_duplicated_hashable(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) { - return NULL; - // PyObject *array; - // PyObject *memo = NULL; - // if (!PyArg_ParseTupleAndKeywords(args, kwargs, - // "O|O!:array_to_duplicated_hashable", array_deepcopy_kwarg_names, - // &array, - // &PyDict_Type, &memo)) { - // return NULL; - // } - // AK_CHECK_NUMPY_ARRAY(array); - // return AK_ArrayDeepCopy((PyArrayObject*)array, memo); + PyArrayObject *array = NULL; + int axis = 0; + int exclude_first = 0; + int exclude_last = 0; + + static char *kwarg_list[] = {"array", "axis", "exclude_first", "exclude_last", NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, + "O!|iii:array_to_duplicated_hashable", kwarg_list, + &PyArray_Type, &array, + &axis, + &exclude_first, + &exclude_last)) + { + return NULL; + } + + if (PyArray_DESCR(array)->kind != 'O') { + PyErr_SetString(PyExc_ValueError, "Array must have object dtype"); + return NULL; + } + + int size; + int ndim = PyArray_NDIM(array); + + if (ndim == 1) { + size = PyArray_DIM(array, 0); + } + else { + if (axis > 1) { + return NULL; + } + size = PyArray_DIM(array, size); + } + + npy_intp dims = {size}; + PyArrayObject *is_dup = PyArray_Zeros(1, &dims, PyArray_DescrFromType(NPY_BOOL), 0); + + if (exclude_first && !exclude_last) { + return AK_array_to_duplicated_hashable_no_constraints(array, is_dup); + } + + return AK_array_to_duplicated_hashable_with_constraints(array, is_dup); } //------------------------------------------------------------------------------ From 024a7b927ac66b006d813784869897c43a795d45 Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Thu, 3 Jun 2021 13:22:00 -0700 Subject: [PATCH 04/18] Another iteration to support 1d arrays with constraints. --- debug.py | 12 +++-- src/_arraykit.c | 133 +++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 119 insertions(+), 26 deletions(-) diff --git a/debug.py b/debug.py index 1a75ccef..8a60ff83 100644 --- a/debug.py +++ b/debug.py @@ -65,12 +65,18 @@ def new( return is_dupe -arr = np.array([1, PO(1), 2, 3, 1, PO(1), 2, 3]) +def test(*args, **kwargs): + assert (new(*args, **kwargs) == array_to_duplicated_hashable(*args, **kwargs)).all(), (args, kwargs) + + +arr = np.array([1, PO(1), 2, 3, 1, PO(1), 2, 3, 2, -1, -233, 'aslkj', 'df', 'df', True, True, None, 1]) #array_to_duplicated_hashable(np.arange(5)) #array_to_duplicated_hashable(np.arange(5), 213) #array_to_duplicated_hashable(np.arange(5), 1) #array_to_duplicated_hashable(np.arange(5), 1, True) #array_to_duplicated_hashable(np.arange(5), 1, 123) #array_to_duplicated_hashable(np.arange(5), 1, True) -x = array_to_duplicated_hashable(arr, 1, True, False) -print(x) +test(arr, 0, True, False) +test(arr, 0, False, False) +test(arr, 0, False, True) +test(arr, 0, True, True) diff --git a/src/_arraykit.c b/src/_arraykit.c index 0ff2dd2a..f6adc298 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -548,13 +548,12 @@ AK_array_to_duplicated_hashable_no_constraints(PyArrayObject *array, PyArrayObje int i = 0; while (count--) { - AK_DEBUG_OBJ(found); // Object arrays contains pointers to PyObjects, so we will only temporarily // look at the reference here. memcpy(&obj_ref, data, sizeof(obj_ref)); // 5. Assign into result whether or not the element exists in the set - int in_set = PySequence_Contains(found, obj_ref); + int in_set = PySet_Contains(found, obj_ref); if (in_set == -1) { Py_DECREF(is_dup); Py_DECREF(found); @@ -589,19 +588,29 @@ AK_array_to_duplicated_hashable_no_constraints(PyArrayObject *array, PyArrayObje } static PyObject * -AK_array_to_duplicated_hashable_with_constraints(PyArrayObject *array, PyArrayObject *is_dup) +AK_array_to_duplicated_hashable_with_constraints(PyArrayObject *array, PyArrayObject *is_dup, + int exclude_first, int exclude_last) { /* Rougly equivalent Python code - found = set() + first_unique_locations = {} + last_duplicate_locations = {} for idx, v in enumerate(array): - if v not in found: - found.add(v) + if v not in first_unique_locations: + first_unique_locations[v] = idx else: is_dupe[idx] = True + if v not in last_duplicate_locations and not exclude_first: + is_dupe[first_unique_locations[v]] = True + + last_duplicate_locations[v] = idx + + if exclude_last: # overwrite with False + is_dupe[list(last_duplicate_locations.values())] = False + return is_dupe */ PyObject *first_unique_locations = PyDict_New(); @@ -617,22 +626,23 @@ AK_array_to_duplicated_hashable_with_constraints(PyArrayObject *array, PyArrayOb return NULL; } - NpyIter *iter = NpyIter_New(array, NPY_ITER_READONLY | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK, NPY_KEEPORDER, NPY_NO_CASTING, NULL); if (!iter) { + Py_DECREF(first_unique_locations); + Py_DECREF(last_duplicate_locations); Py_DECREF(is_dup); - Py_DECREF(found); return NULL; } NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL); if (!iternext) { Py_DECREF(is_dup); - Py_DECREF(found); + Py_DECREF(first_unique_locations); + Py_DECREF(last_duplicate_locations); NpyIter_Deallocate(iter); return NULL; } @@ -646,36 +656,97 @@ AK_array_to_duplicated_hashable_with_constraints(PyArrayObject *array, PyArrayOb npy_intp stride = *strideptr; npy_intp count = *sizeptr; - PyObject* obj_ref = NULL; + PyObject* val = NULL; int i = 0; while (count--) { - AK_DEBUG_OBJ(found); // Object arrays contains pointers to PyObjects, so we will only temporarily // look at the reference here. - memcpy(&obj_ref, data, sizeof(obj_ref)); + memcpy(&val, data, sizeof(val)); + + // AK_DEBUG_OBJ(val); + // AK_DEBUG_OBJ(first_unique_locations); + // AK_DEBUG_OBJ(last_duplicate_locations); // 5. Assign into result whether or not the element exists in the set - int in_set = PySequence_Contains(found, obj_ref); - if (in_set == -1) { + int in_dict = PyDict_Contains(first_unique_locations, val); + if (in_dict == -1) { Py_DECREF(is_dup); - Py_DECREF(found); + Py_DECREF(first_unique_locations); + Py_DECREF(last_duplicate_locations); NpyIter_Deallocate(iter); return NULL; } - else if (in_set == 0) { - Py_INCREF(obj_ref); - int add_success = PySet_Add(found, obj_ref); - Py_DECREF(obj_ref); - if (add_success == -1) { + else if (in_dict == 0) { + PyObject *idx = PyLong_FromLong(i); + if (!idx) { Py_DECREF(is_dup); - Py_DECREF(found); + Py_DECREF(first_unique_locations); + Py_DECREF(last_duplicate_locations); + NpyIter_Deallocate(iter); + } + + Py_INCREF(val); + int set_success = PyDict_SetItem(first_unique_locations, val, idx); + Py_DECREF(val); + Py_DECREF(idx); + if (set_success == -1) { + Py_DECREF(is_dup); + Py_DECREF(first_unique_locations); + Py_DECREF(last_duplicate_locations); NpyIter_Deallocate(iter); return NULL; } } else { *(npy_bool *) PyArray_GETPTR1(is_dup, i) = NPY_TRUE; + + in_dict = PyDict_Contains(last_duplicate_locations, val); + if (in_dict == -1) { + Py_DECREF(is_dup); + Py_DECREF(first_unique_locations); + Py_DECREF(last_duplicate_locations); + NpyIter_Deallocate(iter); + } + else if (in_dict == 0 && !exclude_first) { + PyObject *first_unique_location = PyDict_GetItem(first_unique_locations, val); + if (!first_unique_location) { + Py_DECREF(is_dup); + Py_DECREF(first_unique_locations); + Py_DECREF(last_duplicate_locations); + NpyIter_Deallocate(iter); + } + + int idx = PyLong_AsLong(first_unique_location); + if (idx == -1) { + Py_DECREF(is_dup); + Py_DECREF(first_unique_locations); + Py_DECREF(last_duplicate_locations); + NpyIter_Deallocate(iter); + } + + *(npy_bool *) PyArray_GETPTR1(is_dup, idx) = NPY_TRUE; + } + + PyObject *idx = PyLong_FromLong(i); + if (!idx) { + Py_DECREF(is_dup); + Py_DECREF(first_unique_locations); + Py_DECREF(last_duplicate_locations); + NpyIter_Deallocate(iter); + } + + Py_INCREF(val); + int set_success = PyDict_SetItem(last_duplicate_locations, val, idx); + Py_DECREF(val); + Py_DECREF(idx); + if (set_success == -1) { + Py_DECREF(is_dup); + Py_DECREF(first_unique_locations); + Py_DECREF(last_duplicate_locations); + NpyIter_Deallocate(iter); + return NULL; + } } data += stride; @@ -684,7 +755,23 @@ AK_array_to_duplicated_hashable_with_constraints(PyArrayObject *array, PyArrayOb } while (iternext(iter)); - Py_DECREF(found); + if (exclude_last) { + PyObject *value = NULL; + Py_ssize_t pos = 0; + while (PyDict_Next(last_duplicate_locations, &pos, NULL, &value)) { + long idx = PyLong_AsLong(value); + if (idx == -1) { + Py_DECREF(first_unique_locations); + Py_DECREF(last_duplicate_locations); + NpyIter_Deallocate(iter); + } + + *(npy_bool *) PyArray_GETPTR1(is_dup, idx) = NPY_FALSE; + } + } + + Py_DECREF(first_unique_locations); + Py_DECREF(last_duplicate_locations); NpyIter_Deallocate(iter); return (PyObject*)is_dup; @@ -735,7 +822,7 @@ array_to_duplicated_hashable(PyObject *Py_UNUSED(m), PyObject *args, PyObject *k return AK_array_to_duplicated_hashable_no_constraints(array, is_dup); } - return AK_array_to_duplicated_hashable_with_constraints(array, is_dup); + return AK_array_to_duplicated_hashable_with_constraints(array, is_dup, exclude_first, exclude_last); } //------------------------------------------------------------------------------ From 27413a1e3735eb82e70e3ca53c6f96e47302f4e6 Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Thu, 3 Jun 2021 14:56:43 -0700 Subject: [PATCH 05/18] Cleans up error handling and optimizes debug python reference code. --- debug.py | 55 ++++++++++----- src/_arraykit.c | 173 +++++++++++++++--------------------------------- 2 files changed, 94 insertions(+), 134 deletions(-) diff --git a/debug.py b/debug.py index 8a60ff83..10037501 100644 --- a/debug.py +++ b/debug.py @@ -32,35 +32,56 @@ def new( is_dupe = np.full(size, False) - if exclude_first and not exclude_last: + if exclude_first ^ exclude_last: # Optimize for route requiring least amount of data structure found = set() + if exclude_first: + for idx, v in enumerate(value_source): + if v not in found: + found.add(v) + else: + is_dupe[idx] = True + else: + for idx, v in reversed(list(enumerate(value_source))): + if v not in found: + found.add(v) + else: + is_dupe[idx] = True + + return is_dupe + + elif not exclude_first and not exclude_last: + first_unique_locations = {} + duplicates = set() + for idx, v in enumerate(value_source): - if v not in found: - found.add(v) + if v not in first_unique_locations: + first_unique_locations[v] = idx else: is_dupe[idx] = True - return is_dupe + # Second time seeing a duplicate + if v not in duplicates: + is_dupe[first_unique_locations[v]] = True - first_unique_locations = {} - last_duplicate_locations = {} + # always update last + duplicates.add(v) - for idx, v in enumerate(value_source): - if v not in first_unique_locations: - first_unique_locations[v] = idx - else: - is_dupe[idx] = True + else: + seen = set() + last_duplicate_locations = {} - if v not in last_duplicate_locations and not exclude_first: - is_dupe[first_unique_locations[v]] = True + for idx, v in enumerate(value_source): + if v not in seen: + seen.add(v) + else: + is_dupe[idx] = True - # always update last - last_duplicate_locations[v] = idx + # always update last + last_duplicate_locations[v] = idx - if exclude_last: # overwrite with False is_dupe[list(last_duplicate_locations.values())] = False return is_dupe @@ -76,7 +97,9 @@ def test(*args, **kwargs): #array_to_duplicated_hashable(np.arange(5), 1, True) #array_to_duplicated_hashable(np.arange(5), 1, 123) #array_to_duplicated_hashable(np.arange(5), 1, True) + test(arr, 0, True, False) test(arr, 0, False, False) test(arr, 0, False, True) test(arr, 0, True, True) +print('Done') diff --git a/src/_arraykit.c b/src/_arraykit.c index f6adc298..883c847e 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -499,19 +499,19 @@ AK_array_to_duplicated_hashable_no_constraints(PyArrayObject *array, PyArrayObje /* Rougly equivalent Python code - found = set() + seen = set() for idx, v in enumerate(array): - if v not in found: - found.add(v) + if v not in seen: + seen.add(v) else: - is_dupe[idx] = True + is_dup[idx] = True - return is_dupe + return is_dup */ // This path is optimized to only construct a single set - PyObject *found = PySet_New(NULL); - if (!found) { + PyObject *seen = PySet_New(NULL); + if (!seen) { Py_DECREF(is_dup); return NULL; } @@ -521,19 +521,10 @@ AK_array_to_duplicated_hashable_no_constraints(PyArrayObject *array, PyArrayObje NPY_KEEPORDER, NPY_NO_CASTING, NULL); - if (!iter) { - Py_DECREF(is_dup); - Py_DECREF(found); - return NULL; - } + if (!iter) { goto failure; } NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL); - if (!iternext) { - Py_DECREF(is_dup); - Py_DECREF(found); - NpyIter_Deallocate(iter); - return NULL; - } + if (!iternext) { goto failure; } char** dataptr = NpyIter_GetDataPtrArray(iter); npy_intp *strideptr = NpyIter_GetInnerStrideArray(iter); @@ -544,32 +535,21 @@ AK_array_to_duplicated_hashable_no_constraints(PyArrayObject *array, PyArrayObje npy_intp stride = *strideptr; npy_intp count = *sizeptr; - PyObject* obj_ref = NULL; + PyObject* value = NULL; int i = 0; while (count--) { // Object arrays contains pointers to PyObjects, so we will only temporarily // look at the reference here. - memcpy(&obj_ref, data, sizeof(obj_ref)); + memcpy(&value, data, sizeof(value)); // 5. Assign into result whether or not the element exists in the set - int in_set = PySet_Contains(found, obj_ref); - if (in_set == -1) { - Py_DECREF(is_dup); - Py_DECREF(found); - NpyIter_Deallocate(iter); - return NULL; - } - else if (in_set == 0) { - Py_INCREF(obj_ref); - int add_success = PySet_Add(found, obj_ref); - Py_DECREF(obj_ref); - if (add_success == -1) { - Py_DECREF(is_dup); - Py_DECREF(found); - NpyIter_Deallocate(iter); - return NULL; - } + int found = PySet_Contains(seen, value); + if (found == -1) { goto failure; } + + else if (found == 0) { + int add_success = PySet_Add(seen, value); + if (add_success == -1) { goto failure; } } else { *(npy_bool *) PyArray_GETPTR1(is_dup, i) = NPY_TRUE; @@ -581,10 +561,18 @@ AK_array_to_duplicated_hashable_no_constraints(PyArrayObject *array, PyArrayObje } while (iternext(iter)); - Py_DECREF(found); NpyIter_Deallocate(iter); + Py_DECREF(seen); return (PyObject*)is_dup; + +failure: + if (iter != NULL) { + NpyIter_Deallocate(iter); + } + Py_DECREF(seen); + Py_DECREF(is_dup); + return NULL; } static PyObject * @@ -613,6 +601,7 @@ AK_array_to_duplicated_hashable_with_constraints(PyArrayObject *array, PyArrayOb return is_dupe */ + // Contains first location for all unique values. len(first_unique_locations) == len(set(array)) PyObject *first_unique_locations = PyDict_New(); if (!first_unique_locations) { Py_DECREF(is_dup); @@ -631,21 +620,10 @@ AK_array_to_duplicated_hashable_with_constraints(PyArrayObject *array, PyArrayOb NPY_KEEPORDER, NPY_NO_CASTING, NULL); - if (!iter) { - Py_DECREF(first_unique_locations); - Py_DECREF(last_duplicate_locations); - Py_DECREF(is_dup); - return NULL; - } + if (!iter) { goto failure; } NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL); - if (!iternext) { - Py_DECREF(is_dup); - Py_DECREF(first_unique_locations); - Py_DECREF(last_duplicate_locations); - NpyIter_Deallocate(iter); - return NULL; - } + if (!iternext) { goto failure; } char** dataptr = NpyIter_GetDataPtrArray(iter); npy_intp *strideptr = NpyIter_GetInnerStrideArray(iter); @@ -664,89 +642,42 @@ AK_array_to_duplicated_hashable_with_constraints(PyArrayObject *array, PyArrayOb // look at the reference here. memcpy(&val, data, sizeof(val)); - // AK_DEBUG_OBJ(val); - // AK_DEBUG_OBJ(first_unique_locations); - // AK_DEBUG_OBJ(last_duplicate_locations); - // 5. Assign into result whether or not the element exists in the set int in_dict = PyDict_Contains(first_unique_locations, val); - if (in_dict == -1) { - Py_DECREF(is_dup); - Py_DECREF(first_unique_locations); - Py_DECREF(last_duplicate_locations); - NpyIter_Deallocate(iter); - return NULL; - } + + if (in_dict == -1) { goto failure; } + else if (in_dict == 0) { PyObject *idx = PyLong_FromLong(i); - if (!idx) { - Py_DECREF(is_dup); - Py_DECREF(first_unique_locations); - Py_DECREF(last_duplicate_locations); - NpyIter_Deallocate(iter); - } + if (!idx) { goto failure; } - Py_INCREF(val); int set_success = PyDict_SetItem(first_unique_locations, val, idx); - Py_DECREF(val); Py_DECREF(idx); - if (set_success == -1) { - Py_DECREF(is_dup); - Py_DECREF(first_unique_locations); - Py_DECREF(last_duplicate_locations); - NpyIter_Deallocate(iter); - return NULL; - } + if (set_success == -1) { goto failure; } + } else { *(npy_bool *) PyArray_GETPTR1(is_dup, i) = NPY_TRUE; in_dict = PyDict_Contains(last_duplicate_locations, val); - if (in_dict == -1) { - Py_DECREF(is_dup); - Py_DECREF(first_unique_locations); - Py_DECREF(last_duplicate_locations); - NpyIter_Deallocate(iter); - } + if (in_dict == -1) { goto failure; } + else if (in_dict == 0 && !exclude_first) { PyObject *first_unique_location = PyDict_GetItem(first_unique_locations, val); - if (!first_unique_location) { - Py_DECREF(is_dup); - Py_DECREF(first_unique_locations); - Py_DECREF(last_duplicate_locations); - NpyIter_Deallocate(iter); - } + if (!first_unique_location) { goto failure; } int idx = PyLong_AsLong(first_unique_location); - if (idx == -1) { - Py_DECREF(is_dup); - Py_DECREF(first_unique_locations); - Py_DECREF(last_duplicate_locations); - NpyIter_Deallocate(iter); - } + if (idx == -1) { goto failure; } *(npy_bool *) PyArray_GETPTR1(is_dup, idx) = NPY_TRUE; } PyObject *idx = PyLong_FromLong(i); - if (!idx) { - Py_DECREF(is_dup); - Py_DECREF(first_unique_locations); - Py_DECREF(last_duplicate_locations); - NpyIter_Deallocate(iter); - } + if (!idx) { goto failure; } - Py_INCREF(val); int set_success = PyDict_SetItem(last_duplicate_locations, val, idx); - Py_DECREF(val); Py_DECREF(idx); - if (set_success == -1) { - Py_DECREF(is_dup); - Py_DECREF(first_unique_locations); - Py_DECREF(last_duplicate_locations); - NpyIter_Deallocate(iter); - return NULL; - } + if (set_success == -1) { goto failure; } } data += stride; @@ -756,25 +687,31 @@ AK_array_to_duplicated_hashable_with_constraints(PyArrayObject *array, PyArrayOb } while (iternext(iter)); if (exclude_last) { - PyObject *value = NULL; + PyObject *value = NULL; // Borrowed Py_ssize_t pos = 0; + while (PyDict_Next(last_duplicate_locations, &pos, NULL, &value)) { long idx = PyLong_AsLong(value); - if (idx == -1) { - Py_DECREF(first_unique_locations); - Py_DECREF(last_duplicate_locations); - NpyIter_Deallocate(iter); - } + if (idx == -1) { goto failure; } // -1 always means failure since no locations are negative *(npy_bool *) PyArray_GETPTR1(is_dup, idx) = NPY_FALSE; } } - Py_DECREF(first_unique_locations); - Py_DECREF(last_duplicate_locations); NpyIter_Deallocate(iter); + Py_DECREF(last_duplicate_locations); + Py_DECREF(first_unique_locations); return (PyObject*)is_dup; + +failure: + if (iter != NULL) { + NpyIter_Deallocate(iter); + } + Py_DECREF(last_duplicate_locations); + Py_DECREF(first_unique_locations); + Py_DECREF(is_dup); + return NULL; } static PyObject * From 6a285c9986b89e2ffac7e676b5484bf34b28b070 Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Thu, 3 Jun 2021 18:01:15 -0700 Subject: [PATCH 06/18] Prepares code for function delegation to abstract iteration from processing. --- debug.py | 160 ++++++++++++++++++++++++++++++++---------------- src/_arraykit.c | 140 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 247 insertions(+), 53 deletions(-) diff --git a/debug.py b/debug.py index 10037501..852a85e2 100644 --- a/debug.py +++ b/debug.py @@ -7,6 +7,78 @@ def __init__(self, v) -> None: def __repr__(self) -> str: return f'PO<{self.v}>' +def iterate_1d(array, axis, reverse, is_dupe, process_value_func, set_obj, dict_obj): + if reverse: + iterator = reversed(array) + else: + iterator = array + + size = len(array) + + for i, value in enumerate(iterator): + if reverse: + i = size - i - 1 + + process_value_func(i, value, is_dupe, set_obj, dict_obj) + + +def iterate_2d(array, axis, reverse, is_dupe, process_value_func, set_obj, dict_obj): + size = array.shape[axis] + + if axis == 0: + iterator = array + else: + iterator = array.T + + if reverse: + iterator = reversed(iterator) + + for i, value in enumerate(map(tuple, iterator)): + if reverse: + i = size - i - 1 + + process_value_func(i, value, is_dupe, set_obj, dict_obj) + + +def handle_value_one_boundary(i, value, is_dupe, set_obj, dict_obj): + seen = set_obj + assert dict_obj == None + + if value not in seen: + seen.add(value) + else: + is_dupe[i] = True + + +def handle_value_exclude_boundaries(i, value, is_dupe, set_obj, dict_obj): + duplicates = set_obj + first_unique_locations = dict_obj + + if value not in first_unique_locations: + first_unique_locations[value] = i + else: + is_dupe[i] = True + + # Second time seeing a duplicate + if value not in duplicates: + is_dupe[first_unique_locations[value]] = True + + # always update last + duplicates.add(value) + + +def handle_value_include_boundaries(i, value, is_dupe, set_obj, dict_obj): + seen = set_obj + last_duplicate_locations = dict_obj + + if value not in seen: + seen.add(value) + else: + is_dupe[i] = True + + # always update last + last_duplicate_locations[value] = i + def new( array: np.ndarray, @@ -22,70 +94,36 @@ def new( ''' size = array.shape[axis] + reverse = not exclude_first and exclude_last + if array.ndim == 1: - value_source = array + iterate_func = iterate_1d else: - if axis == 0: - value_source = map(tuple, array) - else: - value_source = map(tuple, array.T) + iterate_func = iterate_2d is_dupe = np.full(size, False) + set_obj = set() if exclude_first ^ exclude_last: - # Optimize for route requiring least amount of data structure - - found = set() - - if exclude_first: - for idx, v in enumerate(value_source): - if v not in found: - found.add(v) - else: - is_dupe[idx] = True - else: - for idx, v in reversed(list(enumerate(value_source))): - if v not in found: - found.add(v) - else: - is_dupe[idx] = True - - return is_dupe + dict_obj = None + process_value_func = handle_value_one_boundary elif not exclude_first and not exclude_last: - first_unique_locations = {} - duplicates = set() - - for idx, v in enumerate(value_source): - if v not in first_unique_locations: - first_unique_locations[v] = idx - else: - is_dupe[idx] = True - - # Second time seeing a duplicate - if v not in duplicates: - is_dupe[first_unique_locations[v]] = True - - # always update last - duplicates.add(v) + dict_obj = dict() + process_value_func = handle_value_exclude_boundaries else: - seen = set() - last_duplicate_locations = {} - - for idx, v in enumerate(value_source): - if v not in seen: - seen.add(v) - else: - is_dupe[idx] = True + dict_obj = dict() + process_value_func = handle_value_include_boundaries - # always update last - last_duplicate_locations[v] = idx + iterate_func(array, axis, reverse, is_dupe, process_value_func, set_obj, dict_obj) - is_dupe[list(last_duplicate_locations.values())] = False + if exclude_first and exclude_last: + is_dupe[list(dict_obj.values())] = False return is_dupe + def test(*args, **kwargs): assert (new(*args, **kwargs) == array_to_duplicated_hashable(*args, **kwargs)).all(), (args, kwargs) @@ -98,8 +136,24 @@ def test(*args, **kwargs): #array_to_duplicated_hashable(np.arange(5), 1, 123) #array_to_duplicated_hashable(np.arange(5), 1, True) -test(arr, 0, True, False) -test(arr, 0, False, False) -test(arr, 0, False, True) -test(arr, 0, True, True) +if False: + test(arr, 0, True, False) + test(arr, 0, False, False) + test(arr, 0, False, True) + test(arr, 0, True, True) + + +array_to_duplicated_hashable(np.arange(20).reshape(4, 5).astype(object), 0) +print() +array_to_duplicated_hashable(np.arange(20).reshape(4, 5).astype(object), 1) +print() +print() + + +array_to_duplicated_hashable(np.arange(20).reshape(4, 5).astype(object).T, 0) +print() +array_to_duplicated_hashable(np.arange(20).reshape(4, 5).astype(object).T, 1) +print() + + print('Done') diff --git a/src/_arraykit.c b/src/_arraykit.c index 883c847e..c3b77e9f 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -714,6 +714,130 @@ AK_array_to_duplicated_hashable_with_constraints(PyArrayObject *array, PyArrayOb return NULL; } +static void +AK_func_1(PyObject *value) +{ + AK_DEBUG_OBJ(value); +} + +static void +AK_func_2(PyObject *value) +{ + AK_DEBUG_OBJ(value); +} + +static void +AK_iter_1d_array(PyArrayObject *array) +{ + NpyIter *iter = NpyIter_New(array, + NPY_ITER_READONLY | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK, + NPY_KEEPORDER, + NPY_NO_CASTING, + NULL); + if (!iter) { goto failure; } + + NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL); + if (!iternext) { goto failure; } + + char** dataptr = NpyIter_GetDataPtrArray(iter); + npy_intp *strideptr = NpyIter_GetInnerStrideArray(iter); + npy_intp *sizeptr = NpyIter_GetInnerLoopSizePtr(iter); + + do { + char *data = *dataptr; + npy_intp stride = *strideptr; + npy_intp count = *sizeptr; + + PyObject* value = NULL; + + while (count--) { + memcpy(&value, data, sizeof(value)); + AK_DEBUG_OBJ(value); + data += stride; + } + } while (iternext(iter)); + + NpyIter_Deallocate(iter); + return; + +failure: + if (iter != NULL) { + NpyIter_Deallocate(iter); + } +} + +static void +AK_iter_2d_array(PyArrayObject *array, int axis, void (*value_func)(PyObject*)) +{ + int is_c_order = PyArray_FLAGS(array) & NPY_ARRAY_C_CONTIGUOUS; + int order_flags = NPY_FORTRANORDER ? axis : NPY_CORDER; + + NpyIter *iter = NpyIter_New(array, + NPY_ITER_READONLY | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK, + order_flags, + NPY_NO_CASTING, + NULL); + if (!iter) { goto failure; } + + NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL); + if (!iternext) { goto failure; } + + char** dataptr = NpyIter_GetDataPtrArray(iter); + npy_intp *strideptr = NpyIter_GetInnerStrideArray(iter); + + npy_intp tuple_size = PyArray_DIM(array, !axis); + npy_intp num_tuples = PyArray_DIM(array, axis); + + do { + char *data = *dataptr; + npy_intp stride = *strideptr; + + PyObject* value = NULL; + + // When the axis doesn't align with the ordering, it means the npy iterator goes one-element at a time. + // Otherwise, it does a strided loop through the non-contiguous axis + if (is_c_order != axis) { + // Do-while is one loop through all elements. + for (int i = 0; i < num_tuples; ++i) { + PyObject *tup = PyTuple_New(tuple_size); + if (!tup) { goto failure; } + + for (int j = 0; j < tuple_size; ++j) { + memcpy(&value, data, sizeof(value)); + Py_INCREF(value); + PyTuple_SET_ITEM(tup, j, value); + data += stride; + } + value_func(tup); + Py_DECREF(tup); + } + } + else { + PyObject *tup = PyTuple_New(tuple_size); + if (!tup) { goto failure; } + + // Each do-while loop strides over another column + for (int i = 0; i < tuple_size; ++i) { + memcpy(&value, data, sizeof(value)); + Py_INCREF(value); + PyTuple_SET_ITEM(tup, i, value); + data += stride; + } + value_func(tup); + Py_DECREF(tup); + } + + } while (iternext(iter)); + + NpyIter_Deallocate(iter); + return; + +failure: + if (iter != NULL) { + NpyIter_Deallocate(iter); + } +} + static PyObject * array_to_duplicated_hashable(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) { @@ -739,22 +863,38 @@ array_to_duplicated_hashable(PyObject *Py_UNUSED(m), PyObject *args, PyObject *k return NULL; } + void (*func1_ptr)(PyObject*) = AK_func_1; + void (*func2_ptr)(PyObject*) = AK_func_2; + + // AK_iter_1d_array(array); + AK_iter_2d_array(array, axis, AK_func_1); + AK_iter_2d_array(array, axis, AK_func_2); + Py_RETURN_NONE; + int size; int ndim = PyArray_NDIM(array); + void (*iterate_array_func)(PyArrayObject*, PyArrayObject*) = NULL; + if (ndim == 1) { + iterate_array_func = AK_iter_1d_array; size = PyArray_DIM(array, 0); } else { if (axis > 1) { return NULL; } + iterate_array_func = AK_iter_2d_array; size = PyArray_DIM(array, size); } npy_intp dims = {size}; PyArrayObject *is_dup = PyArray_Zeros(1, &dims, PyArray_DescrFromType(NPY_BOOL), 0); + void (*process_hashable_func)(PyArrayObject*, PyArrayObject*) = NULL; + + + if (exclude_first && !exclude_last) { return AK_array_to_duplicated_hashable_no_constraints(array, is_dup); } From 4fd0b4319261bd99a82f4ddea7c5be4fde02c12a Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Fri, 4 Jun 2021 10:38:18 -0700 Subject: [PATCH 07/18] Updates C to abstract iteration from process. (Untested). --- debug.py | 30 ++++----- src/_arraykit.c | 164 +++++++++++++++++++++++++++++++++++++----------- 2 files changed, 143 insertions(+), 51 deletions(-) diff --git a/debug.py b/debug.py index 852a85e2..425a1ab4 100644 --- a/debug.py +++ b/debug.py @@ -7,7 +7,7 @@ def __init__(self, v) -> None: def __repr__(self) -> str: return f'PO<{self.v}>' -def iterate_1d(array, axis, reverse, is_dupe, process_value_func, set_obj, dict_obj): +def iterate_1d(array, axis, reverse, is_dup, process_value_func, set_obj, dict_obj): if reverse: iterator = reversed(array) else: @@ -19,10 +19,10 @@ def iterate_1d(array, axis, reverse, is_dupe, process_value_func, set_obj, dict_ if reverse: i = size - i - 1 - process_value_func(i, value, is_dupe, set_obj, dict_obj) + process_value_func(i, value, is_dup, set_obj, dict_obj) -def iterate_2d(array, axis, reverse, is_dupe, process_value_func, set_obj, dict_obj): +def iterate_2d(array, axis, reverse, is_dup, process_value_func, set_obj, dict_obj): size = array.shape[axis] if axis == 0: @@ -37,44 +37,44 @@ def iterate_2d(array, axis, reverse, is_dupe, process_value_func, set_obj, dict_ if reverse: i = size - i - 1 - process_value_func(i, value, is_dupe, set_obj, dict_obj) + process_value_func(i, value, is_dup, set_obj, dict_obj) -def handle_value_one_boundary(i, value, is_dupe, set_obj, dict_obj): +def handle_value_one_boundary(i, value, is_dup, set_obj, dict_obj): seen = set_obj assert dict_obj == None if value not in seen: seen.add(value) else: - is_dupe[i] = True + is_dup[i] = True -def handle_value_exclude_boundaries(i, value, is_dupe, set_obj, dict_obj): +def handle_value_exclude_boundaries(i, value, is_dup, set_obj, dict_obj): duplicates = set_obj first_unique_locations = dict_obj if value not in first_unique_locations: first_unique_locations[value] = i else: - is_dupe[i] = True + is_dup[i] = True # Second time seeing a duplicate if value not in duplicates: - is_dupe[first_unique_locations[value]] = True + is_dup[first_unique_locations[value]] = True # always update last duplicates.add(value) -def handle_value_include_boundaries(i, value, is_dupe, set_obj, dict_obj): +def handle_value_include_boundaries(i, value, is_dup, set_obj, dict_obj): seen = set_obj last_duplicate_locations = dict_obj if value not in seen: seen.add(value) else: - is_dupe[i] = True + is_dup[i] = True # always update last last_duplicate_locations[value] = i @@ -101,7 +101,7 @@ def new( else: iterate_func = iterate_2d - is_dupe = np.full(size, False) + is_dup = np.full(size, False) set_obj = set() if exclude_first ^ exclude_last: @@ -116,12 +116,12 @@ def new( dict_obj = dict() process_value_func = handle_value_include_boundaries - iterate_func(array, axis, reverse, is_dupe, process_value_func, set_obj, dict_obj) + iterate_func(array, axis, reverse, is_dup, process_value_func, set_obj, dict_obj) if exclude_first and exclude_last: - is_dupe[list(dict_obj.values())] = False + is_dup[list(dict_obj.values())] = False - return is_dupe + return is_dup def test(*args, **kwargs): diff --git a/src/_arraykit.c b/src/_arraykit.c index c3b77e9f..f24be641 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -714,21 +714,47 @@ AK_array_to_duplicated_hashable_with_constraints(PyArrayObject *array, PyArrayOb return NULL; } -static void -AK_func_1(PyObject *value) -{ - AK_DEBUG_OBJ(value); -} +typedef int (*AK_handle_value_func)(int, // i + PyObject*, // value + PyArrayObject*, // is_dup + PyObject*, // set_obj + PyObject* // dict_obj +); +typedef int (*AK_iterate_np_func)(PyArrayObject*, // array + int, // axis + int, // reverse + PyArrayObject*, // is_dup + AK_handle_value_func, // handle_value_func + PyObject*, // set_obj + PyObject* // dict_obj +); -static void -AK_func_2(PyObject *value) +static int +AK_handle_value_one_boundary(int i, PyObject *value, PyArrayObject *is_dup, + PyObject *set_obj, PyObject *dict_obj) { - AK_DEBUG_OBJ(value); + PyObject *seen = set_obj; // Meaningful name alias + assert(dict_obj == NULL); + + int found = PySet_Contains(seen, value); + if (found == -1) { return -1; } + + else if (found == 0) { + int add_success = PySet_Add(seen, value); + if (add_success == -1) { return -1; } + } + else { + *(npy_bool *) PyArray_GETPTR1(is_dup, i) = NPY_TRUE; + } + + return 0; } -static void -AK_iter_1d_array(PyArrayObject *array) +static int +AK_iter_1d_array(PyArrayObject *array, int axis, int reverse, PyArrayObject *is_dup, + AK_handle_value_func value_func, PyObject *set_obj, PyObject *dict_obj) { + assert(axis == 0); NpyIter *iter = NpyIter_New(array, NPY_ITER_READONLY | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK, NPY_KEEPORDER, @@ -750,24 +776,42 @@ AK_iter_1d_array(PyArrayObject *array) PyObject* value = NULL; + int i = 0; + int step = 1; + + if (reverse) { + data += (stride * count); + stride = -stride; + i = count; + step = -1; + } + while (count--) { memcpy(&value, data, sizeof(value)); - AK_DEBUG_OBJ(value); + + // Process the value! + if (!value_func(i, value, is_dup, set_obj, dict_obj)) { + goto failure; + } + data += stride; + i += step; } } while (iternext(iter)); NpyIter_Deallocate(iter); - return; + return 0; failure: if (iter != NULL) { NpyIter_Deallocate(iter); } + return -1; } -static void -AK_iter_2d_array(PyArrayObject *array, int axis, void (*value_func)(PyObject*)) +static int +AK_iter_2d_array(PyArrayObject *array, int axis, int reverse, PyArrayObject *is_dup, + AK_handle_value_func value_func, PyObject *set_obj, PyObject *dict_obj) { int is_c_order = PyArray_FLAGS(array) & NPY_ARRAY_C_CONTIGUOUS; int order_flags = NPY_FORTRANORDER ? axis : NPY_CORDER; @@ -788,6 +832,9 @@ AK_iter_2d_array(PyArrayObject *array, int axis, void (*value_func)(PyObject*)) npy_intp tuple_size = PyArray_DIM(array, !axis); npy_intp num_tuples = PyArray_DIM(array, axis); + int idx = 0; + int step = 1; + do { char *data = *dataptr; npy_intp stride = *strideptr; @@ -808,8 +855,11 @@ AK_iter_2d_array(PyArrayObject *array, int axis, void (*value_func)(PyObject*)) PyTuple_SET_ITEM(tup, j, value); data += stride; } - value_func(tup); + + int success = value_func(idx, value, is_dup, set_obj, dict_obj); Py_DECREF(tup); + if (!success) { goto failure; } + idx += step; } } else { @@ -823,19 +873,23 @@ AK_iter_2d_array(PyArrayObject *array, int axis, void (*value_func)(PyObject*)) PyTuple_SET_ITEM(tup, i, value); data += stride; } - value_func(tup); + + int success = value_func(idx, value, is_dup, set_obj, dict_obj); Py_DECREF(tup); + if (!success) { goto failure; } + idx += step; } } while (iternext(iter)); NpyIter_Deallocate(iter); - return; + return 0; failure: if (iter != NULL) { NpyIter_Deallocate(iter); } + return -1; } static PyObject * @@ -863,43 +917,81 @@ array_to_duplicated_hashable(PyObject *Py_UNUSED(m), PyObject *args, PyObject *k return NULL; } - void (*func1_ptr)(PyObject*) = AK_func_1; - void (*func2_ptr)(PyObject*) = AK_func_2; - - // AK_iter_1d_array(array); - AK_iter_2d_array(array, axis, AK_func_1); - AK_iter_2d_array(array, axis, AK_func_2); - Py_RETURN_NONE; - - int size; int ndim = PyArray_NDIM(array); + if (axis > 1 || (ndim == 1 && axis == 1)) { + PyErr_SetString(PyExc_ValueError, "Axis must be 0 or 1 for 2d, and 0 for 1d"); + return NULL; + } - void (*iterate_array_func)(PyArrayObject*, PyArrayObject*) = NULL; + int size = PyArray_DIM(array, axis); + int reverse = !exclude_first && exclude_last; + + AK_handle_value_func handle_value_func = NULL; + AK_iterate_np_func iterate_array_func = NULL; if (ndim == 1) { iterate_array_func = AK_iter_1d_array; - size = PyArray_DIM(array, 0); } else { - if (axis > 1) { - return NULL; - } iterate_array_func = AK_iter_2d_array; - size = PyArray_DIM(array, size); } npy_intp dims = {size}; PyArrayObject *is_dup = PyArray_Zeros(1, &dims, PyArray_DescrFromType(NPY_BOOL), 0); - void (*process_hashable_func)(PyArrayObject*, PyArrayObject*) = NULL; + PyObject *set_obj = PySet_New(NULL); + if (!set_obj) { + return NULL; + } + PyObject *dict_obj = NULL; + if (exclude_first ^ exclude_last) { + handle_value_func = AK_handle_value_one_boundary; + } + else { + dict_obj = PyDict_New(); + if (!dict_obj) { + goto failure; + } - if (exclude_first && !exclude_last) { - return AK_array_to_duplicated_hashable_no_constraints(array, is_dup); + if (!exclude_first && !exclude_last) { + handle_value_func = AK_handle_value_one_boundary; // AK_handle_value_exclude_boundaries + } + else { + handle_value_func = AK_handle_value_one_boundary; // AK_handle_value_include_boundaries + } } - return AK_array_to_duplicated_hashable_with_constraints(array, is_dup, exclude_first, exclude_last); + if (iterate_array_func(array, axis, reverse, is_dup, handle_value_func, set_obj, dict_obj)) { + goto failure; + } + + if (exclude_first && exclude_last) { + // is_dup[list(dict_obj.values())] = False + + PyObject *value = NULL; // Borrowed + Py_ssize_t pos = 0; + + while (PyDict_Next(dict_obj, &pos, NULL, &value)) { + long idx = PyLong_AsLong(value); + if (idx == -1) { + goto failure; // -1 always means failure since no locations are negative + } + Py_DECREF(value); + + *(npy_bool *) PyArray_GETPTR1(is_dup, idx) = NPY_FALSE; + } + } + + Py_XDECREF(dict_obj); + Py_DECREF(set_obj); + return (PyObject *)is_dup; + +failure: + Py_XDECREF(dict_obj); + Py_DECREF(set_obj); + return NULL; } //------------------------------------------------------------------------------ From c8149293079e61494f283f660f186fa602529824 Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Fri, 4 Jun 2021 12:33:59 -0700 Subject: [PATCH 08/18] Gets 1d iteration fully working for all paths, including reverse! Extends debug.py --- debug.py | 69 ++++++++++++++++++ src/_arraykit.c | 185 +++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 237 insertions(+), 17 deletions(-) diff --git a/debug.py b/debug.py index 425a1ab4..860225cb 100644 --- a/debug.py +++ b/debug.py @@ -124,6 +124,75 @@ def new( return is_dup +def dprint(*args, debug): + '''Debug print''' + if debug: + print(*args) + + +def test(*args, debug=True): + dprint(args[1:], debug=debug) + o = new(*args); dprint('python:', o, debug=debug) + n = array_to_duplicated_hashable(*args); dprint('c :', n, debug=debug) + assert (n == o).all() + + +def test_1d(debug=True): + arr = np.array([1, 2, 2, 1, 3, 2, 6], dtype=object) + + # Test with normally constructed array + test(arr, 0, True, True, debug=debug) # include_boundaries + test(arr, 0, True, False, debug=debug) # one_boundary (normal) + test(arr, 0, False, True, debug=debug) # one_boundary (reverse) + test(arr, 0, False, False, debug=debug) # exclude_boundaries + + arr2d = np.array([[2, 1, 2], + [3, 2, 3], + [3, 2, 3], + [2, 1, 2], + [4, 3, 4], + [3, 2, 3], + [6, 6, 6]], dtype=object) + + # Test with array slices + test(arr2d[:, 1], 0, True, True, debug=debug) + test(arr2d[:, 1], 0, True, False, debug=debug) + test(arr2d[:, 1], 0, False, True, debug=debug) + test(arr2d[:, 1], 0, False, False, debug=debug) + + test(arr2d.T[1], 0, True, True, debug=debug) + test(arr2d.T[1], 0, True, False, debug=debug) + test(arr2d.T[1], 0, False, True, debug=debug) + test(arr2d.T[1], 0, False, False, debug=debug) + + +def test_2d(debug=True): + arr2d = np.array([ + [1, 2, 2, 1, 3, 2, 6], + [2, 3, 3, 2, 4, 3, 6], + [2, 3, 3, 2, 4, 3, 6], + [1, 2, 2, 1, 3, 2, 6], + [3, 4, 4, 3, 5, 4, 6], + [2, 3, 3, 2, 4, 3, 6], + ], dtype=object) + + test(arr2d, 0, True, True) + test(arr2d, 0, False, True) + test(arr2d, 0, True, False) + test(arr2d, 0, False, False) + + test(arr2d, 1, True, True) + test(arr2d, 1, False, True) + test(arr2d, 1, True, False) + test(arr2d, 1, False, False) + + +test_1d(debug=False) + +exit(0) + + + def test(*args, **kwargs): assert (new(*args, **kwargs) == array_to_duplicated_hashable(*args, **kwargs)).all(), (args, kwargs) diff --git a/src/_arraykit.c b/src/_arraykit.c index f24be641..1c262f37 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -493,6 +493,8 @@ isna_element(PyObject *Py_UNUSED(m), PyObject *arg) //------------------------------------------------------------------------------ // duplication +// These two methods are defunct. + static PyObject * AK_array_to_duplicated_hashable_no_constraints(PyArrayObject *array, PyArrayObject *is_dup) { @@ -714,12 +716,15 @@ AK_array_to_duplicated_hashable_with_constraints(PyArrayObject *array, PyArrayOb return NULL; } +// Defines how to process a hashable value. typedef int (*AK_handle_value_func)(int, // i PyObject*, // value PyArrayObject*, // is_dup PyObject*, // set_obj PyObject* // dict_obj ); + +// Defines how to iterate over an arbitrary numpy (object) array typedef int (*AK_iterate_np_func)(PyArrayObject*, // array int, // axis int, // reverse @@ -729,31 +734,173 @@ typedef int (*AK_iterate_np_func)(PyArrayObject*, // array PyObject* // dict_obj ); +// Value processing funcs + static int AK_handle_value_one_boundary(int i, PyObject *value, PyArrayObject *is_dup, PyObject *set_obj, PyObject *dict_obj) { + /* + Used when the first duplicated element is considered unique. + + If exclude_first && !exclude_last, we walk from left to right + If !exclude_first && exclude_last, we walk from right to left + + Rougly equivalent Python: + + if value not in seen: + seen.add(value) + else: + is_dup[i] = True + */ PyObject *seen = set_obj; // Meaningful name alias assert(dict_obj == NULL); int found = PySet_Contains(seen, value); - if (found == -1) { return -1; } - - else if (found == 0) { - int add_success = PySet_Add(seen, value); - if (add_success == -1) { return -1; } + if (found == -1) { + return -1; } - else { - *(npy_bool *) PyArray_GETPTR1(is_dup, i) = NPY_TRUE; + + if (found == 0) { + return PySet_Add(seen, value); // -1 on failure, 0 on success } + *(npy_bool *) PyArray_GETPTR1(is_dup, i) = NPY_TRUE; return 0; } +static int +AK_handle_value_include_boundaries(int i, PyObject *value, PyArrayObject *is_dup, + PyObject *set_obj, PyObject *dict_obj) +{ + /* + Used when the first & last instances of duplicated values are considered unique + + Rougly equivalent Python: + + if value not in seen: + seen.add(value) + else: + is_dup[i] = True + + # Keep track of last observed location, so we can mark it False (i.e. unique) at the end + last_duplicate_locations[value] = i + */ + PyObject *seen = set_obj; // Meaningful name alias + PyObject *last_duplicate_locations = dict_obj; // Meaningful name alias + + int found = PySet_Contains(seen, value); + if (found == -1) { + return -1; + } + + if (found == 0) { + return PySet_Add(seen, value); // -1 on failure, 0 on success + } + + *(npy_bool *) PyArray_GETPTR1(is_dup, i) = NPY_TRUE; + + PyObject *idx = PyLong_FromLong(i); + if (!idx) { return -1; } + + int set_success = PyDict_SetItem(last_duplicate_locations, value, idx); + Py_DECREF(idx); + return set_success; // -1 on failure, 0 on success +} + +static int +AK_handle_value_exclude_boundaries(int i, PyObject *value, PyArrayObject *is_dup, + PyObject *set_obj, PyObject *dict_obj) +{ + /* + Used when the first & last instances of duplicated values are considered duplicated + + Rougly equivalent Python: + + if value not in first_unique_locations: + // Keep track of the first time we see each unique value, so we can mark the first location + // of each duplicated value as duplicated + first_unique_locations[value] = i + else: + is_dup[i] = True + + # The second time we see a duplicate, we mark the first observed location as True (i.e. duplicated) + if value not in duplicates: + is_dup[first_unique_locations[value]] = True + + # This value is duplicated! + duplicates.add(value) + */ + + PyObject *duplicates = set_obj; // Meaningful name alias + PyObject *first_unique_locations = dict_obj; // Meaningful name alias + + int found = PyDict_Contains(first_unique_locations, value); + if (found == -1) { + return -1; + } + + if (found == 0) { + PyObject *idx = PyLong_FromLong(i); + if (!idx) { + return -1; + } + + int set_success = PyDict_SetItem(first_unique_locations, value, idx); + Py_DECREF(idx); + return set_success; // -1 on failure, 0 on success + } + + *(npy_bool *) PyArray_GETPTR1(is_dup, i) = NPY_TRUE; + + // Second time seeing a duplicate + found = PySet_Contains(duplicates, value); + if (found == -1) { + return -1; + } + + if (found == 0) { + PyObject *first_unique_location = PyDict_GetItem(first_unique_locations, value); + if (!first_unique_location) { + return -1; + } + long idx = PyLong_AsLong(first_unique_location); + if (idx == -1) { + return -1; // -1 always means failure since no locations are negative + } + Py_DECREF(first_unique_location); + + *(npy_bool *) PyArray_GETPTR1(is_dup, idx) = NPY_TRUE; + } + + return PySet_Add(duplicates, value); +} + +// Iteration funcs + static int AK_iter_1d_array(PyArrayObject *array, int axis, int reverse, PyArrayObject *is_dup, AK_handle_value_func value_func, PyObject *set_obj, PyObject *dict_obj) { + /* + Iterates over a 1D numpy array. + + Roughly equivalent Python code: + + if reverse: + iterator = reversed(array) + else: + iterator = array + + size = len(array) + + for i, value in enumerate(iterator): + if reverse: + i = size - i - 1 + + process_value_func(i, value, is_dup, set_obj, dict_obj) + */ + assert(axis == 0); NpyIter *iter = NpyIter_New(array, NPY_ITER_READONLY | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK, @@ -769,6 +916,7 @@ AK_iter_1d_array(PyArrayObject *array, int axis, int reverse, PyArrayObject *is_ npy_intp *strideptr = NpyIter_GetInnerStrideArray(iter); npy_intp *sizeptr = NpyIter_GetInnerLoopSizePtr(iter); + // Do-while numpy iteration loops only happen once for 1D arrays! do { char *data = *dataptr; npy_intp stride = *strideptr; @@ -778,24 +926,27 @@ AK_iter_1d_array(PyArrayObject *array, int axis, int reverse, PyArrayObject *is_ int i = 0; int step = 1; + int stride_step = (int)stride; // We might walk in reverse! if (reverse) { - data += (stride * count); - stride = -stride; - i = count; + data += (stride * (count - 1)); + i = count - 1; step = -1; + stride_step = -stride_step; } while (count--) { + // Object arrays contains pointers to PyObjects, so we will only temporarily + // look at the reference here. memcpy(&value, data, sizeof(value)); // Process the value! - if (!value_func(i, value, is_dup, set_obj, dict_obj)) { + if (value_func(i, value, is_dup, set_obj, dict_obj) == -1) { goto failure; } - data += stride; i += step; + data += stride_step; } } while (iternext(iter)); @@ -858,7 +1009,7 @@ AK_iter_2d_array(PyArrayObject *array, int axis, int reverse, PyArrayObject *is_ int success = value_func(idx, value, is_dup, set_obj, dict_obj); Py_DECREF(tup); - if (!success) { goto failure; } + if (success == -1) { goto failure; } idx += step; } } @@ -876,7 +1027,7 @@ AK_iter_2d_array(PyArrayObject *array, int axis, int reverse, PyArrayObject *is_ int success = value_func(idx, value, is_dup, set_obj, dict_obj); Py_DECREF(tup); - if (!success) { goto failure; } + if (success == -1) { goto failure; } idx += step; } @@ -956,14 +1107,14 @@ array_to_duplicated_hashable(PyObject *Py_UNUSED(m), PyObject *args, PyObject *k } if (!exclude_first && !exclude_last) { - handle_value_func = AK_handle_value_one_boundary; // AK_handle_value_exclude_boundaries + handle_value_func = AK_handle_value_exclude_boundaries; } else { - handle_value_func = AK_handle_value_one_boundary; // AK_handle_value_include_boundaries + handle_value_func = AK_handle_value_include_boundaries; } } - if (iterate_array_func(array, axis, reverse, is_dup, handle_value_func, set_obj, dict_obj)) { + if (-1 == iterate_array_func(array, axis, reverse, is_dup, handle_value_func, set_obj, dict_obj)) { goto failure; } From f016f24c4993ea90b67c8489898b1195d5a56c1b Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Fri, 4 Jun 2021 12:46:51 -0700 Subject: [PATCH 09/18] Adds more comments. --- src/_arraykit.c | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/src/_arraykit.c b/src/_arraykit.c index 1c262f37..b26ef16d 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -1046,6 +1046,34 @@ AK_iter_2d_array(PyArrayObject *array, int axis, int reverse, PyArrayObject *is_ static PyObject * array_to_duplicated_hashable(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) { + /* + Main driver method. Determines how to iterate, and process the value of each iteration + based on the array itself and the parameters. + + Numpy 2D iteration is very different than Numpy 1D iteration, and so those two iteration + approaches are generalized. + + Depending on the parameters, there are 4 different ways we can interpret uniqueness. + + 1. exclude_first=True and exclude_last=True + - This means the first & last observations of duplicated values are considered unique. + - We consider them `included` in what is reported as unique + + 2. exclude_first=False and exclude_last=False + - This means the first & last observations of duplicated values are considered duplicated. + - We consider them `excluded` in what is reported as unique (by reporting them as duplicates) + + 3. exclude_first ^ exclude_last + - This means either the first OR the last observation will be considered unique, while the other is not + - This allows for more efficient iteration, by requiring only that we keep track of what we've seen before, + only changing the direction we iterate through the array. + + - If exclude_first is True, the we iterate left-to-right, ensuring the first observation of each unique + is reported as such, with every subsequent duplicate observation being marked as a duplicate + + - If exclude_last is True, the we iterate right-to-left, ensuring the last observation of each unique + is reported as such, with every subsequent duplicate observation being marked as a duplicate + */ PyArrayObject *array = NULL; int axis = 0; int exclude_first = 0; @@ -1080,6 +1108,7 @@ array_to_duplicated_hashable(PyObject *Py_UNUSED(m), PyObject *args, PyObject *k AK_handle_value_func handle_value_func = NULL; AK_iterate_np_func iterate_array_func = NULL; + // 1. Determine how to iterate if (ndim == 1) { iterate_array_func = AK_iter_1d_array; } @@ -1097,10 +1126,13 @@ array_to_duplicated_hashable(PyObject *Py_UNUSED(m), PyObject *args, PyObject *k PyObject *dict_obj = NULL; + // 2. Determine how to process each value if (exclude_first ^ exclude_last) { + // 2.a This approach only needs a set! handle_value_func = AK_handle_value_one_boundary; } else { + // 2.b Both of these approaches require an additional dictionary structure to keep track of some observed indices dict_obj = PyDict_New(); if (!dict_obj) { goto failure; @@ -1114,17 +1146,21 @@ array_to_duplicated_hashable(PyObject *Py_UNUSED(m), PyObject *args, PyObject *k } } + // 3. Execute if (-1 == iterate_array_func(array, axis, reverse, is_dup, handle_value_func, set_obj, dict_obj)) { goto failure; } + // 4. Post-process if (exclude_first && exclude_last) { - // is_dup[list(dict_obj.values())] = False + // Mark the last observed location of each duplicate value as False + + PyObject *last_duplicate_locations = dict_obj; // Meaningful name alias PyObject *value = NULL; // Borrowed Py_ssize_t pos = 0; - while (PyDict_Next(dict_obj, &pos, NULL, &value)) { + while (PyDict_Next(last_duplicate_locations, &pos, NULL, &value)) { long idx = PyLong_AsLong(value); if (idx == -1) { goto failure; // -1 always means failure since no locations are negative From 3271618bcad88f368b08ebf3d4a60094ce7478d8 Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Fri, 4 Jun 2021 13:06:22 -0700 Subject: [PATCH 10/18] Implements reverse iteration for 2d c contiguous arrays. --- debug.py | 27 +++++++++++++++++++-------- src/_arraykit.c | 25 +++++++++++++++++++++---- 2 files changed, 40 insertions(+), 12 deletions(-) diff --git a/debug.py b/debug.py index 860225cb..a18f3295 100644 --- a/debug.py +++ b/debug.py @@ -176,18 +176,29 @@ def test_2d(debug=True): [2, 3, 3, 2, 4, 3, 6], ], dtype=object) - test(arr2d, 0, True, True) - test(arr2d, 0, False, True) - test(arr2d, 0, True, False) - test(arr2d, 0, False, False) + test(arr2d, 0, True, True, debug=debug) + test(arr2d, 0, True, False, debug=debug) + test(arr2d, 0, False, True, debug=debug) + test(arr2d, 0, False, False, debug=debug) - test(arr2d, 1, True, True) - test(arr2d, 1, False, True) - test(arr2d, 1, True, False) - test(arr2d, 1, False, False) + test(arr2d, 1, True, True, debug=debug) + test(arr2d, 1, True, False, debug=debug) + #test(arr2d, 1, False, True, debug=debug) + test(arr2d, 1, False, False, debug=debug) + + test(arr2d.T, 0, True, True, debug=debug) + test(arr2d.T, 0, True, False, debug=debug) + #test(arr2d.T, 0, False, True, debug=debug) + test(arr2d.T, 0, False, False, debug=debug) + + test(arr2d.T, 1, True, True, debug=debug) + test(arr2d.T, 1, True, False, debug=debug) + test(arr2d.T, 1, False, True, debug=debug) + test(arr2d.T, 1, False, False, debug=debug) test_1d(debug=False) +test_2d(debug=False) exit(0) diff --git a/src/_arraykit.c b/src/_arraykit.c index b26ef16d..d1675803 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -996,7 +996,22 @@ AK_iter_2d_array(PyArrayObject *array, int axis, int reverse, PyArrayObject *is_ // Otherwise, it does a strided loop through the non-contiguous axis if (is_c_order != axis) { // Do-while is one loop through all elements. - for (int i = 0; i < num_tuples; ++i) { + + int tup_idx = 0; + int step = 1; + int tup_stride_step = 0; // For normal iterations, each time we build a tuple, we are right where we + // we need to be to start building the next tuple. For reverse, we have to + // backtrack two tuples worth of strides to get where we need to be + + if (reverse) { + data += (stride * (num_tuples - 1) * tuple_size); + tup_idx = num_tuples - 1; + step = -1; + tup_stride_step = -(tuple_size * 2) * stride; + } + + while (num_tuples--) { + PyObject *tup = PyTuple_New(tuple_size); if (!tup) { goto failure; } @@ -1007,12 +1022,14 @@ AK_iter_2d_array(PyArrayObject *array, int axis, int reverse, PyArrayObject *is_ data += stride; } - int success = value_func(idx, value, is_dup, set_obj, dict_obj); + int success = value_func(tup_idx, tup, is_dup, set_obj, dict_obj); Py_DECREF(tup); if (success == -1) { goto failure; } - idx += step; + tup_idx += step; + data += tup_stride_step; } } + else { PyObject *tup = PyTuple_New(tuple_size); if (!tup) { goto failure; } @@ -1025,7 +1042,7 @@ AK_iter_2d_array(PyArrayObject *array, int axis, int reverse, PyArrayObject *is_ data += stride; } - int success = value_func(idx, value, is_dup, set_obj, dict_obj); + int success = value_func(idx, tup, is_dup, set_obj, dict_obj); Py_DECREF(tup); if (success == -1) { goto failure; } idx += step; From 5bc4312f3e9f30bb75f943bd06dbcd04cf839db1 Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Fri, 4 Jun 2021 15:15:21 -0700 Subject: [PATCH 11/18] Moves construction of 2d array iter object to own function. 2d array now fully works for all use cases. Cleans up debug.py. --- debug.py | 112 ++++++------------ src/_arraykit.c | 299 ++++++++++++------------------------------------ 2 files changed, 107 insertions(+), 304 deletions(-) diff --git a/debug.py b/debug.py index a18f3295..fee42c4f 100644 --- a/debug.py +++ b/debug.py @@ -130,21 +130,34 @@ def dprint(*args, debug): print(*args) -def test(*args, debug=True): - dprint(args[1:], debug=debug) - o = new(*args); dprint('python:', o, debug=debug) - n = array_to_duplicated_hashable(*args); dprint('c :', n, debug=debug) - assert (n == o).all() +def run_test(array, debug=True): + def _test(*args): + dprint(args[1:], debug=debug) + python_result = new(*args) + dprint('python:', python_result, debug=debug) -def test_1d(debug=True): + c_result = array_to_duplicated_hashable(*args); + dprint('c :', c_result, debug=debug) + assert (python_result == c_result).all() + + _test(array, 0, True, False) # include_boundaries + _test(array, 0, False, False) # one_boundary (normal) + _test(array, 0, False, True) # one_boundary (reverse) + _test(array, 0, True, True) # exclude_boundaries + + if len(array.shape) == 2: + _test(array, 1, True, False) + _test(array, 1, False, False) + _test(array, 1, False, True) + _test(array, 1, True, True) + + +def test_arr1d(debug=True): arr = np.array([1, 2, 2, 1, 3, 2, 6], dtype=object) # Test with normally constructed array - test(arr, 0, True, True, debug=debug) # include_boundaries - test(arr, 0, True, False, debug=debug) # one_boundary (normal) - test(arr, 0, False, True, debug=debug) # one_boundary (reverse) - test(arr, 0, False, False, debug=debug) # exclude_boundaries + run_test(arr, debug=debug) arr2d = np.array([[2, 1, 2], [3, 2, 3], @@ -155,18 +168,11 @@ def test_1d(debug=True): [6, 6, 6]], dtype=object) # Test with array slices - test(arr2d[:, 1], 0, True, True, debug=debug) - test(arr2d[:, 1], 0, True, False, debug=debug) - test(arr2d[:, 1], 0, False, True, debug=debug) - test(arr2d[:, 1], 0, False, False, debug=debug) - - test(arr2d.T[1], 0, True, True, debug=debug) - test(arr2d.T[1], 0, True, False, debug=debug) - test(arr2d.T[1], 0, False, True, debug=debug) - test(arr2d.T[1], 0, False, False, debug=debug) + run_test(arr2d[:, 1], debug=debug) + run_test(arr2d.T[1], debug=debug) -def test_2d(debug=True): +def test_arr2d(debug=True): arr2d = np.array([ [1, 2, 2, 1, 3, 2, 6], [2, 3, 3, 2, 4, 3, 6], @@ -176,64 +182,20 @@ def test_2d(debug=True): [2, 3, 3, 2, 4, 3, 6], ], dtype=object) - test(arr2d, 0, True, True, debug=debug) - test(arr2d, 0, True, False, debug=debug) - test(arr2d, 0, False, True, debug=debug) - test(arr2d, 0, False, False, debug=debug) - - test(arr2d, 1, True, True, debug=debug) - test(arr2d, 1, True, False, debug=debug) - #test(arr2d, 1, False, True, debug=debug) - test(arr2d, 1, False, False, debug=debug) - - test(arr2d.T, 0, True, True, debug=debug) - test(arr2d.T, 0, True, False, debug=debug) - #test(arr2d.T, 0, False, True, debug=debug) - test(arr2d.T, 0, False, False, debug=debug) - - test(arr2d.T, 1, True, True, debug=debug) - test(arr2d.T, 1, True, False, debug=debug) - test(arr2d.T, 1, False, True, debug=debug) - test(arr2d.T, 1, False, False, debug=debug) - - -test_1d(debug=False) -test_2d(debug=False) - -exit(0) - - - -def test(*args, **kwargs): - assert (new(*args, **kwargs) == array_to_duplicated_hashable(*args, **kwargs)).all(), (args, kwargs) - - -arr = np.array([1, PO(1), 2, 3, 1, PO(1), 2, 3, 2, -1, -233, 'aslkj', 'df', 'df', True, True, None, 1]) -#array_to_duplicated_hashable(np.arange(5)) -#array_to_duplicated_hashable(np.arange(5), 213) -#array_to_duplicated_hashable(np.arange(5), 1) -#array_to_duplicated_hashable(np.arange(5), 1, True) -#array_to_duplicated_hashable(np.arange(5), 1, 123) -#array_to_duplicated_hashable(np.arange(5), 1, True) - -if False: - test(arr, 0, True, False) - test(arr, 0, False, False) - test(arr, 0, False, True) - test(arr, 0, True, True) - + run_test(arr2d, debug=debug) + run_test(arr2d.T, debug=debug) -array_to_duplicated_hashable(np.arange(20).reshape(4, 5).astype(object), 0) -print() -array_to_duplicated_hashable(np.arange(20).reshape(4, 5).astype(object), 1) -print() -print() +def test_misc(debug=True): + arr = np.array([1, PO(1), 2, 3, 1, PO(1), 2, 3, 2, -1, -233, 'aslkj', 'df', 'df', True, True, None, 1]) + run_test(arr, debug=debug) -array_to_duplicated_hashable(np.arange(20).reshape(4, 5).astype(object).T, 0) -print() -array_to_duplicated_hashable(np.arange(20).reshape(4, 5).astype(object).T, 1) -print() + arr = np.arange(20).reshape(4, 5).astype(object) + run_test(arr, debug=debug) + run_test(arr.T, debug=debug) +test_arr1d(debug=False) +test_arr2d(debug=False) +test_misc(debug=False) print('Done') diff --git a/src/_arraykit.c b/src/_arraykit.c index d1675803..4b44598e 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -493,229 +493,6 @@ isna_element(PyObject *Py_UNUSED(m), PyObject *arg) //------------------------------------------------------------------------------ // duplication -// These two methods are defunct. - -static PyObject * -AK_array_to_duplicated_hashable_no_constraints(PyArrayObject *array, PyArrayObject *is_dup) -{ - /* - Rougly equivalent Python code - - seen = set() - - for idx, v in enumerate(array): - if v not in seen: - seen.add(v) - else: - is_dup[idx] = True - - return is_dup - */ - // This path is optimized to only construct a single set - PyObject *seen = PySet_New(NULL); - if (!seen) { - Py_DECREF(is_dup); - return NULL; - } - - NpyIter *iter = NpyIter_New(array, - NPY_ITER_READONLY | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK, - NPY_KEEPORDER, - NPY_NO_CASTING, - NULL); - if (!iter) { goto failure; } - - NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL); - if (!iternext) { goto failure; } - - char** dataptr = NpyIter_GetDataPtrArray(iter); - npy_intp *strideptr = NpyIter_GetInnerStrideArray(iter); - npy_intp *sizeptr = NpyIter_GetInnerLoopSizePtr(iter); - - do { - char *data = *dataptr; - npy_intp stride = *strideptr; - npy_intp count = *sizeptr; - - PyObject* value = NULL; - - int i = 0; - while (count--) { - // Object arrays contains pointers to PyObjects, so we will only temporarily - // look at the reference here. - memcpy(&value, data, sizeof(value)); - - // 5. Assign into result whether or not the element exists in the set - int found = PySet_Contains(seen, value); - if (found == -1) { goto failure; } - - else if (found == 0) { - int add_success = PySet_Add(seen, value); - if (add_success == -1) { goto failure; } - } - else { - *(npy_bool *) PyArray_GETPTR1(is_dup, i) = NPY_TRUE; - } - - data += stride; - i += 1; - } - - } while (iternext(iter)); - - NpyIter_Deallocate(iter); - Py_DECREF(seen); - - return (PyObject*)is_dup; - -failure: - if (iter != NULL) { - NpyIter_Deallocate(iter); - } - Py_DECREF(seen); - Py_DECREF(is_dup); - return NULL; -} - -static PyObject * -AK_array_to_duplicated_hashable_with_constraints(PyArrayObject *array, PyArrayObject *is_dup, - int exclude_first, int exclude_last) -{ - /* - Rougly equivalent Python code - - first_unique_locations = {} - last_duplicate_locations = {} - - for idx, v in enumerate(array): - if v not in first_unique_locations: - first_unique_locations[v] = idx - else: - is_dupe[idx] = True - - if v not in last_duplicate_locations and not exclude_first: - is_dupe[first_unique_locations[v]] = True - - last_duplicate_locations[v] = idx - - if exclude_last: # overwrite with False - is_dupe[list(last_duplicate_locations.values())] = False - - return is_dupe - */ - // Contains first location for all unique values. len(first_unique_locations) == len(set(array)) - PyObject *first_unique_locations = PyDict_New(); - if (!first_unique_locations) { - Py_DECREF(is_dup); - return NULL; - } - - PyObject *last_duplicate_locations = PyDict_New(); - if (!last_duplicate_locations) { - Py_DECREF(first_unique_locations); - Py_DECREF(is_dup); - return NULL; - } - - NpyIter *iter = NpyIter_New(array, - NPY_ITER_READONLY | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK, - NPY_KEEPORDER, - NPY_NO_CASTING, - NULL); - if (!iter) { goto failure; } - - NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL); - if (!iternext) { goto failure; } - - char** dataptr = NpyIter_GetDataPtrArray(iter); - npy_intp *strideptr = NpyIter_GetInnerStrideArray(iter); - npy_intp *sizeptr = NpyIter_GetInnerLoopSizePtr(iter); - - do { - char *data = *dataptr; - npy_intp stride = *strideptr; - npy_intp count = *sizeptr; - - PyObject* val = NULL; - - int i = 0; - while (count--) { - // Object arrays contains pointers to PyObjects, so we will only temporarily - // look at the reference here. - memcpy(&val, data, sizeof(val)); - - // 5. Assign into result whether or not the element exists in the set - int in_dict = PyDict_Contains(first_unique_locations, val); - - if (in_dict == -1) { goto failure; } - - else if (in_dict == 0) { - PyObject *idx = PyLong_FromLong(i); - if (!idx) { goto failure; } - - int set_success = PyDict_SetItem(first_unique_locations, val, idx); - Py_DECREF(idx); - if (set_success == -1) { goto failure; } - - } - else { - *(npy_bool *) PyArray_GETPTR1(is_dup, i) = NPY_TRUE; - - in_dict = PyDict_Contains(last_duplicate_locations, val); - if (in_dict == -1) { goto failure; } - - else if (in_dict == 0 && !exclude_first) { - PyObject *first_unique_location = PyDict_GetItem(first_unique_locations, val); - if (!first_unique_location) { goto failure; } - - int idx = PyLong_AsLong(first_unique_location); - if (idx == -1) { goto failure; } - - *(npy_bool *) PyArray_GETPTR1(is_dup, idx) = NPY_TRUE; - } - - PyObject *idx = PyLong_FromLong(i); - if (!idx) { goto failure; } - - int set_success = PyDict_SetItem(last_duplicate_locations, val, idx); - Py_DECREF(idx); - if (set_success == -1) { goto failure; } - } - - data += stride; - i += 1; - } - - } while (iternext(iter)); - - if (exclude_last) { - PyObject *value = NULL; // Borrowed - Py_ssize_t pos = 0; - - while (PyDict_Next(last_duplicate_locations, &pos, NULL, &value)) { - long idx = PyLong_AsLong(value); - if (idx == -1) { goto failure; } // -1 always means failure since no locations are negative - - *(npy_bool *) PyArray_GETPTR1(is_dup, idx) = NPY_FALSE; - } - } - - NpyIter_Deallocate(iter); - Py_DECREF(last_duplicate_locations); - Py_DECREF(first_unique_locations); - - return (PyObject*)is_dup; - -failure: - if (iter != NULL) { - NpyIter_Deallocate(iter); - } - Py_DECREF(last_duplicate_locations); - Py_DECREF(first_unique_locations); - Py_DECREF(is_dup); - return NULL; -} - // Defines how to process a hashable value. typedef int (*AK_handle_value_func)(int, // i PyObject*, // value @@ -960,18 +737,77 @@ AK_iter_1d_array(PyArrayObject *array, int axis, int reverse, PyArrayObject *is_ return -1; } +static NpyIter* +AK_build_2d_array_iter(PyArrayObject *array, int axis, int reverse) +{ + int is_c_order = PyArray_FLAGS(array) & NPY_ARRAY_C_CONTIGUOUS; + int iter_flags = NPY_ITER_READONLY | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK; + int order_flags = NPY_FORTRANORDER ? axis : NPY_CORDER; + + if (!((is_c_order == axis) && reverse)) { + return NpyIter_New(array, iter_flags, order_flags, NPY_NO_CASTING, NULL); + } + + // The dreaded case of reverse iterating through non-continguous sequences of memory (i.e. columns on arr or rows on arr.T) + + PyObject *negative_one = PyLong_FromLong(-1); + if (!negative_one) { + return NULL; + } + + PyObject *reverse_slice = PySlice_New(NULL, NULL, negative_one); + Py_DECREF(negative_one); + if (!reverse_slice) { + return NULL; + } + + PyObject *reversed_array; + + if (axis == 0) { + reversed_array = PyObject_GetItem((PyObject*)array, reverse_slice); // array[::-1] + Py_DECREF(reverse_slice); + if (!reversed_array) { + return NULL; + } + } + else { + PyObject *empty_row_slice = PySlice_New(NULL, NULL, NULL); + if (!empty_row_slice) { + Py_DECREF(reverse_slice); + return NULL; + } + + PyObject *slice_tuple = PyTuple_Pack(2, empty_row_slice, reverse_slice); + Py_DECREF(empty_row_slice); + Py_DECREF(reverse_slice); + if (!slice_tuple) { + return NULL; + } + + reversed_array = PyObject_GetItem((PyObject*)array, slice_tuple); // array[:,::-1] + Py_DECREF(slice_tuple); + if (!reversed_array) { + return NULL; + } + } + + NpyIter *iter = NpyIter_New((PyArrayObject*)reversed_array, + NPY_ITER_READONLY | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK, + order_flags, + NPY_NO_CASTING, + NULL); + + Py_DECREF(reversed_array); + return iter; // Can be NULL +} + static int AK_iter_2d_array(PyArrayObject *array, int axis, int reverse, PyArrayObject *is_dup, AK_handle_value_func value_func, PyObject *set_obj, PyObject *dict_obj) { int is_c_order = PyArray_FLAGS(array) & NPY_ARRAY_C_CONTIGUOUS; - int order_flags = NPY_FORTRANORDER ? axis : NPY_CORDER; - NpyIter *iter = NpyIter_New(array, - NPY_ITER_READONLY | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK, - order_flags, - NPY_NO_CASTING, - NULL); + NpyIter *iter = AK_build_2d_array_iter(array, axis, reverse); if (!iter) { goto failure; } NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL); @@ -986,6 +822,11 @@ AK_iter_2d_array(PyArrayObject *array, int axis, int reverse, PyArrayObject *is_ int idx = 0; int step = 1; + if (reverse) { + idx = num_tuples - 1; + step = -1; + } + do { char *data = *dataptr; npy_intp stride = *strideptr; From 58fdc887c1e723d8d067a9c53282f99dd90a6877 Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Fri, 4 Jun 2021 16:33:51 -0700 Subject: [PATCH 12/18] Updates performance test and other misc changes. --- debug.py | 69 +++++++++++++++++++++-------------------- performance/__main__.py | 36 ++++++++++++--------- src/_arraykit.c | 2 +- 3 files changed, 57 insertions(+), 50 deletions(-) diff --git a/debug.py b/debug.py index fee42c4f..9387e2a1 100644 --- a/debug.py +++ b/debug.py @@ -7,38 +7,6 @@ def __init__(self, v) -> None: def __repr__(self) -> str: return f'PO<{self.v}>' -def iterate_1d(array, axis, reverse, is_dup, process_value_func, set_obj, dict_obj): - if reverse: - iterator = reversed(array) - else: - iterator = array - - size = len(array) - - for i, value in enumerate(iterator): - if reverse: - i = size - i - 1 - - process_value_func(i, value, is_dup, set_obj, dict_obj) - - -def iterate_2d(array, axis, reverse, is_dup, process_value_func, set_obj, dict_obj): - size = array.shape[axis] - - if axis == 0: - iterator = array - else: - iterator = array.T - - if reverse: - iterator = reversed(iterator) - - for i, value in enumerate(map(tuple, iterator)): - if reverse: - i = size - i - 1 - - process_value_func(i, value, is_dup, set_obj, dict_obj) - def handle_value_one_boundary(i, value, is_dup, set_obj, dict_obj): seen = set_obj @@ -80,7 +48,40 @@ def handle_value_include_boundaries(i, value, is_dup, set_obj, dict_obj): last_duplicate_locations[value] = i -def new( +def iterate_1d(array, axis, reverse, is_dup, process_value_func, set_obj, dict_obj): + if reverse: + iterator = reversed(array) + else: + iterator = array + + size = len(array) + + for i, value in enumerate(iterator): + if reverse: + i = size - i - 1 + + process_value_func(i, value, is_dup, set_obj, dict_obj) + + +def iterate_2d(array, axis, reverse, is_dup, process_value_func, set_obj, dict_obj): + size = array.shape[axis] + + if axis == 0: + iterator = array + else: + iterator = array.T + + if reverse: + iterator = reversed(iterator) + + for i, value in enumerate(map(tuple, iterator)): + if reverse: + i = size - i - 1 + + process_value_func(i, value, is_dup, set_obj, dict_obj) + + +def python_impl( array: np.ndarray, axis: int = 0, exclude_first: bool = False, @@ -134,7 +135,7 @@ def run_test(array, debug=True): def _test(*args): dprint(args[1:], debug=debug) - python_result = new(*args) + python_result = python_impl(*args) dprint('python:', python_result, debug=debug) c_result = array_to_duplicated_hashable(*args); diff --git a/performance/__main__.py b/performance/__main__.py index 03efa0eb..19e49b39 100644 --- a/performance/__main__.py +++ b/performance/__main__.py @@ -365,8 +365,13 @@ class IsNaElementPerfREF(IsNaElementPerf): #------------------------------------------------------------------------------- class ArrayToDuplicatedHashablePerf(Perf): - NUMBER = 1 - FUNCTIONS = ('array_1d', 'array_2d') + NUMBER = 3 + FUNCTIONS = ( + 'array_1d_small', + 'array_1d_large', + 'array_2d_small', + 'array_2d_large', + ) def __init__(self): self.arrays_1d_small = [ @@ -390,26 +395,27 @@ def __init__(self): np.hstack([np.arange(15), np.arange(90_000), np.arange(15), np.arange(9970)]).reshape(10_000, 10).astype(object), ] - def array_1d(self): - prd = functools.partial(itertools.product, (True, False), (True, False)) + self.prd_1d = functools.partial(itertools.product, (True, False), (True, False)) + self.prd_2d = functools.partial(itertools.product, (0, 1), (True, False), (True, False)) - for _ in range(1000): - for exclude_first, exclude_last, arr in prd(self.arrays_1d_small): + def array_1d_small(self): + for _ in range(10000): + for exclude_first, exclude_last, arr in self.prd_1d(self.arrays_1d_small): self.entry(arr, exclude_first=exclude_first, exclude_last=exclude_last) - for _ in range(5): - for exclude_first, exclude_last, arr in prd(self.arrays_1d_large): + def array_1d_large(self): + for _ in range(12): + for exclude_first, exclude_last, arr in self.prd_1d(self.arrays_1d_large): self.entry(arr, exclude_first=exclude_first, exclude_last=exclude_last) - def array_2d(self): - prd = functools.partial(itertools.product, (0, 1), (True, False), (True, False)) - - for _ in range(1000): - for axis, exclude_first, exclude_last, arr in prd(self.arrays_2d_small): + def array_2d_small(self): + for _ in range(5000): + for axis, exclude_first, exclude_last, arr in self.prd_2d(self.arrays_2d_small): self.entry(arr, axis, exclude_first, exclude_last) - for _ in range(5): - for axis, exclude_first, exclude_last, arr in prd(self.arrays_2d_large): + def array_2d_large(self): + for _ in range(12): + for axis, exclude_first, exclude_last, arr in self.prd_2d(self.arrays_2d_large): self.entry(arr, axis, exclude_first, exclude_last) diff --git a/src/_arraykit.c b/src/_arraykit.c index 4b44598e..7fd3b501 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -701,7 +701,7 @@ AK_iter_1d_array(PyArrayObject *array, int axis, int reverse, PyArrayObject *is_ PyObject* value = NULL; - int i = 0; + Py_ssize_t i = 0; int step = 1; int stride_step = (int)stride; // We might walk in reverse! From 43069c67792367fec7beaf60b0db46d5d0f2f672 Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Fri, 4 Jun 2021 16:52:19 -0700 Subject: [PATCH 13/18] Assigns into is_dup directly instead of using GETPTR1 --- debug.py | 11 +++++++++++ src/_arraykit.c | 37 ++++++++++++++++++++----------------- 2 files changed, 31 insertions(+), 17 deletions(-) diff --git a/debug.py b/debug.py index 9387e2a1..0e4b5783 100644 --- a/debug.py +++ b/debug.py @@ -196,6 +196,17 @@ def test_misc(debug=True): run_test(arr.T, debug=debug) +# arr = np.array([ +# [1, 2, 2, 1, 3, 2, 6], +# [2, 3, 3, 2, 4, 3, 6], +# [2, 3, 3, 2, 4, 3, 6], +# [1, 2, 2, 1, 3, 2, 6], +# [3, 4, 4, 3, 5, 4, 6], +# [2, 3, 3, 2, 4, 3, 6], +# ], dtype=object) +# array_to_duplicated_hashable(arr, 1) + + test_arr1d(debug=False) test_arr2d(debug=False) test_misc(debug=False) diff --git a/src/_arraykit.c b/src/_arraykit.c index 7fd3b501..454accaf 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -494,18 +494,18 @@ isna_element(PyObject *Py_UNUSED(m), PyObject *arg) // duplication // Defines how to process a hashable value. -typedef int (*AK_handle_value_func)(int, // i - PyObject*, // value - PyArrayObject*, // is_dup - PyObject*, // set_obj - PyObject* // dict_obj +typedef int (*AK_handle_value_func)(int, // i + PyObject*, // value + npy_bool*, // is_dup + PyObject*, // set_obj + PyObject* // dict_obj ); // Defines how to iterate over an arbitrary numpy (object) array typedef int (*AK_iterate_np_func)(PyArrayObject*, // array int, // axis int, // reverse - PyArrayObject*, // is_dup + npy_bool*, // is_dup AK_handle_value_func, // handle_value_func PyObject*, // set_obj PyObject* // dict_obj @@ -514,7 +514,7 @@ typedef int (*AK_iterate_np_func)(PyArrayObject*, // array // Value processing funcs static int -AK_handle_value_one_boundary(int i, PyObject *value, PyArrayObject *is_dup, +AK_handle_value_one_boundary(int i, PyObject *value, npy_bool *is_dup, PyObject *set_obj, PyObject *dict_obj) { /* @@ -542,12 +542,12 @@ AK_handle_value_one_boundary(int i, PyObject *value, PyArrayObject *is_dup, return PySet_Add(seen, value); // -1 on failure, 0 on success } - *(npy_bool *) PyArray_GETPTR1(is_dup, i) = NPY_TRUE; + is_dup[i] = NPY_TRUE; return 0; } static int -AK_handle_value_include_boundaries(int i, PyObject *value, PyArrayObject *is_dup, +AK_handle_value_include_boundaries(int i, PyObject *value, npy_bool *is_dup, PyObject *set_obj, PyObject *dict_obj) { /* @@ -575,7 +575,7 @@ AK_handle_value_include_boundaries(int i, PyObject *value, PyArrayObject *is_dup return PySet_Add(seen, value); // -1 on failure, 0 on success } - *(npy_bool *) PyArray_GETPTR1(is_dup, i) = NPY_TRUE; + is_dup[i] = NPY_TRUE; PyObject *idx = PyLong_FromLong(i); if (!idx) { return -1; } @@ -586,7 +586,7 @@ AK_handle_value_include_boundaries(int i, PyObject *value, PyArrayObject *is_dup } static int -AK_handle_value_exclude_boundaries(int i, PyObject *value, PyArrayObject *is_dup, +AK_handle_value_exclude_boundaries(int i, PyObject *value, npy_bool *is_dup, PyObject *set_obj, PyObject *dict_obj) { /* @@ -628,7 +628,7 @@ AK_handle_value_exclude_boundaries(int i, PyObject *value, PyArrayObject *is_dup return set_success; // -1 on failure, 0 on success } - *(npy_bool *) PyArray_GETPTR1(is_dup, i) = NPY_TRUE; + is_dup[i] = NPY_TRUE; // Second time seeing a duplicate found = PySet_Contains(duplicates, value); @@ -647,7 +647,7 @@ AK_handle_value_exclude_boundaries(int i, PyObject *value, PyArrayObject *is_dup } Py_DECREF(first_unique_location); - *(npy_bool *) PyArray_GETPTR1(is_dup, idx) = NPY_TRUE; + is_dup[idx] = NPY_TRUE; } return PySet_Add(duplicates, value); @@ -656,7 +656,7 @@ AK_handle_value_exclude_boundaries(int i, PyObject *value, PyArrayObject *is_dup // Iteration funcs static int -AK_iter_1d_array(PyArrayObject *array, int axis, int reverse, PyArrayObject *is_dup, +AK_iter_1d_array(PyArrayObject *array, int axis, int reverse, npy_bool *is_dup, AK_handle_value_func value_func, PyObject *set_obj, PyObject *dict_obj) { /* @@ -802,7 +802,7 @@ AK_build_2d_array_iter(PyArrayObject *array, int axis, int reverse) } static int -AK_iter_2d_array(PyArrayObject *array, int axis, int reverse, PyArrayObject *is_dup, +AK_iter_2d_array(PyArrayObject *array, int axis, int reverse, npy_bool *is_dup, AK_handle_value_func value_func, PyObject *set_obj, PyObject *dict_obj) { int is_c_order = PyArray_FLAGS(array) & NPY_ARRAY_C_CONTIGUOUS; @@ -836,6 +836,7 @@ AK_iter_2d_array(PyArrayObject *array, int axis, int reverse, PyArrayObject *is_ // When the axis doesn't align with the ordering, it means the npy iterator goes one-element at a time. // Otherwise, it does a strided loop through the non-contiguous axis if (is_c_order != axis) { + // AK_DEBUG("ONE PASS"); // Do-while is one loop through all elements. int tup_idx = 0; @@ -872,6 +873,7 @@ AK_iter_2d_array(PyArrayObject *array, int axis, int reverse, PyArrayObject *is_ } else { + //AK_DEBUG("MULTI PASS"); PyObject *tup = PyTuple_New(tuple_size); if (!tup) { goto failure; } @@ -976,6 +978,7 @@ array_to_duplicated_hashable(PyObject *Py_UNUSED(m), PyObject *args, PyObject *k npy_intp dims = {size}; PyArrayObject *is_dup = PyArray_Zeros(1, &dims, PyArray_DescrFromType(NPY_BOOL), 0); + npy_bool *is_dup_array = (npy_bool*)PyArray_DATA(is_dup); PyObject *set_obj = PySet_New(NULL); if (!set_obj) { @@ -1005,7 +1008,7 @@ array_to_duplicated_hashable(PyObject *Py_UNUSED(m), PyObject *args, PyObject *k } // 3. Execute - if (-1 == iterate_array_func(array, axis, reverse, is_dup, handle_value_func, set_obj, dict_obj)) { + if (-1 == iterate_array_func(array, axis, reverse, is_dup_array, handle_value_func, set_obj, dict_obj)) { goto failure; } @@ -1025,7 +1028,7 @@ array_to_duplicated_hashable(PyObject *Py_UNUSED(m), PyObject *args, PyObject *k } Py_DECREF(value); - *(npy_bool *) PyArray_GETPTR1(is_dup, idx) = NPY_FALSE; + is_dup_array[idx] = NPY_FALSE; } } From 29166fdb5f15091dfe55babaf51860515d9eca8e Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Fri, 4 Jun 2021 20:38:13 -0700 Subject: [PATCH 14/18] Greatly simplifies 2d iteration logic by ensuring array memory layout is contiguous. --- debug.py | 11 ---- src/_arraykit.c | 153 +++++++++++++----------------------------------- 2 files changed, 40 insertions(+), 124 deletions(-) diff --git a/debug.py b/debug.py index 0e4b5783..9387e2a1 100644 --- a/debug.py +++ b/debug.py @@ -196,17 +196,6 @@ def test_misc(debug=True): run_test(arr.T, debug=debug) -# arr = np.array([ -# [1, 2, 2, 1, 3, 2, 6], -# [2, 3, 3, 2, 4, 3, 6], -# [2, 3, 3, 2, 4, 3, 6], -# [1, 2, 2, 1, 3, 2, 6], -# [3, 4, 4, 3, 5, 4, 6], -# [2, 3, 3, 2, 4, 3, 6], -# ], dtype=object) -# array_to_duplicated_hashable(arr, 1) - - test_arr1d(debug=False) test_arr2d(debug=False) test_misc(debug=False) diff --git a/src/_arraykit.c b/src/_arraykit.c index 454accaf..a66d9ee3 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -63,6 +63,11 @@ fprintf(stderr, #msg); \ _AK_DEBUG_END() +# define AK_DEBUG_INT(msg) \ + _AK_DEBUG_BEGIN(); \ + fprintf(stderr, #msg"=%x", (int)(msg)); \ + _AK_DEBUG_END() + # if defined __GNUC__ || defined __clang__ # define AK_LIKELY(X) __builtin_expect(!!(X), 1) # define AK_UNLIKELY(X) __builtin_expect(!!(X), 0) @@ -737,77 +742,33 @@ AK_iter_1d_array(PyArrayObject *array, int axis, int reverse, npy_bool *is_dup, return -1; } -static NpyIter* -AK_build_2d_array_iter(PyArrayObject *array, int axis, int reverse) +static int +AK_iter_2d_array(PyArrayObject *array, int axis, int reverse, npy_bool *is_dup, + AK_handle_value_func value_func, PyObject *set_obj, PyObject *dict_obj) { int is_c_order = PyArray_FLAGS(array) & NPY_ARRAY_C_CONTIGUOUS; - int iter_flags = NPY_ITER_READONLY | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK; - int order_flags = NPY_FORTRANORDER ? axis : NPY_CORDER; - - if (!((is_c_order == axis) && reverse)) { - return NpyIter_New(array, iter_flags, order_flags, NPY_NO_CASTING, NULL); - } - - // The dreaded case of reverse iterating through non-continguous sequences of memory (i.e. columns on arr or rows on arr.T) - - PyObject *negative_one = PyLong_FromLong(-1); - if (!negative_one) { - return NULL; - } - PyObject *reverse_slice = PySlice_New(NULL, NULL, negative_one); - Py_DECREF(negative_one); - if (!reverse_slice) { - return NULL; - } - - PyObject *reversed_array; - - if (axis == 0) { - reversed_array = PyObject_GetItem((PyObject*)array, reverse_slice); // array[::-1] - Py_DECREF(reverse_slice); - if (!reversed_array) { - return NULL; + // When the axis aligns with the ordering (i.e. row-wise for C, col-wise for Fortran), it means the npy iterator goes one-element at a time. + // Otherwise, it does a strided loop through the non-contiguous axis (which adds a lot of complexity). + // To prevent this, we will make a copy of the array with the data laid out in the way we want + if (is_c_order == axis) { + int new_flags = NPY_ARRAY_ALIGNED; + if (is_c_order) { + new_flags |= NPY_ARRAY_F_CONTIGUOUS; } - } - else { - PyObject *empty_row_slice = PySlice_New(NULL, NULL, NULL); - if (!empty_row_slice) { - Py_DECREF(reverse_slice); - return NULL; - } - - PyObject *slice_tuple = PyTuple_Pack(2, empty_row_slice, reverse_slice); - Py_DECREF(empty_row_slice); - Py_DECREF(reverse_slice); - if (!slice_tuple) { - return NULL; + else { + new_flags |= NPY_ARRAY_C_CONTIGUOUS; } - reversed_array = PyObject_GetItem((PyObject*)array, slice_tuple); // array[:,::-1] - Py_DECREF(slice_tuple); - if (!reversed_array) { - return NULL; + array = PyArray_FromArray(array, PyArray_DescrFromType(NPY_OBJECT), new_flags); + if (!array) { + return -1; } } - NpyIter *iter = NpyIter_New((PyArrayObject*)reversed_array, - NPY_ITER_READONLY | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK, - order_flags, - NPY_NO_CASTING, - NULL); - - Py_DECREF(reversed_array); - return iter; // Can be NULL -} - -static int -AK_iter_2d_array(PyArrayObject *array, int axis, int reverse, npy_bool *is_dup, - AK_handle_value_func value_func, PyObject *set_obj, PyObject *dict_obj) -{ - int is_c_order = PyArray_FLAGS(array) & NPY_ARRAY_C_CONTIGUOUS; - - NpyIter *iter = AK_build_2d_array_iter(array, axis, reverse); + int iter_flags = NPY_ITER_READONLY | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK; + int order_flags = NPY_FORTRANORDER ? axis : NPY_CORDER; + NpyIter *iter = NpyIter_New(array, iter_flags, order_flags, NPY_NO_CASTING, NULL); if (!iter) { goto failure; } NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL); @@ -819,76 +780,42 @@ AK_iter_2d_array(PyArrayObject *array, int axis, int reverse, npy_bool *is_dup, npy_intp tuple_size = PyArray_DIM(array, !axis); npy_intp num_tuples = PyArray_DIM(array, axis); - int idx = 0; - int step = 1; - - if (reverse) { - idx = num_tuples - 1; - step = -1; - } - do { char *data = *dataptr; npy_intp stride = *strideptr; PyObject* value = NULL; - // When the axis doesn't align with the ordering, it means the npy iterator goes one-element at a time. - // Otherwise, it does a strided loop through the non-contiguous axis - if (is_c_order != axis) { - // AK_DEBUG("ONE PASS"); - // Do-while is one loop through all elements. - - int tup_idx = 0; - int step = 1; - int tup_stride_step = 0; // For normal iterations, each time we build a tuple, we are right where we - // we need to be to start building the next tuple. For reverse, we have to - // backtrack two tuples worth of strides to get where we need to be - - if (reverse) { - data += (stride * (num_tuples - 1) * tuple_size); - tup_idx = num_tuples - 1; - step = -1; - tup_stride_step = -(tuple_size * 2) * stride; - } - - while (num_tuples--) { - - PyObject *tup = PyTuple_New(tuple_size); - if (!tup) { goto failure; } - - for (int j = 0; j < tuple_size; ++j) { - memcpy(&value, data, sizeof(value)); - Py_INCREF(value); - PyTuple_SET_ITEM(tup, j, value); - data += stride; - } + int tup_idx = 0; + int step = 1; + int tup_stride_step = 0; // For normal iterations, each time we build a tuple, we are right where we + // we need to be to start building the next tuple. For reverse, we have to + // backtrack two tuples worth of strides to get where we need to be - int success = value_func(tup_idx, tup, is_dup, set_obj, dict_obj); - Py_DECREF(tup); - if (success == -1) { goto failure; } - tup_idx += step; - data += tup_stride_step; - } + if (reverse) { + data += (stride * (num_tuples - 1) * tuple_size); + tup_idx = num_tuples - 1; + step = -1; + tup_stride_step = -(tuple_size * 2) * stride; } - else { - //AK_DEBUG("MULTI PASS"); + while (num_tuples--) { + PyObject *tup = PyTuple_New(tuple_size); if (!tup) { goto failure; } - // Each do-while loop strides over another column - for (int i = 0; i < tuple_size; ++i) { + for (int j = 0; j < tuple_size; ++j) { memcpy(&value, data, sizeof(value)); Py_INCREF(value); - PyTuple_SET_ITEM(tup, i, value); + PyTuple_SET_ITEM(tup, j, value); data += stride; } - int success = value_func(idx, tup, is_dup, set_obj, dict_obj); + int success = value_func(tup_idx, tup, is_dup, set_obj, dict_obj); Py_DECREF(tup); if (success == -1) { goto failure; } - idx += step; + tup_idx += step; + data += tup_stride_step; } } while (iternext(iter)); From 6986fd39d6fefab162b16340309dd629afc88824 Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Thu, 17 Jun 2021 11:19:29 -0700 Subject: [PATCH 15/18] Misc cleanup to source. --- setup.py | 3 +++ src/_arraykit.c | 66 +++++++++++++++++++++++-------------------------- 2 files changed, 34 insertions(+), 35 deletions(-) diff --git a/setup.py b/setup.py index d3062396..3ebb573e 100644 --- a/setup.py +++ b/setup.py @@ -24,6 +24,9 @@ def get_long_description() -> str: ak_extension = Extension( name='arraykit._arraykit', # build into module sources=['src/_arraykit.c'], + #extra_compile_args=['-pg'], + #extra_link_args=['-pg'], + extra_link_args=['-Wl,--no-as-needed,-lprofiler,--as-needed'], **additional_info, ) diff --git a/src/_arraykit.c b/src/_arraykit.c index a66d9ee3..5e31f474 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -7,6 +7,7 @@ # include "numpy/arrayobject.h" # include "numpy/arrayscalars.h" # include "numpy/halffloat.h" +// # include "gperftools/profiler.h" //------------------------------------------------------------------------------ // Macros @@ -499,28 +500,28 @@ isna_element(PyObject *Py_UNUSED(m), PyObject *arg) // duplication // Defines how to process a hashable value. -typedef int (*AK_handle_value_func)(int, // i - PyObject*, // value - npy_bool*, // is_dup - PyObject*, // set_obj - PyObject* // dict_obj +typedef int (*AK_handle_value_func)(Py_ssize_t i, + PyObject* value, + npy_bool* is_dup, + PyObject* set_obj, + PyObject* dict_obj ); // Defines how to iterate over an arbitrary numpy (object) array -typedef int (*AK_iterate_np_func)(PyArrayObject*, // array - int, // axis - int, // reverse - npy_bool*, // is_dup - AK_handle_value_func, // handle_value_func - PyObject*, // set_obj - PyObject* // dict_obj +typedef int (*AK_iterate_np_func)(PyArrayObject* array, + int axis, + int reverse, + npy_bool* is_dup, + AK_handle_value_func handle_value_func, + PyObject* set_obj, + PyObject* dict_obj ); // Value processing funcs static int -AK_handle_value_one_boundary(int i, PyObject *value, npy_bool *is_dup, - PyObject *set_obj, PyObject *dict_obj) +AK_handle_value_one_boundary(Py_ssize_t i, PyObject *value, npy_bool *is_dup, + PyObject *seen, PyObject * Py_UNUSED(dict_obj)) { /* Used when the first duplicated element is considered unique. @@ -535,9 +536,6 @@ AK_handle_value_one_boundary(int i, PyObject *value, npy_bool *is_dup, else: is_dup[i] = True */ - PyObject *seen = set_obj; // Meaningful name alias - assert(dict_obj == NULL); - int found = PySet_Contains(seen, value); if (found == -1) { return -1; @@ -552,8 +550,9 @@ AK_handle_value_one_boundary(int i, PyObject *value, npy_bool *is_dup, } static int -AK_handle_value_include_boundaries(int i, PyObject *value, npy_bool *is_dup, - PyObject *set_obj, PyObject *dict_obj) +AK_handle_value_include_boundaries(Py_ssize_t i, PyObject *value, npy_bool *is_dup, + PyObject *seen, + PyObject *last_duplicate_locations) { /* Used when the first & last instances of duplicated values are considered unique @@ -568,9 +567,6 @@ AK_handle_value_include_boundaries(int i, PyObject *value, npy_bool *is_dup, # Keep track of last observed location, so we can mark it False (i.e. unique) at the end last_duplicate_locations[value] = i */ - PyObject *seen = set_obj; // Meaningful name alias - PyObject *last_duplicate_locations = dict_obj; // Meaningful name alias - int found = PySet_Contains(seen, value); if (found == -1) { return -1; @@ -591,8 +587,9 @@ AK_handle_value_include_boundaries(int i, PyObject *value, npy_bool *is_dup, } static int -AK_handle_value_exclude_boundaries(int i, PyObject *value, npy_bool *is_dup, - PyObject *set_obj, PyObject *dict_obj) +AK_handle_value_exclude_boundaries(Py_ssize_t i, PyObject *value, npy_bool *is_dup, + PyObject *duplicates, + PyObject *first_unique_locations) { /* Used when the first & last instances of duplicated values are considered duplicated @@ -600,8 +597,8 @@ AK_handle_value_exclude_boundaries(int i, PyObject *value, npy_bool *is_dup, Rougly equivalent Python: if value not in first_unique_locations: - // Keep track of the first time we see each unique value, so we can mark the first location - // of each duplicated value as duplicated + # Keep track of the first time we see each unique value, so we can mark the first location + # of each duplicated value as duplicated first_unique_locations[value] = i else: is_dup[i] = True @@ -613,10 +610,6 @@ AK_handle_value_exclude_boundaries(int i, PyObject *value, npy_bool *is_dup, # This value is duplicated! duplicates.add(value) */ - - PyObject *duplicates = set_obj; // Meaningful name alias - PyObject *first_unique_locations = dict_obj; // Meaningful name alias - int found = PyDict_Contains(first_unique_locations, value); if (found == -1) { return -1; @@ -682,7 +675,6 @@ AK_iter_1d_array(PyArrayObject *array, int axis, int reverse, npy_bool *is_dup, process_value_func(i, value, is_dup, set_obj, dict_obj) */ - assert(axis == 0); NpyIter *iter = NpyIter_New(array, NPY_ITER_READONLY | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK, @@ -760,7 +752,7 @@ AK_iter_2d_array(PyArrayObject *array, int axis, int reverse, npy_bool *is_dup, new_flags |= NPY_ARRAY_C_CONTIGUOUS; } - array = PyArray_FromArray(array, PyArray_DescrFromType(NPY_OBJECT), new_flags); + array = (PyArrayObject*)PyArray_FromArray(array, PyArray_DescrFromType(NPY_OBJECT), new_flags); if (!array) { return -1; } @@ -768,6 +760,7 @@ AK_iter_2d_array(PyArrayObject *array, int axis, int reverse, npy_bool *is_dup, int iter_flags = NPY_ITER_READONLY | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK; int order_flags = NPY_FORTRANORDER ? axis : NPY_CORDER; + NpyIter *iter = NpyIter_New(array, iter_flags, order_flags, NPY_NO_CASTING, NULL); if (!iter) { goto failure; } @@ -786,7 +779,7 @@ AK_iter_2d_array(PyArrayObject *array, int axis, int reverse, npy_bool *is_dup, PyObject* value = NULL; - int tup_idx = 0; + Py_ssize_t tup_idx = 0; int step = 1; int tup_stride_step = 0; // For normal iterations, each time we build a tuple, we are right where we // we need to be to start building the next tuple. For reverse, we have to @@ -861,6 +854,7 @@ array_to_duplicated_hashable(PyObject *Py_UNUSED(m), PyObject *args, PyObject *k - If exclude_last is True, the we iterate right-to-left, ensuring the last observation of each unique is reported as such, with every subsequent duplicate observation being marked as a duplicate */ + // ProfilerStart("/home/burkland/github/arraykit/arraykit.prof"); PyArrayObject *array = NULL; int axis = 0; int exclude_first = 0; @@ -904,7 +898,7 @@ array_to_duplicated_hashable(PyObject *Py_UNUSED(m), PyObject *args, PyObject *k } npy_intp dims = {size}; - PyArrayObject *is_dup = PyArray_Zeros(1, &dims, PyArray_DescrFromType(NPY_BOOL), 0); + PyArrayObject *is_dup = (PyArrayObject*)PyArray_Zeros(1, &dims, PyArray_DescrFromType(NPY_BOOL), 0); npy_bool *is_dup_array = (npy_bool*)PyArray_DATA(is_dup); PyObject *set_obj = PySet_New(NULL); @@ -942,7 +936,7 @@ array_to_duplicated_hashable(PyObject *Py_UNUSED(m), PyObject *args, PyObject *k // 4. Post-process if (exclude_first && exclude_last) { // Mark the last observed location of each duplicate value as False - + assert(dict_obj != NULL); PyObject *last_duplicate_locations = dict_obj; // Meaningful name alias PyObject *value = NULL; // Borrowed @@ -961,11 +955,13 @@ array_to_duplicated_hashable(PyObject *Py_UNUSED(m), PyObject *args, PyObject *k Py_XDECREF(dict_obj); Py_DECREF(set_obj); + // ProfilerStop(); return (PyObject *)is_dup; failure: Py_XDECREF(dict_obj); Py_DECREF(set_obj); + // ProfilerStop(); return NULL; } From b36009bff3dc479ecb008914cb6950dd905daca3 Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Thu, 17 Jun 2021 12:01:45 -0700 Subject: [PATCH 16/18] Removes unnecessary compile/link args. --- setup.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/setup.py b/setup.py index 3ebb573e..d3062396 100644 --- a/setup.py +++ b/setup.py @@ -24,9 +24,6 @@ def get_long_description() -> str: ak_extension = Extension( name='arraykit._arraykit', # build into module sources=['src/_arraykit.c'], - #extra_compile_args=['-pg'], - #extra_link_args=['-pg'], - extra_link_args=['-Wl,--no-as-needed,-lprofiler,--as-needed'], **additional_info, ) From 1e63bd1c0790952db8510a3bacc427421b9b53e9 Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Thu, 17 Jun 2021 17:17:12 -0700 Subject: [PATCH 17/18] Fixes some memory bugs. Updates performance benchmarks. Adds profiling script. --- .gitignore | 1 + performance/__main__.py | 43 +++++++++++++++---------- profile.py | 70 +++++++++++++++++++++++++++++++++++++++++ src/_arraykit.c | 22 ++++++------- 4 files changed, 109 insertions(+), 27 deletions(-) create mode 100644 profile.py diff --git a/.gitignore b/.gitignore index e6a47305..c7250842 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ __pycache__ build *.diff *.orig +callgrind* diff --git a/performance/__main__.py b/performance/__main__.py index 19e49b39..aeb08935 100644 --- a/performance/__main__.py +++ b/performance/__main__.py @@ -1,7 +1,6 @@ import argparse import collections import datetime -import functools import itertools import timeit @@ -380,9 +379,16 @@ def __init__(self): np.array(['q','q','q', 'a', 'w', 'w'], dtype=object), np.array([0,1,2,2,1,4,5,3,4,5,5,6], dtype=object), ] + + # 0.99920089 0.94194469 + rs = np.random.RandomState(0) self.arrays_1d_large = [ - np.arange(100_000).astype(object), - np.hstack([np.arange(15), np.arange(90_000), np.arange(15), np.arange(9970)]).astype(object), + np.arange(100_000).astype(object), # All unique 0.73636183 0.73142613 + np.full(100_000, fill_value='abc').astype(object), # All duplicated 0.99341718 1.07130567 + rs.randint(0, 100, 100_000).astype(object), # Many repeated elements from small subset 0.96812477 0.97921523 + rs.randint(0, 10_000, 100_000).astype(object), # Many repeated elements from medium subset 1.05508269 0.9765244 + rs.randint(0, 75_000, 100_000).astype(object), # Some repeated elements from a large subset 0.81474696 0.89746359 + np.hstack([np.arange(15), np.arange(90_000), np.arange(15), np.arange(9970)]).astype(object), # Custom 0.84165586 0.86117453 ] self.arrays_2d_small = [ @@ -395,28 +401,33 @@ def __init__(self): np.hstack([np.arange(15), np.arange(90_000), np.arange(15), np.arange(9970)]).reshape(10_000, 10).astype(object), ] - self.prd_1d = functools.partial(itertools.product, (True, False), (True, False)) - self.prd_2d = functools.partial(itertools.product, (0, 1), (True, False), (True, False)) - def array_1d_small(self): for _ in range(10000): - for exclude_first, exclude_last, arr in self.prd_1d(self.arrays_1d_small): - self.entry(arr, exclude_first=exclude_first, exclude_last=exclude_last) + for arr in self.arrays_1d_small: + self.entry(arr, 0, False, False) + self.entry(arr, 0, True, False) + self.entry(arr, 0, False, True) def array_1d_large(self): - for _ in range(12): - for exclude_first, exclude_last, arr in self.prd_1d(self.arrays_1d_large): - self.entry(arr, exclude_first=exclude_first, exclude_last=exclude_last) + for _ in range(5): + for arr in self.arrays_1d_large: + self.entry(arr, 0, False, False) + self.entry(arr, 0, True, False) + self.entry(arr, 0, False, True) def array_2d_small(self): for _ in range(5000): - for axis, exclude_first, exclude_last, arr in self.prd_2d(self.arrays_2d_small): - self.entry(arr, axis, exclude_first, exclude_last) + for axis, arr in itertools.product((0, 1), self.arrays_2d_small): + self.entry(arr, axis, False, False) + self.entry(arr, axis, True, False) + self.entry(arr, axis, False, True) def array_2d_large(self): for _ in range(12): - for axis, exclude_first, exclude_last, arr in self.prd_2d(self.arrays_2d_large): - self.entry(arr, axis, exclude_first, exclude_last) + for axis, arr in itertools.product((0, 1), self.arrays_2d_large): + self.entry(arr, axis, False, False) + self.entry(arr, axis, True, False) + self.entry(arr, axis, False, True) class ArrayToDuplicatedHashablePerfAK(ArrayToDuplicatedHashablePerf): @@ -466,7 +477,7 @@ def main(): number=cls_runner.NUMBER) records.append((cls_perf.__name__, func_attr, results['ak'], results['ref'], results['ref'] / results['ak'])) - width = 24 + width = 36 for record in records: print(''.join( (r.ljust(width) if isinstance(r, str) else str(round(r, 8)).ljust(width)) for r in record diff --git a/profile.py b/profile.py new file mode 100644 index 00000000..0c849fa9 --- /dev/null +++ b/profile.py @@ -0,0 +1,70 @@ +import numpy as np +import sys +from arraykit import array_to_duplicated_hashable + + +def main(setup): + if setup == 'small_1d': + ITERATIONS = 5_000 + axes = (0,) + arrays = [ + np.array([0,0,1,0,None,None,0,1,None], dtype=object), + np.array([0,0,1,0,'q','q',0,1,'q'], dtype=object), + np.array(['q','q','q', 'a', 'w', 'w'], dtype=object), + np.array([0,1,2,2,1,4,5,3,4,5,5,6], dtype=object), + ] + + elif setup == 'large_1d': + ITERATIONS = 10 + axes = (0,) + + rs = np.random.RandomState(0) + arrays = [ + np.arange(100_000).astype(object), # All unique + np.full(100_000, fill_value='abc').astype(object), # All duplicated + rs.randint(0, 100, 100_000).astype(object), # Many repeated elements from small subset + rs.randint(0, 10_000, 100_000).astype(object), # Many repeated elements from medium subset + rs.randint(0, 75_000, 100_000).astype(object), # Some repeated elements from a large subset + np.hstack([np.arange(15), np.arange(90_000), np.arange(15), np.arange(9970)]).astype(object), # Custom + ] + + elif setup == 'small_2d': + ITERATIONS = 5_000 + axes = (0, 1) + arrays = [ + np.array([[None, None, None, 32, 17, 17], [2,2,2,False,'q','q'], [2,2,2,False,'q','q'], ], dtype=object), + np.array([[None, None, None, 32, 17, 17], [2,2,2,False,'q','q'], [2,2,2,False,'q','q'], ], dtype=object), + np.array([[50, 50, 32, 17, 17], [2,2,1,3,3]], dtype=object), + ] + + elif setup == 'large_2d': + ITERATIONS = 10 + axes = (0, 1) + arrays = [ + np.arange(100_000).reshape(10_000, 10).astype(object), + np.hstack([np.arange(15), np.arange(90_000), np.arange(15), np.arange(9970)]).reshape(10_000, 10).astype(object), + ] + + else: + assert False, "Impossible state!" + + for _ in range(ITERATIONS): + for arr in arrays: + for axis in axes: + array_to_duplicated_hashable(arr, axis, True, False) + array_to_duplicated_hashable(arr, axis, False, True) + array_to_duplicated_hashable(arr, axis, False, False) + + +if __name__ == '__main__': + try: + setup = sys.argv[1] + assert setup in ('small_1d', 'large_1d', 'small_2d', 'large_2d') + except IndexError: + print('Expected a setup arg!') + sys.exit(1) + except AssertionError: + print(f'Invalid setup arg: {setup}') + sys.exit(1) + + main(setup) diff --git a/src/_arraykit.c b/src/_arraykit.c index 5e31f474..2c44caee 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -581,9 +581,11 @@ AK_handle_value_include_boundaries(Py_ssize_t i, PyObject *value, npy_bool *is_d PyObject *idx = PyLong_FromLong(i); if (!idx) { return -1; } - int set_success = PyDict_SetItem(last_duplicate_locations, value, idx); - Py_DECREF(idx); - return set_success; // -1 on failure, 0 on success + int success = PyDict_SetItem(last_duplicate_locations, value, idx); + if (success == -1) { + Py_DECREF(idx); + } + return success; // -1 on failure, 0 on success } static int @@ -621,9 +623,11 @@ AK_handle_value_exclude_boundaries(Py_ssize_t i, PyObject *value, npy_bool *is_d return -1; } - int set_success = PyDict_SetItem(first_unique_locations, value, idx); - Py_DECREF(idx); - return set_success; // -1 on failure, 0 on success + int success = PyDict_SetItem(first_unique_locations, value, idx); + if (success == -1) { + Py_DECREF(idx); + } + return success; // -1 on failure, 0 on success } is_dup[i] = NPY_TRUE; @@ -635,7 +639,7 @@ AK_handle_value_exclude_boundaries(Py_ssize_t i, PyObject *value, npy_bool *is_d } if (found == 0) { - PyObject *first_unique_location = PyDict_GetItem(first_unique_locations, value); + PyObject *first_unique_location = PyDict_GetItem(first_unique_locations, value); // Borrowed! if (!first_unique_location) { return -1; } @@ -643,8 +647,6 @@ AK_handle_value_exclude_boundaries(Py_ssize_t i, PyObject *value, npy_bool *is_d if (idx == -1) { return -1; // -1 always means failure since no locations are negative } - Py_DECREF(first_unique_location); - is_dup[idx] = NPY_TRUE; } @@ -947,8 +949,6 @@ array_to_duplicated_hashable(PyObject *Py_UNUSED(m), PyObject *args, PyObject *k if (idx == -1) { goto failure; // -1 always means failure since no locations are negative } - Py_DECREF(value); - is_dup_array[idx] = NPY_FALSE; } } From 9fe0d58878f92a7a76f3e23f347abfec9dbd8da1 Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Mon, 21 Jun 2021 14:55:31 -0700 Subject: [PATCH 18/18] Some misc updates for profiling. --- .gitignore | 3 +++ setup.py | 2 ++ src/_arraykit.c | 4 ---- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index c7250842..384df630 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,6 @@ build *.diff *.orig callgrind* +*.dot +*.out +*.prof diff --git a/setup.py b/setup.py index d3062396..7a293c33 100644 --- a/setup.py +++ b/setup.py @@ -24,6 +24,8 @@ def get_long_description() -> str: ak_extension = Extension( name='arraykit._arraykit', # build into module sources=['src/_arraykit.c'], + #extra_link_args=['-Wl,--no-as-needed,-lprofiler,--as-needed'], # Uncomment this to use gperftools + #extra_compile_args = ["-O0"], # Uncomment this to provide more debug symbols. **additional_info, ) diff --git a/src/_arraykit.c b/src/_arraykit.c index 2c44caee..5ee9069a 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -7,7 +7,6 @@ # include "numpy/arrayobject.h" # include "numpy/arrayscalars.h" # include "numpy/halffloat.h" -// # include "gperftools/profiler.h" //------------------------------------------------------------------------------ // Macros @@ -856,7 +855,6 @@ array_to_duplicated_hashable(PyObject *Py_UNUSED(m), PyObject *args, PyObject *k - If exclude_last is True, the we iterate right-to-left, ensuring the last observation of each unique is reported as such, with every subsequent duplicate observation being marked as a duplicate */ - // ProfilerStart("/home/burkland/github/arraykit/arraykit.prof"); PyArrayObject *array = NULL; int axis = 0; int exclude_first = 0; @@ -955,13 +953,11 @@ array_to_duplicated_hashable(PyObject *Py_UNUSED(m), PyObject *args, PyObject *k Py_XDECREF(dict_obj); Py_DECREF(set_obj); - // ProfilerStop(); return (PyObject *)is_dup; failure: Py_XDECREF(dict_obj); Py_DECREF(set_obj); - // ProfilerStop(); return NULL; }