static-frame · chaburkland · Jun 3, 2021 · Jun 3, 2021 · Jun 3, 2021 · Jun 3, 2021
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,7 @@ __pycache__
 build
 *.diff
 *.orig
+callgrind*
+*.dot
+*.out
+*.prof
diff --git a/debug.py b/debug.py
@@ -0,0 +1,202 @@
+from arraykit import array_to_duplicated_hashable
+import numpy as np
+
+class PO:
+    def __init__(self, v) -> None:
+        self.v = v
+    def __repr__(self) -> str:
+        return f'PO<{self.v}>'
+
+
+def handle_value_one_boundary(i, value, is_dup, set_obj, dict_obj):
+    seen = set_obj
+    assert dict_obj == None
+
+    if value not in seen:
+        seen.add(value)
+    else:
+        is_dup[i] = True
+
+
+def handle_value_exclude_boundaries(i, value, is_dup, set_obj, dict_obj):
+    duplicates = set_obj
+    first_unique_locations = dict_obj
+
+    if value not in first_unique_locations:
+        first_unique_locations[value] = i
+    else:
+        is_dup[i] = True
+
+        # Second time seeing a duplicate
+        if value not in duplicates:
+            is_dup[first_unique_locations[value]] = True
+
+        # always update last
+        duplicates.add(value)
+
+
+def handle_value_include_boundaries(i, value, is_dup, set_obj, dict_obj):
+    seen = set_obj
+    last_duplicate_locations = dict_obj
+
+    if value not in seen:
+        seen.add(value)
+    else:
+        is_dup[i] = True
+
+        # always update last
+        last_duplicate_locations[value] = i
+
+
+def iterate_1d(array, axis, reverse, is_dup, process_value_func, set_obj, dict_obj):
+    if reverse:
+        iterator = reversed(array)
+    else:
+        iterator = array
+
+    size = len(array)
+
+    for i, value in enumerate(iterator):
+        if reverse:
+            i = size - i - 1
+
+        process_value_func(i, value, is_dup, set_obj, dict_obj)
+
+
+def iterate_2d(array, axis, reverse, is_dup, process_value_func, set_obj, dict_obj):
+    size = array.shape[axis]
+
+    if axis == 0:
+        iterator = array
+    else:
+        iterator = array.T
+
+    if reverse:
+        iterator = reversed(iterator)
+
+    for i, value in enumerate(map(tuple, iterator)):
+        if reverse:
+            i = size - i - 1
+
+        process_value_func(i, value, is_dup, set_obj, dict_obj)
+
+
+def python_impl(
+        array: np.ndarray,
+        axis: int = 0,
+        exclude_first: bool = False,
+        exclude_last: bool = False,
+    ) -> np.ndarray:
+    '''
+    Algorithm for finding duplicates in unsortable arrays for hashables. This will always be an object array.
+
+    Note:
+        np.unique fails under the same conditions that sorting fails, so there is no need to try np.unique: must go to set drectly.
+    '''
+    size = array.shape[axis]
+
+    reverse = not exclude_first and exclude_last
+
+    if array.ndim == 1:
+        iterate_func = iterate_1d
+    else:
+        iterate_func = iterate_2d
+
+    is_dup = np.full(size, False)
+
+    set_obj = set()
+    if exclude_first ^ exclude_last:
+        dict_obj = None
+        process_value_func = handle_value_one_boundary
+
+    elif not exclude_first and not exclude_last:
+        dict_obj = dict()
+        process_value_func = handle_value_exclude_boundaries
+
+    else:
+        dict_obj = dict()
+        process_value_func = handle_value_include_boundaries
+
+    iterate_func(array, axis, reverse, is_dup, process_value_func, set_obj, dict_obj)
+
+    if exclude_first and exclude_last:
+        is_dup[list(dict_obj.values())] = False
+
+    return is_dup
+
+
+def dprint(*args, debug):
+    '''Debug print'''
+    if debug:
+        print(*args)
+
+
+def run_test(array, debug=True):
+    def _test(*args):
+        dprint(args[1:], debug=debug)
+
+        python_result = python_impl(*args)
+        dprint('python:', python_result, debug=debug)
+
+        c_result = array_to_duplicated_hashable(*args);
+        dprint('c     :', c_result, debug=debug)
+        assert (python_result == c_result).all()
+
+    _test(array, 0, True, False) # include_boundaries
+    _test(array, 0, False, False) # one_boundary (normal)
+    _test(array, 0, False, True) # one_boundary (reverse)
+    _test(array, 0, True, True) # exclude_boundaries
+
+    if len(array.shape) == 2:
+        _test(array, 1, True, False)
+        _test(array, 1, False, False)
+        _test(array, 1, False, True)
+        _test(array, 1, True, True)
+
+
+def test_arr1d(debug=True):
+    arr = np.array([1, 2, 2, 1, 3, 2, 6], dtype=object)
+
+    # Test with normally constructed array
+    run_test(arr, debug=debug)
+
+    arr2d = np.array([[2, 1, 2],
+                      [3, 2, 3],
+                      [3, 2, 3],
+                      [2, 1, 2],
+                      [4, 3, 4],
+                      [3, 2, 3],
+                      [6, 6, 6]], dtype=object)
+
+    # Test with array slices
+    run_test(arr2d[:, 1], debug=debug)
+    run_test(arr2d.T[1], debug=debug)
+
+
+def test_arr2d(debug=True):
+    arr2d = np.array([
+        [1, 2, 2, 1, 3, 2, 6],
+        [2, 3, 3, 2, 4, 3, 6],
+        [2, 3, 3, 2, 4, 3, 6],
+        [1, 2, 2, 1, 3, 2, 6],
+        [3, 4, 4, 3, 5, 4, 6],
+        [2, 3, 3, 2, 4, 3, 6],
+    ], dtype=object)
+
+    run_test(arr2d, debug=debug)
+    run_test(arr2d.T, debug=debug)
+
+
+def test_misc(debug=True):
+    arr = np.array([1, PO(1), 2, 3, 1, PO(1), 2, 3, 2, -1, -233, 'aslkj', 'df', 'df', True, True, None, 1])
+    run_test(arr, debug=debug)
+
+    arr = np.arange(20).reshape(4, 5).astype(object)
+    run_test(arr, debug=debug)
+    run_test(arr.T, debug=debug)
+
+
+test_arr1d(debug=False)
+test_arr2d(debug=False)
+test_misc(debug=False)
+print('Done')
diff --git a/performance/__main__.py b/performance/__main__.py
@@ -1,7 +1,8 @@
+import argparse
 import collections
 import datetime
+import itertools
 import timeit
-import argparse
 
 import numpy as np
 
@@ -17,6 +18,7 @@
 from performance.reference.util import dtype_from_element as dtype_from_element_ref
 from performance.reference.util import array_deepcopy as array_deepcopy_ref
 from performance.reference.util import isna_element as isna_element_ref
+from performance.reference.util import array_to_duplicated_hashable as array_to_duplicated_hashable_ref
 
 from performance.reference.array_go import ArrayGO as ArrayGOREF
 
@@ -32,6 +34,7 @@
 from arraykit import dtype_from_element as dtype_from_element_ak
 from arraykit import array_deepcopy as array_deepcopy_ak
 from arraykit import isna_element as isna_element_ak
+from performance.reference.util import array_to_duplicated_hashable as array_to_duplicated_hashable_ak
 
 from arraykit import ArrayGO as ArrayGOAK
 
@@ -359,6 +362,81 @@ class IsNaElementPerfREF(IsNaElementPerf):
     entry = staticmethod(isna_element_ref)
 
 
+#-------------------------------------------------------------------------------
+class ArrayToDuplicatedHashablePerf(Perf):
+    NUMBER = 3
+    FUNCTIONS = (
+            'array_1d_small',
+            'array_1d_large',
+            'array_2d_small',
+            'array_2d_large',
+    )
+
+    def __init__(self):
+        self.arrays_1d_small = [
+            np.array([0,0,1,0,None,None,0,1,None], dtype=object),
+            np.array([0,0,1,0,'q','q',0,1,'q'], dtype=object),
+            np.array(['q','q','q', 'a', 'w', 'w'], dtype=object),
+            np.array([0,1,2,2,1,4,5,3,4,5,5,6], dtype=object),
+        ]
+
+        # 0.99920089 0.94194469
+        rs = np.random.RandomState(0)
+        self.arrays_1d_large = [
+            np.arange(100_000).astype(object), # All unique                                                        0.73636183 0.73142613
+            np.full(100_000, fill_value='abc').astype(object), # All duplicated                                    0.99341718 1.07130567
+            rs.randint(0, 100, 100_000).astype(object), # Many repeated elements from small subset                 0.96812477 0.97921523
+            rs.randint(0, 10_000, 100_000).astype(object), # Many repeated elements from medium subset             1.05508269 0.9765244
+            rs.randint(0, 75_000, 100_000).astype(object), # Some repeated elements from a large subset            0.81474696 0.89746359
+            np.hstack([np.arange(15), np.arange(90_000), np.arange(15), np.arange(9970)]).astype(object), # Custom 0.84165586 0.86117453
+        ]
+
+        self.arrays_2d_small = [
+            np.array([[None, None, None, 32, 17, 17], [2,2,2,False,'q','q'], [2,2,2,False,'q','q'], ], dtype=object),
+            np.array([[None, None, None, 32, 17, 17], [2,2,2,False,'q','q'], [2,2,2,False,'q','q'], ], dtype=object),
+            np.array([[50, 50, 32, 17, 17], [2,2,1,3,3]], dtype=object),
+        ]
+        self.arrays_2d_large = [
+            np.arange(100_000).reshape(10_000, 10).astype(object),
+            np.hstack([np.arange(15), np.arange(90_000), np.arange(15), np.arange(9970)]).reshape(10_000, 10).astype(object),
+        ]
+
+    def array_1d_small(self):
+        for _ in range(10000):
+            for arr in self.arrays_1d_small:
+                self.entry(arr, 0, False, False)
+                self.entry(arr, 0, True, False)
+                self.entry(arr, 0, False, True)
+
+    def array_1d_large(self):
+        for _ in range(5):
+            for arr in self.arrays_1d_large:
+                self.entry(arr, 0, False, False)
+                self.entry(arr, 0, True, False)
+                self.entry(arr, 0, False, True)
+
+    def array_2d_small(self):
+        for _ in range(5000):
+            for axis, arr in itertools.product((0, 1), self.arrays_2d_small):
+                self.entry(arr, axis, False, False)
+                self.entry(arr, axis, True, False)
+                self.entry(arr, axis, False, True)
+
+    def array_2d_large(self):
+        for _ in range(12):
+            for axis, arr in itertools.product((0, 1), self.arrays_2d_large):
+                self.entry(arr, axis, False, False)
+                self.entry(arr, axis, True, False)
+                self.entry(arr, axis, False, True)
+
+
+class ArrayToDuplicatedHashablePerfAK(ArrayToDuplicatedHashablePerf):
+    entry = staticmethod(array_to_duplicated_hashable_ak)
+
+class ArrayToDuplicatedHashablePerfREF(ArrayToDuplicatedHashablePerf):
+    entry = staticmethod(array_to_duplicated_hashable_ref)
+
+
 #-------------------------------------------------------------------------------
 
 def get_arg_parser():
@@ -399,7 +477,7 @@ def main():
                         number=cls_runner.NUMBER)
             records.append((cls_perf.__name__, func_attr, results['ak'], results['ref'], results['ref'] / results['ak']))
 
-    width = 24
+    width = 36
     for record in records:
         print(''.join(
             (r.ljust(width) if isinstance(r, str) else str(round(r, 8)).ljust(width)) for r in record

diff --git a/performance/reference/util.py b/performance/reference/util.py
@@ -216,3 +216,63 @@ def dtype_from_element(value: tp.Optional[tp.Hashable]) -> np.dtype:
     # NOTE: calling array and getting dtype on np.nan is faster than combining isinstance, isnan calls
     return np.array(value).dtype
 
+
+#-------------------------------------------------------------------------------
+# tools for handling duplicates
+
+def array_to_duplicated_hashable(
+        array: np.ndarray,
+        axis: int = 0,
+        exclude_first: bool = False,
+        exclude_last: bool = False,
+    ) -> np.ndarray:
+    '''
+    Algorithm for finding duplicates in unsortable arrays for hashables. This will always be an object array.
+    '''
+    # np.unique fails under the same conditions that sorting fails, so there is no need to try np.unique: must go to set drectly.
+    len_axis = array.shape[axis]
+
+    if array.ndim == 1:
+        value_source = array
+        to_hashable = None
+    else:
+        if axis == 0:
+            value_source = array # will iterate rows
+        else:
+            value_source = (array[:, i] for i in range(len_axis))
+        # values will be arrays; must convert to tuples to make hashable
+        to_hashable = tuple
+
+    is_dupe = np.full(len_axis, False)
+
+    # could exit early with a set, but would have to hash all array twice to go to set and dictionary
+    # creating a list for each entry and tracking indices would be very expensive
+
+    unique_to_first: tp.Dict[tp.Hashable, int] = {} # value to first occurence
+    dupe_to_first: tp.Dict[tp.Hashable, int] = {}
+    dupe_to_last: tp.Dict[tp.Hashable, int] = {}
+
+    for idx, v in enumerate(value_source):
+
+        if to_hashable:
+            v = to_hashable(v)
+
+        if v not in unique_to_first:
+            unique_to_first[v] = idx
+        else:
+            # v has been seen before; upate Boolean array
+            is_dupe[idx] = True
+
+            # if no entry in dupe to first, no update with value in unique to first, which is the index this values was first seen
+            if v not in dupe_to_first:
+                dupe_to_first[v] = unique_to_first[v]
+            # always update last
+            dupe_to_last[v] = idx
+
+    if exclude_last: # overwrite with False
+        is_dupe[list(dupe_to_last.values())] = False
+
+    if not exclude_first: # add in first values
+        is_dupe[list(dupe_to_first.values())] = True
+
+    return is_dupe