pandas-dev · hexgnu · Feb 8, 2018 · Feb 8, 2018 · Feb 12, 2018 · Feb 12, 2018
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -251,25 +251,39 @@ cdef class HashTable:
 {{py:
 
 # name, dtype, null_condition, float_group
-dtypes = [('Float64', 'float64', 'val != val', True),
-          ('UInt64', 'uint64', 'False', False),
-          ('Int64', 'int64', 'val == iNaT', False)]
+dtypes = [('Float64', 'float64', 'val != val', True, 'NAN'),
+          ('UInt64', 'uint64', 'False', False, 'NAN'),
+          ('Int64', 'int64', 'val == iNaT', False, 'iNaT')]
 
 def get_dispatch(dtypes):
-  for (name, dtype, null_condition, float_group) in dtypes:
+  for (name, dtype, null_condition, float_group, na_value) in dtypes:
     unique_template = """\
         cdef:
            Py_ssize_t i, n = len(values)
            int ret = 0
-           {dtype}_t val
+           {dtype}_t val, fill_value_val, ngaps_val
            khiter_t k
            bint seen_na = 0
            {name}Vector uniques = {name}Vector()
            {name}VectorData *ud
 
         ud = uniques.data
-
+        fill_value_val = fill_value
+        ngaps_val = ngaps
+
         with nogil:
+            # If this is a sparse structure we need to append
+            # The fill value as well assuming the ngaps are greater than 0
+
+            if ngaps_val > 0:
+                k = kh_get_{dtype}(self.table, fill_value_val)
+                if k == self.table.n_buckets:
+                    kh_put_{dtype}(self.table, fill_value_val, &ret)
+                    if needs_resize(ud):
+                        with gil:
+                            uniques.resize()
+                    append_data_{dtype}(ud, fill_value_val)
+
             for i in range(n):
                 val = values[i]
                 IF {float_group}:
@@ -300,11 +314,11 @@ def get_dispatch(dtypes):
 
     unique_template = unique_template.format(name=name, dtype=dtype, null_condition=null_condition, float_group=float_group)
 
-    yield (name, dtype, null_condition, float_group, unique_template)
+    yield (name, dtype, null_condition, float_group, unique_template, na_value)
 }}
 
 
-{{for name, dtype, null_condition, float_group, unique_template in get_dispatch(dtypes)}}
+{{for name, dtype, null_condition, float_group, unique_template, na_value in get_dispatch(dtypes)}}
 
 cdef class {{name}}HashTable(HashTable):
 
@@ -405,22 +419,27 @@ cdef class {{name}}HashTable(HashTable):
         labels = self.get_labels(values, uniques, 0, 0)
         return uniques.to_array(), labels
 
+    # This seems like duplicate code from def uniques to me...
+    # Why does this exist?
     @cython.boundscheck(False)
     def get_labels(self, {{dtype}}_t[:] values, {{name}}Vector uniques,
                    Py_ssize_t count_prior, Py_ssize_t na_sentinel,
-                   bint check_null=True):
+                   bint check_null=True, fill_value={{na_value}}, ngaps=0):
         cdef:
             Py_ssize_t i, n = len(values)
             int64_t[:] labels
             Py_ssize_t idx, count = count_prior
             int ret = 0
-            {{dtype}}_t val
+            {{dtype}}_t val, fill_value_val, ngaps_val
             khiter_t k
             {{name}}VectorData *ud
 
         labels = np.empty(n, dtype=np.int64)
         ud = uniques.data
 
+        if ngaps > 0:
+            print("Hello world")
+
         with nogil:
             for i in range(n):
                 val = values[i]
@@ -496,10 +515,10 @@ cdef class {{name}}HashTable(HashTable):
         return np.asarray(labels), arr_uniques
 
     @cython.boundscheck(False)
-    def unique(self, ndarray[{{dtype}}_t, ndim=1] values):
+    def unique(self, ndarray[{{dtype}}_t, ndim=1] values, fill_value={{na_value}}, ngaps=0):
         if values.flags.writeable:
           # If the value is writeable (mutable) then use memview
-          return self.unique_memview(values)
+          return self.unique_memview(values, fill_value=fill_value, ngaps=ngaps)
 
         # We cannot use the memoryview version on readonly-buffers due to
         # a limitation of Cython's typed memoryviews. Instead we can use
@@ -508,7 +527,7 @@ cdef class {{name}}HashTable(HashTable):
 {{unique_template}}
 
     @cython.boundscheck(False)
-    def unique_memview(self, {{dtype}}_t[:] values):
+    def unique_memview(self, {{dtype}}_t[:] values, fill_value={{na_value}}, ngaps=0):
 {{unique_template}}
 
 {{endfor}}

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -10,7 +10,8 @@
     maybe_promote, construct_1d_object_array_from_listlike)
 from pandas.core.dtypes.generic import (
     ABCSeries, ABCIndex,
-    ABCIndexClass, ABCCategorical)
+    ABCIndexClass, ABCCategorical,
+    ABCSparseArray)
 from pandas.core.dtypes.common import (
     is_unsigned_integer_dtype, is_signed_integer_dtype,
     is_integer_dtype, is_complex_dtype,
@@ -362,7 +363,12 @@ def unique(values):
     htable, _, values, dtype, ndtype = _get_hashtable_algo(values)
 
     table = htable(len(values))
-    uniques = table.unique(values)
+
+    if isinstance(values, ABCSparseArray):
+        uniques = table.unique(values, fill_value=values.fill_value,
+                               ngaps=values.sp_index.ngaps)
+    else:
+        uniques = table.unique(values)
     uniques = _reconstruct_data(uniques, dtype, original)
 
     if isinstance(original, ABCSeries) and is_datetime64tz_dtype(dtype):
@@ -469,7 +475,13 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
     table = hash_klass(size_hint or len(values))
     uniques = vec_klass()
     check_nulls = not is_integer_dtype(original)
-    labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls)
+
+    if isinstance(values, ABCSparseArray):
+        labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls,
+                                  fill_value=values.fill_value,
+                                  ngaps=values.sp_index.ngaps)
+    else:
+        labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls)
 
     labels = _ensure_platform_int(labels)
     uniques = uniques.to_array()

diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -3024,6 +3024,7 @@ def is_in_obj(gpr):
 
         # create the Grouping
         # allow us to passing the actual Grouping as the gpr
+
         ping = Grouping(group_axis,
                         gpr,
                         obj=obj,