ray-project · richardliaw · Nov 15, 2024 · Nov 14, 2024 · Nov 14, 2024 · Nov 15, 2024
@@ -32,7 +32,7 @@
 )
 from ray.data._internal.row import TableRow
 from ray.data._internal.table_block import TableBlockAccessor, TableBlockBuilder
-from ray.data._internal.util import find_partitions
+from ray.data._internal.util import find_partitions, NULL_SENTINEL
 from ray.data.block import (
     Block,
     BlockAccessor,
@@ -500,7 +500,6 @@ def sort_and_partition(
         table = sort(self._table, sort_key)
         if len(boundaries) == 0:
             return [table]
-
         return find_partitions(table, boundaries, sort_key)
 
     def combine(self, sort_key: "SortKey", aggs: Tuple["AggregateFn"]) -> Block:
@@ -634,6 +633,10 @@ def key_fn(r):
             else:
                 return (0,)
 
+        def key_fn_with_null_sentinel(r):
+            values = key_fn(r)
+            return [NULL_SENTINEL if v is None else v for v in values]
+
         # Handle blocks of different types.
         blocks = TableBlockAccessor.normalize_block_types(blocks, "arrow")
 
@@ -642,7 +645,7 @@ def key_fn(r):
                 ArrowBlockAccessor(block).iter_rows(public_row_format=False)
                 for block in blocks
             ],
-            key=key_fn,
+            key=key_fn_with_null_sentinel,
         )
         next_row = None
         builder = ArrowBlockBuilder()

@@ -23,7 +23,7 @@ def __init__(
         self,
         key: Optional[Union[str, List[str]]] = None,
         descending: Union[bool, List[bool]] = False,
-        boundaries: Optional[list] = None,
+        boundaries: Optional[List[T]] = None,
     ):
         if key is None:
             key = []
@@ -195,8 +195,8 @@ def sample_boundaries(
         samples_table = builder.build()
         samples_dict = BlockAccessor.for_block(samples_table).to_numpy(columns=columns)
         # This zip does the transposition from list of column values to list of tuples.
-        samples_list = sorted(zip(*samples_dict.values()))
-
+        # Use np.sort to sort None/NaNs effectively
+        samples_list = np.sort(list(zip(*samples_dict.values())), axis=0)
         # Each boundary corresponds to a quantile of the data.
         quantile_indices = [
             int(q * (len(samples_list) - 1))

@@ -55,6 +55,28 @@
 _pyarrow_dataset: LazyModule = None
 
 
+class _NullSentinel:
+    """Sentinel value that sorts greater than any other value."""
+
+    def __eq__(self, other):
+        return isinstance(other, _NullSentinel)
+
+    def __lt__(self, other):
+        return False
+
+    def __le__(self, other):
+        return isinstance(other, _NullSentinel)
+
+    def __gt__(self, other):
+        return True
+
+    def __ge__(self, other):
+        return True
+
+
+NULL_SENTINEL = _NullSentinel()
+
+
 def _lazy_import_pyarrow_dataset() -> LazyModule:
     global _pyarrow_dataset
     if _pyarrow_dataset is None:
@@ -723,6 +745,16 @@ def find_partition_index(
         col_vals = table[col_name].to_numpy()[left:right]
         desired_val = desired[i]
 
+        # Handle null values - replace them with sentinel values
+        if desired_val is None:
+            desired_val = NULL_SENTINEL
+
+        # Replace None/NaN values in col_vals with sentinel
+        null_mask = col_vals == None  # Handles both None and np.nan
+        if null_mask.any():
+            col_vals = col_vals.copy()  # Make a copy to avoid modifying original
+            col_vals[null_mask] = NULL_SENTINEL
+
         prevleft = left
         if descending is True:
             left = prevleft + (

@@ -123,6 +123,30 @@ def test_unique(ray_start_regular_shared):
         assert mock_validate.call_args_list[0].args[0].names == ["b"]
 
 
+@pytest.mark.parametrize("batch_format", ["pandas", "pyarrow"])
+def test_unique_with_nulls(ray_start_regular_shared, batch_format):
+    ds = ray.data.from_items([3, 2, 3, 1, 2, 3, None])
+    assert set(ds.unique("item")) == {1, 2, 3, None}
+
+    ds = ray.data.from_items(
+        [
+            {"a": 1, "b": 1},
+            {"a": 1, "b": 2},
+            {"a": 1, "b": None},
+            {"a": None, "b": 3},
+            {"a": None, "b": 4},
+        ]
+    )
+    assert set(ds.unique("a")) == {1, None}
+    assert set(ds.unique("b")) == {1, 2, 3, 4, None}
+
+    df = pd.DataFrame({"col": [1, 2, 2, 3, None, 3, 2]}, dtype="Int64")
+    # df["col"].unique() works fine, as expected
+    ds2 = ray.data.from_pandas(df)
+    ds2 = ds2.map_batches(lambda x: x, batch_format=batch_format)
+    assert set(ds2.unique("col")) == {1, 2, 3, None}
+
+
 def test_grouped_dataset_repr(ray_start_regular_shared):
     ds = ray.data.from_items([{"key": "spam"}, {"key": "ham"}, {"key": "spam"}])
     assert repr(ds.groupby("key")) == f"GroupedData(dataset={ds!r}, key='key')"