Skip to content

Commit

Permalink
[FEAT] Add hashing and groupby on structs (#2657)
Browse files Browse the repository at this point in the history
This only works within a given schema - structs with different field
names are not guaranteed to be hashed differently. Hashes may also be
different depending on the ordering of the fields.
  • Loading branch information
Vince7778 authored Aug 15, 2024
1 parent d5aec24 commit bdf8aca
Show file tree
Hide file tree
Showing 5 changed files with 163 additions and 3 deletions.
8 changes: 7 additions & 1 deletion src/daft-core/src/array/ops/groups.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use arrow2::array::Array;
use fnv::FnvHashMap;

use crate::{
array::{DataArray, FixedSizeListArray, ListArray},
array::{DataArray, FixedSizeListArray, ListArray, StructArray},
datatypes::{
BinaryArray, BooleanArray, DaftIntegerType, DaftNumericType, FixedSizeBinaryArray,
Float32Array, Float64Array, NullArray, Utf8Array,
Expand Down Expand Up @@ -182,3 +182,9 @@ impl IntoGroups for FixedSizeListArray {
self.hash(None)?.make_groups()
}
}

impl IntoGroups for StructArray {
fn make_groups(&self) -> DaftResult<super::GroupIndicesPair> {
self.hash(None)?.make_groups()
}
}
22 changes: 20 additions & 2 deletions src/daft-core/src/array/ops/hash.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::{
array::{DataArray, FixedSizeListArray, ListArray},
array::{DataArray, FixedSizeListArray, ListArray, StructArray},
datatypes::{
logical::{DateArray, Decimal128Array, TimeArray, TimestampArray},
BinaryArray, BooleanArray, DaftNumericType, FixedSizeBinaryArray, Int16Array, Int32Array,
Expand All @@ -12,7 +12,7 @@ use crate::{
};

use arrow2::types::Index;
use common_error::DaftResult;
use common_error::{DaftError, DaftResult};
use xxhash_rust::xxh3::{xxh3_64, xxh3_64_with_seed};

use super::as_arrow::AsArrow;
Expand Down Expand Up @@ -186,6 +186,24 @@ impl FixedSizeListArray {
}
}

impl StructArray {
pub fn hash(&self, seed: Option<&UInt64Array>) -> DaftResult<UInt64Array> {
// seed first child with input seed,
// then seed each child after with the output of the previous
if self.children.is_empty() {
return Err(DaftError::ValueError(
"Cannot hash struct with no children".into(),
));
}
let mut res = self.children.first().unwrap().hash(seed)?;

for child in self.children.iter().skip(1) {
res = child.hash(Some(&res))?;
}
res.with_validity(self.validity().cloned())
}
}

macro_rules! impl_int_murmur3_32 {
($ArrayT:ty) => {
impl $ArrayT {
Expand Down
1 change: 1 addition & 0 deletions src/daft-core/src/datatypes/matching.rs
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ macro_rules! with_match_hashable_daft_types {(
FixedSizeBinary(_) => __with_ty__! { FixedSizeBinaryType },
List(_) => __with_ty__! { ListType },
FixedSizeList(_, _) => __with_ty__! { FixedSizeListType },
Struct(_) => __with_ty__! { StructType },
_ => panic!("{:?} not implemented", $key_type)
}
})}
Expand Down
113 changes: 113 additions & 0 deletions tests/series/test_hash.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,119 @@ def test_hash_fixed_size_list_array_consistency(dtype):
assert hashed1 == hashed2


@pytest.mark.parametrize("dtype", daft_numeric_types)
@pytest.mark.parametrize("seed", [None, 123])
def test_hash_struct(dtype, seed):
data = [
{"a": 1, "b": 2},
{"a": 3, "b": 4},
{"a": 1, "b": 4},
{"a": 2, "b": 1},
None,
{"b": 2, "a": 1},
{"b": 1, "a": 2},
None,
]
arr = Series.from_pylist(data).cast(DataType.struct({"a": dtype, "b": dtype}))

seeds = None if seed is None else Series.from_pylist([seed] * len(data)).cast(DataType.uint64())

hashed = arr.hash(seeds).to_pylist()
assert hashed[0] == hashed[5]
assert hashed[3] == hashed[6]
assert hashed[4] is None and hashed[-1] is None

different_inds = [0, 1, 2, 3]
for i in range(len(different_inds)):
for j in range(i):
assert hashed[different_inds[i]] != hashed[different_inds[j]]


@pytest.mark.parametrize("dtype", daft_numeric_types)
@pytest.mark.parametrize("seed", [None, 123])
def test_hash_struct_nested(dtype, seed):
data = [
{"a": 1, "b": {"c": 1, "d": 2}},
{"a": 3, "b": {"c": 3, "d": 4}},
{"a": 1, "b": {"c": 3, "d": 4}},
{"a": 2, "b": {"c": 1, "d": 2}},
None,
{"b": {"c": 1, "d": 2}, "a": 1},
{"b": {"c": 1, "d": 2}, "a": 2},
{"a": 1, "b": {"c": 2, "d": 1}},
None,
]
arr = Series.from_pylist(data).cast(DataType.struct({"a": dtype, "b": DataType.struct({"c": dtype, "d": dtype})}))

seeds = None if seed is None else Series.from_pylist([seed] * len(data)).cast(DataType.uint64())

hashed = arr.hash(seeds).to_pylist()
assert hashed[0] == hashed[5]
assert hashed[3] == hashed[6]
assert hashed[4] is None and hashed[-1] is None

different_inds = [0, 1, 2, 3, 7]
for i in range(len(different_inds)):
for j in range(i):
assert hashed[different_inds[i]] != hashed[different_inds[j]]


@pytest.mark.parametrize("dtype", daft_numeric_types)
@pytest.mark.parametrize("seed", [None, 123])
def test_hash_struct_sublist(dtype, seed):
data = [
{"a": 1, "b": [2, 3, 4]},
{"a": 3, "b": [5, 6, 7]},
{"a": 1, "b": [5, 6, 7]},
{"a": 2, "b": [2, 3, 4]},
None,
{"b": [2, 3, 4], "a": 1},
{"b": [2, 3, 4], "a": 2},
{"a": 1, "b": [2, 4, 3]},
None,
]
arr = Series.from_pylist(data).cast(DataType.struct({"a": dtype, "b": DataType.list(dtype)}))

seeds = None if seed is None else Series.from_pylist([seed] * len(data)).cast(DataType.uint64())

hashed = arr.hash(seeds).to_pylist()
assert hashed[0] == hashed[5]
assert hashed[3] == hashed[6]
assert hashed[4] is None and hashed[-1] is None

different_inds = [0, 1, 2, 3, 7]
for i in range(len(different_inds)):
for j in range(i):
assert hashed[different_inds[i]] != hashed[different_inds[j]]


@pytest.mark.parametrize("dtype", daft_numeric_types)
@pytest.mark.parametrize("seed", [None, 123])
def test_hash_struct_with_nones(dtype, seed):
data = [
{"a": 1, "b": 2},
{"a": None, "b": 2},
{"a": 0, "b": 2},
{"a": None, "b": None},
{"a": 0, "b": 0},
None,
{"a": None, "b": 2},
None,
]
arr = Series.from_pylist(data).cast(DataType.struct({"a": dtype, "b": dtype}))

seeds = None if seed is None else Series.from_pylist([seed] * len(data)).cast(DataType.uint64())

hashed = arr.hash(seeds).to_pylist()
assert hashed[1] == hashed[6]
assert hashed[5] is None and hashed[-1] is None

different_inds = [0, 1, 2, 3, 4]
for i in range(len(different_inds)):
for j in range(i):
assert hashed[different_inds[i]] != hashed[different_inds[j]]


@pytest.mark.parametrize(
"dtype",
[
Expand Down
22 changes: 22 additions & 0 deletions tests/table/test_table_aggs.py
Original file line number Diff line number Diff line change
Expand Up @@ -852,3 +852,25 @@ def test_groupby_fixed_size_list(dtype) -> None:
expected = [[0, 1, 4], [2, 6], [3, 5]]
for lt in expected:
assert lt in res["b"]


@pytest.mark.parametrize("dtype", daft_numeric_types)
def test_groupby_struct(dtype) -> None:
df = from_pydict(
{
"a": [
{"c": 1, "d": "hi"},
{"c": 1, "d": "hi"},
{"c": 1, "d": "hello"},
{"c": 2, "d": "hello"},
{"c": 1, "d": "hi"},
{"c": 2, "d": "hello"},
{"c": 1, "d": "hello"},
],
"b": [0, 1, 2, 3, 4, 5, 6],
}
).with_column("a", col("a").cast(DataType.struct({"c": dtype, "d": DataType.string()})))
res = df.groupby("a").agg_list("b").to_pydict()
expected = [[0, 1, 4], [2, 6], [3, 5]]
for lt in expected:
assert lt in res["b"]

0 comments on commit bdf8aca

Please sign in to comment.