From 0b98dfc5904069ebc621d091b8c18f0f28b92d0a Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Mon, 16 Dec 2024 15:19:57 +0100 Subject: [PATCH] ensure that NULL and 0 hashes to different numbers --- .../arrow/compute/kernels/scalar_hash_test.cc | 18 ++++++++++++++++++ cpp/src/arrow/compute/key_hash_internal.cc | 14 ++++++++------ 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_hash_test.cc b/cpp/src/arrow/compute/kernels/scalar_hash_test.cc index 49ce80839dd7c..a2a2ce1ab69ab 100644 --- a/cpp/src/arrow/compute/kernels/scalar_hash_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_hash_test.cc @@ -172,6 +172,24 @@ TEST_F(TestScalarHash, Null) { CheckDeterminisic("hash64", arr); } +TEST_F(TestScalarHash, NullHashIsZero) { + auto arr1 = ArrayFromJSON(int32(), R"([null, 0, 1])"); + ASSERT_OK_AND_ASSIGN(auto res1, CallFunction("hash64", {arr1})); + auto buf1 = res1.array()->GetValues(1); + ASSERT_EQ(buf1[0], 0); + ASSERT_NE(buf1[1], 0); + ASSERT_NE(buf1[2], 0); + ASSERT_NE(buf1[1], buf1[2]); + + auto arr2 = ArrayFromJSON(int8(), R"([null, 0, 1])"); + ASSERT_OK_AND_ASSIGN(auto res2, CallFunction("hash32", {arr2})); + auto buf2 = res2.array()->GetValues(1); + ASSERT_EQ(buf2[0], 0); + ASSERT_NE(buf2[1], 0); + ASSERT_NE(buf2[2], 0); + ASSERT_NE(buf2[1], buf2[2]); +} + TEST_F(TestScalarHash, Boolean) { auto arr = ArrayFromJSON(boolean(), R"([true, false, null, true, null])"); CheckDeterminisic("hash32", arr); diff --git a/cpp/src/arrow/compute/key_hash_internal.cc b/cpp/src/arrow/compute/key_hash_internal.cc index a0002efb3faf3..f1b034acbf86a 100644 --- a/cpp/src/arrow/compute/key_hash_internal.cc +++ b/cpp/src/arrow/compute/key_hash_internal.cc @@ -298,10 +298,11 @@ void Hashing32::HashBit(bool combine_hashes, int64_t bit_offset, uint32_t num_ke template void Hashing32::HashIntImp(uint32_t num_keys, const T* keys, uint32_t* hashes) { - constexpr uint64_t multiplier = 11400714785074694791ULL; + constexpr uint64_t kMultiplier = 11400714785074694791ULL; + constexpr uint64_t kAddend = 9756277977048271785ULL; for (uint32_t ikey = 0; ikey < num_keys; ++ikey) { - uint64_t x = static_cast(keys[ikey]); - uint32_t hash = static_cast(BYTESWAP(x * multiplier)); + uint64_t x = static_cast(keys[ikey]) + kAddend; + uint32_t hash = static_cast(BYTESWAP(x * kMultiplier)); if (T_COMBINE_HASHES) { hashes[ikey] = CombineHashesImp(hashes[ikey], hash); @@ -751,10 +752,11 @@ void Hashing64::HashBit(bool combine_hashes, int64_t bit_offset, uint32_t num_ke template void Hashing64::HashIntImp(uint32_t num_keys, const T* keys, uint64_t* hashes) { - constexpr uint64_t multiplier = 11400714785074694791ULL; + constexpr uint64_t kMultiplier = 11400714785074694791ULL; + constexpr uint64_t kAddend = 9756277977048271785ULL; for (uint32_t ikey = 0; ikey < num_keys; ++ikey) { - uint64_t x = static_cast(keys[ikey]); - uint64_t hash = static_cast(BYTESWAP(x * multiplier)); + uint64_t x = static_cast(keys[ikey]) + kAddend; + uint64_t hash = static_cast(BYTESWAP(x * kMultiplier)); if (T_COMBINE_HASHES) { hashes[ikey] = CombineHashesImp(hashes[ikey], hash);