From 01f46fedd0bf64a0fe4cd4ab176e3b1e4fff3951 Mon Sep 17 00:00:00 2001
From: Pindikura Ravindra
Date: Fri, 20 Jul 2018 08:27:06 +0530
Subject: [PATCH] GDV-28: [C++] Add hash functions on all data types (#69)
* GDV-28: [C++] Add hash functions on all data types
* GDV-28: Fix stylecheck in travis to print diff
* GDV-28: pick clang-format from llvm-binary dir
* GDV-28: handle case when seed is null
* GDV-28: [C++] Fix a style check
---
cpp/src/gandiva/codegen/function_registry.cc | 57 +++-
cpp/src/gandiva/integ/CMakeLists.txt | 1 +
cpp/src/gandiva/integ/hash_test.cc | 142 ++++++++++
cpp/src/gandiva/precompiled/CMakeLists.txt | 5 +-
cpp/src/gandiva/precompiled/hash.cc | 280 +++++++++++++++++++
cpp/src/gandiva/precompiled/hash_test.cc | 119 ++++++++
cpp/src/gandiva/precompiled/types.h | 6 +-
7 files changed, 601 insertions(+), 9 deletions(-)
create mode 100644 cpp/src/gandiva/integ/hash_test.cc
create mode 100644 cpp/src/gandiva/precompiled/hash.cc
create mode 100644 cpp/src/gandiva/precompiled/hash_test.cc
diff --git a/cpp/src/gandiva/codegen/function_registry.cc b/cpp/src/gandiva/codegen/function_registry.cc
index d67369aecb500..4107f55801737 100644
--- a/cpp/src/gandiva/codegen/function_registry.cc
+++ b/cpp/src/gandiva/codegen/function_registry.cc
@@ -90,6 +90,38 @@ using std::vector;
NativeFunction(#NAME, DataTypeVector{TYPE()}, int64(), true, RESULT_NULL_IF_NULL, \
STRINGIFY(NAME##_##TYPE))
+// Hash32 functions that :
+// - NULL handling is of type NULL_NEVER
+//
+// The pre-compiled fn name includes the base name & input type name. hash32_int8
+#define HASH32_SAFE_NULL_NEVER(NAME, TYPE) \
+ NativeFunction(#NAME, DataTypeVector{TYPE()}, int32(), true, RESULT_NULL_NEVER, \
+ STRINGIFY(NAME##_##TYPE))
+
+// Hash32 functions that :
+// - NULL handling is of type NULL_NEVER
+//
+// The pre-compiled fn name includes the base name & input type name. hash32_int8
+#define HASH64_SAFE_NULL_NEVER(NAME, TYPE) \
+ NativeFunction(#NAME, DataTypeVector{TYPE()}, int64(), true, RESULT_NULL_NEVER, \
+ STRINGIFY(NAME##_##TYPE))
+
+// Hash32 functions with seed that :
+// - NULL handling is of type NULL_NEVER
+//
+// The pre-compiled fn name includes the base name & input type name. hash32WithSeed_int8
+#define HASH32_SEED_SAFE_NULL_NEVER(NAME, TYPE) \
+ NativeFunction(#NAME, DataTypeVector{TYPE(), int32()}, int32(), true, \
+ RESULT_NULL_NEVER, STRINGIFY(NAME##WithSeed_##TYPE))
+
+// Hash64 functions with seed that :
+// - NULL handling is of type NULL_NEVER
+//
+// The pre-compiled fn name includes the base name & input type name. hash32WithSeed_int8
+#define HASH64_SEED_SAFE_NULL_NEVER(NAME, TYPE) \
+ NativeFunction(#NAME, DataTypeVector{TYPE(), int64()}, int64(), true, \
+ RESULT_NULL_NEVER, STRINGIFY(NAME##WithSeed_##TYPE))
+
// Iterate the inner macro over all numeric types
#define NUMERIC_TYPES(INNER, NAME) \
INNER(NAME, int8), INNER(NAME, int16), INNER(NAME, int32), INNER(NAME, int64), \
@@ -100,11 +132,6 @@ using std::vector;
#define NUMERIC_DATE_TYPES(INNER, NAME) \
NUMERIC_TYPES(INNER, NAME), DATE_TYPES(INNER, NAME), TIME_TYPES(INNER, NAME)
-// Iterate the inner macro over all numeric types and bool type
-#define NUMERIC_BOOL_DATE_TYPES(INNER, NAME) \
- NUMERIC_TYPES(INNER, NAME), DATE_TYPES(INNER, NAME), TIME_TYPES(INNER, NAME), \
- INNER(NAME, boolean)
-
// Iterate the inner macro over all date types
#define DATE_TYPES(INNER, NAME) INNER(NAME, date64), INNER(NAME, timestamp)
@@ -114,6 +141,14 @@ using std::vector;
// Iterate the inner macro over all data types
#define VAR_LEN_TYPES(INNER, NAME) INNER(NAME, utf8), INNER(NAME, binary)
+// Iterate the inner macro over all numeric types, date types and bool type
+#define NUMERIC_BOOL_DATE_TYPES(INNER, NAME) \
+ NUMERIC_DATE_TYPES(INNER, NAME), INNER(NAME, boolean)
+
+// Iterate the inner macro over all numeric types, date types, bool and varlen types
+#define NUMERIC_BOOL_DATE_VAR_LEN_TYPES(INNER, NAME) \
+ NUMERIC_BOOL_DATE_TYPES(INNER, NAME), VAR_LEN_TYPES(INNER, NAME)
+
// list of registered native functions.
NativeFunction FunctionRegistry::pc_registry_[] = {
// Arithmetic operations
@@ -174,6 +209,18 @@ NativeFunction FunctionRegistry::pc_registry_[] = {
BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampdiffQuarter, timestamp, timestamp, int32),
BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampdiffYear, timestamp, timestamp, int32),
+ // hash functions
+ NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SAFE_NULL_NEVER, hash),
+ NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SAFE_NULL_NEVER, hash32),
+ NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SAFE_NULL_NEVER, hash32AsDouble),
+ NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SEED_SAFE_NULL_NEVER, hash32),
+ NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SEED_SAFE_NULL_NEVER, hash32AsDouble),
+
+ NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH64_SAFE_NULL_NEVER, hash64),
+ NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH64_SAFE_NULL_NEVER, hash64AsDouble),
+ NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH64_SEED_SAFE_NULL_NEVER, hash64),
+ NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH64_SEED_SAFE_NULL_NEVER, hash64AsDouble),
+
// utf8/binary operations
UNARY_SAFE_NULL_IF_NULL(octet_length, utf8, int32),
UNARY_SAFE_NULL_IF_NULL(octet_length, binary, int32),
diff --git a/cpp/src/gandiva/integ/CMakeLists.txt b/cpp/src/gandiva/integ/CMakeLists.txt
index 7fb4d36272eff..8cd4c9192b255 100644
--- a/cpp/src/gandiva/integ/CMakeLists.txt
+++ b/cpp/src/gandiva/integ/CMakeLists.txt
@@ -25,4 +25,5 @@ foreach(lib_type "shared" "static")
add_gandiva_integ_test(date_time_test.cc gandiva_${lib_type})
add_gandiva_integ_test(micro_benchmarks.cc gandiva_${lib_type})
add_gandiva_integ_test(to_string_test.cc gandiva_${lib_type})
+ add_gandiva_integ_test(hash_test.cc gandiva_${lib_type})
endforeach(lib_type)
diff --git a/cpp/src/gandiva/integ/hash_test.cc b/cpp/src/gandiva/integ/hash_test.cc
new file mode 100644
index 0000000000000..da517a33e5bed
--- /dev/null
+++ b/cpp/src/gandiva/integ/hash_test.cc
@@ -0,0 +1,142 @@
+// Copyright (C) 2017-2018 Dremio Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include
+#include "arrow/memory_pool.h"
+#include "gandiva/projector.h"
+#include "gandiva/status.h"
+#include "gandiva/tree_expr_builder.h"
+#include "integ/test_util.h"
+
+namespace gandiva {
+
+using arrow::boolean;
+using arrow::int32;
+using arrow::int64;
+using arrow::utf8;
+
+class TestHash : public ::testing::Test {
+ public:
+ void SetUp() { pool_ = arrow::default_memory_pool(); }
+
+ protected:
+ arrow::MemoryPool* pool_;
+};
+
+TEST_F(TestHash, TestSimple) {
+ // schema for input fields
+ auto field_a = field("a", int32());
+ auto schema = arrow::schema({field_a});
+
+ // output fields
+ auto res_0 = field("res0", int32());
+ auto res_1 = field("res1", int64());
+
+ // build expression.
+ // hash32(a, 10)
+ // hash64(a)
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto literal_10 = TreeExprBuilder::MakeLiteral((int32_t)10);
+ auto hash32 = TreeExprBuilder::MakeFunction("hash32", {node_a, literal_10}, int32());
+ auto hash64 = TreeExprBuilder::MakeFunction("hash64", {node_a}, int64());
+ auto expr_0 = TreeExprBuilder::MakeExpression(hash32, res_0);
+ auto expr_1 = TreeExprBuilder::MakeExpression(hash64, res_1);
+
+ // Build a projector for the expression.
+ std::shared_ptr projector;
+ Status status = Projector::Make(schema, {expr_0, expr_1}, pool_, &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array_a = MakeArrowArrayInt32({1, 2, 3, 4}, {false, true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ auto int32_arr = std::dynamic_pointer_cast(outputs.at(0));
+ EXPECT_EQ(int32_arr->null_count(), 0);
+ EXPECT_EQ(int32_arr->Value(0), 0);
+ for (int i = 1; i < num_records; ++i) {
+ EXPECT_NE(int32_arr->Value(i), int32_arr->Value(i - 1));
+ }
+
+ auto int64_arr = std::dynamic_pointer_cast(outputs.at(1));
+ EXPECT_EQ(int64_arr->null_count(), 0);
+ EXPECT_EQ(int64_arr->Value(0), 0);
+ for (int i = 1; i < num_records; ++i) {
+ EXPECT_NE(int64_arr->Value(i), int64_arr->Value(i - 1));
+ }
+}
+
+TEST_F(TestHash, TestBuf) {
+ // schema for input fields
+ auto field_a = field("a", utf8());
+ auto schema = arrow::schema({field_a});
+
+ // output fields
+ auto res_0 = field("res0", int32());
+ auto res_1 = field("res1", int64());
+
+ // build expressions.
+ // hash32(a)
+ // hash64(a, 10)
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto literal_10 = TreeExprBuilder::MakeLiteral((int64_t)10);
+ auto hash32 = TreeExprBuilder::MakeFunction("hash32", {node_a}, int32());
+ auto hash64 = TreeExprBuilder::MakeFunction("hash64", {node_a, literal_10}, int64());
+ auto expr_0 = TreeExprBuilder::MakeExpression(hash32, res_0);
+ auto expr_1 = TreeExprBuilder::MakeExpression(hash64, res_1);
+
+ // Build a projector for the expressions.
+ std::shared_ptr projector;
+ Status status = Projector::Make(schema, {expr_0, expr_1}, pool_, &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array_a =
+ MakeArrowArrayUtf8({"foo", "hello", "bye", "hi"}, {false, true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ auto int32_arr = std::dynamic_pointer_cast(outputs.at(0));
+ EXPECT_EQ(int32_arr->null_count(), 0);
+ EXPECT_EQ(int32_arr->Value(0), 0);
+ for (int i = 1; i < num_records; ++i) {
+ EXPECT_NE(int32_arr->Value(i), int32_arr->Value(i - 1));
+ }
+
+ auto int64_arr = std::dynamic_pointer_cast(outputs.at(1));
+ EXPECT_EQ(int64_arr->null_count(), 0);
+ EXPECT_EQ(int64_arr->Value(0), 0);
+ for (int i = 1; i < num_records; ++i) {
+ EXPECT_NE(int64_arr->Value(i), int64_arr->Value(i - 1));
+ }
+}
+
+} // namespace gandiva
diff --git a/cpp/src/gandiva/precompiled/CMakeLists.txt b/cpp/src/gandiva/precompiled/CMakeLists.txt
index 32289566ba32e..2fc9230297773 100644
--- a/cpp/src/gandiva/precompiled/CMakeLists.txt
+++ b/cpp/src/gandiva/precompiled/CMakeLists.txt
@@ -14,12 +14,10 @@
project(gandiva)
-set(CLANG_EXECUTABLE ${LLVM_TOOLS_BINARY_DIR}/clang)
-set(LINK_EXECUTABLE ${LLVM_TOOLS_BINARY_DIR}/llvm-link)
-
set(PRECOMPILED_SRCS
arithmetic_ops.cc
bitmap.cc
+ hash.cc
print.cc
sample.cc
string_ops.cc
@@ -51,6 +49,7 @@ add_custom_target(precompiled ALL DEPENDS ${GANDIVA_BC_OUTPUT_PATH})
# testing
add_precompiled_unit_test(bitmap_test.cc bitmap.cc)
+add_precompiled_unit_test(hash_test.cc hash.cc)
add_precompiled_unit_test(time_test.cc time.cc)
add_precompiled_unit_test(sample_test.cc sample.cc)
add_precompiled_unit_test(string_ops_test.cc string_ops.cc)
diff --git a/cpp/src/gandiva/precompiled/hash.cc b/cpp/src/gandiva/precompiled/hash.cc
new file mode 100644
index 0000000000000..72d52d0b043ec
--- /dev/null
+++ b/cpp/src/gandiva/precompiled/hash.cc
@@ -0,0 +1,280 @@
+// Copyright (C) 2017-2018 Dremio Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+extern "C" {
+
+#include
+#include "./types.h"
+
+static inline uint64 rotate_left(uint64 val, int distance) {
+ return (val << distance) | (val >> (64 - distance));
+}
+
+//
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain.
+// See http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp
+// MurmurHash3_x64_128
+//
+static inline uint64 fmix64(uint64 k) {
+ k ^= k >> 33;
+ k *= 0xff51afd7ed558ccduLL;
+ k ^= k >> 33;
+ k *= 0xc4ceb9fe1a85ec53uLL;
+ k ^= k >> 33;
+ return k;
+}
+
+static inline uint64 murmur3_64(uint64 val, int32 seed) {
+ uint64 h1 = seed;
+ uint64 h2 = seed;
+
+ uint64 c1 = 0x87c37b91114253d5ull;
+ uint64 c2 = 0x4cf5ad432745937full;
+
+ int length = 8;
+ uint64 k1 = 0;
+
+ k1 = val;
+ k1 *= c1;
+ k1 = rotate_left(k1, 31);
+ k1 *= c2;
+ h1 ^= k1;
+
+ h1 ^= length;
+ h2 ^= length;
+
+ h1 += h2;
+ h2 += h1;
+
+ h1 = fmix64(h1);
+ h2 = fmix64(h2);
+
+ h1 += h2;
+
+ // h2 += h1;
+ // murmur3_128 should return 128 bit (h1,h2), now we return only 64bits,
+ return h1;
+}
+
+static inline uint64 double_to_long_bits(double value) {
+ uint64 result;
+ memcpy(&result, &value, sizeof(result));
+ return result;
+}
+
+FORCE_INLINE int64 hash64(double val, int64 seed) {
+ return (int64)murmur3_64(double_to_long_bits(val), (int32)seed);
+}
+
+FORCE_INLINE int32 hash32(double val, int32 seed) {
+ return (int32)murmur3_64(double_to_long_bits(val), seed);
+}
+
+// Wrappers for all the numeric/data/time arrow types
+
+#define HASH64_WITH_SEED_OP(NAME, TYPE) \
+ FORCE_INLINE \
+ int64 NAME##_##TYPE(TYPE in, boolean is_valid, int64 seed, boolean seed_isvalid) { \
+ return is_valid && seed_isvalid ? hash64((double)in, seed) : 0; \
+ }
+
+#define HASH32_WITH_SEED_OP(NAME, TYPE) \
+ FORCE_INLINE \
+ int32 NAME##_##TYPE(TYPE in, boolean is_valid, int32 seed, boolean seed_isvalid) { \
+ return is_valid && seed_isvalid ? hash32((double)in, seed) : 0; \
+ }
+
+#define HASH64_OP(NAME, TYPE) \
+ FORCE_INLINE \
+ int64 NAME##_##TYPE(TYPE in, boolean is_valid) { \
+ return is_valid ? hash64((double)in, 0) : 0; \
+ }
+
+#define HASH32_OP(NAME, TYPE) \
+ FORCE_INLINE \
+ int32 NAME##_##TYPE(TYPE in, boolean is_valid) { \
+ return is_valid ? hash32((double)in, 0) : 0; \
+ }
+
+// Expand inner macro for all numeric types.
+#define NUMERIC_BOOL_DATE_TYPES(INNER, NAME) \
+ INNER(NAME, int8) \
+ INNER(NAME, int16) \
+ INNER(NAME, int32) \
+ INNER(NAME, int64) \
+ INNER(NAME, uint8) \
+ INNER(NAME, uint16) \
+ INNER(NAME, uint32) \
+ INNER(NAME, uint64) \
+ INNER(NAME, float32) \
+ INNER(NAME, float64) \
+ INNER(NAME, boolean) \
+ INNER(NAME, date64) \
+ INNER(NAME, time32) \
+ INNER(NAME, timestamp)
+
+NUMERIC_BOOL_DATE_TYPES(HASH32_OP, hash)
+NUMERIC_BOOL_DATE_TYPES(HASH32_OP, hash32)
+NUMERIC_BOOL_DATE_TYPES(HASH32_OP, hash32AsDouble)
+NUMERIC_BOOL_DATE_TYPES(HASH32_WITH_SEED_OP, hash32WithSeed)
+NUMERIC_BOOL_DATE_TYPES(HASH32_WITH_SEED_OP, hash32AsDoubleWithSeed)
+
+NUMERIC_BOOL_DATE_TYPES(HASH64_OP, hash64)
+NUMERIC_BOOL_DATE_TYPES(HASH64_OP, hash64AsDouble)
+NUMERIC_BOOL_DATE_TYPES(HASH64_WITH_SEED_OP, hash64WithSeed)
+NUMERIC_BOOL_DATE_TYPES(HASH64_WITH_SEED_OP, hash64AsDoubleWithSeed)
+
+static inline uint64 murmur3_64_buf(const uint8 *key, int32 len, int32 seed) {
+ uint64 h1 = seed;
+ uint64 h2 = seed;
+ uint64 c1 = 0x87c37b91114253d5ull;
+ uint64 c2 = 0x4cf5ad432745937full;
+
+ const uint64 *blocks = (const uint64 *)key;
+ int nblocks = len / 16;
+ for (int i = 0; i < nblocks; i++) {
+ uint64 k1 = blocks[i * 2 + 0];
+ uint64 k2 = blocks[i * 2 + 1];
+
+ k1 *= c1;
+ k1 = rotate_left(k1, 31);
+ k1 *= c2;
+ h1 ^= k1;
+ h1 = rotate_left(h1, 27);
+ h1 += h2;
+ h1 = h1 * 5 + 0x52dce729;
+ k2 *= c2;
+ k2 = rotate_left(k2, 33);
+ k2 *= c1;
+ h2 ^= k2;
+ h2 = rotate_left(h2, 31);
+ h2 += h1;
+ h2 = h2 * 5 + 0x38495ab5;
+ }
+
+ // tail
+ uint64 k1 = 0;
+ uint64 k2 = 0;
+
+ const uint8 *tail = (const uint8 *)(key + nblocks * 16);
+ switch (len & 15) {
+ case 15:
+ k2 = (uint64)(tail[14]) << 48;
+ case 14:
+ k2 ^= (uint64)(tail[13]) << 40;
+ case 13:
+ k2 ^= (uint64)(tail[12]) << 32;
+ case 12:
+ k2 ^= (uint64)(tail[11]) << 24;
+ case 11:
+ k2 ^= (uint64)(tail[10]) << 16;
+ case 10:
+ k2 ^= (uint64)(tail[9]) << 8;
+ case 9:
+ k2 ^= (uint64)(tail[8]);
+ k2 *= c2;
+ k2 = rotate_left(k2, 33);
+ k2 *= c1;
+ h2 ^= k2;
+ case 8:
+ k1 ^= (uint64)(tail[7]) << 56;
+ case 7:
+ k1 ^= (uint64)(tail[6]) << 48;
+ case 6:
+ k1 ^= (uint64)(tail[5]) << 40;
+ case 5:
+ k1 ^= (uint64)(tail[4]) << 32;
+ case 4:
+ k1 ^= (uint64)(tail[3]) << 24;
+ case 3:
+ k1 ^= (uint64)(tail[2]) << 16;
+ case 2:
+ k1 ^= (uint64)(tail[1]) << 8;
+ case 1:
+ k1 ^= (uint64)(tail[0]) << 0;
+ k1 *= c1;
+ k1 = rotate_left(k1, 31);
+ k1 *= c2;
+ h1 ^= k1;
+ };
+
+ h1 ^= len;
+ h2 ^= len;
+
+ h1 += h2;
+ h2 += h1;
+
+ h1 = fmix64(h1);
+ h2 = fmix64(h2);
+
+ h1 += h2;
+ // h2 += h1;
+ // returning 64-bits of the 128-bit hash.
+ return h1;
+}
+
+FORCE_INLINE int64 hash64_buf(const uint8 *buf, int len, int64 seed) {
+ return (int64)murmur3_64_buf(buf, len, (int32)seed);
+}
+
+FORCE_INLINE int32 hash32_buf(const uint8 *buf, int len, int32 seed) {
+ return (int32)murmur3_64_buf(buf, len, seed);
+}
+
+// Wrappers for the varlen types
+
+#define HASH64_BUF_WITH_SEED_OP(NAME, TYPE) \
+ FORCE_INLINE \
+ int64 NAME##_##TYPE(TYPE in, int32 len, boolean is_valid, int64 seed, \
+ boolean seed_isvalid) { \
+ return is_valid && seed_isvalid ? hash64_buf((const uint8 *)in, len, seed) : 0; \
+ }
+
+#define HASH32_BUF_WITH_SEED_OP(NAME, TYPE) \
+ FORCE_INLINE \
+ int32 NAME##_##TYPE(TYPE in, int32 len, boolean is_valid, int32 seed, \
+ boolean seed_isvalid) { \
+ return is_valid && seed_isvalid ? hash32_buf((const uint8 *)in, len, seed) : 0; \
+ }
+
+#define HASH64_BUF_OP(NAME, TYPE) \
+ FORCE_INLINE \
+ int64 NAME##_##TYPE(TYPE in, int32 len, boolean is_valid) { \
+ return is_valid ? hash64_buf((const uint8 *)in, len, 0) : 0; \
+ }
+
+#define HASH32_BUF_OP(NAME, TYPE) \
+ FORCE_INLINE \
+ int32 NAME##_##TYPE(TYPE in, int32 len, boolean is_valid) { \
+ return is_valid ? hash32_buf((const uint8 *)in, len, 0) : 0; \
+ }
+
+// Expand inner macro for all numeric types.
+#define VAR_LEN_TYPES(INNER, NAME) \
+ INNER(NAME, utf8) \
+ INNER(NAME, binary)
+
+VAR_LEN_TYPES(HASH32_BUF_OP, hash)
+VAR_LEN_TYPES(HASH32_BUF_OP, hash32)
+VAR_LEN_TYPES(HASH32_BUF_OP, hash32AsDouble)
+VAR_LEN_TYPES(HASH32_BUF_WITH_SEED_OP, hash32WithSeed)
+VAR_LEN_TYPES(HASH32_BUF_WITH_SEED_OP, hash32AsDoubleWithSeed)
+
+VAR_LEN_TYPES(HASH64_BUF_OP, hash64)
+VAR_LEN_TYPES(HASH64_BUF_OP, hash64AsDouble)
+VAR_LEN_TYPES(HASH64_BUF_WITH_SEED_OP, hash64WithSeed)
+VAR_LEN_TYPES(HASH64_BUF_WITH_SEED_OP, hash64AsDoubleWithSeed)
+
+} // extern "C"
diff --git a/cpp/src/gandiva/precompiled/hash_test.cc b/cpp/src/gandiva/precompiled/hash_test.cc
new file mode 100644
index 0000000000000..7e45b19694970
--- /dev/null
+++ b/cpp/src/gandiva/precompiled/hash_test.cc
@@ -0,0 +1,119 @@
+// Copyright (C) 2017-2018 Dremio Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include
+
+#include
+#include "precompiled/types.h"
+
+namespace gandiva {
+
+TEST(TestHash, TestHash32) {
+ int8 s8 = 0;
+ uint8 u8 = 0;
+ int16 s16 = 0;
+ uint16 u16 = 0;
+ int32 s32 = 0;
+ uint32 u32 = 0;
+ int64 s64 = 0;
+ uint64 u64 = 0;
+ float32 f32 = 0;
+ float64 f64 = 0;
+
+ // hash of 0 should be non-zero (zero is the hash value for nulls).
+ int32 zero_hash = hash32(s8, 0);
+ EXPECT_NE(zero_hash, 0);
+
+ // for a given value, all numeric types must have the same hash.
+ EXPECT_EQ(hash32(u8, 0), zero_hash);
+ EXPECT_EQ(hash32(s16, 0), zero_hash);
+ EXPECT_EQ(hash32(u16, 0), zero_hash);
+ EXPECT_EQ(hash32(s32, 0), zero_hash);
+ EXPECT_EQ(hash32(u32, 0), zero_hash);
+ EXPECT_EQ(hash32(s64, 0), zero_hash);
+ EXPECT_EQ(hash32(u64, 0), zero_hash);
+ EXPECT_EQ(hash32(f32, 0), zero_hash);
+ EXPECT_EQ(hash32(f64, 0), zero_hash);
+
+ // hash must change with a change in seed.
+ EXPECT_NE(hash32(s8, 1), zero_hash);
+
+ // for a given value and seed, all numeric types must have the same hash.
+ EXPECT_EQ(hash32(s8, 1), hash32(s16, 1));
+ EXPECT_EQ(hash32(s8, 1), hash32(u32, 1));
+ EXPECT_EQ(hash32(s8, 1), hash32(f32, 1));
+ EXPECT_EQ(hash32(s8, 1), hash32(f64, 1));
+}
+
+TEST(TestHash, TestHash64) {
+ int8 s8 = 0;
+ uint8 u8 = 0;
+ int16 s16 = 0;
+ uint16 u16 = 0;
+ int32 s32 = 0;
+ uint32 u32 = 0;
+ int64 s64 = 0;
+ uint64 u64 = 0;
+ float32 f32 = 0;
+ float64 f64 = 0;
+
+ // hash of 0 should be non-zero (zero is the hash value for nulls).
+ int64 zero_hash = hash64(s8, 0);
+ EXPECT_NE(zero_hash, 0);
+ EXPECT_NE(hash64(u8, 0), hash32(u8, 0));
+
+ // for a given value, all numeric types must have the same hash.
+ EXPECT_EQ(hash64(u8, 0), zero_hash);
+ EXPECT_EQ(hash64(s16, 0), zero_hash);
+ EXPECT_EQ(hash64(u16, 0), zero_hash);
+ EXPECT_EQ(hash64(s32, 0), zero_hash);
+ EXPECT_EQ(hash64(u32, 0), zero_hash);
+ EXPECT_EQ(hash64(s64, 0), zero_hash);
+ EXPECT_EQ(hash64(u64, 0), zero_hash);
+ EXPECT_EQ(hash64(f32, 0), zero_hash);
+ EXPECT_EQ(hash64(f64, 0), zero_hash);
+
+ // hash must change with a change in seed.
+ EXPECT_NE(hash64(s8, 1), zero_hash);
+
+ // for a given value and seed, all numeric types must have the same hash.
+ EXPECT_EQ(hash64(s8, 1), hash64(s16, 1));
+ EXPECT_EQ(hash64(s8, 1), hash64(u32, 1));
+ EXPECT_EQ(hash64(s8, 1), hash64(f32, 1));
+}
+
+TEST(TestHash, TestHashBuf) {
+ const char *buf = "hello";
+ int buf_len = 5;
+
+ // hash should be non-zero (zero is the hash value for nulls).
+ EXPECT_NE(hash32_buf((const uint8 *)buf, buf_len, 0), 0);
+ EXPECT_NE(hash64_buf((const uint8 *)buf, buf_len, 0), 0);
+
+ // hash must change if the string is changed.
+ EXPECT_NE(hash32_buf((const uint8 *)buf, buf_len, 0),
+ hash32_buf((const uint8 *)buf, buf_len - 1, 0));
+
+ EXPECT_NE(hash64_buf((const uint8 *)buf, buf_len, 0),
+ hash64_buf((const uint8 *)buf, buf_len - 1, 0));
+
+ // hash must change if the seed is changed.
+ EXPECT_NE(hash32_buf((const uint8 *)buf, buf_len, 0),
+ hash32_buf((const uint8 *)buf, buf_len, 1));
+
+ EXPECT_NE(hash64_buf((const uint8 *)buf, buf_len, 0),
+ hash64_buf((const uint8 *)buf, buf_len, 1));
+}
+
+} // namespace gandiva
diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h
index 2f782a8edd29e..5e2f8ecf72185 100644
--- a/cpp/src/gandiva/precompiled/types.h
+++ b/cpp/src/gandiva/precompiled/types.h
@@ -63,11 +63,15 @@ int64 extractDay_timestamp(timestamp millis);
int64 extractHour_timestamp(timestamp millis);
int64 extractMinute_timestamp(timestamp millis);
int64 extractSecond_timestamp(timestamp millis);
-
int64 extractHour_time32(int32 millis_in_day);
int64 extractMinute_time32(int32 millis_in_day);
int64 extractSecond_time32(int32 millis_in_day);
+int32 hash32(double val, int32 seed);
+int32 hash32_buf(const uint8 *buf, int len, int32 seed);
+int64 hash64(double val, int64 seed);
+int64 hash64_buf(const uint8 *buf, int len, int64 seed);
+
int32 mem_compare(const char *left, int32 left_len, const char *right, int32 right_len);
} // extern "C"