diff --git a/cmake/BuildUtils.cmake b/cmake/BuildUtils.cmake index ba8b4bdf2bd8c..46e3c9803dd06 100644 --- a/cmake/BuildUtils.cmake +++ b/cmake/BuildUtils.cmake @@ -128,9 +128,6 @@ function(build_external PROJ) EXCLUDE_FROM_ALL) endfunction(build_external PROJ) -find_program(CLANG_FORMAT_BIN NAMES clang-format) -message(STATUS "Found clang-format executable at ${CLANG_FORMAT_BIN}") - file(GLOB_RECURSE LINT_FILES "${CMAKE_CURRENT_SOURCE_DIR}/include/*.h" "${CMAKE_CURRENT_SOURCE_DIR}/src/*.h" @@ -146,7 +143,7 @@ function(add_stylecheck) COMMENT "Performing stylecheck on all .cpp/.h files" # use ! to check for no replacement COMMAND ! - ${CLANG_FORMAT_BIN} + ${CLANG_FORMAT_EXECUTABLE} -style=file -output-replacements-xml ${LINT_FILES} @@ -161,7 +158,7 @@ function(add_stylefix) add_custom_target(stylefix COMMENT "Performing stylefix on all .cpp/.h files" COMMAND - echo ${LINT_FILES} | xargs ${CLANG_FORMAT_BIN} -style=file -i + echo ${LINT_FILES} | xargs ${CLANG_FORMAT_EXECUTABLE} -style=file -i ) endif (UNIX) endfunction(add_stylefix) diff --git a/cmake/FindLLVM.cmake b/cmake/FindLLVM.cmake index 0b66510e57497..b3c411c7b4bc4 100644 --- a/cmake/FindLLVM.cmake +++ b/cmake/FindLLVM.cmake @@ -27,6 +27,10 @@ message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}") # Find the libraries that correspond to the LLVM components llvm_map_components_to_libnames(LLVM_LIBS core mcjit native ipo bitreader target linker analysis debuginfodwarf) +set(CLANG_EXECUTABLE ${LLVM_TOOLS_BINARY_DIR}/clang CACHE STRING "clang") +set(LINK_EXECUTABLE ${LLVM_TOOLS_BINARY_DIR}/llvm-link CACHE STRING "link") +set(CLANG_FORMAT_EXECUTABLE ${LLVM_TOOLS_BINARY_DIR}/clang-format CACHE STRING "clang-format") + add_library(LLVM::LLVM_INTERFACE INTERFACE IMPORTED) set_target_properties(LLVM::LLVM_INTERFACE PROPERTIES diff --git a/integ/CMakeLists.txt b/integ/CMakeLists.txt index 7fb4d36272eff..8cd4c9192b255 100644 --- a/integ/CMakeLists.txt +++ b/integ/CMakeLists.txt @@ -25,4 +25,5 @@ foreach(lib_type "shared" "static") add_gandiva_integ_test(date_time_test.cc gandiva_${lib_type}) add_gandiva_integ_test(micro_benchmarks.cc gandiva_${lib_type}) add_gandiva_integ_test(to_string_test.cc gandiva_${lib_type}) + add_gandiva_integ_test(hash_test.cc gandiva_${lib_type}) endforeach(lib_type) diff --git a/integ/hash_test.cc b/integ/hash_test.cc new file mode 100644 index 0000000000000..da517a33e5bed --- /dev/null +++ b/integ/hash_test.cc @@ -0,0 +1,142 @@ +// Copyright (C) 2017-2018 Dremio Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "arrow/memory_pool.h" +#include "gandiva/projector.h" +#include "gandiva/status.h" +#include "gandiva/tree_expr_builder.h" +#include "integ/test_util.h" + +namespace gandiva { + +using arrow::boolean; +using arrow::int32; +using arrow::int64; +using arrow::utf8; + +class TestHash : public ::testing::Test { + public: + void SetUp() { pool_ = arrow::default_memory_pool(); } + + protected: + arrow::MemoryPool* pool_; +}; + +TEST_F(TestHash, TestSimple) { + // schema for input fields + auto field_a = field("a", int32()); + auto schema = arrow::schema({field_a}); + + // output fields + auto res_0 = field("res0", int32()); + auto res_1 = field("res1", int64()); + + // build expression. + // hash32(a, 10) + // hash64(a) + auto node_a = TreeExprBuilder::MakeField(field_a); + auto literal_10 = TreeExprBuilder::MakeLiteral((int32_t)10); + auto hash32 = TreeExprBuilder::MakeFunction("hash32", {node_a, literal_10}, int32()); + auto hash64 = TreeExprBuilder::MakeFunction("hash64", {node_a}, int64()); + auto expr_0 = TreeExprBuilder::MakeExpression(hash32, res_0); + auto expr_1 = TreeExprBuilder::MakeExpression(hash64, res_1); + + // Build a projector for the expression. + std::shared_ptr projector; + Status status = Projector::Make(schema, {expr_0, expr_1}, pool_, &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + // Create a row-batch with some sample data + int num_records = 4; + auto array_a = MakeArrowArrayInt32({1, 2, 3, 4}, {false, true, true, true}); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a}); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, &outputs); + EXPECT_TRUE(status.ok()); + + // Validate results + auto int32_arr = std::dynamic_pointer_cast(outputs.at(0)); + EXPECT_EQ(int32_arr->null_count(), 0); + EXPECT_EQ(int32_arr->Value(0), 0); + for (int i = 1; i < num_records; ++i) { + EXPECT_NE(int32_arr->Value(i), int32_arr->Value(i - 1)); + } + + auto int64_arr = std::dynamic_pointer_cast(outputs.at(1)); + EXPECT_EQ(int64_arr->null_count(), 0); + EXPECT_EQ(int64_arr->Value(0), 0); + for (int i = 1; i < num_records; ++i) { + EXPECT_NE(int64_arr->Value(i), int64_arr->Value(i - 1)); + } +} + +TEST_F(TestHash, TestBuf) { + // schema for input fields + auto field_a = field("a", utf8()); + auto schema = arrow::schema({field_a}); + + // output fields + auto res_0 = field("res0", int32()); + auto res_1 = field("res1", int64()); + + // build expressions. + // hash32(a) + // hash64(a, 10) + auto node_a = TreeExprBuilder::MakeField(field_a); + auto literal_10 = TreeExprBuilder::MakeLiteral((int64_t)10); + auto hash32 = TreeExprBuilder::MakeFunction("hash32", {node_a}, int32()); + auto hash64 = TreeExprBuilder::MakeFunction("hash64", {node_a, literal_10}, int64()); + auto expr_0 = TreeExprBuilder::MakeExpression(hash32, res_0); + auto expr_1 = TreeExprBuilder::MakeExpression(hash64, res_1); + + // Build a projector for the expressions. + std::shared_ptr projector; + Status status = Projector::Make(schema, {expr_0, expr_1}, pool_, &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + // Create a row-batch with some sample data + int num_records = 4; + auto array_a = + MakeArrowArrayUtf8({"foo", "hello", "bye", "hi"}, {false, true, true, true}); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a}); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, &outputs); + EXPECT_TRUE(status.ok()); + + // Validate results + auto int32_arr = std::dynamic_pointer_cast(outputs.at(0)); + EXPECT_EQ(int32_arr->null_count(), 0); + EXPECT_EQ(int32_arr->Value(0), 0); + for (int i = 1; i < num_records; ++i) { + EXPECT_NE(int32_arr->Value(i), int32_arr->Value(i - 1)); + } + + auto int64_arr = std::dynamic_pointer_cast(outputs.at(1)); + EXPECT_EQ(int64_arr->null_count(), 0); + EXPECT_EQ(int64_arr->Value(0), 0); + for (int i = 1; i < num_records; ++i) { + EXPECT_NE(int64_arr->Value(i), int64_arr->Value(i - 1)); + } +} + +} // namespace gandiva diff --git a/src/codegen/function_registry.cc b/src/codegen/function_registry.cc index d67369aecb500..4107f55801737 100644 --- a/src/codegen/function_registry.cc +++ b/src/codegen/function_registry.cc @@ -90,6 +90,38 @@ using std::vector; NativeFunction(#NAME, DataTypeVector{TYPE()}, int64(), true, RESULT_NULL_IF_NULL, \ STRINGIFY(NAME##_##TYPE)) +// Hash32 functions that : +// - NULL handling is of type NULL_NEVER +// +// The pre-compiled fn name includes the base name & input type name. hash32_int8 +#define HASH32_SAFE_NULL_NEVER(NAME, TYPE) \ + NativeFunction(#NAME, DataTypeVector{TYPE()}, int32(), true, RESULT_NULL_NEVER, \ + STRINGIFY(NAME##_##TYPE)) + +// Hash32 functions that : +// - NULL handling is of type NULL_NEVER +// +// The pre-compiled fn name includes the base name & input type name. hash32_int8 +#define HASH64_SAFE_NULL_NEVER(NAME, TYPE) \ + NativeFunction(#NAME, DataTypeVector{TYPE()}, int64(), true, RESULT_NULL_NEVER, \ + STRINGIFY(NAME##_##TYPE)) + +// Hash32 functions with seed that : +// - NULL handling is of type NULL_NEVER +// +// The pre-compiled fn name includes the base name & input type name. hash32WithSeed_int8 +#define HASH32_SEED_SAFE_NULL_NEVER(NAME, TYPE) \ + NativeFunction(#NAME, DataTypeVector{TYPE(), int32()}, int32(), true, \ + RESULT_NULL_NEVER, STRINGIFY(NAME##WithSeed_##TYPE)) + +// Hash64 functions with seed that : +// - NULL handling is of type NULL_NEVER +// +// The pre-compiled fn name includes the base name & input type name. hash32WithSeed_int8 +#define HASH64_SEED_SAFE_NULL_NEVER(NAME, TYPE) \ + NativeFunction(#NAME, DataTypeVector{TYPE(), int64()}, int64(), true, \ + RESULT_NULL_NEVER, STRINGIFY(NAME##WithSeed_##TYPE)) + // Iterate the inner macro over all numeric types #define NUMERIC_TYPES(INNER, NAME) \ INNER(NAME, int8), INNER(NAME, int16), INNER(NAME, int32), INNER(NAME, int64), \ @@ -100,11 +132,6 @@ using std::vector; #define NUMERIC_DATE_TYPES(INNER, NAME) \ NUMERIC_TYPES(INNER, NAME), DATE_TYPES(INNER, NAME), TIME_TYPES(INNER, NAME) -// Iterate the inner macro over all numeric types and bool type -#define NUMERIC_BOOL_DATE_TYPES(INNER, NAME) \ - NUMERIC_TYPES(INNER, NAME), DATE_TYPES(INNER, NAME), TIME_TYPES(INNER, NAME), \ - INNER(NAME, boolean) - // Iterate the inner macro over all date types #define DATE_TYPES(INNER, NAME) INNER(NAME, date64), INNER(NAME, timestamp) @@ -114,6 +141,14 @@ using std::vector; // Iterate the inner macro over all data types #define VAR_LEN_TYPES(INNER, NAME) INNER(NAME, utf8), INNER(NAME, binary) +// Iterate the inner macro over all numeric types, date types and bool type +#define NUMERIC_BOOL_DATE_TYPES(INNER, NAME) \ + NUMERIC_DATE_TYPES(INNER, NAME), INNER(NAME, boolean) + +// Iterate the inner macro over all numeric types, date types, bool and varlen types +#define NUMERIC_BOOL_DATE_VAR_LEN_TYPES(INNER, NAME) \ + NUMERIC_BOOL_DATE_TYPES(INNER, NAME), VAR_LEN_TYPES(INNER, NAME) + // list of registered native functions. NativeFunction FunctionRegistry::pc_registry_[] = { // Arithmetic operations @@ -174,6 +209,18 @@ NativeFunction FunctionRegistry::pc_registry_[] = { BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampdiffQuarter, timestamp, timestamp, int32), BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampdiffYear, timestamp, timestamp, int32), + // hash functions + NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SAFE_NULL_NEVER, hash), + NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SAFE_NULL_NEVER, hash32), + NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SAFE_NULL_NEVER, hash32AsDouble), + NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SEED_SAFE_NULL_NEVER, hash32), + NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SEED_SAFE_NULL_NEVER, hash32AsDouble), + + NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH64_SAFE_NULL_NEVER, hash64), + NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH64_SAFE_NULL_NEVER, hash64AsDouble), + NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH64_SEED_SAFE_NULL_NEVER, hash64), + NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH64_SEED_SAFE_NULL_NEVER, hash64AsDouble), + // utf8/binary operations UNARY_SAFE_NULL_IF_NULL(octet_length, utf8, int32), UNARY_SAFE_NULL_IF_NULL(octet_length, binary, int32), diff --git a/src/precompiled/CMakeLists.txt b/src/precompiled/CMakeLists.txt index 32289566ba32e..2fc9230297773 100644 --- a/src/precompiled/CMakeLists.txt +++ b/src/precompiled/CMakeLists.txt @@ -14,12 +14,10 @@ project(gandiva) -set(CLANG_EXECUTABLE ${LLVM_TOOLS_BINARY_DIR}/clang) -set(LINK_EXECUTABLE ${LLVM_TOOLS_BINARY_DIR}/llvm-link) - set(PRECOMPILED_SRCS arithmetic_ops.cc bitmap.cc + hash.cc print.cc sample.cc string_ops.cc @@ -51,6 +49,7 @@ add_custom_target(precompiled ALL DEPENDS ${GANDIVA_BC_OUTPUT_PATH}) # testing add_precompiled_unit_test(bitmap_test.cc bitmap.cc) +add_precompiled_unit_test(hash_test.cc hash.cc) add_precompiled_unit_test(time_test.cc time.cc) add_precompiled_unit_test(sample_test.cc sample.cc) add_precompiled_unit_test(string_ops_test.cc string_ops.cc) diff --git a/src/precompiled/hash.cc b/src/precompiled/hash.cc new file mode 100644 index 0000000000000..72d52d0b043ec --- /dev/null +++ b/src/precompiled/hash.cc @@ -0,0 +1,280 @@ +// Copyright (C) 2017-2018 Dremio Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +extern "C" { + +#include +#include "./types.h" + +static inline uint64 rotate_left(uint64 val, int distance) { + return (val << distance) | (val >> (64 - distance)); +} + +// +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. +// See http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp +// MurmurHash3_x64_128 +// +static inline uint64 fmix64(uint64 k) { + k ^= k >> 33; + k *= 0xff51afd7ed558ccduLL; + k ^= k >> 33; + k *= 0xc4ceb9fe1a85ec53uLL; + k ^= k >> 33; + return k; +} + +static inline uint64 murmur3_64(uint64 val, int32 seed) { + uint64 h1 = seed; + uint64 h2 = seed; + + uint64 c1 = 0x87c37b91114253d5ull; + uint64 c2 = 0x4cf5ad432745937full; + + int length = 8; + uint64 k1 = 0; + + k1 = val; + k1 *= c1; + k1 = rotate_left(k1, 31); + k1 *= c2; + h1 ^= k1; + + h1 ^= length; + h2 ^= length; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + + // h2 += h1; + // murmur3_128 should return 128 bit (h1,h2), now we return only 64bits, + return h1; +} + +static inline uint64 double_to_long_bits(double value) { + uint64 result; + memcpy(&result, &value, sizeof(result)); + return result; +} + +FORCE_INLINE int64 hash64(double val, int64 seed) { + return (int64)murmur3_64(double_to_long_bits(val), (int32)seed); +} + +FORCE_INLINE int32 hash32(double val, int32 seed) { + return (int32)murmur3_64(double_to_long_bits(val), seed); +} + +// Wrappers for all the numeric/data/time arrow types + +#define HASH64_WITH_SEED_OP(NAME, TYPE) \ + FORCE_INLINE \ + int64 NAME##_##TYPE(TYPE in, boolean is_valid, int64 seed, boolean seed_isvalid) { \ + return is_valid && seed_isvalid ? hash64((double)in, seed) : 0; \ + } + +#define HASH32_WITH_SEED_OP(NAME, TYPE) \ + FORCE_INLINE \ + int32 NAME##_##TYPE(TYPE in, boolean is_valid, int32 seed, boolean seed_isvalid) { \ + return is_valid && seed_isvalid ? hash32((double)in, seed) : 0; \ + } + +#define HASH64_OP(NAME, TYPE) \ + FORCE_INLINE \ + int64 NAME##_##TYPE(TYPE in, boolean is_valid) { \ + return is_valid ? hash64((double)in, 0) : 0; \ + } + +#define HASH32_OP(NAME, TYPE) \ + FORCE_INLINE \ + int32 NAME##_##TYPE(TYPE in, boolean is_valid) { \ + return is_valid ? hash32((double)in, 0) : 0; \ + } + +// Expand inner macro for all numeric types. +#define NUMERIC_BOOL_DATE_TYPES(INNER, NAME) \ + INNER(NAME, int8) \ + INNER(NAME, int16) \ + INNER(NAME, int32) \ + INNER(NAME, int64) \ + INNER(NAME, uint8) \ + INNER(NAME, uint16) \ + INNER(NAME, uint32) \ + INNER(NAME, uint64) \ + INNER(NAME, float32) \ + INNER(NAME, float64) \ + INNER(NAME, boolean) \ + INNER(NAME, date64) \ + INNER(NAME, time32) \ + INNER(NAME, timestamp) + +NUMERIC_BOOL_DATE_TYPES(HASH32_OP, hash) +NUMERIC_BOOL_DATE_TYPES(HASH32_OP, hash32) +NUMERIC_BOOL_DATE_TYPES(HASH32_OP, hash32AsDouble) +NUMERIC_BOOL_DATE_TYPES(HASH32_WITH_SEED_OP, hash32WithSeed) +NUMERIC_BOOL_DATE_TYPES(HASH32_WITH_SEED_OP, hash32AsDoubleWithSeed) + +NUMERIC_BOOL_DATE_TYPES(HASH64_OP, hash64) +NUMERIC_BOOL_DATE_TYPES(HASH64_OP, hash64AsDouble) +NUMERIC_BOOL_DATE_TYPES(HASH64_WITH_SEED_OP, hash64WithSeed) +NUMERIC_BOOL_DATE_TYPES(HASH64_WITH_SEED_OP, hash64AsDoubleWithSeed) + +static inline uint64 murmur3_64_buf(const uint8 *key, int32 len, int32 seed) { + uint64 h1 = seed; + uint64 h2 = seed; + uint64 c1 = 0x87c37b91114253d5ull; + uint64 c2 = 0x4cf5ad432745937full; + + const uint64 *blocks = (const uint64 *)key; + int nblocks = len / 16; + for (int i = 0; i < nblocks; i++) { + uint64 k1 = blocks[i * 2 + 0]; + uint64 k2 = blocks[i * 2 + 1]; + + k1 *= c1; + k1 = rotate_left(k1, 31); + k1 *= c2; + h1 ^= k1; + h1 = rotate_left(h1, 27); + h1 += h2; + h1 = h1 * 5 + 0x52dce729; + k2 *= c2; + k2 = rotate_left(k2, 33); + k2 *= c1; + h2 ^= k2; + h2 = rotate_left(h2, 31); + h2 += h1; + h2 = h2 * 5 + 0x38495ab5; + } + + // tail + uint64 k1 = 0; + uint64 k2 = 0; + + const uint8 *tail = (const uint8 *)(key + nblocks * 16); + switch (len & 15) { + case 15: + k2 = (uint64)(tail[14]) << 48; + case 14: + k2 ^= (uint64)(tail[13]) << 40; + case 13: + k2 ^= (uint64)(tail[12]) << 32; + case 12: + k2 ^= (uint64)(tail[11]) << 24; + case 11: + k2 ^= (uint64)(tail[10]) << 16; + case 10: + k2 ^= (uint64)(tail[9]) << 8; + case 9: + k2 ^= (uint64)(tail[8]); + k2 *= c2; + k2 = rotate_left(k2, 33); + k2 *= c1; + h2 ^= k2; + case 8: + k1 ^= (uint64)(tail[7]) << 56; + case 7: + k1 ^= (uint64)(tail[6]) << 48; + case 6: + k1 ^= (uint64)(tail[5]) << 40; + case 5: + k1 ^= (uint64)(tail[4]) << 32; + case 4: + k1 ^= (uint64)(tail[3]) << 24; + case 3: + k1 ^= (uint64)(tail[2]) << 16; + case 2: + k1 ^= (uint64)(tail[1]) << 8; + case 1: + k1 ^= (uint64)(tail[0]) << 0; + k1 *= c1; + k1 = rotate_left(k1, 31); + k1 *= c2; + h1 ^= k1; + }; + + h1 ^= len; + h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + // h2 += h1; + // returning 64-bits of the 128-bit hash. + return h1; +} + +FORCE_INLINE int64 hash64_buf(const uint8 *buf, int len, int64 seed) { + return (int64)murmur3_64_buf(buf, len, (int32)seed); +} + +FORCE_INLINE int32 hash32_buf(const uint8 *buf, int len, int32 seed) { + return (int32)murmur3_64_buf(buf, len, seed); +} + +// Wrappers for the varlen types + +#define HASH64_BUF_WITH_SEED_OP(NAME, TYPE) \ + FORCE_INLINE \ + int64 NAME##_##TYPE(TYPE in, int32 len, boolean is_valid, int64 seed, \ + boolean seed_isvalid) { \ + return is_valid && seed_isvalid ? hash64_buf((const uint8 *)in, len, seed) : 0; \ + } + +#define HASH32_BUF_WITH_SEED_OP(NAME, TYPE) \ + FORCE_INLINE \ + int32 NAME##_##TYPE(TYPE in, int32 len, boolean is_valid, int32 seed, \ + boolean seed_isvalid) { \ + return is_valid && seed_isvalid ? hash32_buf((const uint8 *)in, len, seed) : 0; \ + } + +#define HASH64_BUF_OP(NAME, TYPE) \ + FORCE_INLINE \ + int64 NAME##_##TYPE(TYPE in, int32 len, boolean is_valid) { \ + return is_valid ? hash64_buf((const uint8 *)in, len, 0) : 0; \ + } + +#define HASH32_BUF_OP(NAME, TYPE) \ + FORCE_INLINE \ + int32 NAME##_##TYPE(TYPE in, int32 len, boolean is_valid) { \ + return is_valid ? hash32_buf((const uint8 *)in, len, 0) : 0; \ + } + +// Expand inner macro for all numeric types. +#define VAR_LEN_TYPES(INNER, NAME) \ + INNER(NAME, utf8) \ + INNER(NAME, binary) + +VAR_LEN_TYPES(HASH32_BUF_OP, hash) +VAR_LEN_TYPES(HASH32_BUF_OP, hash32) +VAR_LEN_TYPES(HASH32_BUF_OP, hash32AsDouble) +VAR_LEN_TYPES(HASH32_BUF_WITH_SEED_OP, hash32WithSeed) +VAR_LEN_TYPES(HASH32_BUF_WITH_SEED_OP, hash32AsDoubleWithSeed) + +VAR_LEN_TYPES(HASH64_BUF_OP, hash64) +VAR_LEN_TYPES(HASH64_BUF_OP, hash64AsDouble) +VAR_LEN_TYPES(HASH64_BUF_WITH_SEED_OP, hash64WithSeed) +VAR_LEN_TYPES(HASH64_BUF_WITH_SEED_OP, hash64AsDoubleWithSeed) + +} // extern "C" diff --git a/src/precompiled/hash_test.cc b/src/precompiled/hash_test.cc new file mode 100644 index 0000000000000..7e45b19694970 --- /dev/null +++ b/src/precompiled/hash_test.cc @@ -0,0 +1,119 @@ +// Copyright (C) 2017-2018 Dremio Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include "precompiled/types.h" + +namespace gandiva { + +TEST(TestHash, TestHash32) { + int8 s8 = 0; + uint8 u8 = 0; + int16 s16 = 0; + uint16 u16 = 0; + int32 s32 = 0; + uint32 u32 = 0; + int64 s64 = 0; + uint64 u64 = 0; + float32 f32 = 0; + float64 f64 = 0; + + // hash of 0 should be non-zero (zero is the hash value for nulls). + int32 zero_hash = hash32(s8, 0); + EXPECT_NE(zero_hash, 0); + + // for a given value, all numeric types must have the same hash. + EXPECT_EQ(hash32(u8, 0), zero_hash); + EXPECT_EQ(hash32(s16, 0), zero_hash); + EXPECT_EQ(hash32(u16, 0), zero_hash); + EXPECT_EQ(hash32(s32, 0), zero_hash); + EXPECT_EQ(hash32(u32, 0), zero_hash); + EXPECT_EQ(hash32(s64, 0), zero_hash); + EXPECT_EQ(hash32(u64, 0), zero_hash); + EXPECT_EQ(hash32(f32, 0), zero_hash); + EXPECT_EQ(hash32(f64, 0), zero_hash); + + // hash must change with a change in seed. + EXPECT_NE(hash32(s8, 1), zero_hash); + + // for a given value and seed, all numeric types must have the same hash. + EXPECT_EQ(hash32(s8, 1), hash32(s16, 1)); + EXPECT_EQ(hash32(s8, 1), hash32(u32, 1)); + EXPECT_EQ(hash32(s8, 1), hash32(f32, 1)); + EXPECT_EQ(hash32(s8, 1), hash32(f64, 1)); +} + +TEST(TestHash, TestHash64) { + int8 s8 = 0; + uint8 u8 = 0; + int16 s16 = 0; + uint16 u16 = 0; + int32 s32 = 0; + uint32 u32 = 0; + int64 s64 = 0; + uint64 u64 = 0; + float32 f32 = 0; + float64 f64 = 0; + + // hash of 0 should be non-zero (zero is the hash value for nulls). + int64 zero_hash = hash64(s8, 0); + EXPECT_NE(zero_hash, 0); + EXPECT_NE(hash64(u8, 0), hash32(u8, 0)); + + // for a given value, all numeric types must have the same hash. + EXPECT_EQ(hash64(u8, 0), zero_hash); + EXPECT_EQ(hash64(s16, 0), zero_hash); + EXPECT_EQ(hash64(u16, 0), zero_hash); + EXPECT_EQ(hash64(s32, 0), zero_hash); + EXPECT_EQ(hash64(u32, 0), zero_hash); + EXPECT_EQ(hash64(s64, 0), zero_hash); + EXPECT_EQ(hash64(u64, 0), zero_hash); + EXPECT_EQ(hash64(f32, 0), zero_hash); + EXPECT_EQ(hash64(f64, 0), zero_hash); + + // hash must change with a change in seed. + EXPECT_NE(hash64(s8, 1), zero_hash); + + // for a given value and seed, all numeric types must have the same hash. + EXPECT_EQ(hash64(s8, 1), hash64(s16, 1)); + EXPECT_EQ(hash64(s8, 1), hash64(u32, 1)); + EXPECT_EQ(hash64(s8, 1), hash64(f32, 1)); +} + +TEST(TestHash, TestHashBuf) { + const char *buf = "hello"; + int buf_len = 5; + + // hash should be non-zero (zero is the hash value for nulls). + EXPECT_NE(hash32_buf((const uint8 *)buf, buf_len, 0), 0); + EXPECT_NE(hash64_buf((const uint8 *)buf, buf_len, 0), 0); + + // hash must change if the string is changed. + EXPECT_NE(hash32_buf((const uint8 *)buf, buf_len, 0), + hash32_buf((const uint8 *)buf, buf_len - 1, 0)); + + EXPECT_NE(hash64_buf((const uint8 *)buf, buf_len, 0), + hash64_buf((const uint8 *)buf, buf_len - 1, 0)); + + // hash must change if the seed is changed. + EXPECT_NE(hash32_buf((const uint8 *)buf, buf_len, 0), + hash32_buf((const uint8 *)buf, buf_len, 1)); + + EXPECT_NE(hash64_buf((const uint8 *)buf, buf_len, 0), + hash64_buf((const uint8 *)buf, buf_len, 1)); +} + +} // namespace gandiva diff --git a/src/precompiled/types.h b/src/precompiled/types.h index 2f782a8edd29e..5e2f8ecf72185 100644 --- a/src/precompiled/types.h +++ b/src/precompiled/types.h @@ -63,11 +63,15 @@ int64 extractDay_timestamp(timestamp millis); int64 extractHour_timestamp(timestamp millis); int64 extractMinute_timestamp(timestamp millis); int64 extractSecond_timestamp(timestamp millis); - int64 extractHour_time32(int32 millis_in_day); int64 extractMinute_time32(int32 millis_in_day); int64 extractSecond_time32(int32 millis_in_day); +int32 hash32(double val, int32 seed); +int32 hash32_buf(const uint8 *buf, int len, int32 seed); +int64 hash64(double val, int64 seed); +int64 hash64_buf(const uint8 *buf, int len, int64 seed); + int32 mem_compare(const char *left, int32 left_len, const char *right, int32 right_len); } // extern "C"