Skip to content

Commit

Permalink
GDV-28: [C++] Add hash functions on all data types (apache#69)
Browse files Browse the repository at this point in the history
* GDV-28: [C++] Add hash functions on all data types

* GDV-28: Fix stylecheck in travis to print diff

* GDV-28: pick clang-format from llvm-binary dir

* GDV-28: handle case when seed is null

* GDV-28: [C++] Fix a style check
  • Loading branch information
pravindra authored Jul 20, 2018
1 parent 770d2bb commit 01f46fe
Show file tree
Hide file tree
Showing 7 changed files with 601 additions and 9 deletions.
57 changes: 52 additions & 5 deletions cpp/src/gandiva/codegen/function_registry.cc
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,38 @@ using std::vector;
NativeFunction(#NAME, DataTypeVector{TYPE()}, int64(), true, RESULT_NULL_IF_NULL, \
STRINGIFY(NAME##_##TYPE))

// Hash32 functions that :
// - NULL handling is of type NULL_NEVER
//
// The pre-compiled fn name includes the base name & input type name. hash32_int8
#define HASH32_SAFE_NULL_NEVER(NAME, TYPE) \
NativeFunction(#NAME, DataTypeVector{TYPE()}, int32(), true, RESULT_NULL_NEVER, \
STRINGIFY(NAME##_##TYPE))

// Hash32 functions that :
// - NULL handling is of type NULL_NEVER
//
// The pre-compiled fn name includes the base name & input type name. hash32_int8
#define HASH64_SAFE_NULL_NEVER(NAME, TYPE) \
NativeFunction(#NAME, DataTypeVector{TYPE()}, int64(), true, RESULT_NULL_NEVER, \
STRINGIFY(NAME##_##TYPE))

// Hash32 functions with seed that :
// - NULL handling is of type NULL_NEVER
//
// The pre-compiled fn name includes the base name & input type name. hash32WithSeed_int8
#define HASH32_SEED_SAFE_NULL_NEVER(NAME, TYPE) \
NativeFunction(#NAME, DataTypeVector{TYPE(), int32()}, int32(), true, \
RESULT_NULL_NEVER, STRINGIFY(NAME##WithSeed_##TYPE))

// Hash64 functions with seed that :
// - NULL handling is of type NULL_NEVER
//
// The pre-compiled fn name includes the base name & input type name. hash32WithSeed_int8
#define HASH64_SEED_SAFE_NULL_NEVER(NAME, TYPE) \
NativeFunction(#NAME, DataTypeVector{TYPE(), int64()}, int64(), true, \
RESULT_NULL_NEVER, STRINGIFY(NAME##WithSeed_##TYPE))

// Iterate the inner macro over all numeric types
#define NUMERIC_TYPES(INNER, NAME) \
INNER(NAME, int8), INNER(NAME, int16), INNER(NAME, int32), INNER(NAME, int64), \
Expand All @@ -100,11 +132,6 @@ using std::vector;
#define NUMERIC_DATE_TYPES(INNER, NAME) \
NUMERIC_TYPES(INNER, NAME), DATE_TYPES(INNER, NAME), TIME_TYPES(INNER, NAME)

// Iterate the inner macro over all numeric types and bool type
#define NUMERIC_BOOL_DATE_TYPES(INNER, NAME) \
NUMERIC_TYPES(INNER, NAME), DATE_TYPES(INNER, NAME), TIME_TYPES(INNER, NAME), \
INNER(NAME, boolean)

// Iterate the inner macro over all date types
#define DATE_TYPES(INNER, NAME) INNER(NAME, date64), INNER(NAME, timestamp)

Expand All @@ -114,6 +141,14 @@ using std::vector;
// Iterate the inner macro over all data types
#define VAR_LEN_TYPES(INNER, NAME) INNER(NAME, utf8), INNER(NAME, binary)

// Iterate the inner macro over all numeric types, date types and bool type
#define NUMERIC_BOOL_DATE_TYPES(INNER, NAME) \
NUMERIC_DATE_TYPES(INNER, NAME), INNER(NAME, boolean)

// Iterate the inner macro over all numeric types, date types, bool and varlen types
#define NUMERIC_BOOL_DATE_VAR_LEN_TYPES(INNER, NAME) \
NUMERIC_BOOL_DATE_TYPES(INNER, NAME), VAR_LEN_TYPES(INNER, NAME)

// list of registered native functions.
NativeFunction FunctionRegistry::pc_registry_[] = {
// Arithmetic operations
Expand Down Expand Up @@ -174,6 +209,18 @@ NativeFunction FunctionRegistry::pc_registry_[] = {
BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampdiffQuarter, timestamp, timestamp, int32),
BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampdiffYear, timestamp, timestamp, int32),

// hash functions
NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SAFE_NULL_NEVER, hash),
NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SAFE_NULL_NEVER, hash32),
NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SAFE_NULL_NEVER, hash32AsDouble),
NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SEED_SAFE_NULL_NEVER, hash32),
NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SEED_SAFE_NULL_NEVER, hash32AsDouble),

NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH64_SAFE_NULL_NEVER, hash64),
NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH64_SAFE_NULL_NEVER, hash64AsDouble),
NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH64_SEED_SAFE_NULL_NEVER, hash64),
NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH64_SEED_SAFE_NULL_NEVER, hash64AsDouble),

// utf8/binary operations
UNARY_SAFE_NULL_IF_NULL(octet_length, utf8, int32),
UNARY_SAFE_NULL_IF_NULL(octet_length, binary, int32),
Expand Down
1 change: 1 addition & 0 deletions cpp/src/gandiva/integ/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,5 @@ foreach(lib_type "shared" "static")
add_gandiva_integ_test(date_time_test.cc gandiva_${lib_type})
add_gandiva_integ_test(micro_benchmarks.cc gandiva_${lib_type})
add_gandiva_integ_test(to_string_test.cc gandiva_${lib_type})
add_gandiva_integ_test(hash_test.cc gandiva_${lib_type})
endforeach(lib_type)
142 changes: 142 additions & 0 deletions cpp/src/gandiva/integ/hash_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
// Copyright (C) 2017-2018 Dremio Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <gtest/gtest.h>
#include "arrow/memory_pool.h"
#include "gandiva/projector.h"
#include "gandiva/status.h"
#include "gandiva/tree_expr_builder.h"
#include "integ/test_util.h"

namespace gandiva {

using arrow::boolean;
using arrow::int32;
using arrow::int64;
using arrow::utf8;

class TestHash : public ::testing::Test {
public:
void SetUp() { pool_ = arrow::default_memory_pool(); }

protected:
arrow::MemoryPool* pool_;
};

TEST_F(TestHash, TestSimple) {
// schema for input fields
auto field_a = field("a", int32());
auto schema = arrow::schema({field_a});

// output fields
auto res_0 = field("res0", int32());
auto res_1 = field("res1", int64());

// build expression.
// hash32(a, 10)
// hash64(a)
auto node_a = TreeExprBuilder::MakeField(field_a);
auto literal_10 = TreeExprBuilder::MakeLiteral((int32_t)10);
auto hash32 = TreeExprBuilder::MakeFunction("hash32", {node_a, literal_10}, int32());
auto hash64 = TreeExprBuilder::MakeFunction("hash64", {node_a}, int64());
auto expr_0 = TreeExprBuilder::MakeExpression(hash32, res_0);
auto expr_1 = TreeExprBuilder::MakeExpression(hash64, res_1);

// Build a projector for the expression.
std::shared_ptr<Projector> projector;
Status status = Projector::Make(schema, {expr_0, expr_1}, pool_, &projector);
EXPECT_TRUE(status.ok()) << status.message();

// Create a row-batch with some sample data
int num_records = 4;
auto array_a = MakeArrowArrayInt32({1, 2, 3, 4}, {false, true, true, true});

// prepare input record batch
auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});

// Evaluate expression
arrow::ArrayVector outputs;
status = projector->Evaluate(*in_batch, &outputs);
EXPECT_TRUE(status.ok());

// Validate results
auto int32_arr = std::dynamic_pointer_cast<arrow::Int32Array>(outputs.at(0));
EXPECT_EQ(int32_arr->null_count(), 0);
EXPECT_EQ(int32_arr->Value(0), 0);
for (int i = 1; i < num_records; ++i) {
EXPECT_NE(int32_arr->Value(i), int32_arr->Value(i - 1));
}

auto int64_arr = std::dynamic_pointer_cast<arrow::Int64Array>(outputs.at(1));
EXPECT_EQ(int64_arr->null_count(), 0);
EXPECT_EQ(int64_arr->Value(0), 0);
for (int i = 1; i < num_records; ++i) {
EXPECT_NE(int64_arr->Value(i), int64_arr->Value(i - 1));
}
}

TEST_F(TestHash, TestBuf) {
// schema for input fields
auto field_a = field("a", utf8());
auto schema = arrow::schema({field_a});

// output fields
auto res_0 = field("res0", int32());
auto res_1 = field("res1", int64());

// build expressions.
// hash32(a)
// hash64(a, 10)
auto node_a = TreeExprBuilder::MakeField(field_a);
auto literal_10 = TreeExprBuilder::MakeLiteral((int64_t)10);
auto hash32 = TreeExprBuilder::MakeFunction("hash32", {node_a}, int32());
auto hash64 = TreeExprBuilder::MakeFunction("hash64", {node_a, literal_10}, int64());
auto expr_0 = TreeExprBuilder::MakeExpression(hash32, res_0);
auto expr_1 = TreeExprBuilder::MakeExpression(hash64, res_1);

// Build a projector for the expressions.
std::shared_ptr<Projector> projector;
Status status = Projector::Make(schema, {expr_0, expr_1}, pool_, &projector);
EXPECT_TRUE(status.ok()) << status.message();

// Create a row-batch with some sample data
int num_records = 4;
auto array_a =
MakeArrowArrayUtf8({"foo", "hello", "bye", "hi"}, {false, true, true, true});

// prepare input record batch
auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});

// Evaluate expression
arrow::ArrayVector outputs;
status = projector->Evaluate(*in_batch, &outputs);
EXPECT_TRUE(status.ok());

// Validate results
auto int32_arr = std::dynamic_pointer_cast<arrow::Int32Array>(outputs.at(0));
EXPECT_EQ(int32_arr->null_count(), 0);
EXPECT_EQ(int32_arr->Value(0), 0);
for (int i = 1; i < num_records; ++i) {
EXPECT_NE(int32_arr->Value(i), int32_arr->Value(i - 1));
}

auto int64_arr = std::dynamic_pointer_cast<arrow::Int64Array>(outputs.at(1));
EXPECT_EQ(int64_arr->null_count(), 0);
EXPECT_EQ(int64_arr->Value(0), 0);
for (int i = 1; i < num_records; ++i) {
EXPECT_NE(int64_arr->Value(i), int64_arr->Value(i - 1));
}
}

} // namespace gandiva
5 changes: 2 additions & 3 deletions cpp/src/gandiva/precompiled/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,10 @@

project(gandiva)

set(CLANG_EXECUTABLE ${LLVM_TOOLS_BINARY_DIR}/clang)
set(LINK_EXECUTABLE ${LLVM_TOOLS_BINARY_DIR}/llvm-link)

set(PRECOMPILED_SRCS
arithmetic_ops.cc
bitmap.cc
hash.cc
print.cc
sample.cc
string_ops.cc
Expand Down Expand Up @@ -51,6 +49,7 @@ add_custom_target(precompiled ALL DEPENDS ${GANDIVA_BC_OUTPUT_PATH})

# testing
add_precompiled_unit_test(bitmap_test.cc bitmap.cc)
add_precompiled_unit_test(hash_test.cc hash.cc)
add_precompiled_unit_test(time_test.cc time.cc)
add_precompiled_unit_test(sample_test.cc sample.cc)
add_precompiled_unit_test(string_ops_test.cc string_ops.cc)
Loading

0 comments on commit 01f46fe

Please sign in to comment.