From 0072c677fbbc85832fa7a90ab49daf7c1f99a373 Mon Sep 17 00:00:00 2001 From: frank400 Date: Tue, 6 Jul 2021 12:19:05 +0530 Subject: [PATCH] ARROW-12567: [C++][Gandiva] Implement ILIKE SQL function Closes #10179 from jvictorhuguenin/feature/implement-sql-ilike and squashes the following commits: f160880d2 Optimize holder constructor call 97e6e2d83 Remove unnecessary Make method c2363b10f Disable TryOptimize for ilike a48414931 Fix checkstyle on cmake file c6a8372cd Delete unnecessary holder 4be6cc611 Fix redefined function b78085a14 Fix miss include 2efd43e2b Implement ilike function Authored-by: frank400 Signed-off-by: Praveen --- cpp/src/gandiva/function_holder_registry.h | 1 + cpp/src/gandiva/function_registry_string.cc | 4 ++ cpp/src/gandiva/gdv_function_stubs.cc | 17 +++++ cpp/src/gandiva/gdv_function_stubs.h | 3 + cpp/src/gandiva/like_holder.cc | 21 ++++++ cpp/src/gandiva/like_holder.h | 6 ++ cpp/src/gandiva/like_holder_test.cc | 75 +++++++++++++++++++-- 7 files changed, 123 insertions(+), 4 deletions(-) diff --git a/cpp/src/gandiva/function_holder_registry.h b/cpp/src/gandiva/function_holder_registry.h index e1c5630e84191..225c73207fcc0 100644 --- a/cpp/src/gandiva/function_holder_registry.h +++ b/cpp/src/gandiva/function_holder_registry.h @@ -62,6 +62,7 @@ class FunctionHolderRegistry { static map_type& makers() { static map_type maker_map = { {"like", LAMBDA_MAKER(LikeHolder)}, + {"ilike", LAMBDA_MAKER(LikeHolder)}, {"to_date", LAMBDA_MAKER(ToDateHolder)}, {"random", LAMBDA_MAKER(RandomGeneratorHolder)}, {"rand", LAMBDA_MAKER(RandomGeneratorHolder)}, diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index 90e2231677357..7491e4435a9dc 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -131,6 +131,10 @@ std::vector GetStringFunctionRegistry() { kResultNullIfNull, "gdv_fn_like_utf8_utf8_utf8", NativeFunction::kNeedsFunctionHolder), + NativeFunction("ilike", {}, DataTypeVector{utf8(), utf8()}, boolean(), + kResultNullIfNull, "gdv_fn_ilike_utf8_utf8", + NativeFunction::kNeedsFunctionHolder), + NativeFunction("ltrim", {}, DataTypeVector{utf8(), utf8()}, utf8(), kResultNullIfNull, "ltrim_utf8_utf8", NativeFunction::kNeedsContext), diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index 38c31a8c3f505..3c278049ed6fb 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -52,6 +52,12 @@ bool gdv_fn_like_utf8_utf8_utf8(int64_t ptr, const char* data, int data_len, return (*holder)(std::string(data, data_len)); } +bool gdv_fn_ilike_utf8_utf8(int64_t ptr, const char* data, int data_len, + const char* pattern, int pattern_len) { + gandiva::LikeHolder* holder = reinterpret_cast(ptr); + return (*holder)(std::string(data, data_len)); +} + double gdv_fn_random(int64_t ptr) { gandiva::RandomGeneratorHolder* holder = reinterpret_cast(ptr); @@ -807,6 +813,17 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const { types->i1_type() /*return_type*/, args, reinterpret_cast(gdv_fn_like_utf8_utf8_utf8)); + // gdv_fn_ilike_utf8_utf8 + args = {types->i64_type(), // int64_t ptr + types->i8_ptr_type(), // const char* data + types->i32_type(), // int data_len + types->i8_ptr_type(), // const char* pattern + types->i32_type()}; // int pattern_len + + engine->AddGlobalMappingForFunc("gdv_fn_ilike_utf8_utf8", + types->i1_type() /*return_type*/, args, + reinterpret_cast(gdv_fn_ilike_utf8_utf8)); + // gdv_fn_to_date_utf8_utf8 args = {types->i64_type(), // int64_t execution_context types->i64_type(), // int64_t holder_ptr diff --git a/cpp/src/gandiva/gdv_function_stubs.h b/cpp/src/gandiva/gdv_function_stubs.h index ee22c3f4ece14..043e94034ed7f 100644 --- a/cpp/src/gandiva/gdv_function_stubs.h +++ b/cpp/src/gandiva/gdv_function_stubs.h @@ -50,6 +50,9 @@ bool gdv_fn_like_utf8_utf8_utf8(int64_t ptr, const char* data, int data_len, const char* pattern, int pattern_len, const char* escape_char, int escape_char_len); +bool gdv_fn_ilike_utf8_utf8(int64_t ptr, const char* data, int data_len, + const char* pattern, int pattern_len); + int64_t gdv_fn_to_date_utf8_utf8_int32(int64_t context, int64_t ptr, const char* data, int data_len, bool in1_validity, const char* pattern, int pattern_len, diff --git a/cpp/src/gandiva/like_holder.cc b/cpp/src/gandiva/like_holder.cc index 5a3510e36528b..af9ac67d66ac4 100644 --- a/cpp/src/gandiva/like_holder.cc +++ b/cpp/src/gandiva/like_holder.cc @@ -80,6 +80,13 @@ Status LikeHolder::Make(const FunctionNode& node, std::shared_ptr* h !IsArrowStringLiteral(literal_type), Status::Invalid( "'like' function requires a string literal as the second parameter")); + + RE2::Options regex_op; + if (node.descriptor()->name() == "ilike") { + regex_op.set_case_sensitive(false); // set case-insensitive for ilike function. + + return Make(arrow::util::get(literal->holder()), holder, regex_op); + } if (node.children().size() == 2) { return Make(arrow::util::get(literal->holder()), holder); } else { @@ -132,4 +139,18 @@ Status LikeHolder::Make(const std::string& sql_pattern, const std::string& escap return Status::OK(); } +Status LikeHolder::Make(const std::string& sql_pattern, + std::shared_ptr* holder, RE2::Options regex_op) { + std::string pcre_pattern; + ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern)); + + std::shared_ptr lholder; + lholder = std::shared_ptr(new LikeHolder(pcre_pattern, regex_op)); + + ARROW_RETURN_IF(!lholder->regex_.ok(), + Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed")); + + *holder = lholder; + return Status::OK(); +} } // namespace gandiva diff --git a/cpp/src/gandiva/like_holder.h b/cpp/src/gandiva/like_holder.h index c7982e9143748..73e58017de19f 100644 --- a/cpp/src/gandiva/like_holder.h +++ b/cpp/src/gandiva/like_holder.h @@ -42,6 +42,9 @@ class GANDIVA_EXPORT LikeHolder : public FunctionHolder { static Status Make(const std::string& sql_pattern, const std::string& escape_char, std::shared_ptr* holder); + static Status Make(const std::string& sql_pattern, std::shared_ptr* holder, + RE2::Options regex_op); + // Try and optimise a function node with a "like" pattern. static const FunctionNode TryOptimize(const FunctionNode& node); @@ -51,6 +54,9 @@ class GANDIVA_EXPORT LikeHolder : public FunctionHolder { private: explicit LikeHolder(const std::string& pattern) : pattern_(pattern), regex_(pattern) {} + LikeHolder(const std::string& pattern, RE2::Options regex_op) + : pattern_(pattern), regex_(pattern, regex_op) {} + std::string pattern_; // posix pattern string, to help debugging RE2 regex_; // compiled regex for the pattern diff --git a/cpp/src/gandiva/like_holder_test.cc b/cpp/src/gandiva/like_holder_test.cc index 18e585fc502d4..a52533a113836 100644 --- a/cpp/src/gandiva/like_holder_test.cc +++ b/cpp/src/gandiva/like_holder_test.cc @@ -27,6 +27,7 @@ namespace gandiva { class TestLikeHolder : public ::testing::Test { public: + RE2::Options regex_op; FunctionNode BuildLike(std::string pattern) { auto field = std::make_shared(arrow::field("in", arrow::utf8())); auto pattern_node = @@ -48,7 +49,7 @@ class TestLikeHolder : public ::testing::Test { TEST_F(TestLikeHolder, TestMatchAny) { std::shared_ptr like_holder; - auto status = LikeHolder::Make("ab%", &like_holder); + auto status = LikeHolder::Make("ab%", &like_holder, regex_op); EXPECT_EQ(status.ok(), true) << status.message(); auto& like = *like_holder; @@ -63,7 +64,7 @@ TEST_F(TestLikeHolder, TestMatchAny) { TEST_F(TestLikeHolder, TestMatchOne) { std::shared_ptr like_holder; - auto status = LikeHolder::Make("ab_", &like_holder); + auto status = LikeHolder::Make("ab_", &like_holder, regex_op); EXPECT_EQ(status.ok(), true) << status.message(); auto& like = *like_holder; @@ -78,7 +79,7 @@ TEST_F(TestLikeHolder, TestMatchOne) { TEST_F(TestLikeHolder, TestPcreSpecial) { std::shared_ptr like_holder; - auto status = LikeHolder::Make(".*ab_", &like_holder); + auto status = LikeHolder::Make(".*ab_", &like_holder, regex_op); EXPECT_EQ(status.ok(), true) << status.message(); auto& like = *like_holder; @@ -97,7 +98,7 @@ TEST_F(TestLikeHolder, TestRegexEscape) { TEST_F(TestLikeHolder, TestDot) { std::shared_ptr like_holder; - auto status = LikeHolder::Make("abc.", &like_holder); + auto status = LikeHolder::Make("abc.", &like_holder, regex_op); EXPECT_EQ(status.ok(), true) << status.message(); auto& like = *like_holder; @@ -211,4 +212,70 @@ TEST_F(TestLikeHolder, TestMultipleEscapeChar) { auto status = LikeHolder::Make("ab\\_", "\\\\", &like_holder); EXPECT_EQ(status.ok(), false) << status.message(); } +class TestILikeHolder : public ::testing::Test { + public: + RE2::Options regex_op; + FunctionNode BuildILike(std::string pattern) { + auto field = std::make_shared(arrow::field("in", arrow::utf8())); + auto pattern_node = + std::make_shared(arrow::utf8(), LiteralHolder(pattern), false); + return FunctionNode("ilike", {field, pattern_node}, arrow::boolean()); + } +}; + +TEST_F(TestILikeHolder, TestMatchAny) { + std::shared_ptr like_holder; + + regex_op.set_case_sensitive(false); + auto status = LikeHolder::Make("ab%", &like_holder, regex_op); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& like = *like_holder; + EXPECT_TRUE(like("ab")); + EXPECT_TRUE(like("aBc")); + EXPECT_TRUE(like("ABCD")); + + EXPECT_FALSE(like("a")); + EXPECT_FALSE(like("cab")); +} + +TEST_F(TestILikeHolder, TestMatchOne) { + std::shared_ptr like_holder; + + regex_op.set_case_sensitive(false); + auto status = LikeHolder::Make("Ab_", &like_holder, regex_op); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& like = *like_holder; + EXPECT_TRUE(like("abc")); + EXPECT_TRUE(like("aBd")); + + EXPECT_FALSE(like("A")); + EXPECT_FALSE(like("Abcd")); + EXPECT_FALSE(like("DaBc")); +} + +TEST_F(TestILikeHolder, TestPcreSpecial) { + std::shared_ptr like_holder; + + regex_op.set_case_sensitive(false); + auto status = LikeHolder::Make(".*aB_", &like_holder, regex_op); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& like = *like_holder; + EXPECT_TRUE(like(".*Abc")); // . and * aren't special in sql regex + EXPECT_FALSE(like("xxAbc")); +} + +TEST_F(TestILikeHolder, TestDot) { + std::shared_ptr like_holder; + + regex_op.set_case_sensitive(false); + auto status = LikeHolder::Make("aBc.", &like_holder, regex_op); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& like = *like_holder; + EXPECT_FALSE(like("abcd")); +} + } // namespace gandiva