Skip to content

Commit

Permalink
Implement ilike function
Browse files Browse the repository at this point in the history
  • Loading branch information
jvictorhuguenin committed Jun 7, 2021
1 parent 4fb9de2 commit 2efd43e
Show file tree
Hide file tree
Showing 9 changed files with 315 additions and 0 deletions.
2 changes: 2 additions & 0 deletions cpp/src/gandiva/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ set(SRC_FILES
llvm_generator.cc
llvm_types.cc
like_holder.cc
ilike_holder.cc
literal_holder.cc
projector.cc
regex_util.cc
Expand Down Expand Up @@ -230,6 +231,7 @@ add_gandiva_test(internals-test
to_date_holder_test.cc
simple_arena_test.cc
like_holder_test.cc
ilike_holder_test.cc
decimal_type_util_test.cc
random_generator_holder_test.cc
hash_utils_test.cc
Expand Down
2 changes: 2 additions & 0 deletions cpp/src/gandiva/expr_decomposer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ Status ExprDecomposer::Visit(const FieldNode& node) {
const FunctionNode ExprDecomposer::TryOptimize(const FunctionNode& node) {
if (node.descriptor()->name() == "like") {
return LikeHolder::TryOptimize(node);
} else if (node.descriptor()->name() == "ilike") {
return IlikeHolder::TryOptimize(node);
} else {
return node;
}
Expand Down
2 changes: 2 additions & 0 deletions cpp/src/gandiva/function_holder_registry.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "arrow/status.h"

#include "gandiva/function_holder.h"
#include "gandiva/ilike_holder.h"
#include "gandiva/like_holder.h"
#include "gandiva/node.h"
#include "gandiva/random_generator_holder.h"
Expand Down Expand Up @@ -62,6 +63,7 @@ class FunctionHolderRegistry {
static map_type& makers() {
static map_type maker_map = {
{"like", LAMBDA_MAKER(LikeHolder)},
{"ilike", LAMBDA_MAKER(IlikeHolder)},
{"to_date", LAMBDA_MAKER(ToDateHolder)},
{"random", LAMBDA_MAKER(RandomGeneratorHolder)},
{"rand", LAMBDA_MAKER(RandomGeneratorHolder)},
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/gandiva/function_registry_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,10 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
kResultNullIfNull, "gdv_fn_like_utf8_utf8_utf8",
NativeFunction::kNeedsFunctionHolder),

NativeFunction("ilike", {}, DataTypeVector{utf8(), utf8()}, boolean(),
kResultNullIfNull, "gdv_fn_ilike_utf8_utf8",
NativeFunction::kNeedsFunctionHolder),

NativeFunction("ltrim", {}, DataTypeVector{utf8(), utf8()}, utf8(),
kResultNullIfNull, "ltrim_utf8_utf8", NativeFunction::kNeedsContext),

Expand Down
18 changes: 18 additions & 0 deletions cpp/src/gandiva/gdv_function_stubs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "gandiva/exported_funcs.h"
#include "gandiva/formatting_utils.h"
#include "gandiva/hash_utils.h"
#include "gandiva/ilike_holder.h"
#include "gandiva/in_holder.h"
#include "gandiva/like_holder.h"
#include "gandiva/precompiled/types.h"
Expand All @@ -52,6 +53,12 @@ bool gdv_fn_like_utf8_utf8_utf8(int64_t ptr, const char* data, int data_len,
return (*holder)(std::string(data, data_len));
}

bool gdv_fn_ilike_utf8_utf8(int64_t ptr, const char* data, int data_len,
const char* pattern, int pattern_len) {
gandiva::IlikeHolder* holder = reinterpret_cast<gandiva::IlikeHolder*>(ptr);
return (*holder)(std::string(data, data_len));
}

double gdv_fn_random(int64_t ptr) {
gandiva::RandomGeneratorHolder* holder =
reinterpret_cast<gandiva::RandomGeneratorHolder*>(ptr);
Expand Down Expand Up @@ -752,6 +759,17 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const {
types->i1_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_like_utf8_utf8_utf8));

// gdv_fn_ilike_utf8_utf8
args = {types->i64_type(), // int64_t ptr
types->i8_ptr_type(), // const char* data
types->i32_type(), // int data_len
types->i8_ptr_type(), // const char* pattern
types->i32_type()}; // int pattern_len

engine->AddGlobalMappingForFunc("gdv_fn_ilike_utf8_utf8",
types->i1_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_ilike_utf8_utf8));

// gdv_fn_to_date_utf8_utf8
args = {types->i64_type(), // int64_t execution_context
types->i64_type(), // int64_t holder_ptr
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/gandiva/gdv_function_stubs.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ bool gdv_fn_like_utf8_utf8_utf8(int64_t ptr, const char* data, int data_len,
const char* pattern, int pattern_len,
const char* escape_char, int escape_char_len);

bool gdv_fn_ilike_utf8_utf8(int64_t ptr, const char* data, int data_len,
const char* pattern, int pattern_len);

int64_t gdv_fn_to_date_utf8_utf8_int32(int64_t context, int64_t ptr, const char* data,
int data_len, bool in1_validity,
const char* pattern, int pattern_len,
Expand Down
101 changes: 101 additions & 0 deletions cpp/src/gandiva/ilike_holder.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "gandiva/ilike_holder.h"

#include <regex>
#include "gandiva/node.h"
#include "gandiva/regex_util.h"

namespace gandiva {
RE2::Options IlikeHolder::regex_op_ = RE2::Options();

RE2 IlikeHolder::starts_with_regex_(R"((?i)(\w|\s)*\.\*)");
RE2 IlikeHolder::ends_with_regex_(R"((?i)\.\*(\w|\s)*)");
RE2 IlikeHolder::is_substr_regex_(R"((?i)\.\*(\w|\s)*\.\*)");

// Short-circuit pattern matches for the following common sub cases :
// - starts_with, ends_with and is_substr
const FunctionNode IlikeHolder::TryOptimize(const FunctionNode& node) {
std::shared_ptr<IlikeHolder> holder;
auto status = Make(node, &holder);
if (status.ok()) {
std::string& pattern = holder->pattern_;
auto literal_type = node.children().at(1)->return_type();

if (RE2::FullMatch(pattern, starts_with_regex_)) {
auto prefix = pattern.substr(0, pattern.length() - 2); // trim .*
auto prefix_node =
std::make_shared<LiteralNode>(literal_type, LiteralHolder(prefix), false);
return FunctionNode("starts_with", {node.children().at(0), prefix_node},
node.return_type());
} else if (RE2::FullMatch(pattern, ends_with_regex_)) {
auto suffix = pattern.substr(2); // skip .*
auto suffix_node =
std::make_shared<LiteralNode>(literal_type, LiteralHolder(suffix), false);
return FunctionNode("ends_with", {node.children().at(0), suffix_node},
node.return_type());
} else if (RE2::FullMatch(pattern, is_substr_regex_)) {
auto substr =
pattern.substr(2, pattern.length() - 4); // trim starting and ending .*
auto substr_node =
std::make_shared<LiteralNode>(literal_type, LiteralHolder(substr), false);
return FunctionNode("is_substr", {node.children().at(0), substr_node},
node.return_type());
}
}

// Could not optimize, return original node.
return node;
}

static bool IsArrowStringLiteral(arrow::Type::type type) {
return type == arrow::Type::STRING || type == arrow::Type::BINARY;
}

Status IlikeHolder::Make(const FunctionNode& node, std::shared_ptr<IlikeHolder>* holder) {
ARROW_RETURN_IF(node.children().size() != 2,
Status::Invalid("'ilike' function requires two parameters"));

auto literal = dynamic_cast<LiteralNode*>(node.children().at(1).get());
ARROW_RETURN_IF(
literal == nullptr,
Status::Invalid("'ilike' function requires a literal as the second parameter"));

auto literal_type = literal->return_type()->id();
ARROW_RETURN_IF(
!IsArrowStringLiteral(literal_type),
Status::Invalid(
"'ilike' function requires a string literal as the second parameter"));

return Make(arrow::util::get<std::string>(literal->holder()), holder);
}

Status IlikeHolder::Make(const std::string& sql_pattern,
std::shared_ptr<IlikeHolder>* holder) {
std::string pcre_pattern;
ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern));

regex_op_.set_case_sensitive(false); // set insensitive case.
auto lholder = std::shared_ptr<IlikeHolder>(new IlikeHolder(pcre_pattern));
ARROW_RETURN_IF(!lholder->regex_.ok(),
Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed"));

*holder = lholder;
return Status::OK();
}
} // namespace gandiva
61 changes: 61 additions & 0 deletions cpp/src/gandiva/ilike_holder.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <memory>
#include <string>

#include <re2/re2.h>

#include "arrow/status.h"

#include "gandiva/function_holder.h"
#include "gandiva/like_holder.h"
#include "gandiva/node.h"
#include "gandiva/visibility.h"

namespace gandiva {

/// Function Holder for SQL 'ilike'
class GANDIVA_EXPORT IlikeHolder : public FunctionHolder {
public:
~IlikeHolder() override = default;
static Status Make(const FunctionNode& node, std::shared_ptr<IlikeHolder>* holder);

static Status Make(const std::string& sql_pattern,
std::shared_ptr<IlikeHolder>* holder);

// Try and optimise a function node with a "ilike" pattern.
static const FunctionNode TryOptimize(const FunctionNode& node);

/// Return true if the data matches the pattern.
bool operator()(const std::string& data) { return RE2::FullMatch(data, regex_); }

private:
explicit IlikeHolder(const std::string& pattern)
: pattern_(pattern), regex_(pattern, regex_op_) {}

std::string pattern_; // posix pattern string, to help debugging
RE2 regex_; // compiled regex for the pattern

static RE2 starts_with_regex_; // pre-compiled pattern for matching starts_with
static RE2 ends_with_regex_; // pre-compiled pattern for matching ends_with
static RE2 is_substr_regex_; // pre-compiled pattern for matching is_substr
static RE2::Options regex_op_; // regex option for insensitive case
};
} // namespace gandiva
122 changes: 122 additions & 0 deletions cpp/src/gandiva/ilike_holder_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "gandiva/ilike_holder.h"
#include "gandiva/regex_util.h"

#include <memory>
#include <vector>

#include <gtest/gtest.h>

namespace gandiva {

class TestILikeHolder : public ::testing::Test {
public:
FunctionNode BuildILike(std::string pattern) {
auto field = std::make_shared<FieldNode>(arrow::field("in", arrow::utf8()));
auto pattern_node =
std::make_shared<LiteralNode>(arrow::utf8(), LiteralHolder(pattern), false);
return FunctionNode("ilike", {field, pattern_node}, arrow::boolean());
}
};

TEST_F(TestILikeHolder, TestMatchAny) {
std::shared_ptr<IlikeHolder> ilike_holder;

auto status = IlikeHolder::Make("ab%", &ilike_holder);
EXPECT_EQ(status.ok(), true) << status.message();

auto& like = *ilike_holder;
EXPECT_TRUE(like("ab"));
EXPECT_TRUE(like("aBc"));
EXPECT_TRUE(like("ABCD"));

EXPECT_FALSE(like("a"));
EXPECT_FALSE(like("cab"));
}

TEST_F(TestILikeHolder, TestMatchOne) {
std::shared_ptr<IlikeHolder> ilike_holder;

auto status = IlikeHolder::Make("Ab_", &ilike_holder);
EXPECT_EQ(status.ok(), true) << status.message();

auto& like = *ilike_holder;
EXPECT_TRUE(like("abc"));
EXPECT_TRUE(like("aBd"));

EXPECT_FALSE(like("A"));
EXPECT_FALSE(like("Abcd"));
EXPECT_FALSE(like("DaBc"));
}

TEST_F(TestILikeHolder, TestPcreSpecial) {
std::shared_ptr<IlikeHolder> ilike_holder;

auto status = IlikeHolder::Make(".*aB_", &ilike_holder);
EXPECT_EQ(status.ok(), true) << status.message();

auto& like = *ilike_holder;
EXPECT_TRUE(like(".*Abc")); // . and * aren't special in sql regex
EXPECT_FALSE(like("xxAbc"));
}

TEST_F(TestILikeHolder, TestDot) {
std::shared_ptr<IlikeHolder> ilike_holder;

auto status = IlikeHolder::Make("aBc.", &ilike_holder);
EXPECT_EQ(status.ok(), true) << status.message();

auto& like = *ilike_holder;
EXPECT_FALSE(like("abcd"));
}

TEST_F(TestILikeHolder, TestOptimise) {
// optimise for 'starts_with'
auto fnode = IlikeHolder::TryOptimize(BuildILike("xy 123z%"));
EXPECT_EQ(fnode.descriptor()->name(), "starts_with");
EXPECT_EQ(fnode.ToString(), "bool starts_with((string) in, (const string) xy 123z)");

// optimise for 'ends_with'
fnode = IlikeHolder::TryOptimize(BuildILike("%xyz"));
EXPECT_EQ(fnode.descriptor()->name(), "ends_with");
EXPECT_EQ(fnode.ToString(), "bool ends_with((string) in, (const string) xyz)");

// optimise for 'is_substr'
fnode = IlikeHolder::TryOptimize(BuildILike("%abc%"));
EXPECT_EQ(fnode.descriptor()->name(), "is_substr");
EXPECT_EQ(fnode.ToString(), "bool is_substr((string) in, (const string) abc)");

// no optimisation for others.
fnode = IlikeHolder::TryOptimize(BuildILike("xyz_"));
EXPECT_EQ(fnode.descriptor()->name(), "ilike");

fnode = IlikeHolder::TryOptimize(BuildILike("_xyz"));
EXPECT_EQ(fnode.descriptor()->name(), "ilike");

fnode = IlikeHolder::TryOptimize(BuildILike("_xyz_"));
EXPECT_EQ(fnode.descriptor()->name(), "ilike");

fnode = IlikeHolder::TryOptimize(BuildILike("%xyz_"));
EXPECT_EQ(fnode.descriptor()->name(), "ilike");

fnode = IlikeHolder::TryOptimize(BuildILike("x_yz%"));
EXPECT_EQ(fnode.descriptor()->name(), "ilike");
}

} // namespace gandiva

0 comments on commit 2efd43e

Please sign in to comment.