From 2f46e8a3b519b3eb9e0b9afc9aec106f676ebbdf Mon Sep 17 00:00:00 2001 From: PHILO-HE Date: Wed, 5 Jan 2022 12:07:22 +0800 Subject: [PATCH] Add translate expression support (#68) * Initial commit * Introduce TranslateHolder * Remove unused header --- cpp/src/gandiva/CMakeLists.txt | 2 + cpp/src/gandiva/function_holder_registry.h | 2 + cpp/src/gandiva/function_registry_string.cc | 5 ++ cpp/src/gandiva/gdv_function_stubs.cc | 26 +++++++++ cpp/src/gandiva/gdv_function_stubs.h | 5 ++ cpp/src/gandiva/precompiled/string_ops.cc | 1 + cpp/src/gandiva/translate_holder.cc | 62 +++++++++++++++++++++ cpp/src/gandiva/translate_holder.h | 44 +++++++++++++++ cpp/src/gandiva/translate_holder_test.cc | 46 +++++++++++++++ 9 files changed, 193 insertions(+) create mode 100644 cpp/src/gandiva/translate_holder.cc create mode 100644 cpp/src/gandiva/translate_holder.h create mode 100644 cpp/src/gandiva/translate_holder_test.cc diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 0b7a45fd30cee..78fc77705c51f 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -84,6 +84,7 @@ set(SRC_FILES llvm_types.cc like_holder.cc json_holder.cc + translate_holder.cc literal_holder.cc projector.cc regex_util.cc @@ -231,6 +232,7 @@ add_gandiva_test(internals-test to_date_holder_test.cc simple_arena_test.cc json_holder_test.cc + translate_holder_test.cc like_holder_test.cc replace_holder_test.cc decimal_type_util_test.cc diff --git a/cpp/src/gandiva/function_holder_registry.h b/cpp/src/gandiva/function_holder_registry.h index ed111c86dcfdf..39b6bd8be1902 100644 --- a/cpp/src/gandiva/function_holder_registry.h +++ b/cpp/src/gandiva/function_holder_registry.h @@ -31,6 +31,7 @@ #include "gandiva/random_generator_holder.h" #include "gandiva/replace_holder.h" #include "gandiva/to_date_holder.h" +#include "gandiva/translate_holder.h" namespace gandiva { @@ -70,6 +71,7 @@ class FunctionHolderRegistry { {"random", LAMBDA_MAKER(RandomGeneratorHolder)}, {"rand", LAMBDA_MAKER(RandomGeneratorHolder)}, {"regexp_replace", LAMBDA_MAKER(ReplaceHolder)}, + {"translate", LAMBDA_MAKER(TranslateHolder)} }; return maker_map; } diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index 457b87df9949e..39130ab3a2fc4 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -172,6 +172,11 @@ std::vector GetStringFunctionRegistry() { kResultNullInternal, "gdv_fn_get_json_object_utf8_utf8", NativeFunction::kNeedsContext | NativeFunction::kNeedsFunctionHolder | NativeFunction::kCanReturnErrors), + + NativeFunction("translate", {}, DataTypeVector{utf8(), utf8(), utf8()}, utf8(), + kResultNullIfNull, "gdv_fn_translate_utf8_utf8_utf8", + NativeFunction::kNeedsContext | NativeFunction::kNeedsFunctionHolder | + NativeFunction::kCanReturnErrors), NativeFunction("ltrim", {}, DataTypeVector{utf8(), utf8()}, utf8(), kResultNullIfNull, "ltrim_utf8_utf8", NativeFunction::kNeedsContext), diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index 952af2f1e43e9..ea34a03569a30 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -33,6 +33,7 @@ #include "gandiva/random_generator_holder.h" #include "gandiva/replace_holder.h" #include "gandiva/to_date_holder.h" +#include "gandiva/translate_holder.h" /// Stub functions that can be accessed from LLVM or the pre-compiled library. @@ -57,6 +58,17 @@ const uint8_t* gdv_fn_get_json_object_utf8_utf8(int64_t ptr, int64_t holder_ptr, return res; } +const uint8_t* gdv_fn_translate_utf8_utf8_utf8(int64_t ptr, int64_t holder_ptr, const char* text, + int text_len, const char* matching_str, + int matching_str_len, const char* replace_str, + int replace_str_len, int32_t* out_len) { + gandiva::ExecutionContext* context = reinterpret_cast(ptr); + gandiva::TranslateHolder* holder = reinterpret_cast(holder_ptr); + auto res = (*holder)(context, std::string(text, text_len), std::string(matching_str, matching_str_len), + std::string(replace_str, replace_str_len), out_len); + return res; +} + bool gdv_fn_like_utf8_utf8(int64_t ptr, const char* data, int data_len, const char* pattern, int pattern_len) { gandiva::LikeHolder* holder = reinterpret_cast(ptr); @@ -517,6 +529,20 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const { engine->AddGlobalMappingForFunc("gdv_fn_get_json_object_utf8_utf8", types->i8_ptr_type() /*return_type*/, args, reinterpret_cast(gdv_fn_get_json_object_utf8_utf8)); + + // gdv_fn_translate_utf8_utf8_utf8 + args = {types->i64_type(), // int64_t ptr + types->i64_type(), // int64_t holder_ptr + types->i8_ptr_type(), // const char* text + types->i32_type(), // int text_len + types->i8_ptr_type(), // const char* matching_str + types->i32_type(), // int matching_str_len + types->i8_ptr_type(), // const char* replace_str + types->i32_type(), // int replace_str_len + types->i32_ptr_type()}; // int* out_len + engine->AddGlobalMappingForFunc("gdv_fn_translate_utf8_utf8_utf8", + types->i8_ptr_type() /*return types*/, args, + reinterpret_cast(gdv_fn_translate_utf8_utf8_utf8)); // gdv_fn_like_utf8_utf8 args = {types->i64_type(), // int64_t ptr diff --git a/cpp/src/gandiva/gdv_function_stubs.h b/cpp/src/gandiva/gdv_function_stubs.h index 2a92f25493640..9416a5bb523c0 100644 --- a/cpp/src/gandiva/gdv_function_stubs.h +++ b/cpp/src/gandiva/gdv_function_stubs.h @@ -57,6 +57,11 @@ bool gdv_fn_ilike_utf8_utf8(int64_t ptr, const char* data, int data_len, const uint8_t* gdv_fn_get_json_object_utf8_utf8(int64_t ptr, int64_t holder_ptr, const char* data, int data_len, bool in1_valid, const char* pattern, int pattern_len, bool in2_valid, bool* out_valid, int32_t* out_len); +const uint8_t* gdv_fn_translate_utf8_utf8_utf8(int64_t ptr, int64_t holder_ptr, const char* text, + int text_len, const char* matching_str, + int matching_str_len, const char* replace_str, + int replace_str_len, int32_t* out_len); + int64_t gdv_fn_to_date_utf8_utf8_int32(int64_t context, int64_t ptr, const char* data, int data_len, bool in1_validity, const char* pattern, int pattern_len, diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index fa9164bd1396c..168ddf29ccf0f 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -17,6 +17,7 @@ // String functions #include "arrow/util/value_parsing.h" + extern "C" { #include diff --git a/cpp/src/gandiva/translate_holder.cc b/cpp/src/gandiva/translate_holder.cc new file mode 100644 index 0000000000000..a8b0d4bfb2e97 --- /dev/null +++ b/cpp/src/gandiva/translate_holder.cc @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/translate_holder.h" + +#include +#include "gandiva/node.h" + +namespace gandiva { + +Status TranslateHolder::Make(const FunctionNode& node, std::shared_ptr* holder) { + return Make(holder); +} + +Status TranslateHolder::Make(std::shared_ptr* holder) { + *holder = std::shared_ptr(new TranslateHolder()); + return Status::OK(); +} + +const uint8_t* TranslateHolder::operator()(gandiva::ExecutionContext* ctx, std::string text, + std::string matching_str, std::string replace_str, int32_t* out_len) { + char res[text.length()]; + std::unordered_map replace_map; + for (int i = 0; i < matching_str.length(); i++) { + if (i >= replace_str.length()) { + replace_map[matching_str[i]] = '\0'; + } else { + replace_map[matching_str[i]] = replace_str[i]; + } + } + int j = 0; + for (int i = 0; i < text.length(); i++) { + if (replace_map.find(text[i]) == replace_map.end()) { + res[j++] = text[i]; + continue; + } + char replace_char = replace_map[text[i]]; + if (replace_char != '\0') { + res[j++] = replace_char; + } + } + *out_len = j; + auto result_buffer = reinterpret_cast(ctx->arena()->Allocate(*out_len)); + memcpy(result_buffer, std::string((char*)res, *out_len).data(), *out_len); + return result_buffer; +} + +} // namespace gandiva \ No newline at end of file diff --git a/cpp/src/gandiva/translate_holder.h b/cpp/src/gandiva/translate_holder.h new file mode 100644 index 0000000000000..bbe4668c37d7e --- /dev/null +++ b/cpp/src/gandiva/translate_holder.h @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/status.h" +#include "gandiva/execution_context.h" +#include "gandiva/function_holder.h" +#include "gandiva/node.h" +#include "gandiva/visibility.h" + +namespace gandiva { + +/// Function Holder for SQL 'translate' +class GANDIVA_EXPORT TranslateHolder : public FunctionHolder { + public: + TranslateHolder() {} + ~TranslateHolder() override = default; + + static Status Make(const FunctionNode& node, std::shared_ptr* holder); + static Status Make(std::shared_ptr* holder); + + const uint8_t* operator()(gandiva::ExecutionContext* ctx, std::string text, + std::string matching_str, std::string replace_str, int32_t* out_len); +}; + +} // namespace gandiva diff --git a/cpp/src/gandiva/translate_holder_test.cc b/cpp/src/gandiva/translate_holder_test.cc new file mode 100644 index 0000000000000..966f0046a980d --- /dev/null +++ b/cpp/src/gandiva/translate_holder_test.cc @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/translate_holder.h" + +#include + +namespace gandiva { +class TestTranslateHolder : public ::testing::Test { + protected: + ExecutionContext ctx_; +}; + +TEST_F(TestTranslateHolder, TestTranslate) { + std::shared_ptr translate_holder; + + auto status = TranslateHolder::Make(&translate_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto translate = *translate_holder; + + int32_t out_len; + const uint8_t* out_str; + + out_str = translate(&ctx_, "ab[cd]", "[]", "", &out_len); + EXPECT_EQ(std::string((char*)out_str, out_len), "abcd"); + + out_str = translate(&ctx_, "ab[cd]", "[]", "#", &out_len); + EXPECT_EQ(std::string((char*)out_str, out_len), "ab#cd"); +} + +} // namespace gandiva \ No newline at end of file