-
Notifications
You must be signed in to change notification settings - Fork 3.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
ARROW-12410: [C++][Gandiva] Implement regexp_replace function on Gandiva
Closes #10059 from rodrigojdebem/feature/implement-regexp-replace and squashes the following commits: baf2778 <rodrigojdebem> Add implementation for REGEXP_REPLACE Authored-by: rodrigojdebem <rodrigodebem1@gmail.com> Signed-off-by: Praveen <praveen@dremio.com>
- Loading branch information
1 parent
87e0252
commit 5f0641b
Showing
9 changed files
with
394 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
#include "gandiva/replace_holder.h" | ||
|
||
#include "gandiva/node.h" | ||
#include "gandiva/regex_util.h" | ||
|
||
namespace gandiva { | ||
|
||
static bool IsArrowStringLiteral(arrow::Type::type type) { | ||
return type == arrow::Type::STRING || type == arrow::Type::BINARY; | ||
} | ||
|
||
Status ReplaceHolder::Make(const FunctionNode& node, | ||
std::shared_ptr<ReplaceHolder>* holder) { | ||
ARROW_RETURN_IF(node.children().size() != 3, | ||
Status::Invalid("'replace' function requires three parameters")); | ||
|
||
auto literal = dynamic_cast<LiteralNode*>(node.children().at(1).get()); | ||
ARROW_RETURN_IF( | ||
literal == nullptr, | ||
Status::Invalid("'replace' function requires a literal as the second parameter")); | ||
|
||
auto literal_type = literal->return_type()->id(); | ||
ARROW_RETURN_IF( | ||
!IsArrowStringLiteral(literal_type), | ||
Status::Invalid( | ||
"'replace' function requires a string literal as the second parameter")); | ||
|
||
return Make(arrow::util::get<std::string>(literal->holder()), holder); | ||
} | ||
|
||
Status ReplaceHolder::Make(const std::string& sql_pattern, | ||
std::shared_ptr<ReplaceHolder>* holder) { | ||
auto lholder = std::shared_ptr<ReplaceHolder>(new ReplaceHolder(sql_pattern)); | ||
ARROW_RETURN_IF(!lholder->regex_.ok(), | ||
Status::Invalid("Building RE2 pattern '", sql_pattern, "' failed")); | ||
|
||
*holder = lholder; | ||
return Status::OK(); | ||
} | ||
|
||
void ReplaceHolder::return_error(ExecutionContext* context, std::string& data, | ||
std::string& replace_string) { | ||
std::string err_msg = "Error replacing '" + replace_string + "' on the given string '" + | ||
data + "' for the given pattern: " + pattern_; | ||
context->set_error_msg(err_msg.c_str()); | ||
} | ||
|
||
} // namespace gandiva |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
#pragma once | ||
|
||
#include <re2/re2.h> | ||
|
||
#include <memory> | ||
#include <string> | ||
|
||
#include "arrow/status.h" | ||
#include "gandiva/execution_context.h" | ||
#include "gandiva/function_holder.h" | ||
#include "gandiva/node.h" | ||
#include "gandiva/visibility.h" | ||
|
||
namespace gandiva { | ||
|
||
/// Function Holder for 'replace' | ||
class GANDIVA_EXPORT ReplaceHolder : public FunctionHolder { | ||
public: | ||
~ReplaceHolder() override = default; | ||
|
||
static Status Make(const FunctionNode& node, std::shared_ptr<ReplaceHolder>* holder); | ||
|
||
static Status Make(const std::string& sql_pattern, | ||
std::shared_ptr<ReplaceHolder>* holder); | ||
|
||
/// Return a new string with the pattern that matched the regex replaced for | ||
/// the replace_input parameter. | ||
const char* operator()(ExecutionContext* ctx, const char* user_input, | ||
int32_t user_input_len, const char* replace_input, | ||
int32_t replace_input_len, int32_t* out_length) { | ||
std::string user_input_as_str(user_input, user_input_len); | ||
std::string replace_input_as_str(replace_input, replace_input_len); | ||
|
||
int32_t total_replaces = | ||
RE2::GlobalReplace(&user_input_as_str, regex_, replace_input_as_str); | ||
|
||
if (total_replaces < 0) { | ||
return_error(ctx, user_input_as_str, replace_input_as_str); | ||
*out_length = 0; | ||
return ""; | ||
} | ||
|
||
if (total_replaces == 0) { | ||
*out_length = user_input_len; | ||
return user_input; | ||
} | ||
|
||
*out_length = static_cast<int32_t>(user_input_as_str.size()); | ||
|
||
// This condition treats the case where the whole string is replaced by an empty | ||
// string | ||
if (*out_length == 0) { | ||
return ""; | ||
} | ||
|
||
char* result_buffer = reinterpret_cast<char*>(ctx->arena()->Allocate(*out_length)); | ||
|
||
if (result_buffer == NULLPTR) { | ||
ctx->set_error_msg("Could not allocate memory for result"); | ||
*out_length = 0; | ||
return ""; | ||
} | ||
|
||
memcpy(result_buffer, user_input_as_str.data(), *out_length); | ||
|
||
return result_buffer; | ||
} | ||
|
||
private: | ||
explicit ReplaceHolder(const std::string& pattern) | ||
: pattern_(pattern), regex_(pattern) {} | ||
|
||
void return_error(ExecutionContext* context, std::string& data, | ||
std::string& replace_string); | ||
|
||
std::string pattern_; // posix pattern string, to help debugging | ||
RE2 regex_; // compiled regex for the pattern | ||
}; | ||
|
||
} // namespace gandiva |
Oops, something went wrong.