Skip to content

Commit

Permalink
[feature](json)support explode_json_object func (#36887)
Browse files Browse the repository at this point in the history
after this pr we support
expand json object into two column with key column (string) ,and value
column(json value)
  • Loading branch information
amorynan authored Jul 2, 2024
1 parent 3716fb5 commit d60dbb5
Show file tree
Hide file tree
Showing 11 changed files with 477 additions and 0 deletions.
2 changes: 2 additions & 0 deletions be/src/vec/exprs/table_function/table_function_factory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "vec/exprs/table_function/vexplode.h"
#include "vec/exprs/table_function/vexplode_bitmap.h"
#include "vec/exprs/table_function/vexplode_json_array.h"
#include "vec/exprs/table_function/vexplode_json_object.h"
#include "vec/exprs/table_function/vexplode_map.h"
#include "vec/exprs/table_function/vexplode_numbers.h"
#include "vec/exprs/table_function/vexplode_split.h"
Expand Down Expand Up @@ -58,6 +59,7 @@ const std::unordered_map<std::string, std::function<std::unique_ptr<TableFunctio
{"explode_json_array_json", VExplodeJsonArrayCreator<ParsedDataJSON>()},
{"explode_bitmap", TableFunctionCreator<VExplodeBitmapTableFunction>()},
{"explode_map", TableFunctionCreator<VExplodeMapTableFunction> {}},
{"explode_json_object", TableFunctionCreator<VExplodeJsonObjectTableFunction> {}},
{"explode", TableFunctionCreator<VExplodeTableFunction> {}}};

Status TableFunctionFactory::get_fn(const TFunction& t_fn, ObjectPool* pool, TableFunction** fn) {
Expand Down
160 changes: 160 additions & 0 deletions be/src/vec/exprs/table_function/vexplode_json_object.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "vec/exprs/table_function/vexplode_json_object.h"

#include <glog/logging.h>

#include <ostream>
#include <vector>

#include "common/status.h"
#include "vec/columns/column.h"
#include "vec/common/string_ref.h"
#include "vec/core/block.h"
#include "vec/core/column_with_type_and_name.h"
#include "vec/exprs/vexpr.h"
#include "vec/exprs/vexpr_context.h"

namespace doris::vectorized {

VExplodeJsonObjectTableFunction::VExplodeJsonObjectTableFunction() {
_fn_name = "vexplode_json_object";
}

Status VExplodeJsonObjectTableFunction::process_init(Block* block, RuntimeState* state) {
CHECK(_expr_context->root()->children().size() == 1)
<< "VExplodeJsonObjectTableFunction only support 1 child but has "
<< _expr_context->root()->children().size();

int text_column_idx = -1;
RETURN_IF_ERROR(_expr_context->root()->children()[0]->execute(_expr_context.get(), block,
&text_column_idx));

_json_object_column = block->get_by_position(text_column_idx).column;
return Status::OK();
}

void VExplodeJsonObjectTableFunction::process_row(size_t row_idx) {
TableFunction::process_row(row_idx);

StringRef text = _json_object_column->get_data_at(row_idx);
if (text.data != nullptr) {
JsonbDocument* doc = JsonbDocument::createDocument(text.data, text.size);
if (UNLIKELY(!doc || !doc->getValue())) {
// error jsonb, put null into output, cur_size = 0 , we will insert_default
return;
}
// value is NOT necessary to be deleted since JsonbValue will not allocate memory
JsonbValue* value = doc->getValue();
auto writer = std::make_unique<JsonbWriter>();
if (value->isObject()) {
_cur_size = value->length();
ObjectVal* obj = (ObjectVal*)value;
_object_pairs.first =
ColumnNullable::create(ColumnString::create(), ColumnUInt8::create());
_object_pairs.second =
ColumnNullable::create(ColumnString::create(), ColumnUInt8::create());
_object_pairs.first->reserve(_cur_size);
_object_pairs.second->reserve(_cur_size);
for (auto it = obj->begin(); it != obj->end(); ++it) {
_object_pairs.first->insert_data(it->getKeyStr(), it->klen());
writer->reset();
writer->writeValue(it->value());
if (it->value()->isNull()) {
_object_pairs.second->insert_default();
} else {
const std::string_view& jsonb_value = std::string_view(
writer->getOutput()->getBuffer(), writer->getOutput()->getSize());
_object_pairs.second->insert_data(jsonb_value.data(), jsonb_value.size());
}
}
}
// we do not support other json type except object
}
}

void VExplodeJsonObjectTableFunction::process_close() {
_json_object_column = nullptr;
_object_pairs.first = nullptr;
_object_pairs.second = nullptr;
}

void VExplodeJsonObjectTableFunction::get_same_many_values(MutableColumnPtr& column, int length) {
// if current is empty map row, also append a default value
if (current_empty()) {
column->insert_many_defaults(length);
return;
}
ColumnStruct* ret = nullptr;
// this _is_nullable is whole output column's nullable
if (_is_nullable) {
// make map kv value into struct
ret = assert_cast<ColumnStruct*>(
assert_cast<ColumnNullable*>(column.get())->get_nested_column_ptr().get());
assert_cast<ColumnUInt8*>(
assert_cast<ColumnNullable*>(column.get())->get_null_map_column_ptr().get())
->insert_many_defaults(length);
} else if (column->is_column_struct()) {
ret = assert_cast<ColumnStruct*>(column.get());
} else {
throw Exception(ErrorCode::INTERNAL_ERROR,
"only support expand json object int to struct(kv pair), but given: ",
column->get_name());
}
if (!ret || ret->tuple_size() != 2) {
throw Exception(ErrorCode::INTERNAL_ERROR,
"only support expand json object int to kv pair column, but given: ",
ret->tuple_size());
}
ret->get_column(0).insert_many_from(*_object_pairs.first, _cur_offset, length);
ret->get_column(1).insert_many_from(*_object_pairs.second, _cur_offset, length);
}

int VExplodeJsonObjectTableFunction::get_value(MutableColumnPtr& column, int max_step) {
max_step = std::min(max_step, (int)(_cur_size - _cur_offset));
if (current_empty()) {
column->insert_default();
max_step = 1;
} else {
ColumnStruct* struct_column = nullptr;
if (_is_nullable) {
auto* nullable_column = assert_cast<ColumnNullable*>(column.get());
struct_column =
assert_cast<ColumnStruct*>(nullable_column->get_nested_column_ptr().get());
auto* nullmap_column =
assert_cast<ColumnUInt8*>(nullable_column->get_null_map_column_ptr().get());
// here nullmap_column insert max_step many defaults as if MAP[row_idx] is NULL
// will be not update value, _cur_size = 0, means current_empty;
// so here could insert directly
nullmap_column->insert_many_defaults(max_step);
} else {
struct_column = assert_cast<ColumnStruct*>(column.get());
}
if (!struct_column || struct_column->tuple_size() != 2) {
throw Exception(ErrorCode::INTERNAL_ERROR,
"only support expand json object int to kv pair column, but given: ",
struct_column->tuple_size());
}
struct_column->get_column(0).insert_range_from(*_object_pairs.first, _cur_offset, max_step);
struct_column->get_column(1).insert_range_from(*_object_pairs.second, _cur_offset,
max_step);
}
forward(max_step);
return max_step;
}
} // namespace doris::vectorized
59 changes: 59 additions & 0 deletions be/src/vec/exprs/table_function/vexplode_json_object.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <stddef.h>

#include "common/status.h"
#include "vec/columns/column_map.h"
#include "vec/data_types/data_type.h"
#include "vec/data_types/data_type_array.h"
#include "vec/data_types/data_type_map.h"
#include "vec/exprs/table_function/table_function.h"
#include "vec/functions/array/function_array_utils.h"

namespace doris::vectorized {
class Block;
} // namespace doris::vectorized

namespace doris::vectorized {

// explode_json_object("{\"a\": 1, \"b\": 2}") ->
// | key | value |
// | a | 1 |
// | b | 2 |
class VExplodeJsonObjectTableFunction : public TableFunction {
ENABLE_FACTORY_CREATOR(VExplodeJsonObjectTableFunction);

public:
VExplodeJsonObjectTableFunction();

~VExplodeJsonObjectTableFunction() override = default;

Status process_init(Block* block, RuntimeState* state) override;
void process_row(size_t row_idx) override;
void process_close() override;
void get_same_many_values(MutableColumnPtr& column, int length) override;
int get_value(MutableColumnPtr& column, int max_step) override;

private:
ColumnPtr _json_object_column;
std::pair<MutableColumnPtr, MutableColumnPtr> _object_pairs; // ColumnNullable<ColumnString>
};

} // namespace doris::vectorized
14 changes: 14 additions & 0 deletions be/src/vec/functions/function_fake.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,19 @@ struct FunctionExplodeMap {
static std::string get_error_msg() { return "Fake function do not support execute"; }
};

// explode json-object: expands json-object to struct with a pair of key and value in column string
struct FunctionExplodeJsonObject {
static DataTypePtr get_return_type_impl(const DataTypes& arguments) {
DCHECK(WhichDataType(arguments[0]).is_json())
<< " explode json object " << arguments[0]->get_name() << " not supported";
DataTypes fieldTypes(2);
fieldTypes[0] = make_nullable(std::make_shared<DataTypeString>());
fieldTypes[1] = make_nullable(std::make_shared<DataTypeJsonb>());
return make_nullable(std::make_shared<vectorized::DataTypeStruct>(fieldTypes));
}
static std::string get_error_msg() { return "Fake function do not support execute"; }
};

struct FunctionEsquery {
static DataTypePtr get_return_type_impl(const DataTypes& arguments) {
return FunctionFakeBaseImpl<DataTypeUInt8>::get_return_type_impl(arguments);
Expand Down Expand Up @@ -113,6 +126,7 @@ void register_function_fake(SimpleFunctionFactory& factory) {
register_table_function_expand_outer<FunctionExplode>(factory, "explode");
register_table_function_expand_outer<FunctionExplodeMap>(factory, "explode_map");

register_table_function_expand_outer<FunctionExplodeJsonObject>(factory, "explode_json_object");
register_table_function_expand_outer_default<DataTypeString>(factory, "explode_split");
register_table_function_expand_outer_default<DataTypeInt32>(factory, "explode_numbers");
register_table_function_expand_outer_default<DataTypeInt64>(factory, "explode_json_array_int");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
import org.apache.doris.nereids.trees.expressions.functions.generator.ExplodeJsonArrayJsonOuter;
import org.apache.doris.nereids.trees.expressions.functions.generator.ExplodeJsonArrayString;
import org.apache.doris.nereids.trees.expressions.functions.generator.ExplodeJsonArrayStringOuter;
import org.apache.doris.nereids.trees.expressions.functions.generator.ExplodeJsonObject;
import org.apache.doris.nereids.trees.expressions.functions.generator.ExplodeJsonObjectOuter;
import org.apache.doris.nereids.trees.expressions.functions.generator.ExplodeMap;
import org.apache.doris.nereids.trees.expressions.functions.generator.ExplodeMapOuter;
import org.apache.doris.nereids.trees.expressions.functions.generator.ExplodeNumbers;
Expand All @@ -52,6 +54,8 @@ public class BuiltinTableGeneratingFunctions implements FunctionHelper {
tableGenerating(ExplodeOuter.class, "explode_outer"),
tableGenerating(ExplodeMap.class, "explode_map"),
tableGenerating(ExplodeMapOuter.class, "explode_map_outer"),
tableGenerating(ExplodeJsonObject.class, "explode_json_object"),
tableGenerating(ExplodeJsonObjectOuter.class, "explode_json_object_outer"),
tableGenerating(ExplodeNumbers.class, "explode_numbers"),
tableGenerating(ExplodeNumbersOuter.class, "explode_numbers_outer"),
tableGenerating(ExplodeBitmap.class, "explode_bitmap"),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

package org.apache.doris.nereids.trees.expressions.functions.generator;

import org.apache.doris.catalog.FunctionSignature;
import org.apache.doris.nereids.trees.expressions.Expression;
import org.apache.doris.nereids.trees.expressions.functions.AlwaysNullable;
import org.apache.doris.nereids.trees.expressions.literal.StructLiteral;
import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression;
import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
import org.apache.doris.nereids.types.JsonType;
import org.apache.doris.nereids.types.StringType;

import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;

import java.util.List;

/**
* explode_json_object("{"amory":1, "doris": 2}") generate two column and two lines with:
* key column: amory, doris
* value column: 1, 2
*/
public class ExplodeJsonObject extends TableGeneratingFunction implements UnaryExpression, AlwaysNullable {

public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
FunctionSignature.ret(StructLiteral.constructStructType(
ImmutableList.of(StringType.INSTANCE, JsonType.INSTANCE))).args(JsonType.INSTANCE));

/**
* constructor with 1 argument.
*/
public ExplodeJsonObject(Expression arg) {
super("explode_json_object", arg);
}

/**
* withChildren.
*/
@Override
public ExplodeJsonObject withChildren(List<Expression> children) {
Preconditions.checkArgument(children.size() == 1);
return new ExplodeJsonObject(children.get(0));
}

@Override
public List<FunctionSignature> getSignatures() {
return SIGNATURES;
}

@Override
public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
return visitor.visitExplodeJsonObject(this, context);
}
}
Loading

0 comments on commit d60dbb5

Please sign in to comment.