diff --git a/be/src/vec/data_types/serde/data_type_string_serde.h b/be/src/vec/data_types/serde/data_type_string_serde.h index b870b61d64f371..91ed4dbb4135c9 100644 --- a/be/src/vec/data_types/serde/data_type_string_serde.h +++ b/be/src/vec/data_types/serde/data_type_string_serde.h @@ -73,19 +73,59 @@ class DataTypeStringSerDeBase : public DataTypeSerDe { auto result = check_column_const_set_readability(column, row_num); ColumnPtr ptr = result.first; row_num = result.second; + const auto& value = assert_cast(*ptr).get_data_at(row_num); if (_nesting_level > 1) { bw.write('"'); } - - const auto& value = assert_cast(*ptr).get_data_at(row_num); - bw.write(value.data, value.size); + if constexpr (std::is_same_v) { + if (options.escape_char != 0) { + // we should make deal with some special characters in json str if we have escape_char + StringRef str_ref = value; + write_with_escaped_char_to_json(str_ref, bw); + } else { + bw.write(value.data, value.size); + } + } else { + bw.write(value.data, value.size); + } if (_nesting_level > 1) { bw.write('"'); } + return Status::OK(); } + inline void write_with_escaped_char_to_json(StringRef value, BufferWritable& bw) const { + for (char it : value) { + switch (it) { + case '\b': + bw.write("\\b", 2); + break; + case '\f': + bw.write("\\f", 2); + break; + case '\n': + bw.write("\\n", 2); + break; + case '\r': + bw.write("\\r", 2); + break; + case '\t': + bw.write("\\t", 2); + break; + case '\\': + bw.write("\\\\", 2); + break; + case '"': + bw.write("\\\"", 2); + break; + default: + bw.write(it); + } + } + } + Status serialize_column_to_json(const IColumn& column, int start_idx, int end_idx, BufferWritable& bw, FormatOptions& options) const override { SERIALIZE_COLUMN_TO_JSON(); diff --git a/be/src/vec/functions/function_cast.h b/be/src/vec/functions/function_cast.h index 17250e10fd77d0..d4b21aacc5c148 100644 --- a/be/src/vec/functions/function_cast.h +++ b/be/src/vec/functions/function_cast.h @@ -766,6 +766,7 @@ struct ConvertImplGenericToJsonb { auto tmp_col = ColumnString::create(); vectorized::DataTypeSerDe::FormatOptions options; + options.escape_char = '\\'; for (size_t i = 0; i < input_rows_count; i++) { // convert to string tmp_col->clear(); diff --git a/regression-test/data/jsonb_p0/test_jsonb_cast.csv b/regression-test/data/jsonb_p0/test_jsonb_cast.csv new file mode 100644 index 00000000000000..08b694ddea822f --- /dev/null +++ b/regression-test/data/jsonb_p0/test_jsonb_cast.csv @@ -0,0 +1,4 @@ +1 \N +2 ['{\'x\' : \'{"y" : 1}\', \'t\' : \'{"y" : 2}\'}', '{"x" : 1}'] +3 ['foo\'bar', 'foo"bar', 'foo\\'bar', 'foo\'\'bar'] +4 ['\/some\/cool\/url', '/some/cool/url', 'a\\_\\c\\l\\i\\c\\k\\h\\o\\u\\s\\e'] \ No newline at end of file diff --git a/regression-test/data/jsonb_p0/test_jsonb_cast.out b/regression-test/data/jsonb_p0/test_jsonb_cast.out new file mode 100644 index 00000000000000..2ab4174c746d6a --- /dev/null +++ b/regression-test/data/jsonb_p0/test_jsonb_cast.out @@ -0,0 +1,27 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_1 -- +1 \N +2 ["{\\'x\\' : \\'{"y" : 1}\\', \\'t\\' : \\'{"y" : 2}\\'}", "{"x" : 1}"] +3 ["foo\\'bar', 'foo"bar', 'foo\\\\'bar', 'foo\\'\\'bar"] +4 ["\\/some\\/cool\\/url", "/some/cool/url", "a\\\\_\\\\c\\\\l\\\\i\\\\c\\\\k\\\\h\\\\o\\\\u\\\\s\\\\e"] + +-- !select_2 -- +1 \N +2 ["{\\'x\\' : \\'{"y" : 1}\\', \\'t\\' : \\'{"y" : 2}\\'}", "{"x" : 1}"] +3 ["foo\\'bar', 'foo"bar', 'foo\\\\'bar', 'foo\\'\\'bar"] +4 ["\\/some\\/cool\\/url", "/some/cool/url", "a\\\\_\\\\c\\\\l\\\\i\\\\c\\\\k\\\\h\\\\o\\\\u\\\\s\\\\e"] +27 ["{"k1":"v1", "k2": 200}"] +28 ["{"a.b.c":{"k1.a1":"v31", "k2": 300},"a":"niu"}"] +29 [" \n\r", " \n\r"] +30 ["f\r\n", "f\r\n""] + +-- !select_json -- +1 \N +2 ["{\\\\'x\\\\' : \\\\'{\\"y\\" : 1}\\\\', \\\\'t\\\\' : \\\\'{\\"y\\" : 2}\\\\'}","{\\"x\\" : 1}"] +3 ["foo\\\\'bar', 'foo\\"bar', 'foo\\\\\\\\'bar', 'foo\\\\'\\\\'bar"] +4 ["\\\\/some\\\\/cool\\\\/url","/some/cool/url","a\\\\\\\\_\\\\\\\\c\\\\\\\\l\\\\\\\\i\\\\\\\\c\\\\\\\\k\\\\\\\\h\\\\\\\\o\\\\\\\\u\\\\\\\\s\\\\\\\\e"] +27 ["{\\"k1\\":\\"v1\\", \\"k2\\": 200}"] +28 ["{\\"a.b.c\\":{\\"k1.a1\\":\\"v31\\", \\"k2\\": 300},\\"a\\":\\"niu\\"}"] +29 ["\\f\\n\\r","\\f\\n\\r"] +30 ["f\\b\\r\\n","f\\b\\r\\n\\""] + diff --git a/regression-test/suites/jsonb_p0/test_jsonb_cast.groovy b/regression-test/suites/jsonb_p0/test_jsonb_cast.groovy new file mode 100644 index 00000000000000..4d1b2aa7181923 --- /dev/null +++ b/regression-test/suites/jsonb_p0/test_jsonb_cast.groovy @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.codehaus.groovy.runtime.IOGroovyMethods + +suite("test_jsonb_cast", "p0") { + + // define a sql table with array which has some Escape Character and should also to cast to json + def testTable = "tbl_test_array_text_cast_jsonb" + def dataFile = "test_jsonb_cast.csv" + + sql """ set experimental_enable_nereids_planner = true """ + sql """ set enable_fallback_to_original_planner = true """ + + sql "DROP TABLE IF EXISTS ${testTable}" + + sql """ + CREATE TABLE IF NOT EXISTS ${testTable} ( + id INT, + a ARRAY, + ) + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 3 + PROPERTIES("replication_num" = "1"); + """ + + // load the jsonb data from csv file + streamLoad { + table testTable + + file dataFile // import csv file + time 10000 // limit inflight 10s + set 'strict_mode', 'true' + + // if declared a check callback, the default check condition will ignore. + // So you must check all condition + check { result, exception, startTime, endTime -> + if (exception != null) { + throw exception + } + log.info("Stream load result: ${result}".toString()) + def json = parseJson(result) + assertEquals(4, json.NumberTotalRows) + assertEquals(4, json.NumberLoadedRows) + assertTrue(json.LoadBytes > 0) + } + } + + sql """ sync; """ + + // check result + qt_select_1 "SELECT * FROM ${testTable} ORDER BY id" + + + // insert into valid json rows + sql """INSERT INTO ${testTable} VALUES(27, ['{"k1":"v1", "k2": 200}'])""" + sql """INSERT INTO ${testTable} VALUES(28, ['{"a.b.c":{"k1.a1":"v31", "k2": 300},"a":"niu"}'])""" + sql """INSERT INTO ${testTable} VALUES(29, ['\f\n\r', "\f\n\r"])""" + sql """INSERT INTO ${testTable} VALUES(30, ["\\f\\b\\r\\n", '\\f\\b\\r\\n"'])""" + + // check result + qt_select_2 "SELECT * FROM ${testTable} ORDER BY id" + // check cast as json + qt_select_json "SELECT id, cast(a as JSON) FROM ${testTable} ORDER BY id" +} \ No newline at end of file