Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FLASH-37] Json support #159

Merged
merged 21 commits into from
Aug 22, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion dbms/src/Storages/Transaction/Codec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <DataTypes/DataTypeDecimal.h>
#include <Storages/Transaction/TiDB.h>
#include <Storages/Transaction/TiKVVarInt.h>
#include <Storages/Transaction/JSONCodec.h>

namespace DB
{
Expand Down Expand Up @@ -290,9 +291,11 @@ Field DecodeDatum(size_t & cursor, const String & raw_value)
case TiDB::CodecFlagVarInt:
return DecodeVarInt(cursor, raw_value);
case TiDB::CodecFlagDuration:
throw Exception("Not implented yet. DecodeDatum: CodecFlagDuration", ErrorCodes::LOGICAL_ERROR);
throw Exception("Not implemented yet. DecodeDatum: CodecFlagDuration", ErrorCodes::LOGICAL_ERROR);
case TiDB::CodecFlagDecimal:
return DecodeDecimal(cursor, raw_value);
case TiDB::CodecFlagJson:
return DecodeJsonAsBinary(cursor, raw_value);
default:
throw Exception("Unknown Type:" + std::to_string(raw_value[cursor - 1]), ErrorCodes::LOGICAL_ERROR);
}
Expand Down Expand Up @@ -330,6 +333,9 @@ void SkipDatum(size_t & cursor, const String & raw_value)
case TiDB::CodecFlagDecimal:
SkipDecimal(cursor, raw_value);
return;
case TiDB::CodecFlagJson:
SkipJson(cursor, raw_value);
return;
default:
throw Exception("Unknown Type:" + std::to_string(raw_value[cursor - 1]), ErrorCodes::LOGICAL_ERROR);
}
Expand Down
276 changes: 276 additions & 0 deletions dbms/src/Storages/Transaction/JSONCodec.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,276 @@
#include <Storages/Transaction/Codec.h>
#include <Storages/Transaction/JSONCodec.h>

#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#include <Poco/JSON/Array.h>
#include <Poco/JSON/Object.h>
#pragma GCC diagnostic pop

/**
* https://github.com/pingcap/tidb/blob/release-3.0/types/json/binary.go
The binary JSON format from MySQL 5.7 is as follows:
JSON doc ::= type value
type ::=
0x01 | // large JSON object
0x03 | // large JSON array
0x04 | // literal (true/false/null)
0x05 | // int16
0x06 | // uint16
0x07 | // int32
0x08 | // uint32
0x09 | // int64
0x0a | // uint64
0x0b | // double
0x0c | // utf8mb4 string
value ::=
object |
array |
literal |
number |
string |
object ::= element-count size key-entry* value-entry* key* value*
array ::= element-count size value-entry* value*
// number of members in object or number of elements in array
element-count ::= uint32
// number of bytes in the binary representation of the object or array
size ::= uint32
key-entry ::= key-offset key-length
key-offset ::= uint32
key-length ::= uint16 // key length must be less than 64KB
value-entry ::= type offset-or-inlined-value
// This field holds either the offset to where the value is stored,
// or the value itself if it is small enough to be inlined (that is,
// if it is a JSON literal or a small enough [u]int).
offset-or-inlined-value ::= uint32
key ::= utf8mb4-data
literal ::=
0x00 | // JSON null literal
0x01 | // JSON true literal
0x02 | // JSON false literal
number ::= .... // little-endian format for [u]int(16|32|64), whereas
// double is stored in a platform-independent, eight-byte
// format using float8store()
string ::= data-length utf8mb4-data
data-length ::= uint8* // If the high bit of a byte is 1, the length
// field is continued in the next byte,
// otherwise it is the last byte of the length
// field. So we need 1 byte to represent
// lengths up to 127, 2 bytes to represent
// lengths up to 16383, and so on...
*/
namespace DB
{

namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}

using JsonVar = Poco::Dynamic::Var;

constexpr UInt8 TYPE_CODE_OBJECT = 0x01; // TypeCodeObject indicates the JSON is an object.
constexpr UInt8 TYPE_CODE_ARRAY = 0x03; // TypeCodeArray indicates the JSON is an array.
constexpr UInt8 TYPE_CODE_LITERAL = 0x04; // TypeCodeLiteral indicates the JSON is a literal.
constexpr UInt8 TYPE_CODE_INT64 = 0x09; // TypeCodeInt64 indicates the JSON is a signed integer.
constexpr UInt8 TYPE_CODE_UINT64 = 0x0a; // TypeCodeUint64 indicates the JSON is a unsigned integer.
constexpr UInt8 TYPE_CODE_FLOAT64 = 0x0b; // TypeCodeFloat64 indicates the JSON is a double float number.
constexpr UInt8 TYPE_CODE_STRING = 0x0c; // TypeCodeString indicates the JSON is a string.
constexpr UInt8 LITERAL_NIL = 0x00; // LiteralNil represents JSON null.
constexpr UInt8 LITERAL_TRUE = 0x01; // LiteralTrue represents JSON true.
constexpr UInt8 LITERAL_FALSE = 0x02; // LiteralFalse represents JSON false.

constexpr size_t VALUE_ENTRY_SIZE = 5;
constexpr size_t KEY_ENTRY_LENGTH = 6;
constexpr size_t PREFIX_LENGTH = 8;

using JsonArrayPtr = Poco::JSON::Array::Ptr;
using JsonObjectPtr = Poco::JSON::Object::Ptr;

JsonArrayPtr decodeArray(size_t & cursor, const String & raw_value);
JsonObjectPtr decodeObject(size_t & cursor, const String & raw_value);
inline JsonVar decodeLiteral(size_t & cursor, const String & raw_value);
inline String decodeString(size_t & cursor, const String & raw_value);
JsonVar decodeValue(UInt8 type, size_t & cursor, const String & raw_value);

// Below funcs decode via relative offset and base offset does not move
JsonVar decodeValueEntry(size_t base, const String & raw_value, size_t value_offset);
inline String decodeString(size_t base, const String & raw_value, size_t length);

template <typename T>
inline T decodeNumeric(size_t & cursor, const String & raw_value)
{
T res = *(reinterpret_cast<const T *>(raw_value.data() + cursor));
cursor += sizeof(T);
return res;
}

JsonObjectPtr decodeObject(size_t & cursor, const String & raw_value)
{
UInt32 element_count = decodeNumeric<UInt32>(cursor, raw_value);
size_t size = decodeNumeric<UInt32>(cursor, raw_value);
size_t base = cursor;
JsonObjectPtr obj_ptr = new Poco::JSON::Object();

for (UInt32 i = 0; i < element_count; i++)
{
// offset points to head of string content instead of length so - 2
size_t entry_base = base + i * KEY_ENTRY_LENGTH;
size_t key_offset = base + decodeNumeric<UInt32>(entry_base, raw_value) - 8;
size_t key_length = decodeNumeric<UInt16>(entry_base, raw_value);
String key = decodeString(key_offset, raw_value, key_length);
solotzg marked this conversation as resolved.
Show resolved Hide resolved

JsonVar val = decodeValueEntry(base, raw_value, element_count * KEY_ENTRY_LENGTH + i * VALUE_ENTRY_SIZE);
obj_ptr->set(key, val);
}
cursor += size - 8;

return obj_ptr;
}

JsonArrayPtr decodeArray(size_t & cursor, const String & raw_value)
{
UInt32 element_count = decodeNumeric<UInt32>(cursor, raw_value);
size_t size = decodeNumeric<UInt32>(cursor, raw_value);

JsonArrayPtr array_ptr = new Poco::JSON::Array();
for (UInt32 i = 0; i < element_count; i++)
{
JsonVar val = decodeValueEntry(cursor, raw_value, VALUE_ENTRY_SIZE * i);
array_ptr->add(val);
}
cursor += size - 8;
return array_ptr;
}

JsonVar decodeValueEntry(size_t base, const String & raw_value, size_t value_entry_offset)
{
UInt8 type = raw_value[base + value_entry_offset];
size_t abs_entry_offset = base + value_entry_offset + 1;

if (type == TYPE_CODE_LITERAL)
{
return decodeLiteral(abs_entry_offset, raw_value);
}

size_t value_offset = base + decodeNumeric<UInt32>(abs_entry_offset, raw_value) - PREFIX_LENGTH;
return decodeValue(type, value_offset, raw_value);
}

JsonVar decodeValue(UInt8 type, size_t & cursor, const String & raw_value)
{
switch (type) // JSON Root element type
{
case TYPE_CODE_OBJECT:
return decodeObject(cursor, raw_value);
case TYPE_CODE_ARRAY:
return decodeArray(cursor, raw_value);
case TYPE_CODE_LITERAL:
return decodeLiteral(cursor, raw_value);
case TYPE_CODE_INT64:
return JsonVar(decodeNumeric<Int64>(cursor, raw_value));
case TYPE_CODE_UINT64:
return JsonVar(decodeNumeric<UInt64>(cursor, raw_value));
case TYPE_CODE_FLOAT64:
return JsonVar(decodeNumeric<Float64>(cursor, raw_value));
case TYPE_CODE_STRING:
return JsonVar(decodeString(cursor, raw_value));
default:
throw Exception("decodeValue: Unknown JSON Element Type:" + std::to_string(type), ErrorCodes::LOGICAL_ERROR);
}
}

inline JsonVar decodeLiteral(size_t & cursor, const String & raw_value)
{
UInt8 type = raw_value[cursor++];
switch (type)
{
case LITERAL_FALSE:
return JsonVar(false);
case LITERAL_NIL:
return JsonVar();
case LITERAL_TRUE:
return JsonVar(true);
default:
throw Exception("decodeLiteral: Unknown JSON Literal Type:" + std::to_string(type), ErrorCodes::LOGICAL_ERROR);
}
}

inline String decodeString(size_t base, const String & raw_value, size_t length) { return String(raw_value, base, length); }

inline String decodeString(size_t & cursor, const String & raw_value)
{
size_t length = DecodeVarUInt(cursor, raw_value);
String val = String(raw_value, cursor, length);
cursor += length;
return val;
}

String DecodeJsonAsString(size_t & cursor, const String & raw_value)
{
UInt8 type = raw_value[cursor++];
return decodeValue(type, cursor, raw_value);
}

template<bool doDecode>
struct need_decode{};


template<>
struct need_decode<true>{ typedef String type; };

template<>
struct need_decode<false>{ typedef void type; };

template <bool doDecode>
typename need_decode<doDecode>::type DecodeJson(size_t & cursor, const String & raw_value)
{
size_t base = cursor;
UInt8 type = raw_value[cursor++];
size_t size = 0;

switch (type) // JSON Root element type
{
case TYPE_CODE_OBJECT:
cursor += 4;
size = decodeNumeric<UInt32>(cursor, raw_value);
break;
case TYPE_CODE_ARRAY:
cursor += 4;
size = decodeNumeric<UInt32>(cursor, raw_value);
break;
case TYPE_CODE_LITERAL:
size = 1;
break;
case TYPE_CODE_INT64:
case TYPE_CODE_UINT64:
case TYPE_CODE_FLOAT64:
size = 8;
break;
case TYPE_CODE_STRING:
size = DecodeVarUInt(cursor, raw_value);
size += (cursor - base - 1);
break;
default:
throw Exception("DecodeJsonBinary: Unknown JSON Element Type:" + std::to_string(type), ErrorCodes::LOGICAL_ERROR);
}

size++;
cursor = base + size;
if (!doDecode)
return static_cast<typename need_decode<doDecode>::type>(0);
else
return static_cast<typename need_decode<doDecode>::type>(raw_value.substr(base, size));
}

void SkipJson(size_t & cursor, const String & raw_value)
{
DecodeJson<false>(cursor, raw_value);
}

String DecodeJsonAsBinary(size_t & cursor, const String & raw_value)
{
return DecodeJson<true>(cursor, raw_value);
}

} // namespace DB
12 changes: 12 additions & 0 deletions dbms/src/Storages/Transaction/JSONCodec.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#pragma once

#include <Core/Types.h>

namespace DB
{

void SkipJson(size_t & cursor, const String & raw_value);
String DecodeJsonAsBinary(size_t & cursor, const String & raw_value);
String DecodeJsonAsString(size_t & cursor, const String & raw_value);

} // namespace DB