Skip to content

Commit

Permalink
Merge pull request #21415 from andrwng/iceberg-datatypes
Browse files Browse the repository at this point in the history
iceberg: initial data structures for logical data types
  • Loading branch information
dotnwat authored Jul 17, 2024
2 parents 4f699f2 + f4f231a commit e539d40
Show file tree
Hide file tree
Showing 10 changed files with 1,294 additions and 1 deletion.
10 changes: 9 additions & 1 deletion src/v/iceberg/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,15 @@ endforeach()

v_cc_library(
NAME iceberg
SRCS ${avro_hdrs}
SRCS
${avro_hdrs}
datatypes.cc
datatypes_json.cc
json_utils.cc
DEPS
v::container
v::json
v::strings
)

add_subdirectory(tests)
263 changes: 263 additions & 0 deletions src/v/iceberg/datatypes.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
// Copyright 2024 Redpanda Data, Inc.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.md
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0

#include "iceberg/datatypes.h"

#include <variant>

namespace iceberg {

struct primitive_type_comparison_visitor {
template<typename T, typename U>
bool operator()(const T&, const U&) const {
return false;
}
bool operator()(const decimal_type& lhs, const decimal_type& rhs) const {
return lhs.precision == rhs.precision && lhs.scale == rhs.scale;
}
bool operator()(const fixed_type& lhs, const fixed_type& rhs) const {
return lhs.length == rhs.length;
}
template<typename T>
bool operator()(const T&, const T&) const {
return true;
}
};

bool operator==(const primitive_type& lhs, const primitive_type& rhs) {
return std::visit(primitive_type_comparison_visitor{}, lhs, rhs);
}

struct field_type_comparison_visitor {
template<typename T, typename U>
bool operator()(const T&, const U&) const {
return false;
}
bool
operator()(const primitive_type& lhs, const primitive_type& rhs) const {
return lhs == rhs;
}
bool operator()(const struct_type& lhs, const struct_type& rhs) const {
return lhs == rhs;
}
bool operator()(const list_type& lhs, const list_type& rhs) const {
return lhs == rhs;
}
bool operator()(const map_type& lhs, const map_type& rhs) const {
return lhs == rhs;
}
};

bool operator==(const field_type& lhs, const field_type& rhs) {
return std::visit(field_type_comparison_visitor{}, lhs, rhs);
}
bool operator==(const nested_field& lhs, const nested_field& rhs) {
return lhs.id == rhs.id && lhs.required == rhs.required
&& lhs.name == rhs.name && lhs.type == rhs.type;
}

std::ostream& operator<<(std::ostream& o, const boolean_type&) {
o << "boolean";
return o;
}

std::ostream& operator<<(std::ostream& o, const int_type&) {
o << "int";
return o;
}

std::ostream& operator<<(std::ostream& o, const long_type&) {
o << "long";
return o;
}

std::ostream& operator<<(std::ostream& o, const float_type&) {
o << "float";
return o;
}

std::ostream& operator<<(std::ostream& o, const double_type&) {
o << "double";
return o;
}

std::ostream& operator<<(std::ostream& o, const decimal_type& t) {
o << fmt::format("decimal({}, {})", t.precision, t.scale);
return o;
}

std::ostream& operator<<(std::ostream& o, const date_type&) {
o << "date";
return o;
}

std::ostream& operator<<(std::ostream& o, const time_type&) {
o << "time";
return o;
}

std::ostream& operator<<(std::ostream& o, const timestamp_type&) {
o << "timestamp";
return o;
}

std::ostream& operator<<(std::ostream& o, const timestamptz_type&) {
o << "timestamptz";
return o;
}

std::ostream& operator<<(std::ostream& o, const string_type&) {
o << "string";
return o;
}

std::ostream& operator<<(std::ostream& o, const uuid_type&) {
o << "uuid";
return o;
}

std::ostream& operator<<(std::ostream& o, const fixed_type& t) {
// NOTE: square brackets to match how fixed type is serialized as JSON,
// though this matching isn't necessarily important for operator<<.
o << fmt::format("fixed[{}]", t.length);
return o;
}

std::ostream& operator<<(std::ostream& o, const binary_type&) {
o << "binary";
return o;
}

std::ostream& operator<<(std::ostream& o, const struct_type&) {
o << "struct";
return o;
}

std::ostream& operator<<(std::ostream& o, const list_type&) {
o << "list";
return o;
}

std::ostream& operator<<(std::ostream& o, const map_type&) {
o << "map";
return o;
}

bool operator==(const struct_type& lhs, const struct_type& rhs) {
if (lhs.fields.size() != rhs.fields.size()) {
return false;
}
for (size_t i = 0; i < lhs.fields.size(); i++) {
bool has_lhs = lhs.fields[i] != nullptr;
bool has_rhs = rhs.fields[i] != nullptr;
if (has_lhs != has_rhs) {
return false;
}
if (has_lhs == false) {
continue;
}
if (*lhs.fields[i] != *rhs.fields[i]) {
return false;
}
}
return true;
}
bool operator==(const list_type& lhs, const list_type& rhs) {
bool has_lhs = lhs.element_field != nullptr;
bool has_rhs = rhs.element_field != nullptr;
if (has_lhs != has_rhs) {
return false;
}
if (!has_lhs) {
// Both nullptr.
return true;
}
return *lhs.element_field == *rhs.element_field;
}
bool operator==(const map_type& lhs, const map_type& rhs) {
bool has_key_lhs = lhs.key_field != nullptr;
bool has_key_rhs = rhs.key_field != nullptr;
if (has_key_lhs != has_key_rhs) {
return false;
}
bool has_val_lhs = lhs.value_field != nullptr;
bool has_val_rhs = rhs.value_field != nullptr;
if (has_val_lhs != has_val_rhs) {
return false;
}
if (has_key_lhs && *lhs.key_field != *rhs.key_field) {
return false;
}
if (has_val_lhs && *lhs.value_field != *rhs.value_field) {
return false;
}
return true;
}

namespace {
struct ostream_visitor {
explicit ostream_visitor(std::ostream& o)
: os(o) {}
std::ostream& os;

template<typename T>
void operator()(const T& v) const {
os << v;
}
};
} // namespace

std::ostream& operator<<(std::ostream& o, const primitive_type& t) {
std::visit(ostream_visitor{o}, t);
return o;
}

std::ostream& operator<<(std::ostream& o, const field_type& t) {
std::visit(ostream_visitor{o}, t);
return o;
}

list_type list_type::create(
int32_t element_id, field_required element_required, field_type element) {
// NOTE: the element field doesn't have a name. Functionally, the list type
// is represented as:
// - element-id
// - element-type
// - element-required
// Despite the missing name though, many Iceberg implementations represent
// the list with a nested_field.
return list_type{
.element_field = nested_field::create(
element_id, "element", element_required, std::move(element))};
}

map_type map_type::create(
int32_t key_id,
field_type key_type,
int32_t val_id,
field_required val_req,
field_type val_type) {
// NOTE: the keys and values don't have names, and the key is always
// required. Functionally, a map type is represented as:
// - key-id
// - key-type
// - value-id
// - value-required
// - value-type
// Despite the missing names though, many Iceberg implementations represent
// the map with two nested_fields.
return map_type{
.key_field = nested_field::create(
key_id, "key", field_required::yes, std::move(key_type)),
.value_field = nested_field::create(
val_id, "value", val_req, std::move(val_type)),
};
}

} // namespace iceberg
Loading

0 comments on commit e539d40

Please sign in to comment.