Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-261: Refactor String/Binary code paths to reflect unnested (non-list-based) structure #176

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -681,7 +681,6 @@ set(ARROW_SRCS

src/arrow/types/construct.cc
src/arrow/types/decimal.cc
src/arrow/types/json.cc
src/arrow/types/list.cc
src/arrow/types/primitive.cc
src/arrow/types/string.cc
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ class ARROW_EXPORT ArrayBuilder {

// Creates new array object to hold the contents of the builder and transfers
// ownership of the data. This resets all variables on the builder.
virtual std::shared_ptr<Array> Finish() = 0;
virtual Status Finish(std::shared_ptr<Array>* out) = 0;

const std::shared_ptr<DataType>& type() const { return type_; }

Expand Down
47 changes: 23 additions & 24 deletions cpp/src/arrow/ipc/adapter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -78,22 +78,6 @@ static bool IsPrimitive(const DataType* type) {
}
}

static bool IsListType(const DataType* type) {
DCHECK(type != nullptr);
switch (type->type) {
// TODO(emkornfield) grouping like this are used in a few places in the
// code consider using pattern like:
// http://stackoverflow.com/questions/26784685/c-macro-for-calling-function-based-on-enum-type
//
case Type::BINARY:
case Type::LIST:
case Type::STRING:
return true;
default:
return false;
}
}

// ----------------------------------------------------------------------
// Record batch write path

Expand All @@ -115,7 +99,11 @@ Status VisitArray(const Array* arr, std::vector<flatbuf::FieldNode>* field_nodes
if (IsPrimitive(arr_type)) {
const auto prim_arr = static_cast<const PrimitiveArray*>(arr);
buffers->push_back(prim_arr->data());
} else if (IsListType(arr_type)) {
} else if (arr->type_enum() == Type::STRING || arr->type_enum() == Type::BINARY) {
const auto binary_arr = static_cast<const BinaryArray*>(arr);
buffers->push_back(binary_arr->offsets());
buffers->push_back(binary_arr->data());
} else if (arr->type_enum() == Type::LIST) {
const auto list_arr = static_cast<const ListArray*>(arr);
buffers->push_back(list_arr->offset_buffer());
RETURN_NOT_OK(VisitArray(
Expand Down Expand Up @@ -331,9 +319,21 @@ class RecordBatchReader::RecordBatchReaderImpl {
}
return MakePrimitiveArray(
type, field_meta.length, data, field_meta.null_count, null_bitmap, out);
}
} else if (type->type == Type::STRING || type->type == Type::BINARY) {
std::shared_ptr<Buffer> offsets;
std::shared_ptr<Buffer> values;
RETURN_NOT_OK(GetBuffer(buffer_index_++, &offsets));
RETURN_NOT_OK(GetBuffer(buffer_index_++, &values));

if (IsListType(type.get())) {
if (type->type == Type::STRING) {
*out = std::make_shared<StringArray>(
field_meta.length, offsets, values, field_meta.null_count, null_bitmap);
} else {
*out = std::make_shared<BinaryArray>(
field_meta.length, offsets, values, field_meta.null_count, null_bitmap);
}
return Status::OK();
} else if (type->type == Type::LIST) {
std::shared_ptr<Buffer> offsets;
RETURN_NOT_OK(GetBuffer(buffer_index_++, &offsets));
const int num_children = type->num_children();
Expand All @@ -346,11 +346,10 @@ class RecordBatchReader::RecordBatchReaderImpl {
std::shared_ptr<Array> values_array;
RETURN_NOT_OK(
NextArray(type->child(0).get(), max_recursion_depth - 1, &values_array));
return MakeListArray(type, field_meta.length, offsets, values_array,
field_meta.null_count, null_bitmap, out);
}

if (type->type == Type::STRUCT) {
*out = std::make_shared<ListArray>(type, field_meta.length, offsets, values_array,
field_meta.null_count, null_bitmap);
return Status::OK();
} else if (type->type == Type::STRUCT) {
const int num_children = type->num_children();
std::vector<ArrayPtr> fields;
fields.reserve(num_children);
Expand Down
19 changes: 8 additions & 11 deletions cpp/src/arrow/ipc/test-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ const auto kListInt32 = std::make_shared<ListType>(kInt32);
const auto kListListInt32 = std::make_shared<ListType>(kListInt32);

Status MakeRandomInt32Array(
int32_t length, bool include_nulls, MemoryPool* pool, std::shared_ptr<Array>* array) {
int32_t length, bool include_nulls, MemoryPool* pool, std::shared_ptr<Array>* out) {
std::shared_ptr<PoolBuffer> data;
test::MakeRandomInt32PoolBuffer(length, pool, &data);
const auto kInt32 = std::make_shared<Int32Type>();
Expand All @@ -52,16 +52,14 @@ Status MakeRandomInt32Array(
test::MakeRandomBytePoolBuffer(length, pool, &valid_bytes);
RETURN_NOT_OK(builder.Append(
reinterpret_cast<const int32_t*>(data->data()), length, valid_bytes->data()));
*array = builder.Finish();
return Status::OK();
return builder.Finish(out);
}
RETURN_NOT_OK(builder.Append(reinterpret_cast<const int32_t*>(data->data()), length));
*array = builder.Finish();
return Status::OK();
return builder.Finish(out);
}

Status MakeRandomListArray(const std::shared_ptr<Array>& child_array, int num_lists,
bool include_nulls, MemoryPool* pool, std::shared_ptr<Array>* array) {
bool include_nulls, MemoryPool* pool, std::shared_ptr<Array>* out) {
// Create the null list values
std::vector<uint8_t> valid_lists(num_lists);
const double null_percent = include_nulls ? 0.1 : 0;
Expand Down Expand Up @@ -90,8 +88,8 @@ Status MakeRandomListArray(const std::shared_ptr<Array>& child_array, int num_li
}
ListBuilder builder(pool, child_array);
RETURN_NOT_OK(builder.Append(offsets.data(), num_lists, valid_lists.data()));
*array = builder.Finish();
return (*array)->Validate();
RETURN_NOT_OK(builder.Finish(out));
return (*out)->Validate();
}

typedef Status MakeRecordBatch(std::shared_ptr<RecordBatch>* out);
Expand All @@ -115,7 +113,7 @@ Status MakeIntRecordBatch(std::shared_ptr<RecordBatch>* out) {

template <class Builder, class RawType>
Status MakeRandomBinaryArray(
const TypePtr& type, int32_t length, MemoryPool* pool, ArrayPtr* array) {
const TypePtr& type, int32_t length, MemoryPool* pool, ArrayPtr* out) {
const std::vector<std::string> values = {
"", "", "abc", "123", "efg", "456!@#!@#", "12312"};
Builder builder(pool, type);
Expand All @@ -130,8 +128,7 @@ Status MakeRandomBinaryArray(
builder.Append(reinterpret_cast<const RawType*>(value.data()), value.size()));
}
}
*array = builder.Finish();
return Status::OK();
return builder.Finish(out);
}

Status MakeStringTypesRecordBatch(std::shared_ptr<RecordBatch>* out) {
Expand Down
20 changes: 4 additions & 16 deletions cpp/src/arrow/type.h
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ struct ARROW_EXPORT DoubleType : public PrimitiveType<DoubleType> {
struct ARROW_EXPORT ListType : public DataType {
// List can contain any other logical value type
explicit ListType(const std::shared_ptr<DataType>& value_type)
: ListType(value_type, Type::LIST) {}
: ListType(std::make_shared<Field>("item", value_type)) {}

explicit ListType(const std::shared_ptr<Field>& value_field) : DataType(Type::LIST) {
children_ = {value_field};
Expand All @@ -255,26 +255,17 @@ struct ARROW_EXPORT ListType : public DataType {
static char const* name() { return "list"; }

std::string ToString() const override;

protected:
// Constructor for classes that are implemented as List Arrays.
ListType(const std::shared_ptr<DataType>& value_type, Type::type logical_type)
: DataType(logical_type) {
// TODO ARROW-187 this can technically fail, make a constructor method ?
children_ = {std::make_shared<Field>("item", value_type)};
}
};

// BinaryType type is reprsents lists of 1-byte values.
struct ARROW_EXPORT BinaryType : public ListType {
struct ARROW_EXPORT BinaryType : public DataType {
BinaryType() : BinaryType(Type::BINARY) {}
static char const* name() { return "binary"; }
std::string ToString() const override;

protected:
// Allow subclasses to change the logical type.
explicit BinaryType(Type::type logical_type)
: ListType(std::shared_ptr<DataType>(new UInt8Type()), logical_type) {}
explicit BinaryType(Type::type logical_type) : DataType(logical_type) {}
};

// UTF encoded strings
Expand All @@ -284,9 +275,6 @@ struct ARROW_EXPORT StringType : public BinaryType {
static char const* name() { return "string"; }

std::string ToString() const override;

protected:
explicit StringType(Type::type logical_type) : BinaryType(logical_type) {}
};

struct ARROW_EXPORT StructType : public DataType {
Expand All @@ -300,7 +288,7 @@ struct ARROW_EXPORT StructType : public DataType {

// These will be defined elsewhere
template <typename T>
struct type_traits {};
struct TypeTraits {};

} // namespace arrow

Expand Down
1 change: 0 additions & 1 deletion cpp/src/arrow/types/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ install(FILES
construct.h
datetime.h
decimal.h
json.h
list.h
primitive.h
string.h
Expand Down
28 changes: 0 additions & 28 deletions cpp/src/arrow/types/binary.h

This file was deleted.

31 changes: 3 additions & 28 deletions cpp/src/arrow/types/construct.cc
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
BUILDER_CASE(DOUBLE, DoubleBuilder);

BUILDER_CASE(STRING, StringBuilder);
BUILDER_CASE(BINARY, BinaryBuilder);

case Type::LIST: {
std::shared_ptr<ArrayBuilder> value_builder;
Expand Down Expand Up @@ -105,10 +106,10 @@ Status MakePrimitiveArray(const TypePtr& type, int32_t length,
MAKE_PRIMITIVE_ARRAY_CASE(INT32, Int32Array);
MAKE_PRIMITIVE_ARRAY_CASE(UINT64, UInt64Array);
MAKE_PRIMITIVE_ARRAY_CASE(INT64, Int64Array);
MAKE_PRIMITIVE_ARRAY_CASE(TIME, Int64Array);
MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP, TimestampArray);
MAKE_PRIMITIVE_ARRAY_CASE(FLOAT, FloatArray);
MAKE_PRIMITIVE_ARRAY_CASE(DOUBLE, DoubleArray);
MAKE_PRIMITIVE_ARRAY_CASE(TIME, Int64Array);
MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP, TimestampArray);
MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP_DOUBLE, DoubleArray);
default:
return Status::NotImplemented(type->ToString());
Expand All @@ -120,30 +121,4 @@ Status MakePrimitiveArray(const TypePtr& type, int32_t length,
#endif
}

Status MakeListArray(const TypePtr& type, int32_t length,
const std::shared_ptr<Buffer>& offsets, const ArrayPtr& values, int32_t null_count,
const std::shared_ptr<Buffer>& null_bitmap, ArrayPtr* out) {
switch (type->type) {
case Type::BINARY:
out->reset(new BinaryArray(type, length, offsets, values, null_count, null_bitmap));
break;

case Type::LIST:
out->reset(new ListArray(type, length, offsets, values, null_count, null_bitmap));
break;

case Type::DECIMAL_TEXT:
case Type::STRING:
out->reset(new StringArray(type, length, offsets, values, null_count, null_bitmap));
break;
default:
return Status::NotImplemented(type->ToString());
}
#ifdef NDEBUG
return Status::OK();
#else
return (*out)->Validate();
#endif
}

} // namespace arrow
8 changes: 0 additions & 8 deletions cpp/src/arrow/types/construct.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,6 @@ Status ARROW_EXPORT MakePrimitiveArray(const std::shared_ptr<DataType>& type,
int32_t length, const std::shared_ptr<Buffer>& data, int32_t null_count,
const std::shared_ptr<Buffer>& null_bitmap, std::shared_ptr<Array>* out);

// Create new list arrays for logical types that are backed by ListArrays (e.g. list of
// primitives and strings)
// TODO(emkornfield) split up string vs list?
Status ARROW_EXPORT MakeListArray(const std::shared_ptr<DataType>& type, int32_t length,
const std::shared_ptr<Buffer>& offests, const std::shared_ptr<Array>& values,
int32_t null_count, const std::shared_ptr<Buffer>& null_bitmap,
std::shared_ptr<Array>* out);

} // namespace arrow

#endif // ARROW_BUILDER_H_
37 changes: 0 additions & 37 deletions cpp/src/arrow/types/json.cc

This file was deleted.

36 changes: 0 additions & 36 deletions cpp/src/arrow/types/json.h

This file was deleted.

Loading