Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-42247: [C++] Support casting to and from utf8_view/binary_view #43302

Merged
merged 26 commits into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
39ce7e9
type_traits.h: Add is_binary_view_like() type predicate
felipecrv Jun 22, 2024
3790b79
type.h: Add BinaryViewTypes() including STRING_VIEW/BINARY_VIEW
felipecrv Jun 22, 2024
5fd535e
scalar_cast_internal.cc: Remove stale comments from '20
felipecrv Jun 21, 2024
daf02b5
scalar_cast_string.cc: Use a more specific (positive) type predicate
felipecrv Jun 21, 2024
5410b3c
scalar_cast_string.cc: Prepare for expanding the set string casts
felipecrv Jun 21, 2024
1458b83
scalar_cast_string.cc: Register the common casts from dictionary<utf8…
felipecrv Jun 21, 2024
986bea7
scalar_cast_string.cc: Cast from (utf8|binary)_view -> fixed_size_binary
felipecrv Jun 21, 2024
e2d1177
scalar_cast_string.cc: Cast from (utf8|binary)_view -> (large)?(utf8|…
felipecrv Jun 22, 2024
2481eee
scalar_cast_string.cc: Cast from (utf8|binary)->(utf8|binary)_view
felipecrv Jun 22, 2024
8097717
scalar_cast_string.cc: Cast from fixed_size_binary -> (utf8|binary)_view
felipecrv Jun 22, 2024
d2f95a1
scalar_cast_string.cc: Cast from (utf8|binary)_view -> (utf8|binary)_…
felipecrv Jun 22, 2024
f7e64cb
scalar_cast_string.cc: Cast from Number/Decimal/Temporal to utf8_view
felipecrv Jun 22, 2024
04ebd0d
scalar_cast_boolean.cc: Cast from (utf8|binary)_view to boolean
felipecrv Jun 22, 2024
7d330d3
scalar_cast_numeric.cc: Cast (utf8|binary)_view to numeric types
felipecrv Jun 22, 2024
d59e146
scalar_cast_test.cc: Include string and binary-views types in cast-ch…
felipecrv Jun 23, 2024
4b1c035
FIX: Remove unrelated float16() casts from tests
felipecrv Jul 21, 2024
72decf7
FIX: Don't say dictionary<string_view> casts are supported yet
felipecrv Jul 21, 2024
f0043f2
Use 'Offset String' instead of 'Span'
felipecrv Aug 5, 2024
9ad5587
Mention issue 43573
felipecrv Aug 5, 2024
ea022c7
Check against overflow of fixed-size string offsets
felipecrv Aug 5, 2024
3c418d5
Improve comment listing types that have std::string_view C-type
felipecrv Aug 6, 2024
4d51841
Calculate how much data will be needed to convert view to offset string
felipecrv Aug 6, 2024
2e47828
Fix the Offset String -> String View issue
mapleFU Aug 21, 2024
f4c8d1f
Check length for Offset String -> String View
mapleFU Aug 23, 2024
b17e170
resolve comment
mapleFU Aug 23, 2024
2416d19
apply suggestions
mapleFU Sep 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion cpp/src/arrow/compute/kernels/codegen_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,8 @@ struct GetViewType<Type, enable_if_has_c_type<Type>> {

template <typename Type>
struct GetViewType<Type, enable_if_t<is_base_binary_type<Type>::value ||
is_fixed_size_binary_type<Type>::value>> {
is_fixed_size_binary_type<Type>::value ||
is_binary_view_like_type<Type>::value>> {
using T = std::string_view;
using PhysicalType = T;

Expand Down Expand Up @@ -1265,6 +1266,22 @@ ArrayKernelExec GenerateVarBinary(detail::GetTypeId get_id) {
}
}

// Generate a kernel given a templated functor for binary-view types. Generates a
// single kernel for binary/string-view.
//
// See "Numeric" above for description of the generator functor
template <template <typename...> class Generator, typename Type0, typename... Args>
ArrayKernelExec GenerateVarBinaryViewBase(detail::GetTypeId get_id) {
switch (get_id.id) {
case Type::BINARY_VIEW:
case Type::STRING_VIEW:
return Generator<Type0, BinaryViewType, Args...>::Exec;
default:
DCHECK(false);
return nullptr;
}
}

// Generate a kernel given a templated functor for temporal types
//
// See "Numeric" above for description of the generator functor
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,12 @@ std::vector<std::shared_ptr<CastFunction>> GetBooleanCasts() {
BooleanType, ParseBooleanString>(*ty);
DCHECK_OK(func->AddKernel(ty->id(), {ty}, boolean(), exec));
}
for (const auto& ty : BinaryViewTypes()) {
ArrayKernelExec exec =
GenerateVarBinaryViewBase<applicator::ScalarUnaryNotNull, BooleanType,
ParseBooleanString>(*ty);
DCHECK_OK(func->AddKernel(ty->id(), {ty}, boolean(), exec));
}
return {func};
}

Expand Down
7 changes: 2 additions & 5 deletions cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,6 @@ void CastNumberToNumberUnsafe(Type::type in_type, Type::type out_type,
// ----------------------------------------------------------------------

Status UnpackDictionary(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
// TODO: is there an implementation more friendly to the "span" data structures?

DictionaryArray dict_arr(batch[0].array.ToArrayData());
const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;

Expand Down Expand Up @@ -281,6 +279,8 @@ void AddZeroCopyCast(Type::type in_type_id, InputType in_type, OutputType out_ty
}

static bool CanCastFromDictionary(Type::type type_id) {
/// TODO(GH-43010): add is_binary_view_like() here once array_take
/// can handle string-views
return (is_primitive(type_id) || is_base_binary_like(type_id) ||
is_fixed_size_binary(type_id));
}
Expand All @@ -297,9 +297,6 @@ void AddCommonCasts(Type::type out_type_id, OutputType out_ty, CastFunction* fun
// From dictionary to this type
if (CanCastFromDictionary(out_type_id)) {
// Dictionary unpacking not implemented for boolean or nested types.
//
// XXX: Uses Take and does its own memory allocation for the moment. We can
// fix this later.
DCHECK_OK(func->AddKernel(Type::DICTIONARY, {InputType(Type::DICTIONARY)}, out_ty,
UnpackDictionary, NullHandling::COMPUTED_NO_PREALLOCATE,
MemAllocation::NO_PREALLOCATE));
Expand Down
24 changes: 21 additions & 3 deletions cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,9 @@ struct ParseString {

template <typename O, typename I>
struct CastFunctor<
O, I, enable_if_t<(is_number_type<O>::value && is_base_binary_type<I>::value)>> {
O, I,
enable_if_t<(is_number_type<O>::value && (is_base_binary_type<I>::value ||
is_binary_view_like_type<I>::value))>> {
static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
return applicator::ScalarUnaryNotNull<O, I, ParseString<O>>::Exec(ctx, batch, out);
}
Expand Down Expand Up @@ -658,11 +660,15 @@ struct DecimalCastFunctor {
};

template <typename I>
struct CastFunctor<Decimal128Type, I, enable_if_t<is_base_binary_type<I>::value>>
struct CastFunctor<
Decimal128Type, I,
enable_if_t<is_base_binary_type<I>::value || is_binary_view_like_type<I>::value>>
: public DecimalCastFunctor<Decimal128Type, I> {};

template <typename I>
struct CastFunctor<Decimal256Type, I, enable_if_t<is_base_binary_type<I>::value>>
struct CastFunctor<
Decimal256Type, I,
enable_if_t<is_base_binary_type<I>::value || is_binary_view_like_type<I>::value>>
: public DecimalCastFunctor<Decimal256Type, I> {};

// ----------------------------------------------------------------------
Expand Down Expand Up @@ -708,6 +714,10 @@ void AddCommonNumberCasts(const std::shared_ptr<DataType>& out_ty, CastFunction*
auto exec = GenerateVarBinaryBase<CastFunctor, OutType>(*in_ty);
DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, out_ty, exec));
}
for (const std::shared_ptr<DataType>& in_ty : BinaryViewTypes()) {
auto exec = GenerateVarBinaryViewBase<CastFunctor, OutType>(*in_ty);
DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, out_ty, exec));
}
}

template <typename OutType>
Expand Down Expand Up @@ -793,6 +803,10 @@ std::shared_ptr<CastFunction> GetCastToDecimal128() {
auto exec = GenerateVarBinaryBase<CastFunctor, Decimal128Type>(in_ty->id());
DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, sig_out_ty, std::move(exec)));
}
for (const std::shared_ptr<DataType>& in_ty : BinaryViewTypes()) {
auto exec = GenerateVarBinaryViewBase<CastFunctor, Decimal128Type>(in_ty->id());
DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, sig_out_ty, std::move(exec)));
}

// Cast from other decimal
auto exec = CastFunctor<Decimal128Type, Decimal128Type>::Exec;
Expand Down Expand Up @@ -828,6 +842,10 @@ std::shared_ptr<CastFunction> GetCastToDecimal256() {
auto exec = GenerateVarBinaryBase<CastFunctor, Decimal256Type>(in_ty->id());
DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, sig_out_ty, std::move(exec)));
}
for (const std::shared_ptr<DataType>& in_ty : BinaryViewTypes()) {
auto exec = GenerateVarBinaryViewBase<CastFunctor, Decimal256Type>(in_ty->id());
DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, sig_out_ty, std::move(exec)));
}

// Cast from other decimal
auto exec = CastFunctor<Decimal256Type, Decimal128Type>::Exec;
Expand Down
Loading
Loading