Skip to content

Commit

Permalink
Support casting between BinaryView <--> Utf8 and LargeUtf8 (apache#6180)
Browse files Browse the repository at this point in the history
* support cast between binaryview and string

* update impl. and add bench mark

* Add ut for views

* Apply coments
  • Loading branch information
xinlifoobar committed Aug 8, 2024
1 parent e28cf44 commit 4bd737d
Show file tree
Hide file tree
Showing 2 changed files with 127 additions and 14 deletions.
102 changes: 88 additions & 14 deletions arrow-cast/src/cast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -225,10 +225,11 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
| Timestamp(Millisecond, _)
| Timestamp(Microsecond, _)
| Timestamp(Nanosecond, _)
| Interval(_),
| Interval(_)
| BinaryView,
) => true,
(Utf8 | LargeUtf8, Utf8View) => true,
(BinaryView, Binary | LargeBinary) => true,
(BinaryView, Binary | LargeBinary | Utf8 | LargeUtf8 | Utf8View ) => true,
(Utf8 | LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16,
(_, Utf8 | LargeUtf8) => from_type.is_primitive(),

Expand Down Expand Up @@ -1229,6 +1230,9 @@ pub fn cast_with_options(
cast_byte_container::<BinaryType, LargeBinaryType>(&binary)
}
Utf8View => Ok(Arc::new(StringViewArray::from(array.as_string::<i32>()))),
BinaryView => Ok(Arc::new(
StringViewArray::from(array.as_string::<i32>()).to_binary_view(),
)),
LargeUtf8 => cast_byte_container::<Utf8Type, LargeUtf8Type>(array),
Time32(TimeUnit::Second) => parse_string::<Time32SecondType, i32>(array, cast_options),
Time32(TimeUnit::Millisecond) => {
Expand Down Expand Up @@ -1282,6 +1286,7 @@ pub fn cast_with_options(
Date64 => parse_string_view::<Date64Type>(array, cast_options),
Binary => cast_view_to_byte::<StringViewType, GenericBinaryType<i32>>(array),
LargeBinary => cast_view_to_byte::<StringViewType, GenericBinaryType<i64>>(array),
BinaryView => Ok(Arc::new(array.as_string_view().clone().to_binary_view())),
Utf8 => cast_view_to_byte::<StringViewType, GenericStringType<i32>>(array),
LargeUtf8 => cast_view_to_byte::<StringViewType, GenericStringType<i64>>(array),
Time32(TimeUnit::Second) => parse_string_view::<Time32SecondType>(array, cast_options),
Expand Down Expand Up @@ -1339,6 +1344,13 @@ pub fn cast_with_options(
array.as_string::<i64>().clone(),
))),
Utf8View => Ok(Arc::new(StringViewArray::from(array.as_string::<i64>()))),
BinaryView => Ok(Arc::new(BinaryViewArray::from(
array
.as_string::<i64>()
.into_iter()
.map(|x| x.map(|x| x.as_bytes()))
.collect::<Vec<_>>(),
))),
Time32(TimeUnit::Second) => parse_string::<Time32SecondType, i64>(array, cast_options),
Time32(TimeUnit::Millisecond) => {
parse_string::<Time32MillisecondType, i64>(array, cast_options)
Expand Down Expand Up @@ -1417,6 +1429,20 @@ pub fn cast_with_options(
(BinaryView, LargeBinary) => {
cast_view_to_byte::<BinaryViewType, GenericBinaryType<i64>>(array)
}
(BinaryView, Utf8) => {
let binary_arr = cast_view_to_byte::<BinaryViewType, GenericBinaryType<i32>>(array)?;
cast_binary_to_string::<i32>(&binary_arr, cast_options)
}
(BinaryView, LargeUtf8) => {
let binary_arr = cast_view_to_byte::<BinaryViewType, GenericBinaryType<i64>>(array)?;
cast_binary_to_string::<i64>(&binary_arr, cast_options)
}
(BinaryView, Utf8View) => {
Ok(Arc::new(array.as_binary_view().clone().to_string_view()?) as ArrayRef)
}
(BinaryView, _) => Err(ArrowError::CastError(format!(
"Casting from {from_type:?} to {to_type:?} not supported",
))),
(from_type, LargeUtf8) if from_type.is_primitive() => {
value_to_string::<i64>(array, cast_options)
}
Expand Down Expand Up @@ -2008,7 +2034,6 @@ pub fn cast_with_options(
})?,
))
}

(Date64, Timestamp(TimeUnit::Second, None)) => Ok(Arc::new(
array
.as_primitive::<Date64Type>()
Expand Down Expand Up @@ -5256,12 +5281,6 @@ mod tests {
}
}

#[test]
fn test_string_to_view() {
_test_string_to_view::<i32>();
_test_string_to_view::<i64>();
}

const VIEW_TEST_DATA: [Option<&str>; 5] = [
Some("hello"),
Some("repeated"),
Expand All @@ -5270,6 +5289,44 @@ mod tests {
Some("repeated"),
];

#[test]
fn test_string_view_to_binary_view() {
let string_view_array = StringViewArray::from_iter(VIEW_TEST_DATA);

assert!(can_cast_types(
string_view_array.data_type(),
&DataType::BinaryView
));

let binary_view_array = cast(&string_view_array, &DataType::BinaryView).unwrap();
assert_eq!(binary_view_array.data_type(), &DataType::BinaryView);

let expect_binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA);
assert_eq!(binary_view_array.as_ref(), &expect_binary_view_array);
}

#[test]
fn test_binary_view_to_string_view() {
let binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA);

assert!(can_cast_types(
binary_view_array.data_type(),
&DataType::Utf8View
));

let string_view_array = cast(&binary_view_array, &DataType::Utf8View).unwrap();
assert_eq!(string_view_array.data_type(), &DataType::Utf8View);

let expect_string_view_array = StringViewArray::from_iter(VIEW_TEST_DATA);
assert_eq!(string_view_array.as_ref(), &expect_string_view_array);
}

#[test]
fn test_string_to_view() {
_test_string_to_view::<i32>();
_test_string_to_view::<i64>();
}

fn _test_string_to_view<O>()
where
O: OffsetSizeTrait,
Expand All @@ -5281,11 +5338,22 @@ mod tests {
&DataType::Utf8View
));

assert!(can_cast_types(
string_array.data_type(),
&DataType::BinaryView
));

let string_view_array = cast(&string_array, &DataType::Utf8View).unwrap();
assert_eq!(string_view_array.data_type(), &DataType::Utf8View);

let binary_view_array = cast(&string_array, &DataType::BinaryView).unwrap();
assert_eq!(binary_view_array.data_type(), &DataType::BinaryView);

let expect_string_view_array = StringViewArray::from_iter(VIEW_TEST_DATA);
assert_eq!(string_view_array.as_ref(), &expect_string_view_array);

let expect_binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA);
assert_eq!(binary_view_array.as_ref(), &expect_binary_view_array);
}

#[test]
Expand Down Expand Up @@ -5380,23 +5448,29 @@ mod tests {
where
O: OffsetSizeTrait,
{
let view_array = {
let string_view_array = {
let mut builder = StringViewBuilder::new().with_fixed_block_size(8); // multiple buffers.
for s in VIEW_TEST_DATA.iter() {
builder.append_option(*s);
}
builder.finish()
};

let binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA);

let expected_string_array = GenericStringArray::<O>::from_iter(VIEW_TEST_DATA);
let expected_type = expected_string_array.data_type();

assert!(can_cast_types(view_array.data_type(), expected_type));
assert!(can_cast_types(string_view_array.data_type(), expected_type));
assert!(can_cast_types(binary_view_array.data_type(), expected_type));

let string_array = cast(&view_array, expected_type).unwrap();
assert_eq!(string_array.data_type(), expected_type);
let string_view_casted_array = cast(&string_view_array, expected_type).unwrap();
assert_eq!(string_view_casted_array.data_type(), expected_type);
assert_eq!(string_view_casted_array.as_ref(), &expected_string_array);

assert_eq!(string_array.as_ref(), &expected_string_array);
let binary_view_casted_array = cast(&binary_view_array, expected_type).unwrap();
assert_eq!(binary_view_casted_array.data_type(), expected_type);
assert_eq!(binary_view_casted_array.as_ref(), &expected_string_array);
}

#[test]
Expand Down
39 changes: 39 additions & 0 deletions arrow/benches/cast_kernels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,18 @@ fn build_decimal256_array(size: usize, precision: u8, scale: i8) -> ArrayRef {
)
}

fn build_string_array(size: usize) -> ArrayRef {
let mut builder = StringBuilder::new();
for v in 0..size {
match v % 3 {
0 => builder.append_value("small"),
1 => builder.append_value("larger string more than 12 bytes"),
_ => builder.append_null(),
}
}
Arc::new(builder.finish())
}

fn build_dict_array(size: usize) -> ArrayRef {
let values = StringArray::from_iter([
Some("small"),
Expand Down Expand Up @@ -148,9 +160,12 @@ fn add_benchmark(c: &mut Criterion) {

let decimal128_array = build_decimal128_array(512, 10, 3);
let decimal256_array = build_decimal256_array(512, 50, 3);
let string_array = build_string_array(512);
let wide_string_array = cast(&string_array, &DataType::LargeUtf8).unwrap();

let dict_array = build_dict_array(10_000);
let string_view_array = cast(&dict_array, &DataType::Utf8View).unwrap();
let binary_view_array = cast(&string_view_array, &DataType::BinaryView).unwrap();

c.bench_function("cast int32 to int32 512", |b| {
b.iter(|| cast_array(&i32_array, DataType::Int32))
Expand Down Expand Up @@ -262,6 +277,30 @@ fn add_benchmark(c: &mut Criterion) {
)
})
});
c.bench_function("cast string view to string", |b| {
b.iter(|| cast_array(&string_view_array, DataType::Utf8))
});
c.bench_function("cast string view to wide string", |b| {
b.iter(|| cast_array(&string_view_array, DataType::LargeUtf8))
});
c.bench_function("cast binary view to string", |b| {
b.iter(|| cast_array(&binary_view_array, DataType::Utf8))
});
c.bench_function("cast binary view to wide string", |b| {
b.iter(|| cast_array(&binary_view_array, DataType::LargeUtf8))
});
c.bench_function("cast string to binary view 512", |b| {
b.iter(|| cast_array(&string_array, DataType::BinaryView))
});
c.bench_function("cast wide string to binary view 512", |b| {
b.iter(|| cast_array(&wide_string_array, DataType::BinaryView))
});
c.bench_function("cast string view to binary view", |b| {
b.iter(|| cast_array(&string_view_array, DataType::BinaryView))
});
c.bench_function("cast binary view to string view", |b| {
b.iter(|| cast_array(&binary_view_array, DataType::Utf8View))
});
}

criterion_group!(benches, add_benchmark);
Expand Down

0 comments on commit 4bd737d

Please sign in to comment.