Skip to content

Commit

Permalink
feat: support cast decimal to utf8 (pola-rs#13829)
Browse files Browse the repository at this point in the history
  • Loading branch information
flisky authored and r-brink committed Jan 24, 2024
1 parent 24756e4 commit d927cea
Show file tree
Hide file tree
Showing 8 changed files with 141 additions and 120 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ hashbrown = { version = "0.14", features = ["rayon", "ahash"] }
hex = "0.4.3"
indexmap = { version = "2", features = ["std"] }
itoa = "1.0.6"
itoap = { version = "1", features = ["simd"] }
atoi_simd = "0.15.5"
fast-float = { version = "0.2" }
memchr = "2.6"
Expand Down
3 changes: 2 additions & 1 deletion crates/polars-arrow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ ethnum = { workspace = true }
atoi_simd = { workspace = true, optional = true }
fast-float = { workspace = true, optional = true }
itoa = { workspace = true, optional = true }
itoap = { workspace = true, optional = true }
ryu = { workspace = true, optional = true }

regex = { workspace = true, optional = true }
Expand Down Expand Up @@ -161,7 +162,7 @@ simd = []
# polars-arrow
timezones = []
dtype-array = []
dtype-decimal = ["atoi"]
dtype-decimal = ["atoi", "itoap"]
bigidx = []
nightly = []
performant = []
Expand Down
25 changes: 25 additions & 0 deletions crates/polars-arrow/src/compute/cast/decimal_to.rs
Original file line number Diff line number Diff line change
Expand Up @@ -137,3 +137,28 @@ where
let from = from.as_any().downcast_ref().unwrap();
Ok(Box::new(decimal_to_integer::<T>(from)))
}

/// Returns a [`Utf8Array`] where every element is the utf8 representation of the decimal.
#[cfg(feature = "dtype-decimal")]
pub(super) fn decimal_to_utf8view(from: &PrimitiveArray<i128>) -> Utf8ViewArray {
let (_, from_scale) = if let ArrowDataType::Decimal(p, s) = from.data_type().to_logical_type() {
(*p, *s)
} else {
panic!("internal error: i128 is always a decimal")
};

let mut mutable = MutableBinaryViewArray::with_capacity(from.len());

for &x in from.values().iter() {
let buf = crate::legacy::compute::decimal::format_decimal(x, from_scale, false);
mutable.push_value_ignore_validity(buf.as_str())
}

mutable.freeze().with_validity(from.validity().cloned())
}

#[cfg(feature = "dtype-decimal")]
pub(super) fn decimal_to_utf8view_dyn(from: &dyn Array) -> Utf8ViewArray {
let from = from.as_any().downcast_ref().unwrap();
decimal_to_utf8view(from)
}
2 changes: 2 additions & 0 deletions crates/polars-arrow/src/compute/cast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,8 @@ pub fn cast(
Utf8 => Ok(
utf8_to_utf8view(array.as_any().downcast_ref::<Utf8Array<i32>>().unwrap()).boxed(),
),
#[cfg(feature = "dtype-decimal")]
Decimal(_, _) => Ok(decimal_to_utf8view_dyn(array).boxed()),
_ => from_to_binview(array, from_type, to_type)
.map(|arr| unsafe { arr.to_utf8view_unchecked() }.boxed()),
},
Expand Down
103 changes: 103 additions & 0 deletions crates/polars-arrow/src/legacy/compute/decimal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,109 @@ pub(crate) fn deserialize_decimal(
}
}

const BUF_LEN: usize = 48;

#[derive(Clone, Copy)]
pub struct FormatBuffer {
data: [u8; BUF_LEN],
len: usize,
}

impl FormatBuffer {
#[inline]
pub const fn new() -> Self {
Self {
data: [0; BUF_LEN],
len: 0,
}
}

#[inline]
pub fn as_str(&self) -> &str {
unsafe { std::str::from_utf8_unchecked(&self.data[..self.len]) }
}
}

const POW10: [i128; 38] = [
1,
10,
100,
1000,
10000,
100000,
1000000,
10000000,
100000000,
1000000000,
10000000000,
100000000000,
1000000000000,
10000000000000,
100000000000000,
1000000000000000,
10000000000000000,
100000000000000000,
1000000000000000000,
10000000000000000000,
100000000000000000000,
1000000000000000000000,
10000000000000000000000,
100000000000000000000000,
1000000000000000000000000,
10000000000000000000000000,
100000000000000000000000000,
1000000000000000000000000000,
10000000000000000000000000000,
100000000000000000000000000000,
1000000000000000000000000000000,
10000000000000000000000000000000,
100000000000000000000000000000000,
1000000000000000000000000000000000,
10000000000000000000000000000000000,
100000000000000000000000000000000000,
1000000000000000000000000000000000000,
10000000000000000000000000000000000000,
];

pub fn format_decimal(v: i128, scale: usize, trim_zeros: bool) -> FormatBuffer {
const ZEROS: [u8; BUF_LEN] = [b'0'; BUF_LEN];

let mut buf = FormatBuffer::new();
let factor = POW10[scale]; //10_i128.pow(scale as _);
let (div, rem) = (v / factor, v.abs() % factor);

unsafe {
let mut ptr = buf.data.as_mut_ptr();
if div == 0 && v < 0 {
*ptr = b'-';
ptr = ptr.add(1);
buf.len = 1;
}
let n_whole = itoap::write_to_ptr(ptr, div);
buf.len += n_whole;
if rem != 0 {
ptr = ptr.add(n_whole);
*ptr = b'.';
ptr = ptr.add(1);
let mut frac_buf = [0_u8; BUF_LEN];
let n_frac = itoap::write_to_ptr(frac_buf.as_mut_ptr(), rem);
std::ptr::copy_nonoverlapping(ZEROS.as_ptr(), ptr, scale - n_frac);
ptr = ptr.add(scale - n_frac);
std::ptr::copy_nonoverlapping(frac_buf.as_mut_ptr(), ptr, n_frac);
buf.len += 1 + scale;
if trim_zeros {
ptr = ptr.add(n_frac - 1);
while *ptr == b'0' {
ptr = ptr.sub(1);
buf.len -= 1;
}
}
}
}

buf
}

#[cfg(test)]
mod test {
use super::*;
Expand Down
3 changes: 1 addition & 2 deletions crates/polars-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ comfy-table = { version = "7.0.1", default_features = false, optional = true }
either = { workspace = true }
hashbrown = { workspace = true }
indexmap = { workspace = true }
itoap = { version = "1", optional = true, features = ["simd"] }
ndarray = { workspace = true, optional = true }
num-traits = { workspace = true }
once_cell = { workspace = true }
Expand Down Expand Up @@ -109,7 +108,7 @@ dtype-time = ["temporal"]
dtype-array = ["arrow/dtype-array", "polars-compute/dtype-array"]
dtype-i8 = []
dtype-i16 = []
dtype-decimal = ["dep:itoap", "arrow/dtype-decimal"]
dtype-decimal = ["arrow/dtype-decimal"]
dtype-u8 = []
dtype-u16 = []
dtype-categorical = []
Expand Down
122 changes: 6 additions & 116 deletions crates/polars-core/src/fmt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1153,126 +1153,16 @@ impl Series {
}
}

#[inline]
#[cfg(feature = "dtype-decimal")]
mod decimal {
use std::fmt::Formatter;
use std::{fmt, ptr, str};
pub fn fmt_decimal(f: &mut Formatter<'_>, v: i128, scale: usize) -> fmt::Result {
use arrow::legacy::compute::decimal::format_decimal;

use crate::fmt::{fmt_float_string, get_trim_decimal_zeros};

const BUF_LEN: usize = 48;

#[derive(Clone, Copy)]
pub struct FormatBuffer {
data: [u8; BUF_LEN],
len: usize,
}

impl FormatBuffer {
#[inline]
pub const fn new() -> Self {
Self {
data: [0; BUF_LEN],
len: 0,
}
}

#[inline]
pub fn as_str(&self) -> &str {
unsafe { str::from_utf8_unchecked(&self.data[..self.len]) }
}
}

const POW10: [i128; 38] = [
1,
10,
100,
1000,
10000,
100000,
1000000,
10000000,
100000000,
1000000000,
10000000000,
100000000000,
1000000000000,
10000000000000,
100000000000000,
1000000000000000,
10000000000000000,
100000000000000000,
1000000000000000000,
10000000000000000000,
100000000000000000000,
1000000000000000000000,
10000000000000000000000,
100000000000000000000000,
1000000000000000000000000,
10000000000000000000000000,
100000000000000000000000000,
1000000000000000000000000000,
10000000000000000000000000000,
100000000000000000000000000000,
1000000000000000000000000000000,
10000000000000000000000000000000,
100000000000000000000000000000000,
1000000000000000000000000000000000,
10000000000000000000000000000000000,
100000000000000000000000000000000000,
1000000000000000000000000000000000000,
10000000000000000000000000000000000000,
];

pub fn format_decimal(v: i128, scale: usize, trim_zeros: bool) -> FormatBuffer {
const ZEROS: [u8; BUF_LEN] = [b'0'; BUF_LEN];

let mut buf = FormatBuffer::new();
let factor = POW10[scale]; //10_i128.pow(scale as _);
let (div, rem) = (v / factor, v.abs() % factor);

unsafe {
let mut ptr = buf.data.as_mut_ptr();
if div == 0 && v < 0 {
*ptr = b'-';
ptr = ptr.add(1);
buf.len = 1;
}
let n_whole = itoap::write_to_ptr(ptr, div);
buf.len += n_whole;
if rem != 0 {
ptr = ptr.add(n_whole);
*ptr = b'.';
ptr = ptr.add(1);
let mut frac_buf = [0_u8; BUF_LEN];
let n_frac = itoap::write_to_ptr(frac_buf.as_mut_ptr(), rem);
ptr::copy_nonoverlapping(ZEROS.as_ptr(), ptr, scale - n_frac);
ptr = ptr.add(scale - n_frac);
ptr::copy_nonoverlapping(frac_buf.as_mut_ptr(), ptr, n_frac);
buf.len += 1 + scale;
if trim_zeros {
ptr = ptr.add(n_frac - 1);
while *ptr == b'0' {
ptr = ptr.sub(1);
buf.len -= 1;
}
}
}
}

buf
}

#[inline]
pub fn fmt_decimal(f: &mut Formatter<'_>, v: i128, scale: usize) -> fmt::Result {
let trim_zeros = get_trim_decimal_zeros();
f.write_str(fmt_float_string(format_decimal(v, scale, trim_zeros).as_str()).as_str())
}
let trim_zeros = get_trim_decimal_zeros();
let repr = format_decimal(v, scale, trim_zeros);
f.write_str(fmt_float_string(repr.as_str()).as_str())
}

#[cfg(feature = "dtype-decimal")]
pub use decimal::fmt_decimal;

#[cfg(all(
test,
feature = "temporal",
Expand Down

0 comments on commit d927cea

Please sign in to comment.