Skip to content

Commit

Permalink
Faster timestamp parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
tustvold committed Mar 3, 2023
1 parent aab85b3 commit 10b560e
Show file tree
Hide file tree
Showing 3 changed files with 203 additions and 77 deletions.
5 changes: 5 additions & 0 deletions arrow-cast/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,5 +48,10 @@ num = { version = "0.4", default-features = false, features = ["std"] }
lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] }

[dev-dependencies]
criterion = { version = "0.4", default-features = false }

[build-dependencies]

[[bench]]
name = "parse_timestamp"
harness = false
44 changes: 44 additions & 0 deletions arrow-cast/benches/parse_timestamp.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use arrow_cast::parse::string_to_timestamp_nanos;
use criterion::*;

fn criterion_benchmark(c: &mut Criterion) {
let timestamps = [
"2020-09-08",
"2020-09-08T13:42:29",
"2020-09-08T13:42:29.190",
"2020-09-08T13:42:29.190855",
"2020-09-08T13:42:29.190855999",
"2020-09-08T13:42:29+00:00",
"2020-09-08T13:42:29.190+00:00",
"2020-09-08T13:42:29.190855+00:00",
"2020-09-08T13:42:29.190855999-05:00",
"2020-09-08T13:42:29.190855Z",
];

for timestamp in timestamps {
let t = black_box(timestamp);
c.bench_function(t, |b| {
b.iter(|| string_to_timestamp_nanos(t).unwrap());
});
}
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
231 changes: 154 additions & 77 deletions arrow-cast/src/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,94 @@
// specific language governing permissions and limitations
// under the License.

use arrow_array::timezone::Tz;
use arrow_array::types::*;
use arrow_array::ArrowPrimitiveType;
use arrow_schema::ArrowError;
use chrono::prelude::*;

/// Parsed a fixed length slice to an integer
fn parse_fixed_integer<const N: usize, T>(b: &[u8; N]) -> Option<T>
where
T: From<u8> + std::ops::AddAssign + std::ops::MulAssign,
{
let parsed = b.map(|x| x.wrapping_sub(b'0'));
parsed.iter().all(|x| *x <= 9).then(|| {
let mut out = T::from(0);
for b in parsed.iter() {
out *= T::from(10);
out += T::from(*b);
}
out
})
}

/// Parse a date of the form `2023-01-01`
fn parse_date(bytes: &[u8; 10]) -> Result<NaiveDate, ArrowError> {
let year = parse_fixed_integer::<4, u16>(bytes[0..4].try_into().unwrap());
let month = parse_fixed_integer::<2, u8>(bytes[5..7].try_into().unwrap());
let day = parse_fixed_integer::<2, u8>(bytes[8..10].try_into().unwrap());

match (year, month, day, bytes[4], bytes[7]) {
(Some(year), Some(month), Some(day), b'-', b'-') => {
NaiveDate::from_ymd_opt(year as _, month as _, day as _)
}
_ => None,
}
.ok_or_else(|| {
ArrowError::ParseError(format!(
"Error parsing date from '{}'",
String::from_utf8_lossy(bytes)
))
})
}

/// Parse a time of the form `09:26:56`
fn parse_time(bytes: &[u8; 8]) -> Result<NaiveTime, ArrowError> {
let hour = parse_fixed_integer::<2, u8>(bytes[0..2].try_into().unwrap());
let minute = parse_fixed_integer::<2, u8>(bytes[3..5].try_into().unwrap());
let second = parse_fixed_integer::<2, u8>(bytes[6..8].try_into().unwrap());
match (hour, minute, second, bytes[2], bytes[5]) {
(Some(hour), Some(minute), Some(second), b':', b':') => {
NaiveTime::from_hms_opt(hour as _, minute as _, second as _)
}
_ => None,
}
.ok_or_else(|| {
ArrowError::ParseError(format!(
"Error parsing time from '{}'",
String::from_utf8_lossy(bytes)
))
})
}

/// Parse a nanoseconds from a string containing 3, 6, or 9 digits
///
/// Returns the nanoseconds and the number of digits
fn parse_nanoseconds(bytes: &[u8]) -> Result<(u32, usize), ArrowError> {
let mut len = 0;
while len < bytes.len().min(9) && bytes[len].is_ascii_digit() {
len += 3;
}

match len {
3 => parse_fixed_integer::<3, u16>(bytes[0..3].try_into().unwrap())
.map(|x| (x as u32 * 1_000_000, 3)),
6 => parse_fixed_integer::<6, u32>(bytes[0..6].try_into().unwrap())
.map(|x| (x * 1_000, 6)),
9 => {
parse_fixed_integer::<9, u32>(bytes[0..9].try_into().unwrap()).map(|n| (n, 9))
}
_ => unreachable!(),
}
.ok_or_else(|| {
ArrowError::ParseError(format!(
"Error parsing '{}': expected 3, 6, or 9 sub-second digits'",
String::from_utf8_lossy(bytes)
))
})
}

/// Accepts a string and parses it relative to the provided `timezone`
///
/// In addition to RFC3339 / ISO8601 standard timestamps, it also
Expand All @@ -46,89 +129,69 @@ pub fn string_to_datetime<T: TimeZone>(
timezone: &T,
s: &str,
) -> Result<DateTime<T>, ArrowError> {
// Fast path: RFC3339 timestamp (with a T)
// Example: 2020-09-08T13:42:29.190855Z
if let Ok(ts) = DateTime::parse_from_rfc3339(s) {
return Ok(ts.with_timezone(timezone));
}

// Implement quasi-RFC3339 support by trying to parse the
// timestamp with various other format specifiers to to support
// separating the date and time with a space ' ' rather than 'T' to be
// (more) compatible with Apache Spark SQL

let supported_formats = vec![
"%Y-%m-%d %H:%M:%S%.f%:z", // Example: 2020-09-08 13:42:29.190855-05:00
"%Y-%m-%d %H%M%S%.3f%:z", // Example: "2023-01-01 040506 +07:30"
];

for f in supported_formats.iter() {
if let Ok(ts) = DateTime::parse_from_str(s, f) {
return Ok(ts.with_timezone(timezone));
}
}

// with an explicit Z, using ' ' as a separator
// Example: 2020-09-08 13:42:29Z
if let Ok(ts) = Utc.datetime_from_str(s, "%Y-%m-%d %H:%M:%S%.fZ") {
return Ok(ts.with_timezone(timezone));
let bytes = s.as_bytes();
if bytes.len() < 10 {
return Err(ArrowError::ParseError(format!("Error parsing timestamp from '{s}': timestamp must contain at least 10 characters")));
}

// Support timestamps without an explicit timezone offset, again
// to be compatible with what Apache Spark SQL does.

// without a timezone specifier as a local time, using T as a separator
// Example: 2020-09-08T13:42:29.190855
if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S%.f") {
if let Some(offset) = timezone.offset_from_local_datetime(&ts).single() {
return Ok(DateTime::from_local(ts, offset));
}
}

// without a timezone specifier as a local time, using T as a
// separator, no fractional seconds
// Example: 2020-09-08T13:42:29
if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S") {
if let Some(offset) = timezone.offset_from_local_datetime(&ts).single() {
return Ok(DateTime::from_local(ts, offset));
}
}
let date = parse_date(bytes[0..10].try_into().unwrap())?;
// 1997-01-31
if bytes.len() == 10 {
let offset = timezone.offset_from_local_date(&date);
let offset = offset.single().ok_or_else(|| {
ArrowError::ParseError(format!("Error computing offset for date '{date}'"))
})?;

// without a timezone specifier as a local time, using ' ' as a separator
// Example: 2020-09-08 13:42:29.190855
if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S%.f") {
if let Some(offset) = timezone.offset_from_local_datetime(&ts).single() {
return Ok(DateTime::from_local(ts, offset));
}
let time = NaiveTime::from_hms_opt(0, 0, 0).unwrap();
return Ok(DateTime::from_local(date.and_time(time), offset));
}

// without a timezone specifier as a local time, using ' ' as a
// separator, no fractional seconds
// Example: 2020-09-08 13:42:29
if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S") {
if let Some(offset) = timezone.offset_from_local_datetime(&ts).single() {
return Ok(DateTime::from_local(ts, offset));
}
if bytes.len() < 19 {
return Err(ArrowError::ParseError(format!(
"Error parsing time from '{s}': time must contain at least 8 characters"
)));
}

// without a timezone specifier as a local time, only date
// Example: 2020-09-08
if let Ok(dt) = NaiveDate::parse_from_str(s, "%Y-%m-%d") {
if let Some(ts) = dt.and_hms_opt(0, 0, 0) {
if let Some(offset) = timezone.offset_from_local_datetime(&ts).single() {
return Ok(DateTime::from_local(ts, offset));
}
}
if bytes[10] != b' ' && bytes[10] != b'T' {
return Err(ArrowError::ParseError(format!(
"Error parsing '{s}': expected either ' ' or 'T' as separator, got {}",
bytes[10]
)));
}

// Note we don't pass along the error message from the underlying
// chrono parsing because we tried several different format
// strings and we don't know which the user was trying to
// match. Ths any of the specific error messages is likely to be
// be more confusing than helpful
Err(ArrowError::CastError(format!(
"Error parsing '{s}' as timestamp"
)))
let time = parse_time(bytes[11..19].try_into().unwrap())?;
let (time, bytes_offset) = if bytes.len() >= 20 && bytes[19] == b'.' {
let (nanoseconds, digits) = parse_nanoseconds(&bytes[20..])?;
// Nanoseconds cannot exceed 2_000_000_000
(time.with_nanosecond(nanoseconds).unwrap(), 20 + digits)
} else {
(time, 19)
};
let ts = date.and_time(time);

if bytes.len() <= bytes_offset {
let offset = timezone.offset_from_local_datetime(&ts);
let offset = offset.single().ok_or_else(|| {
ArrowError::ParseError(format!("Error computing offset for timestamp '{ts}'"))
})?;
return Ok(DateTime::from_local(ts, offset));
}

if bytes[bytes_offset] == b'z' || bytes[bytes_offset] == b'Z' {
let offset = timezone.offset_from_local_datetime(&ts);
let offset = offset.single().ok_or_else(|| {
ArrowError::ParseError(format!("Error computing offset for timestamp '{ts}'"))
})?;
return Ok(DateTime::from_utc(ts, offset));
}

// Parse remainder of string as timezone
let parsed_tz: Tz = s[bytes_offset..].parse()?;
let offset = parsed_tz.offset_from_local_datetime(&ts);
let offset = offset.single().ok_or_else(|| {
ArrowError::ParseError(format!("Error computing offset for timestamp '{ts}'"))
})?;
Ok(DateTime::<Tz>::from_local(ts, offset).with_timezone(timezone))
}

/// Accepts a string in RFC3339 / ISO8601 standard format and some
Expand Down Expand Up @@ -464,6 +527,20 @@ mod tests {
use super::*;
use arrow_array::timezone::Tz;

#[test]
fn test_parse_fixed() {
assert_eq!(parse_fixed_integer::<3, u8>(b"001").unwrap(), 1);
assert_eq!(parse_fixed_integer::<3, u8>(b"011").unwrap(), 11);
assert_eq!(parse_fixed_integer::<3, u8>(b"111").unwrap(), 111);
assert_eq!(parse_fixed_integer::<1, u8>(b"9").unwrap(), 9);
assert_eq!(parse_fixed_integer::<2, u8>(b"23").unwrap(), 23);
assert_eq!(parse_fixed_integer::<3, u8>(b"124").unwrap(), 124);
assert_eq!(parse_fixed_integer::<4, u16>(b"1243").unwrap(), 1243);

assert_eq!(parse_fixed_integer::<3, u8>(b"2k2"), None);
assert_eq!(parse_fixed_integer::<2, u8>(b"4k"), None);
}

#[test]
fn string_to_timestamp_timezone() {
// Explicit timezone
Expand Down Expand Up @@ -565,11 +642,11 @@ mod tests {
// Test parsing invalid formats

// It would be nice to make these messages better
expect_timestamp_parse_error("", "Error parsing '' as timestamp");
expect_timestamp_parse_error("SS", "Error parsing 'SS' as timestamp");
expect_timestamp_parse_error("", "Error parsing timestamp from '': timestamp must contain at least 10 characters");
expect_timestamp_parse_error("SS", "Error parsing timestamp from 'SS': timestamp must contain at least 10 characters");
expect_timestamp_parse_error(
"Wed, 18 Feb 2015 23:16:09 GMT",
"Error parsing 'Wed, 18 Feb 2015 23:16:09 GMT' as timestamp",
"Parser error: Error parsing date from 'Wed, 18 Fe'",
);
}

Expand Down

0 comments on commit 10b560e

Please sign in to comment.