Skip to content

Commit

Permalink
fix: Csv stop simd cache if eol char is hit (#20199)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 authored Dec 6, 2024
1 parent 9e32651 commit 579d8fb
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 16 deletions.
4 changes: 2 additions & 2 deletions crates/polars-io/src/csv/read/read_impl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,8 @@ impl<'a> CoreReader<'a> {

let counter = CountLines::new(self.quote_char, self.eol_char);
let mut total_offset = 0;
let check_utf8 = matches!(self.encoding, CsvEncoding::Utf8)
&& self.schema.iter_fields().any(|f| f.dtype().is_string());

pool.scope(|s| {
loop {
Expand Down Expand Up @@ -432,8 +434,6 @@ impl<'a> CoreReader<'a> {
total_offset = end;
(b, count)
};
let check_utf8 = matches!(self.encoding, CsvEncoding::Utf8)
&& self.schema.iter_fields().any(|f| f.dtype().is_string());

if !b.is_empty() {
let results = results.clone();
Expand Down
34 changes: 20 additions & 14 deletions crates/polars-io/src/csv/read/splitfields.rs
Original file line number Diff line number Diff line change
Expand Up @@ -188,11 +188,11 @@ mod inner {
unsafe fn finish_eol(
&mut self,
need_escaping: bool,
idx: usize,
pos: usize,
) -> Option<(&'a [u8], bool)> {
self.finished = true;
debug_assert!(idx <= self.v.len());
Some((self.v.get_unchecked(..idx), need_escaping))
debug_assert!(pos <= self.v.len());
Some((self.v.get_unchecked(..pos), need_escaping))
}

#[inline]
Expand All @@ -212,7 +212,11 @@ mod inner {

#[inline]
fn next(&mut self) -> Option<(&'a [u8], bool)> {
// First check cached value as this is hot.
// This must be before we check the cached value
if self.finished {
return None;
}
// Then check cached value as this is hot.
if self.previous_valid_ends != 0 {
let pos = self.previous_valid_ends.trailing_zeros() as usize;
self.previous_valid_ends >>= (pos + 1) as u64;
Expand All @@ -221,22 +225,24 @@ mod inner {
debug_assert!(pos < self.v.len());
// SAFETY:
// we are in bounds
let needs_escaping = self
.v
.first()
.map(|c| *c == self.quote_char && self.quoting)
.unwrap_or(false);

if *self.v.get_unchecked(pos) == self.eol_char {
return self.finish_eol(needs_escaping, pos);
}

let bytes = self.v.get_unchecked(..pos);

self.v = self.v.get_unchecked(pos + 1..);
let ret = Some((
bytes,
bytes
.first()
.map(|c| *c == self.quote_char && self.quoting)
.unwrap_or(false),
));
let ret = Some((bytes, needs_escaping));

return ret;
}
}
if self.finished {
return None;
}
if self.v.is_empty() {
return self.finish(false);
}
Expand Down

0 comments on commit 579d8fb

Please sign in to comment.