Skip to content

Commit

Permalink
accelerate line iterator
Browse files Browse the repository at this point in the history
  • Loading branch information
pascalkuthe committed Apr 8, 2024
1 parent 761f1da commit fce2e5c
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 4 deletions.
2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ exclude = ["/design/*", "/benches/*.txt", "/fuzz/**", "/.github/*"]

[features]
default = ["unicode_lines", "simd"]
memchr = ["dep:memchr"]
cr_lines = [] # Enable recognizing carriage returns as line breaks.
unicode_lines = ["cr_lines"] # Enable recognizing all Unicode line breaks.
simd = ["str_indices/simd"]
Expand All @@ -23,6 +24,7 @@ simd = ["str_indices/simd"]
small_chunks = []

[dependencies]
memchr = { version = "2.7.2", optional = true}
smallvec = "1.0.0"
str_indices = { version = "0.4", default-features = false }

Expand Down
8 changes: 4 additions & 4 deletions src/iter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ use std::sync::Arc;
use crate::slice::{RSEnum, RopeSlice};
use crate::str_utils::{
byte_to_line_idx, char_to_byte_idx, count_chars, count_utf16_surrogates, ends_with_line_break,
last_line_start_byte_idx, line_to_byte_idx, trim_line_break,
last_line_start_byte_idx, line_to_byte_idx, next_line_byte_idx, trim_line_break,
};
use crate::tree::{Count, Node, TextInfo};

Expand Down Expand Up @@ -1021,7 +1021,7 @@ impl<'a> Lines<'a> {
*line_idx += 1;

let head = &text[*leaf_byte_idx as usize..];
let mut line_len = line_to_byte_idx(head, 1);
let mut line_len = next_line_byte_idx(head);

// Check if the iterators needs to advance to the next chunk.
// During this check the number of newline (0 or 1) is yielded
Expand Down Expand Up @@ -1125,7 +1125,7 @@ impl<'a> Lines<'a> {
// This chunk contains a line break so it will contain the start of our line.
*text = node.children().nodes()[child_i].leaf_text();
// Find the end of the line within the chunk.
let mut line_end = line_to_byte_idx(text, 1);
let mut line_end = next_line_byte_idx(text);
// Check if the iterator was exhausted.
let ends_with_newline = if line_end >= available_bytes {
// Handle terminating lines without a line break properly.
Expand Down Expand Up @@ -1202,7 +1202,7 @@ impl<'a> Lines<'a> {
}

let start_idx = *byte_idx;
let end_idx = line_to_byte_idx(&text[start_idx..], 1) + start_idx;
let end_idx = next_line_byte_idx(&text[start_idx..]) + start_idx;
*byte_idx = end_idx;
*line_idx += 1;

Expand Down
63 changes: 63 additions & 0 deletions src/str_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,51 @@ pub(crate) fn utf16_code_unit_to_char_idx(text: &str, utf16_idx: usize) -> usize
str_indices::chars::from_byte_idx(text, str_indices::utf16::to_byte_idx(text, utf16_idx))
}

#[cfg(all(not(feature = "unicode_lines"), feature = "memchr"))]
pub(crate) fn next_line_byte_idx(text: &str) -> usize {
#[cfg(not(feature = "cr_lines"))]
let offset = memchr::memchr(b'\n', text.as_bytes());
#[cfg(feature = "cr_lines")]
let offset = memchr::memchr2(b'\n', b'\r', text.as_bytes());
match offset {
#[cfg(feature = "cr_lines")]
Some(i) if text[i..].starts_with("\r\n") => i + 2,
Some(i) => i + 1,
None => text.len(),
}
}

#[cfg(all(not(feature = "unicode_lines"), feature = "memchr"))]
pub(crate) fn last_line_start_byte_idx(text: &str) -> usize {
#[cfg(not(feature = "cr_lines"))]
let offset = memchr::memrchr(b'\n', text.as_bytes());
#[cfg(feature = "cr_lines")]
let offset = memchr::memrchr2(b'\n', b'\r', text.as_bytes());
// TODO: this is quite slow (3x slower than memchar). Std just uses a naive
// chars match loop. Using the packed (teddy) algorithm from the aoh-corasic
// crate would be much faster (particularly since it reports end of line
// matches), unfortunately that is a bit of a heavy dependency, maybe that
// aglorithm will be exctracted one day.
#[cfg(feature = "unicode_lines")]
let offset = text.rfind([
'\n', '\r', '\u{000B}', '\u{000C}', '\u{0085}', '\u{2028}', '\u{2029}',
]);
match offset {
Some(i) => i + 1,
None => 0,
}
}

#[cfg(any(feature = "unicode_lines", not(feature = "memchr")))]
pub(crate) fn next_line_byte_idx(text: &str) -> usize {
line_to_byte_idx(text, 1)
}

/// Returns the byte index of the start of the last line of the passed text.
///
/// Note: if the text ends in a line break, that means the last line is
/// an empty line that starts at the end of the text.
#[cfg(any(feature = "unicode_lines", not(feature = "memchr")))]
pub(crate) fn last_line_start_byte_idx(text: &str) -> usize {
let mut itr = text.bytes().enumerate().rev();

Expand Down Expand Up @@ -106,6 +147,20 @@ pub(crate) fn last_line_start_byte_idx(text: &str) -> usize {
///
/// If the string doesn't end in a line break, returns the string unchanged.
#[inline]
#[cfg(not(any(feature = "unicode_lines", feature = "cr_lines")))]
pub(crate) fn trim_line_break(text: &str) -> &str {
// hacks: we don't ned to care about crlf here
// since we just use the in the line iterator and want to pevent
// matching the same line terminator a second time
// #[cfg(feature = "cr_lines")]
text.strip_suffix('\n').unwrap_or(text)
}

/// Trims a single trailing line break (if any) off the end of the passed string.
///
/// If the string doesn't end in a line break, returns the string unchanged.
#[inline]
#[cfg(any(feature = "unicode_lines", feature = "cr_lines"))]
pub(crate) fn trim_line_break(text: &str) -> &str {
if text.is_empty() {
return "";
Expand Down Expand Up @@ -147,6 +202,14 @@ pub(crate) fn trim_line_break(text: &str) -> &str {

/// Returns whether the given string ends in a line break or not.
#[inline]
#[cfg(not(any(feature = "cr_lines", feature = "unicode_lines")))]
pub(crate) fn ends_with_line_break(text: &str) -> bool {
text.ends_with('\n')
}

/// Returns whether the given string ends in a line break or not.
#[inline]
#[cfg(any(feature = "cr_lines", feature = "unicode_lines"))]
pub(crate) fn ends_with_line_break(text: &str) -> bool {
if text.is_empty() {
return false;
Expand Down

0 comments on commit fce2e5c

Please sign in to comment.