Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UTF-8 validation: Compute block end upfront #37926

Merged
merged 3 commits into from
Jan 12, 2017
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 26 additions & 27 deletions src/libcore/str/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1215,26 +1215,31 @@ fn contains_nonascii(x: usize) -> bool {
/// invalid sequence.
#[inline(always)]
fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
let mut offset = 0;
let mut index = 0;
let len = v.len();
while offset < len {
let old_offset = offset;

let usize_bytes = mem::size_of::<usize>();
let ascii_block_size = 2 * usize_bytes;
let blocks_end = if len >= ascii_block_size { len - ascii_block_size + 1 } else { 0 };

while index < len {
let old_offset = index;
macro_rules! err { () => {{
return Err(Utf8Error {
valid_up_to: old_offset
})
}}}

macro_rules! next { () => {{
offset += 1;
index += 1;
// we needed data, but there was none: error!
if offset >= len {
if index >= len {
err!()
}
v[offset]
v[index]
}}}

let first = v[offset];
let first = v[index];
if first >= 128 {
let w = UTF8_CHAR_WIDTH[first as usize];
let second = next!();
Expand Down Expand Up @@ -1277,38 +1282,32 @@ fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
}
_ => err!()
}
offset += 1;
index += 1;
} else {
// Ascii case, try to skip forward quickly.
// When the pointer is aligned, read 2 words of data per iteration
// until we find a word containing a non-ascii byte.
let usize_bytes = mem::size_of::<usize>();
let bytes_per_iteration = 2 * usize_bytes;
let ptr = v.as_ptr();
let align = (ptr as usize + offset) & (usize_bytes - 1);
let align = (ptr as usize + index) & (usize_bytes - 1);
if align == 0 {
if len >= bytes_per_iteration {
while offset <= len - bytes_per_iteration {
unsafe {
let u = *(ptr.offset(offset as isize) as *const usize);
let v = *(ptr.offset((offset + usize_bytes) as isize) as *const usize);

// break if there is a nonascii byte
let zu = contains_nonascii(u);
let zv = contains_nonascii(v);
if zu || zv {
break;
}
while index < blocks_end {
unsafe {
let block = ptr.offset(index as isize) as *const usize;
// break if there is a nonascii byte
let zu = contains_nonascii(*block);
let zv = contains_nonascii(*block.offset(1));
if zu | zv {
break;
}
offset += bytes_per_iteration;
}
index += ascii_block_size;
}
// step from the point where the wordwise loop stopped
while offset < len && v[offset] < 128 {
offset += 1;
while index < len && v[index] < 128 {
index += 1;
}
} else {
offset += 1;
index += 1;
}
}
}
Expand Down