Skip to content

Commit

Permalink
Auto merge of #37926 - bluss:from-utf8-small-simplification, r=sfackler
Browse files Browse the repository at this point in the history
UTF-8 validation: Compute block end upfront

Simplify the conditional used for ensuring that the whole word loop is
only used if there are at least two whole words left to read.

This makes the function slightly smaller and simpler, a 0-5% reduction
in runtime for various test cases.
  • Loading branch information
bors committed Jan 12, 2017
2 parents 2782e8f + 0dffc1e commit 408c2f7
Showing 1 changed file with 26 additions and 27 deletions.
53 changes: 26 additions & 27 deletions src/libcore/str/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1232,26 +1232,31 @@ fn contains_nonascii(x: usize) -> bool {
/// invalid sequence.
#[inline(always)]
fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
let mut offset = 0;
let mut index = 0;
let len = v.len();
while offset < len {
let old_offset = offset;

let usize_bytes = mem::size_of::<usize>();
let ascii_block_size = 2 * usize_bytes;
let blocks_end = if len >= ascii_block_size { len - ascii_block_size + 1 } else { 0 };

while index < len {
let old_offset = index;
macro_rules! err { () => {{
return Err(Utf8Error {
valid_up_to: old_offset
})
}}}

macro_rules! next { () => {{
offset += 1;
index += 1;
// we needed data, but there was none: error!
if offset >= len {
if index >= len {
err!()
}
v[offset]
v[index]
}}}

let first = v[offset];
let first = v[index];
if first >= 128 {
let w = UTF8_CHAR_WIDTH[first as usize];
let second = next!();
Expand Down Expand Up @@ -1294,38 +1299,32 @@ fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
}
_ => err!()
}
offset += 1;
index += 1;
} else {
// Ascii case, try to skip forward quickly.
// When the pointer is aligned, read 2 words of data per iteration
// until we find a word containing a non-ascii byte.
let usize_bytes = mem::size_of::<usize>();
let bytes_per_iteration = 2 * usize_bytes;
let ptr = v.as_ptr();
let align = (ptr as usize + offset) & (usize_bytes - 1);
let align = (ptr as usize + index) & (usize_bytes - 1);
if align == 0 {
if len >= bytes_per_iteration {
while offset <= len - bytes_per_iteration {
unsafe {
let u = *(ptr.offset(offset as isize) as *const usize);
let v = *(ptr.offset((offset + usize_bytes) as isize) as *const usize);

// break if there is a nonascii byte
let zu = contains_nonascii(u);
let zv = contains_nonascii(v);
if zu || zv {
break;
}
while index < blocks_end {
unsafe {
let block = ptr.offset(index as isize) as *const usize;
// break if there is a nonascii byte
let zu = contains_nonascii(*block);
let zv = contains_nonascii(*block.offset(1));
if zu | zv {
break;
}
offset += bytes_per_iteration;
}
index += ascii_block_size;
}
// step from the point where the wordwise loop stopped
while offset < len && v[offset] < 128 {
offset += 1;
while index < len && v[index] < 128 {
index += 1;
}
} else {
offset += 1;
index += 1;
}
}
}
Expand Down

0 comments on commit 408c2f7

Please sign in to comment.