Skip to content

Commit

Permalink
auto merge of rust-lang#15638 : blake2-ppc/rust/ptr-arithmetic-chars,…
Browse files Browse the repository at this point in the history
… r=huonw

Reimplement the string slice's `Iterator<char>` by wrapping the already efficient
slice iterator.

The iterator uses our guarantee that the string contains valid UTF-8, but its only unsafe
code is transmuting the decoded `u32` into `char`.

Benchmarks suggest that the runtime of `Chars` benchmarks are reduced by up to 30%,
runtime of `Chars` reversed reduced by up to 60%.

```
BEFORE
test str::bench::char_indicesator                          ... bench:       124 ns/iter (+/- 1)
test str::bench::char_indicesator_rev                      ... bench:       188 ns/iter (+/- 9)
test str::bench::char_iterator                             ... bench:       122 ns/iter (+/- 2)
test str::bench::char_iterator_ascii                       ... bench:       302 ns/iter (+/- 41)
test str::bench::char_iterator_for                         ... bench:       123 ns/iter (+/- 4)
test str::bench::char_iterator_rev                         ... bench:       189 ns/iter (+/- 14)
test str::bench::char_iterator_rev_for                     ... bench:       177 ns/iter (+/- 4)

AFTER
test str::bench::char_indicesator                          ... bench:        85 ns/iter (+/- 3)
test str::bench::char_indicesator_rev                      ... bench:        82 ns/iter (+/- 2)
test str::bench::char_iterator                             ... bench:       100 ns/iter (+/- 3)
test str::bench::char_iterator_ascii                       ... bench:       317 ns/iter (+/- 3)
test str::bench::char_iterator_for                         ... bench:        86 ns/iter (+/- 2)
test str::bench::char_iterator_rev                         ... bench:        80 ns/iter (+/- 6)
test str::bench::char_iterator_rev_for                     ... bench:        68 ns/iter (+/- 0)
```

Note: Branch name is no longer indicative of the implementation.
  • Loading branch information
bors committed Jul 19, 2014
2 parents e0a6e2b + c5e0736 commit ca38434
Show file tree
Hide file tree
Showing 2 changed files with 161 additions and 57 deletions.
54 changes: 48 additions & 6 deletions src/libcollections/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -808,6 +808,7 @@ impl OwnedStr for String {
#[cfg(test)]
mod tests {
use std::iter::AdditiveIterator;
use std::iter::range;
use std::default::Default;
use std::char::Char;
use std::clone::Clone;
Expand Down Expand Up @@ -1610,6 +1611,30 @@ mod tests {
assert_eq!(pos, v.len());
}

#[test]
fn test_chars_decoding() {
let mut bytes = [0u8, ..4];
for c in range(0u32, 0x110000).filter_map(|c| ::core::char::from_u32(c)) {
let len = c.encode_utf8(bytes);
let s = ::core::str::from_utf8(bytes.slice_to(len)).unwrap();
if Some(c) != s.chars().next() {
fail!("character {:x}={} does not decode correctly", c as u32, c);
}
}
}

#[test]
fn test_chars_rev_decoding() {
let mut bytes = [0u8, ..4];
for c in range(0u32, 0x110000).filter_map(|c| ::core::char::from_u32(c)) {
let len = c.encode_utf8(bytes);
let s = ::core::str::from_utf8(bytes.slice_to(len)).unwrap();
if Some(c) != s.chars().rev().next() {
fail!("character {:x}={} does not decode correctly", c as u32, c);
}
}
}

#[test]
fn test_iterator_clone() {
let s = "ศไทย中华Việt Nam";
Expand Down Expand Up @@ -2240,16 +2265,26 @@ mod tests {
#[cfg(test)]
mod bench {
use test::Bencher;
use test::black_box;
use super::*;
use std::option::{None, Some};
use std::iter::{Iterator, DoubleEndedIterator};
use std::collections::Collection;

#[bench]
fn char_iterator(b: &mut Bencher) {
let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
let len = s.char_len();

b.iter(|| assert_eq!(s.chars().count(), len));
b.iter(|| s.chars().count());
}

#[bench]
fn char_iterator_for(b: &mut Bencher) {
let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";

b.iter(|| {
for ch in s.chars() { black_box(ch) }
});
}

#[bench]
Expand All @@ -2260,17 +2295,24 @@ mod bench {
Mary had a little lamb, Little lamb
Mary had a little lamb, Little lamb
Mary had a little lamb, Little lamb";
let len = s.char_len();

b.iter(|| assert_eq!(s.chars().count(), len));
b.iter(|| s.chars().count());
}

#[bench]
fn char_iterator_rev(b: &mut Bencher) {
let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
let len = s.char_len();

b.iter(|| assert_eq!(s.chars().rev().count(), len));
b.iter(|| s.chars().rev().count());
}

#[bench]
fn char_iterator_rev_for(b: &mut Bencher) {
let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";

b.iter(|| {
for ch in s.chars().rev() { black_box(ch) }
});
}

#[bench]
Expand Down
164 changes: 113 additions & 51 deletions src/libcore/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,47 +97,110 @@ impl<'a> CharEq for &'a [char] {
Section: Iterators
*/

/// External iterator for a string's characters.
/// Use with the `std::iter` module.
/// Iterator for the char (representing *Unicode Scalar Values*) of a string
///
/// Created with the method `.chars()`.
#[deriving(Clone)]
pub struct Chars<'a> {
/// The slice remaining to be iterated
string: &'a str,
iter: slice::Items<'a, u8>
}

// Return the initial codepoint accumulator for the first byte.
// The first byte is special, only want bottom 5 bits for width 2, 4 bits
// for width 3, and 3 bits for width 4
macro_rules! utf8_first_byte(
($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as u32)
)

// return the value of $ch updated with continuation byte $byte
macro_rules! utf8_acc_cont_byte(
($ch:expr, $byte:expr) => (($ch << 6) | ($byte & CONT_MASK) as u32)
)

macro_rules! utf8_is_cont_byte(
($byte:expr) => (($byte & !CONT_MASK) == TAG_CONT_U8)
)

#[inline]
fn unwrap_or_0(opt: Option<&u8>) -> u8 {
match opt {
Some(&byte) => byte,
None => 0,
}
}

impl<'a> Iterator<char> for Chars<'a> {
#[inline]
fn next(&mut self) -> Option<char> {
// Decode the next codepoint, then update
// the slice to be just the remaining part
if self.string.len() != 0 {
let CharRange {ch, next} = self.string.char_range_at(0);
unsafe {
self.string = raw::slice_unchecked(self.string, next, self.string.len());
// Decode UTF-8, using the valid UTF-8 invariant
let x = match self.iter.next() {
None => return None,
Some(&next_byte) if next_byte < 128 => return Some(next_byte as char),
Some(&next_byte) => next_byte,
};

// Multibyte case follows
// Decode from a byte combination out of: [[[x y] z] w]
// NOTE: Performance is sensitive to the exact formulation here
let init = utf8_first_byte!(x, 2);
let y = unwrap_or_0(self.iter.next());
let mut ch = utf8_acc_cont_byte!(init, y);
if x >= 0xE0 {
// [[x y z] w] case
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
let z = unwrap_or_0(self.iter.next());
let y_z = utf8_acc_cont_byte!((y & CONT_MASK) as u32, z);
ch = init << 12 | y_z;
if x >= 0xF0 {
// [x y z w] case
// use only the lower 3 bits of `init`
let w = unwrap_or_0(self.iter.next());
ch = (init & 7) << 18 | utf8_acc_cont_byte!(y_z, w);
}
Some(ch)
} else {
None
}

// str invariant says `ch` is a valid Unicode Scalar Value
unsafe {
Some(mem::transmute(ch))
}
}

#[inline]
fn size_hint(&self) -> (uint, Option<uint>) {
(self.string.len().saturating_add(3)/4, Some(self.string.len()))
let (len, _) = self.iter.size_hint();
(len.saturating_add(3) / 4, Some(len))
}
}

impl<'a> DoubleEndedIterator<char> for Chars<'a> {
#[inline]
fn next_back(&mut self) -> Option<char> {
if self.string.len() != 0 {
let CharRange {ch, next} = self.string.char_range_at_reverse(self.string.len());
unsafe {
self.string = raw::slice_unchecked(self.string, 0, next);
let w = match self.iter.next_back() {
None => return None,
Some(&back_byte) if back_byte < 128 => return Some(back_byte as char),
Some(&back_byte) => back_byte,
};

// Multibyte case follows
// Decode from a byte combination out of: [x [y [z w]]]
let mut ch;
let z = unwrap_or_0(self.iter.next_back());
ch = utf8_first_byte!(z, 2);
if utf8_is_cont_byte!(z) {
let y = unwrap_or_0(self.iter.next_back());
ch = utf8_first_byte!(y, 3);
if utf8_is_cont_byte!(y) {
let x = unwrap_or_0(self.iter.next_back());
ch = utf8_first_byte!(x, 4);
ch = utf8_acc_cont_byte!(ch, y);
}
Some(ch)
} else {
None
ch = utf8_acc_cont_byte!(ch, z);
}
ch = utf8_acc_cont_byte!(ch, w);

// str invariant says `ch` is a valid Unicode Scalar Value
unsafe {
Some(mem::transmute(ch))
}
}
}
Expand All @@ -146,18 +209,23 @@ impl<'a> DoubleEndedIterator<char> for Chars<'a> {
/// Use with the `std::iter` module.
#[deriving(Clone)]
pub struct CharOffsets<'a> {
/// The original string to be iterated
string: &'a str,
front_offset: uint,
iter: Chars<'a>,
}

impl<'a> Iterator<(uint, char)> for CharOffsets<'a> {
#[inline]
fn next(&mut self) -> Option<(uint, char)> {
// Compute the byte offset by using the pointer offset between
// the original string slice and the iterator's remaining part
let offset = self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint;
self.iter.next().map(|ch| (offset, ch))
let (pre_len, _) = self.iter.iter.size_hint();
match self.iter.next() {
None => None,
Some(ch) => {
let index = self.front_offset;
let (len, _) = self.iter.iter.size_hint();
self.front_offset += pre_len - len;
Some((index, ch))
}
}
}

#[inline]
Expand All @@ -169,11 +237,14 @@ impl<'a> Iterator<(uint, char)> for CharOffsets<'a> {
impl<'a> DoubleEndedIterator<(uint, char)> for CharOffsets<'a> {
#[inline]
fn next_back(&mut self) -> Option<(uint, char)> {
self.iter.next_back().map(|ch| {
let offset = self.iter.string.len() +
self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint;
(offset, ch)
})
match self.iter.next_back() {
None => None,
Some(ch) => {
let (len, _) = self.iter.iter.size_hint();
let index = self.front_offset + len;
Some((index, ch))
}
}
}
}

Expand Down Expand Up @@ -672,9 +743,9 @@ fn run_utf8_validation_iterator(iter: &mut slice::Items<u8>) -> bool {
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
// %xF4 %x80-8F 2( UTF8-tail )
match w {
2 => if second & 192 != TAG_CONT_U8 {err!()},
2 => if second & !CONT_MASK != TAG_CONT_U8 {err!()},
3 => {
match (first, second, next!() & 192) {
match (first, second, next!() & !CONT_MASK) {
(0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) |
(0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) |
(0xED , 0x80 .. 0x9F, TAG_CONT_U8) |
Expand All @@ -683,7 +754,7 @@ fn run_utf8_validation_iterator(iter: &mut slice::Items<u8>) -> bool {
}
}
4 => {
match (first, second, next!() & 192, next!() & 192) {
match (first, second, next!() & !CONT_MASK, next!() & !CONT_MASK) {
(0xF0 , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
(0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
(0xF4 , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => {}
Expand Down Expand Up @@ -880,19 +951,10 @@ pub struct CharRange {
pub next: uint,
}

// Return the initial codepoint accumulator for the first byte.
// The first byte is special, only want bottom 5 bits for width 2, 4 bits
// for width 3, and 3 bits for width 4
macro_rules! utf8_first_byte(
($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as u32)
)

// return the value of $ch updated with continuation byte $byte
macro_rules! utf8_acc_cont_byte(
($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as u32)
)

static TAG_CONT_U8: u8 = 128u8;
/// Mask of the value bits of a continuation byte
static CONT_MASK: u8 = 0b0011_1111u8;
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte
static TAG_CONT_U8: u8 = 0b1000_0000u8;

/// Unsafe operations
pub mod raw {
Expand Down Expand Up @@ -1608,7 +1670,7 @@ impl<'a> StrSlice<'a> for &'a str {

#[inline]
fn chars(&self) -> Chars<'a> {
Chars{string: *self}
Chars{iter: self.as_bytes().iter()}
}

#[inline]
Expand All @@ -1618,7 +1680,7 @@ impl<'a> StrSlice<'a> for &'a str {

#[inline]
fn char_indices(&self) -> CharOffsets<'a> {
CharOffsets{string: *self, iter: self.chars()}
CharOffsets{front_offset: 0, iter: self.chars()}
}

#[inline]
Expand Down Expand Up @@ -1828,7 +1890,7 @@ impl<'a> StrSlice<'a> for &'a str {
// Multibyte case is a fn to allow char_range_at_reverse to inline cleanly
fn multibyte_char_range_at_reverse(s: &str, mut i: uint) -> CharRange {
// while there is a previous byte == 10......
while i > 0 && s.as_bytes()[i] & 192u8 == TAG_CONT_U8 {
while i > 0 && s.as_bytes()[i] & !CONT_MASK == TAG_CONT_U8 {
i -= 1u;
}

Expand Down

0 comments on commit ca38434

Please sign in to comment.