Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Provide a condition in string-based IO routines #5399

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions src/libcore/io.rs
Original file line number Diff line number Diff line change
@@ -207,6 +207,25 @@ pub trait ReaderUtil {
fn read_i8(&self) -> i8;
}

/// Special utility functions defined on readers.
pub trait ReaderUtilEx {

/// Read up until a specified character (which is not returned) or EOF.
/// And fix the invalid utf-8 character according to the mode.
/// It provides three modes. (strict: raise an error,
/// replacement: replace the invalid string to unicode replacement character,
/// ignore: ignore the invalid string)
fn read_and_fix_utf8_until(&self, c: char, mode: uint) -> ~str;

/// Read up until the first '\n' char (which is not returned) or EOF.
/// And fix the invalid utf-8 character according to the mode.
fn read_fixed_utf8_line(&self, mode: uint) -> ~str;

/// Iterate over every line until the iterator breaks or EOF.
/// And fix the invalid utf-8 character according to the mode.
fn each_fixed_utf8_line(&self, mode: uint, it: fn(&str) -> bool);
}

impl<T:Reader> ReaderUtil for T {

fn read_bytes(&self,len: uint) -> ~[u8] {
@@ -463,6 +482,31 @@ impl<T:Reader> ReaderUtil for T {
}
}

impl<T:Reader> ReaderUtilEx for T {

fn read_and_fix_utf8_until(&self, c: char, mode: uint) -> ~str {
let mut bytes = ~[];
loop {
let ch = self.read_byte();
if ch == -1 || ch == c as int {
break;
}
bytes.push(ch as u8);
}
str::from_fixed_utf8_bytes(bytes, mode)
}

fn read_fixed_utf8_line(&self, mode: uint) -> ~str {
self.read_and_fix_utf8_until('\n', mode)
}

fn each_fixed_utf8_line(&self, mode: uint, it: fn(&str) -> bool) {
while !self.eof() {
if !it(self.read_fixed_utf8_line(mode)) { break; }
}
}
}

fn extend_sign(val: u64, nbytes: uint) -> i64 {
let shift = (8 - nbytes) * 8;
(val << shift) as i64 >> shift
63 changes: 63 additions & 0 deletions src/libcore/str.rs
Original file line number Diff line number Diff line change
@@ -21,6 +21,7 @@ use at_vec;
use cast;
use char;
use cmp::{Equiv, TotalOrd, Ordering, Less, Equal, Greater};
use condition;
use libc;
use option::{None, Option, Some};
use ptr;
@@ -47,6 +48,35 @@ pub pure fn from_bytes(vv: &[const u8]) -> ~str {
return unsafe { raw::from_bytes(vv) };
}

// Condition for invalid UTF-8 string
condition! {
is_not_utf8: ~[u8] -> ();
}

/**
* Condition types for invalid UTF-8 string
*
* strict raise an error (default mode)
* replacement replace the invalid string to unicode replacement character
* ignore ignore the invalid string
*/
const strict: uint = 1;
const replacement: uint = 2;
const ignore: uint = 3;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is probably better as an enum (enum UTF8Mode { Strict, Replacement, Ignore }), unless there was a particular reason for using the constants?


/**
* Convert a vector of bytes to a UTF-8 string

* Provide a condition when presented with invalid UTF-8
*/
pub fn from_fixed_utf8_bytes(v: &[const u8], mode: uint) -> ~str {
let bytes = match mode {
replacement | ignore => do is_not_utf8::cond.trap(|_| {()}).in { fix_utf8(v, mode) },
_ => fix_utf8(v, mode)
};
return unsafe { raw::from_bytes(bytes) };
}

/// Copy a slice into a new unique str
pub pure fn from_slice(s: &str) -> ~str {
unsafe { raw::slice_bytes(s, 0, len(s)) }
@@ -1548,6 +1578,39 @@ pub pure fn is_utf8(v: &[const u8]) -> bool {
return true;
}

/// Fixes if a vector of bytes contains invalid UTF-8
pub fn fix_utf8(v: &[const u8], mode: uint) -> ~[u8] {
let mut i = 0u;
let total = vec::len::<u8>(v);
let mut result = ~[];
while i < total {
let chend = i + utf8_char_width(v[i]);
let mut j = i + 1u;
while j < total && j < chend && v[j] & 192u8 == tag_cont_u8 {
j += 1u;
}
if j == chend {
fail_unless!(i != chend);
result = vec::append(result, v.view(i, j));
} else {
match mode {
replacement => {
let replacement_char: ~[u8] = ~[0xef, 0xbf, 0xbd];
result = vec::append(result, replacement_char);
},
_ => ()
}
if i == chend {
is_not_utf8::cond.raise(v.slice(i, i + 1));
} else {
is_not_utf8::cond.raise(v.slice(i, chend));
}
}
i = j;
}
result
}

/// Determines if a vector of `u16` contains valid UTF-16
pub pure fn is_utf16(v: &[u16]) -> bool {
let len = vec::len(v);