From 062a6b90db56326a6990fd25d30f88a2de891a23 Mon Sep 17 00:00:00 2001 From: Youngsoo Son Date: Fri, 15 Mar 2013 21:37:03 +0900 Subject: [PATCH] Provide a conditino in string-based IO routines --- src/libcore/io.rs | 44 ++++++++++++++++++++++++++++++++ src/libcore/str.rs | 63 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+) diff --git a/src/libcore/io.rs b/src/libcore/io.rs index 4942b0587851b..a7f18c386b1c2 100644 --- a/src/libcore/io.rs +++ b/src/libcore/io.rs @@ -207,6 +207,25 @@ pub trait ReaderUtil { fn read_i8(&self) -> i8; } +/// Special utility functions defined on readers. +pub trait ReaderUtilEx { + + /// Read up until a specified character (which is not returned) or EOF. + /// And fix the invalid utf-8 character according to the mode. + /// It provides three modes. (strict: raise an error, + /// replacement: replace the invalid string to unicode replacement character, + /// ignore: ignore the invalid string) + fn read_and_fix_utf8_until(&self, c: char, mode: uint) -> ~str; + + /// Read up until the first '\n' char (which is not returned) or EOF. + /// And fix the invalid utf-8 character according to the mode. + fn read_fixed_utf8_line(&self, mode: uint) -> ~str; + + /// Iterate over every line until the iterator breaks or EOF. + /// And fix the invalid utf-8 character according to the mode. + fn each_fixed_utf8_line(&self, mode: uint, it: fn(&str) -> bool); +} + impl ReaderUtil for T { fn read_bytes(&self,len: uint) -> ~[u8] { @@ -463,6 +482,31 @@ impl ReaderUtil for T { } } +impl ReaderUtilEx for T { + + fn read_and_fix_utf8_until(&self, c: char, mode: uint) -> ~str { + let mut bytes = ~[]; + loop { + let ch = self.read_byte(); + if ch == -1 || ch == c as int { + break; + } + bytes.push(ch as u8); + } + str::from_fixed_utf8_bytes(bytes, mode) + } + + fn read_fixed_utf8_line(&self, mode: uint) -> ~str { + self.read_and_fix_utf8_until('\n', mode) + } + + fn each_fixed_utf8_line(&self, mode: uint, it: fn(&str) -> bool) { + while !self.eof() { + if !it(self.read_fixed_utf8_line(mode)) { break; } + } + } +} + fn extend_sign(val: u64, nbytes: uint) -> i64 { let shift = (8 - nbytes) * 8; (val << shift) as i64 >> shift diff --git a/src/libcore/str.rs b/src/libcore/str.rs index 4163679a98d74..78e5f63a61e17 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -21,6 +21,7 @@ use at_vec; use cast; use char; use cmp::{Equiv, TotalOrd, Ordering, Less, Equal, Greater}; +use condition; use libc; use option::{None, Option, Some}; use ptr; @@ -47,6 +48,35 @@ pub pure fn from_bytes(vv: &[const u8]) -> ~str { return unsafe { raw::from_bytes(vv) }; } +// Condition for invalid UTF-8 string +condition! { + is_not_utf8: ~[u8] -> (); +} + +/** + * Condition types for invalid UTF-8 string + * + * strict raise an error (default mode) + * replacement replace the invalid string to unicode replacement character + * ignore ignore the invalid string + */ +const strict: uint = 1; +const replacement: uint = 2; +const ignore: uint = 3; + +/** + * Convert a vector of bytes to a UTF-8 string + + * Provide a condition when presented with invalid UTF-8 + */ +pub fn from_fixed_utf8_bytes(v: &[const u8], mode: uint) -> ~str { + let bytes = match mode { + replacement | ignore => do is_not_utf8::cond.trap(|_| {()}).in { fix_utf8(v, mode) }, + _ => fix_utf8(v, mode) + }; + return unsafe { raw::from_bytes(bytes) }; +} + /// Copy a slice into a new unique str pub pure fn from_slice(s: &str) -> ~str { unsafe { raw::slice_bytes(s, 0, len(s)) } @@ -1548,6 +1578,39 @@ pub pure fn is_utf8(v: &[const u8]) -> bool { return true; } +/// Fixes if a vector of bytes contains invalid UTF-8 +pub fn fix_utf8(v: &[const u8], mode: uint) -> ~[u8] { + let mut i = 0u; + let total = vec::len::(v); + let mut result = ~[]; + while i < total { + let chend = i + utf8_char_width(v[i]); + let mut j = i + 1u; + while j < total && j < chend && v[j] & 192u8 == tag_cont_u8 { + j += 1u; + } + if j == chend { + fail_unless!(i != chend); + result = vec::append(result, v.view(i, j)); + } else { + match mode { + replacement => { + let replacement_char: ~[u8] = ~[0xef, 0xbf, 0xbd]; + result = vec::append(result, replacement_char); + }, + _ => () + } + if i == chend { + is_not_utf8::cond.raise(v.slice(i, i + 1)); + } else { + is_not_utf8::cond.raise(v.slice(i, chend)); + } + } + i = j; + } + result +} + /// Determines if a vector of `u16` contains valid UTF-16 pub pure fn is_utf16(v: &[u16]) -> bool { let len = vec::len(v);