From cd85664787f5abaab019abebc0681e62db2c6366 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 9 Jul 2016 15:48:16 -0400 Subject: [PATCH 1/6] Don't ignore the start offset when searching for an anchored literal. If we ignore the start offset, then we may report a match where none exists. This can in particular lead to a match loop that never terminates. Fixes #255. --- src/exec.rs | 6 +++++- tests/misc.rs | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/exec.rs b/src/exec.rs index 87c3e84dab..755342497b 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -589,7 +589,11 @@ impl<'c> ExecNoSync<'c> { lits.find_start(&text[start..]) .map(|(s, e)| (start + s, start + e)) } - AnchoredEnd => self.ro.suffixes.find_end(&text), + AnchoredEnd => { + let lits = &self.ro.suffixes; + lits.find_end(&text[start..]) + .map(|(s, e)| (start + s, start + e)) + } } } diff --git a/tests/misc.rs b/tests/misc.rs index 4fba750359..293cddb322 100644 --- a/tests/misc.rs +++ b/tests/misc.rs @@ -13,6 +13,7 @@ use regex::Regex; mat!(prefix_literal_match, r"^abc", r"abc", Some((0, 3))); mat!(prefix_literal_nomatch, r"^abc", r"zabc", None); mat!(one_literal_edge, r"abc", r"xxxxxab", None); +matiter!(terminates, r"a$", r"a", (0, 1)); #[test] fn eq() { From 9062f38eff7b8030bc5dba7aa03bf7bb144c82b2 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 9 Jul 2016 16:52:58 -0400 Subject: [PATCH 2/6] Disallow empty character class ranges. The compiler in particular assumes that it never gets an empty character class. The current parser is pretty paranoid about rejecting empty classes, but a few tricky cases made it through. In particular, one can write `[^\d\D]` to correspond to "match nothing." This commit now looks for empty classes explicitly, and if one is found, returns an error. Interestingly, other regex engines allow this particular idiosyncrasy and interpret it as "never match." Even more interesting, expressions like `a{0}` are also allowed (including by this regex library) and are interpreted as "always match the empty string." Both seem semantically the same. In any case, we forbid empty character classes, primarily because that seems like the sensible thing to do but secondarily because it's the conservative choice. It seems plausible that such a construct could be occasionally useful if one were machine generating regexes, because it could be used to indicate "never match." If we do want to support that use case, we'll need to add a new opcode to the regex matching engines. One can still achieve that today using something like `(a|[^a])`. Fixes #257, where using such a form caused an assert to trip in the compiler. A new, more explicit assert has been added. --- regex-syntax/src/lib.rs | 6 ++++++ regex-syntax/src/parser.rs | 26 +++++++++++++++----------- src/compile.rs | 1 + 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index 3aaab9a37c..ee96045f4a 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -1336,6 +1336,9 @@ pub enum ErrorKind { /// This never returned if the parser is permitted to allow expressions /// that match arbitrary bytes. InvalidUtf8, + /// A character class was constructed such that it is empty. + /// e.g., `[^\d\D]`. + EmptyClass, /// Hints that destructuring should not be exhaustive. /// /// This enum may grow additional variants, so this makes sure clients @@ -1398,6 +1401,7 @@ impl ErrorKind { FlagNotAllowed(_) => "flag not allowed", UnicodeNotAllowed => "Unicode features not allowed", InvalidUtf8 => "matching arbitrary bytes is not allowed", + EmptyClass => "empty character class", __Nonexhaustive => unreachable!(), } } @@ -1507,6 +1511,8 @@ impl fmt::Display for ErrorKind { (u) flag is not set."), InvalidUtf8 => write!(f, "Matching arbitrary bytes is not allowed."), + EmptyClass => + write!(f, "Empty character classes are not allowed."), __Nonexhaustive => unreachable!(), } } diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs index 29bc483062..a4d04c0782 100644 --- a/regex-syntax/src/parser.rs +++ b/regex-syntax/src/parser.rs @@ -587,6 +587,9 @@ impl Parser { } } class = self.class_transform(negated, class).canonicalize(); + if class.is_empty() { + return Err(self.err(ErrorKind::EmptyClass)); + } Ok(Build::Expr(if self.flags.unicode { Expr::Class(class) } else { @@ -1277,7 +1280,7 @@ mod tests { ErrorKind, }; use unicode::regex::{PERLD, PERLS, PERLW}; - use super::{LOWER, UPPER, Flags, Parser, ascii_class}; + use super::{LOWER, UPPER, WORD, Flags, Parser, ascii_class}; static YI: &'static [(char, char)] = &[ ('\u{a000}', '\u{a48c}'), ('\u{a490}', '\u{a4c6}'), @@ -2127,10 +2130,10 @@ mod tests { #[test] fn class_multiple_class_negate_negate() { - let nperld = class(PERLD).negate(); + let nperlw = class(PERLW).negate(); let nyi = class(YI).negate(); - let cls = CharClass::empty().merge(nperld).merge(nyi); - assert_eq!(p(r"[^\D\P{Yi}]"), Expr::Class(cls.negate())); + let cls = CharClass::empty().merge(nperlw).merge(nyi); + assert_eq!(p(r"[^\W\P{Yi}]"), Expr::Class(cls.negate())); } #[test] @@ -2149,10 +2152,10 @@ mod tests { #[test] fn class_multiple_class_negate_negate_casei() { - let nperld = class(PERLD).negate(); + let nperlw = class(PERLW).negate(); let nyi = class(YI).negate(); - let class = CharClass::empty().merge(nperld).merge(nyi); - assert_eq!(p(r"(?i)[^\D\P{Yi}]"), + let class = CharClass::empty().merge(nperlw).merge(nyi); + assert_eq!(p(r"(?i)[^\W\P{Yi}]"), Expr::Class(class.case_fold().negate())); } @@ -2236,10 +2239,10 @@ mod tests { #[test] fn ascii_classes_negate_multiple() { - let (nlower, nupper) = (class(LOWER).negate(), class(UPPER).negate()); - let cls = CharClass::empty().merge(nlower).merge(nupper); - assert_eq!(p("[[:^lower:][:^upper:]]"), Expr::Class(cls.clone())); - assert_eq!(p("[^[:^lower:][:^upper:]]"), Expr::Class(cls.negate())); + let (nlower, nword) = (class(LOWER).negate(), class(WORD).negate()); + let cls = CharClass::empty().merge(nlower).merge(nword); + assert_eq!(p("[[:^lower:][:^word:]]"), Expr::Class(cls.clone())); + assert_eq!(p("[^[:^lower:][:^word:]]"), Expr::Class(cls.negate())); } #[test] @@ -2725,6 +2728,7 @@ mod tests { fn error_class_empty_range() { test_err!("[]", 2, ErrorKind::UnexpectedClassEof); test_err!("[^]", 3, ErrorKind::UnexpectedClassEof); + test_err!(r"[^\d\D]", 7, ErrorKind::EmptyClass); } #[test] diff --git a/src/compile.rs b/src/compile.rs index 32df4b3547..9db743f489 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -372,6 +372,7 @@ impl Compiler { } fn c_class(&mut self, ranges: &[ClassRange]) -> Result { + assert!(!ranges.is_empty()); if self.compiled.uses_bytes() { CompileClass { c: self, From e55c7ed19a420fd73cc4feb46529ec6a75f08d76 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 9 Jul 2016 16:59:59 -0400 Subject: [PATCH 3/6] Explicitly state that flags aren't incorporated into the pattern string. Fixes #246. --- src/re_builder.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/re_builder.rs b/src/re_builder.rs index 62f3d2daeb..ca030b3ef5 100644 --- a/src/re_builder.rs +++ b/src/re_builder.rs @@ -67,6 +67,10 @@ impl RegexBuilder { } /// Consume the builder and compile the regular expression. + /// + /// Note that calling `as_str` on the resulting `Regex` will produce the + /// pattern given to `new` verbatim. Notably, it will not incorporate any + /// of the flags set on this builder. pub fn compile(self) -> Result { ExecBuilder::new_options(self.0) .only_utf8($only_utf8) From f07b83d7c6cb9a940f90fed279f6d08f7eae6e83 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 9 Jul 2016 17:04:58 -0400 Subject: [PATCH 4/6] Add a regression test. The bug shown in #251 has the same underlying cause as the bug in #255, which has been fixed in a previous commit. This commit just adds a more specific regression test for #251. Fixes #251. --- tests/api.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/api.rs b/tests/api.rs index e6c3a27a8d..0be032949a 100644 --- a/tests/api.rs +++ b/tests/api.rs @@ -239,3 +239,4 @@ split!(split1, r"\s+", "a b\nc\td\n\t e", &[t!("a"), t!("b"), t!("c"), t!("d"), t!("e")]); split!(split2, r"\b", "a b c", &[t!(""), t!("a"), t!(" "), t!("b"), t!(" "), t!("c")]); +split!(split3, r"a$", "a", &[t!("")]); From 81297f09cf2df45ee36b7594ac4af91ad22be49f Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 9 Jul 2016 19:07:24 -0400 Subject: [PATCH 5/6] Disallow Unicode literals in character classes when Unicode is disabled. When Unicode mode is disabled, we also disable the use of Unicode literals in the regular expression, since it can lead to unintuitive behavior. In this case, Unicode literals in character classes were not disallowed, and subsequent code filtered them out, which resulted in an empty character class. The compiler assumes that empty character classes are not allowed, and so this causes an assert to trigger. Fixes #250. --- regex-syntax/src/parser.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs index a4d04c0782..0ba0558be4 100644 --- a/regex-syntax/src/parser.rs +++ b/regex-syntax/src/parser.rs @@ -610,6 +610,11 @@ impl Parser { fn parse_class_range(&mut self, class: &mut CharClass, start: char) -> Result<()> { if !self.bump_if('-') { + // Make sure we haven't parsed Unicode literals when we shouldn't have. + if !self.flags.unicode { + let _ = try!(self.codepoint_to_one_byte(start)); + } + // Not a range, so just push a singleton range. class.ranges.push(ClassRange::one(start)); return Ok(()); @@ -651,6 +656,11 @@ impl Parser { end: end, })); } + // Make sure we haven't parsed Unicode literals when we shouldn't have. + if !self.flags.unicode { + let _ = try!(self.codepoint_to_one_byte(start)); + let _ = try!(self.codepoint_to_one_byte(end)); + } class.ranges.push(ClassRange::new(start, end)); Ok(()) } @@ -2405,6 +2415,13 @@ mod tests { test_err!(r"☃(?-u:\pL)", 9, ErrorKind::UnicodeNotAllowed, flags); } + #[test] + fn unicode_class_literal_not_allowed() { + let flags = Flags { allow_bytes: true, .. Flags::default() }; + test_err!(r"(?-u)[☃]", 7, ErrorKind::UnicodeNotAllowed, flags); + test_err!(r"(?-u)[☃-☃]", 9, ErrorKind::UnicodeNotAllowed, flags); + } + #[test] fn unicode_hex_not_allowed() { let flags = Flags { allow_bytes: true, .. Flags::default() }; From 84a2bf5d73e52e961f41e312e449dd251f86e34d Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 9 Jul 2016 22:20:59 -0400 Subject: [PATCH 6/6] Match (?-u:\B) correctly in the NFA engines when valid UTF-8 is required. This commit fixes a bug where matching (?-u:\B) (that is, "not an ASCII word boundary") in the NFA engines could produce match positions at invalid UTF-8 sequence boundaries. The specific problem is that determining whether (?-u:\B) matches or not relies on knowing whether we must report matches only at UTF-8 boundaries, and this wasn't actually being taken into account. (Instead, we prefer to enforce this invariant in the compiler, so that the matching engines mostly don't have to care about it.) But of course, the zero-width assertions are kind of a special case all around, so we need to handle ASCII word boundaries differently depending on whether we require valid UTF-8. This bug was noticed because the DFA actually handles this correctly (by encoding ASCII word boundaries into the state machine itself, which in turn guarantees the valid UTF-8 invariant) while the NFAs don't, leading to an inconsistency. Fix #241. --- regex-syntax/src/parser.rs | 27 ++++---- src/backtrack.rs | 4 +- src/dfa.rs | 2 +- src/exec.rs | 4 +- src/input.rs | 115 ++++++++++++++++++++++++++++++--- src/pikevm.rs | 4 +- src/prog.rs | 18 ------ tests/word_boundary_ascii.rs | 1 + tests/word_boundary_unicode.rs | 1 + 9 files changed, 128 insertions(+), 48 deletions(-) diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs index 0ba0558be4..1d10572f6d 100644 --- a/regex-syntax/src/parser.rs +++ b/regex-syntax/src/parser.rs @@ -581,6 +581,9 @@ impl Parser { _ => unreachable!(), }, start => { + if !self.flags.unicode { + let _ = try!(self.codepoint_to_one_byte(start)); + } self.bump(); try!(self.parse_class_range(&mut class, start)); } @@ -610,11 +613,6 @@ impl Parser { fn parse_class_range(&mut self, class: &mut CharClass, start: char) -> Result<()> { if !self.bump_if('-') { - // Make sure we haven't parsed Unicode literals when we shouldn't have. - if !self.flags.unicode { - let _ = try!(self.codepoint_to_one_byte(start)); - } - // Not a range, so just push a singleton range. class.ranges.push(ClassRange::one(start)); return Ok(()); @@ -647,7 +645,13 @@ impl Parser { // Because `parse_escape` can never return `LeftParen`. _ => unreachable!(), }, - _ => self.bump(), + _ => { + let c = self.bump(); + if !self.flags.unicode { + let _ = try!(self.codepoint_to_one_byte(c)); + } + c + } }; if end < start { // e.g., [z-a] @@ -656,11 +660,6 @@ impl Parser { end: end, })); } - // Make sure we haven't parsed Unicode literals when we shouldn't have. - if !self.flags.unicode { - let _ = try!(self.codepoint_to_one_byte(start)); - let _ = try!(self.codepoint_to_one_byte(end)); - } class.ranges.push(ClassRange::new(start, end)); Ok(()) } @@ -2015,6 +2014,8 @@ mod tests { assert_eq!(pb(r"(?-u)[a]"), Expr::ClassBytes(bclass(&[(b'a', b'a')]))); assert_eq!(pb(r"(?-u)[\x00]"), Expr::ClassBytes(bclass(&[(0, 0)]))); + assert_eq!(pb(r"(?-u)[\xFF]"), + Expr::ClassBytes(bclass(&[(0xFF, 0xFF)]))); assert_eq!(pb("(?-u)[\n]"), Expr::ClassBytes(bclass(&[(b'\n', b'\n')]))); assert_eq!(pb(r"(?-u)[\n]"), @@ -2418,8 +2419,8 @@ mod tests { #[test] fn unicode_class_literal_not_allowed() { let flags = Flags { allow_bytes: true, .. Flags::default() }; - test_err!(r"(?-u)[☃]", 7, ErrorKind::UnicodeNotAllowed, flags); - test_err!(r"(?-u)[☃-☃]", 9, ErrorKind::UnicodeNotAllowed, flags); + test_err!(r"(?-u)[☃]", 6, ErrorKind::UnicodeNotAllowed, flags); + test_err!(r"(?-u)[☃-☃]", 6, ErrorKind::UnicodeNotAllowed, flags); } #[test] diff --git a/src/backtrack.rs b/src/backtrack.rs index b0e0e02035..3c06254c6b 100644 --- a/src/backtrack.rs +++ b/src/backtrack.rs @@ -242,9 +242,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> { ip = inst.goto1; } EmptyLook(ref inst) => { - let prev = self.input.previous_char(at); - let next = self.input.next_char(at); - if inst.matches(prev, next) { + if self.input.is_empty_match(at, inst) { ip = inst.goto; } else { return false; diff --git a/src/dfa.rs b/src/dfa.rs index d216f2cbae..0216f25620 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -1847,7 +1847,7 @@ mod tests { expected == got && state.flags() == StateFlags(flags) } QuickCheck::new() - .gen(StdGen::new(self::rand::thread_rng(), 70_000)) + .gen(StdGen::new(self::rand::thread_rng(), 10_000)) .quickcheck(p as fn(Vec, u8) -> bool); } diff --git a/src/exec.rs b/src/exec.rs index 755342497b..65a3935a72 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -921,7 +921,7 @@ impl<'c> ExecNoSync<'c> { matches, slots, quit_after_match, - ByteInput::new(text), + ByteInput::new(text, self.ro.nfa.only_utf8), start) } else { pikevm::Fsm::exec( @@ -949,7 +949,7 @@ impl<'c> ExecNoSync<'c> { &self.cache, matches, slots, - ByteInput::new(text), + ByteInput::new(text, self.ro.nfa.only_utf8), start) } else { backtrack::Bounded::exec( diff --git a/src/input.rs b/src/input.rs index f96a6be075..a8547d5902 100644 --- a/src/input.rs +++ b/src/input.rs @@ -16,8 +16,9 @@ use std::u32; use syntax; -use utf8::{decode_utf8, decode_last_utf8}; use literals::LiteralSearcher; +use prog::InstEmptyLook; +use utf8::{decode_utf8, decode_last_utf8}; /// Represents a location in the input. #[derive(Clone, Copy, Debug)] @@ -83,6 +84,10 @@ pub trait Input { /// If no such character could be decoded, then `Char` is absent. fn previous_char(&self, at: InputAt) -> Char; + /// Return true if the given empty width instruction matches at the + /// input position given. + fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool; + /// Scan the input for a matching prefix. fn prefix_at( &self, @@ -104,6 +109,10 @@ impl<'a, T: Input> Input for &'a T { fn previous_char(&self, at: InputAt) -> Char { (**self).previous_char(at) } + fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { + (**self).is_empty_match(at, empty) + } + fn prefix_at( &self, prefixes: &LiteralSearcher, @@ -155,6 +164,38 @@ impl<'t> Input for CharInput<'t> { decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() } + fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { + use prog::EmptyLook::*; + match empty.look { + StartLine => { + let c = self.previous_char(at); + c.is_none() || c == '\n' + } + EndLine => { + let c = self.next_char(at); + c.is_none() || c == '\n' + } + StartText => self.previous_char(at).is_none(), + EndText => self.next_char(at).is_none(), + WordBoundary => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_char() != c2.is_word_char() + } + NotWordBoundary => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_char() == c2.is_word_char() + } + WordBoundaryAscii => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_byte() != c2.is_word_byte() + } + NotWordBoundaryAscii => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_byte() == c2.is_word_byte() + } + } + } + fn prefix_at( &self, prefixes: &LiteralSearcher, @@ -178,12 +219,18 @@ impl<'t> Input for CharInput<'t> { /// easy access to necessary Unicode decoding (used for word boundary look /// ahead/look behind). #[derive(Clone, Copy, Debug)] -pub struct ByteInput<'t>(&'t [u8]); +pub struct ByteInput<'t> { + text: &'t [u8], + only_utf8: bool, +} impl<'t> ByteInput<'t> { /// Return a new byte-based input reader for the given string. - pub fn new(s: &'t [u8]) -> ByteInput<'t> { - ByteInput(s) + pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> { + ByteInput { + text: text, + only_utf8: only_utf8, + } } } @@ -191,7 +238,7 @@ impl<'t> ops::Deref for ByteInput<'t> { type Target = [u8]; fn deref(&self) -> &[u8] { - self.0 + self.text } } @@ -213,6 +260,58 @@ impl<'t> Input for ByteInput<'t> { decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() } + fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { + use prog::EmptyLook::*; + match empty.look { + StartLine => { + let c = self.previous_char(at); + c.is_none() || c == '\n' + } + EndLine => { + let c = self.next_char(at); + c.is_none() || c == '\n' + } + StartText => self.previous_char(at).is_none(), + EndText => self.next_char(at).is_none(), + WordBoundary => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_char() != c2.is_word_char() + } + NotWordBoundary => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_char() == c2.is_word_char() + } + WordBoundaryAscii => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + if self.only_utf8 { + // If we must match UTF-8, then we can't match word + // boundaries at invalid UTF-8. + if c1.is_none() && !at.is_start() { + return false; + } + if c2.is_none() && !at.is_end() { + return false; + } + } + c1.is_word_byte() != c2.is_word_byte() + } + NotWordBoundaryAscii => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + if self.only_utf8 { + // If we must match UTF-8, then we can't match word + // boundaries at invalid UTF-8. + if c1.is_none() && !at.is_start() { + return false; + } + if c2.is_none() && !at.is_end() { + return false; + } + } + c1.is_word_byte() == c2.is_word_byte() + } + } + } + fn prefix_at( &self, prefixes: &LiteralSearcher, @@ -222,11 +321,11 @@ impl<'t> Input for ByteInput<'t> { } fn len(&self) -> usize { - self.0.len() + self.text.len() } fn as_bytes(&self) -> &[u8] { - self.0 + &self.text } } @@ -276,7 +375,7 @@ impl Char { pub fn is_word_byte(self) -> bool { match char::from_u32(self.0) { None => false, - Some(c) if c <= '\u{FF}' => syntax::is_word_byte(c as u8), + Some(c) if c <= '\u{7F}' => syntax::is_word_byte(c as u8), Some(_) => false, } } diff --git a/src/pikevm.rs b/src/pikevm.rs index a18011bab0..b96f0e7588 100644 --- a/src/pikevm.rs +++ b/src/pikevm.rs @@ -322,9 +322,7 @@ impl<'r, I: Input> Fsm<'r, I> { nlist.set.insert(ip); match self.prog[ip] { EmptyLook(ref inst) => { - let prev = self.input.previous_char(at); - let next = self.input.next_char(at); - if inst.matches(prev, next) { + if self.input.is_empty_match(at, inst) { ip = inst.goto; } } diff --git a/src/prog.rs b/src/prog.rs index cad12fb133..36f2aff879 100644 --- a/src/prog.rs +++ b/src/prog.rs @@ -351,24 +351,6 @@ pub enum EmptyLook { NotWordBoundaryAscii, } -impl InstEmptyLook { - /// Tests whether the pair of characters matches this zero-width - /// instruction. - pub fn matches(&self, c1: Char, c2: Char) -> bool { - use self::EmptyLook::*; - match self.look { - StartLine => c1.is_none() || c1 == '\n', - EndLine => c2.is_none() || c2 == '\n', - StartText => c1.is_none(), - EndText => c2.is_none(), - WordBoundary => c1.is_word_char() != c2.is_word_char(), - NotWordBoundary => c1.is_word_char() == c2.is_word_char(), - WordBoundaryAscii => c1.is_word_byte() != c2.is_word_byte(), - NotWordBoundaryAscii => c1.is_word_byte() == c2.is_word_byte(), - } - } -} - /// Representation of the Char instruction. #[derive(Clone, Debug)] pub struct InstChar { diff --git a/tests/word_boundary_ascii.rs b/tests/word_boundary_ascii.rs index c127e8aa28..9beb7c0cb1 100644 --- a/tests/word_boundary_ascii.rs +++ b/tests/word_boundary_ascii.rs @@ -2,6 +2,7 @@ // For Unicode word boundaries, the tests are precisely inverted. matiter!(ascii1, r"\bx\b", "áxβ", (2, 3)); matiter!(ascii2, r"\Bx\B", "áxβ"); +matiter!(ascii3, r"\B", "0\u{7EF5E}", (2, 2), (3, 3), (4, 4), (5, 5)); // We can still get Unicode mode in byte regexes. matiter!(unicode1, r"(?u:\b)x(?u:\b)", "áxβ"); diff --git a/tests/word_boundary_unicode.rs b/tests/word_boundary_unicode.rs index 42bcba51b4..43612a91ac 100644 --- a/tests/word_boundary_unicode.rs +++ b/tests/word_boundary_unicode.rs @@ -5,3 +5,4 @@ matiter!(unicode2, r"\Bx\B", "áxβ", (2, 3)); matiter!(ascii1, r"(?-u:\b)x(?-u:\b)", "áxβ", (2, 3)); matiter!(ascii2, r"(?-u:\B)x(?-u:\B)", "áxβ"); +matiter!(ascii3, r"(?-u:\B)", "0\u{7EF5E}", (5, 5));