From 9be9fda5f9918216d415e5047b2a510bf9631bdd Mon Sep 17 00:00:00 2001 From: Scott Steele Date: Wed, 7 Dec 2016 20:50:44 -0500 Subject: [PATCH] Verify character class still non-empty after converting to byte class MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For `[^\x00-\xff]`, while it is still treated as a full Unicode character class, it is not empty. For instance `≥` would still be matched. However, when `CharClass::to_byte_class` is called on it (as is done when using `regex::bytes::Regex::new` rather than `regex::Regex::new`), it _is_ now empty, since it excludes all possible bytes. This commit adds a test asserting that `regex::bytes::Regex::new` returns `Err` for this case (in accordance with https://github.com/rust-lang-nursery/regex/issues/106) and adds an `is_empty` check to the result of calling `CharClass::to_byte_class`, which allows the test to pass. --- src/parser.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/parser.rs b/src/parser.rs index 1d10572..ed9f33b 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -596,7 +596,17 @@ impl Parser { Ok(Build::Expr(if self.flags.unicode { Expr::Class(class) } else { - Expr::ClassBytes(class.to_byte_class()) + let byte_class = class.to_byte_class(); + + // If `class` was only non-empty due to multibyte characters, the + // corresponding byte class will now be empty. + // + // See https://github.com/rust-lang-nursery/regex/issues/303 + if byte_class.is_empty() { + return Err(self.err(ErrorKind::EmptyClass)); + } + + Expr::ClassBytes(byte_class) })) }