diff --git a/bench/src/bench.rs b/bench/src/bench.rs
index d0804d42bf..eabd5ecc2f 100644
--- a/bench/src/bench.rs
+++ b/bench/src/bench.rs
@@ -59,11 +59,23 @@ pub use ffi::tcl::Regex;
 // Due to macro scoping rules, this definition only applies for the modules
 // defined below. Effectively, it allows us to use the same tests for both
 // native and dynamic regexes.
+#[cfg(not(feature = "re-rust-bytes"))]
 #[cfg(not(feature = "re-rust-plugin"))]
 macro_rules! regex {
     ($re:expr) => { ::Regex::new($re).unwrap() }
 }
 
+#[cfg(feature = "re-rust-bytes")]
+#[cfg(not(feature = "re-rust-plugin"))]
+macro_rules! regex {
+    ($re:expr) => {{
+        // Always enable the Unicode flag for byte based regexes.
+        // Really, this should have been enabled by default. *sigh*
+        use regex::bytes::RegexBuilder;
+        RegexBuilder::new($re).unicode(true).compile().unwrap()
+    }}
+}
+
 // Usage: text!(haystack)
 //
 // Builds a ::Text from an owned string.
diff --git a/bench/src/sherlock.rs b/bench/src/sherlock.rs
index 8c704ae07d..a7b741fd63 100644
--- a/bench/src/sherlock.rs
+++ b/bench/src/sherlock.rs
@@ -101,32 +101,17 @@ sherlock!(everything_greedy_nl, r"(?s).*", 1);
 // How fast can we match every letter? This also defeats any clever prefix
 // tricks.
 #[cfg(not(feature = "re-tcl"))]
-#[cfg(not(feature = "re-rust-bytes"))]
 sherlock!(letters, r"\p{L}", 447160);
-#[cfg(not(feature = "re-tcl"))]
-#[cfg(feature = "re-rust-bytes")]
-sherlock!(letters, r"(?u)\p{L}", 447160);
 
 #[cfg(not(feature = "re-tcl"))]
-#[cfg(not(feature = "re-rust-bytes"))]
 sherlock!(letters_upper, r"\p{Lu}", 14180);
-#[cfg(not(feature = "re-tcl"))]
-#[cfg(feature = "re-rust-bytes")]
-sherlock!(letters_upper, r"(?u)\p{Lu}", 14180);
 
 #[cfg(not(feature = "re-tcl"))]
-#[cfg(not(feature = "re-rust-bytes"))]
 sherlock!(letters_lower, r"\p{Ll}", 432980);
-#[cfg(not(feature = "re-tcl"))]
-#[cfg(feature = "re-rust-bytes")]
-sherlock!(letters_lower, r"(?u)\p{Ll}", 432980);
 
 // Similarly, for words.
-#[cfg(not(feature = "re-rust-bytes"))]
 #[cfg(not(feature = "re-re2"))]
 sherlock!(words, r"\w+", 109214);
-#[cfg(feature = "re-rust-bytes")]
-sherlock!(words, r"(?u)\w+", 109214);
 #[cfg(feature = "re-re2")]
 sherlock!(words, r"\w+", 109222); // hmm, why does RE2 diverge here?
 
@@ -195,8 +180,4 @@ sherlock!(ing_suffix, r"[a-zA-Z]+ing", 2824);
 //
 // Onig does surprisingly well on this benchmark and yet does quite poorly on
 // the ing_suffix benchmark. That one has me stumped.
-//
-// Interestingly, this is slower in the rust-bytes benchmark, presumably
-// because scanning for one of the bytes in the Unicode *unaware* `\s` ends
-// up being slower than avoiding the prefix scan at all.
 sherlock!(ing_suffix_limited_space, r"\s[a-zA-Z]{0,12}ing\s", 2081);
diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs
index 03a56e55dc..65604d764e 100644
--- a/regex-syntax/src/parser.rs
+++ b/regex-syntax/src/parser.rs
@@ -114,13 +114,7 @@ impl Parser {
                 '{' => try!(self.parse_counted_repeat()),
                 '[' => match self.maybe_parse_ascii() {
                     None => try!(self.parse_class()),
-                    Some(cls) => {
-                        Build::Expr(if self.flags.unicode {
-                            Expr::Class(cls)
-                        } else {
-                            Expr::ClassBytes(cls.to_byte_class())
-                        })
-                    }
+                    Some(cls) => Build::Expr(Expr::Class(cls)),
                 },
                 '^' => {
                     if self.flags.multi {
@@ -224,11 +218,7 @@ impl Parser {
             }
             'd'|'s'|'w'|'D'|'S'|'W' => {
                 self.bump();
-                Ok(Build::Expr(if self.flags.unicode {
-                    Expr::Class(self.parse_perl_class(c))
-                } else {
-                    Expr::ClassBytes(self.parse_perl_class(c).to_byte_class())
-                }))
+                Ok(Build::Expr(Expr::Class(self.parse_perl_class(c))))
             }
             c => Err(self.err(ErrorKind::UnrecognizedEscape(c))),
         }
@@ -1328,16 +1318,28 @@ mod tests {
         ByteClass::new(ranges)
     }
 
-    fn asciid() -> ByteClass {
-        ascii_class("digit").unwrap().to_byte_class()
+    fn asciid() -> CharClass {
+        ascii_class("digit").unwrap()
+    }
+
+    fn asciis() -> CharClass {
+        ascii_class("space").unwrap()
+    }
+
+    fn asciiw() -> CharClass {
+        ascii_class("word").unwrap()
     }
 
-    fn asciis() -> ByteClass {
-        ascii_class("space").unwrap().to_byte_class()
+    fn asciid_bytes() -> ByteClass {
+        asciid().to_byte_class()
     }
 
-    fn asciiw() -> ByteClass {
-        ascii_class("word").unwrap().to_byte_class()
+    fn asciis_bytes() -> ByteClass {
+        asciis().to_byte_class()
+    }
+
+    fn asciiw_bytes() -> ByteClass {
+        asciiw().to_byte_class()
     }
 
     #[test]
@@ -1905,55 +1907,55 @@ mod tests {
     #[test]
     fn escape_perl_d() {
         assert_eq!(p(r"\d"), Expr::Class(class(PERLD)));
-        assert_eq!(pb(r"(?-u)\d"), Expr::ClassBytes(asciid()));
+        assert_eq!(pb(r"(?-u)\d"), Expr::Class(asciid()));
     }
 
     #[test]
     fn escape_perl_s() {
         assert_eq!(p(r"\s"), Expr::Class(class(PERLS)));
-        assert_eq!(pb(r"(?-u)\s"), Expr::ClassBytes(asciis()));
+        assert_eq!(pb(r"(?-u)\s"), Expr::Class(asciis()));
     }
 
     #[test]
     fn escape_perl_w() {
         assert_eq!(p(r"\w"), Expr::Class(class(PERLW)));
-        assert_eq!(pb(r"(?-u)\w"), Expr::ClassBytes(asciiw()));
+        assert_eq!(pb(r"(?-u)\w"), Expr::Class(asciiw()));
     }
 
     #[test]
     fn escape_perl_d_negate() {
         assert_eq!(p(r"\D"), Expr::Class(class(PERLD).negate()));
-        assert_eq!(pb(r"(?-u)\D"), Expr::ClassBytes(asciid().negate()));
+        assert_eq!(pb(r"(?-u)\D"), Expr::Class(asciid().negate()));
     }
 
     #[test]
     fn escape_perl_s_negate() {
         assert_eq!(p(r"\S"), Expr::Class(class(PERLS).negate()));
-        assert_eq!(pb(r"(?-u)\S"), Expr::ClassBytes(asciis().negate()));
+        assert_eq!(pb(r"(?-u)\S"), Expr::Class(asciis().negate()));
     }
 
     #[test]
     fn escape_perl_w_negate() {
         assert_eq!(p(r"\W"), Expr::Class(class(PERLW).negate()));
-        assert_eq!(pb(r"(?-u)\W"), Expr::ClassBytes(asciiw().negate()));
+        assert_eq!(pb(r"(?-u)\W"), Expr::Class(asciiw().negate()));
     }
 
     #[test]
     fn escape_perl_d_case_fold() {
         assert_eq!(p(r"(?i)\d"), Expr::Class(class(PERLD).case_fold()));
-        assert_eq!(pb(r"(?i-u)\d"), Expr::ClassBytes(asciid().case_fold()));
+        assert_eq!(pb(r"(?i-u)\d"), Expr::Class(asciid().case_fold()));
     }
 
     #[test]
     fn escape_perl_s_case_fold() {
         assert_eq!(p(r"(?i)\s"), Expr::Class(class(PERLS).case_fold()));
-        assert_eq!(pb(r"(?i-u)\s"), Expr::ClassBytes(asciis().case_fold()));
+        assert_eq!(pb(r"(?i-u)\s"), Expr::Class(asciis().case_fold()));
     }
 
     #[test]
     fn escape_perl_w_case_fold() {
         assert_eq!(p(r"(?i)\w"), Expr::Class(class(PERLW).case_fold()));
-        assert_eq!(pb(r"(?i-u)\w"), Expr::ClassBytes(asciiw().case_fold()));
+        assert_eq!(pb(r"(?i-u)\w"), Expr::Class(asciiw().case_fold()));
     }
 
     #[test]
@@ -1961,7 +1963,7 @@ mod tests {
         assert_eq!(p(r"(?i)\D"),
                    Expr::Class(class(PERLD).case_fold().negate()));
         let bytes = asciid().case_fold().negate();
-        assert_eq!(pb(r"(?i-u)\D"), Expr::ClassBytes(bytes));
+        assert_eq!(pb(r"(?i-u)\D"), Expr::Class(bytes));
     }
 
     #[test]
@@ -1969,7 +1971,7 @@ mod tests {
         assert_eq!(p(r"(?i)\S"),
                    Expr::Class(class(PERLS).case_fold().negate()));
         let bytes = asciis().case_fold().negate();
-        assert_eq!(pb(r"(?i-u)\S"), Expr::ClassBytes(bytes));
+        assert_eq!(pb(r"(?i-u)\S"), Expr::Class(bytes));
     }
 
     #[test]
@@ -1977,7 +1979,7 @@ mod tests {
         assert_eq!(p(r"(?i)\W"),
                    Expr::Class(class(PERLW).case_fold().negate()));
         let bytes = asciiw().case_fold().negate();
-        assert_eq!(pb(r"(?i-u)\W"), Expr::ClassBytes(bytes));
+        assert_eq!(pb(r"(?i-u)\W"), Expr::Class(bytes));
     }
 
     #[test]
@@ -2039,11 +2041,11 @@ mod tests {
         assert_eq!(p(r"[^\w]"), Expr::Class(class(PERLW).negate()));
         assert_eq!(p(r"[^\s]"), Expr::Class(class(PERLS).negate()));
 
-        let bytes = asciid().negate();
+        let bytes = asciid_bytes().negate();
         assert_eq!(pb(r"(?-u)[^\d]"), Expr::ClassBytes(bytes));
-        let bytes = asciiw().negate();
+        let bytes = asciiw_bytes().negate();
         assert_eq!(pb(r"(?-u)[^\w]"), Expr::ClassBytes(bytes));
-        let bytes = asciis().negate();
+        let bytes = asciis_bytes().negate();
         assert_eq!(pb(r"(?-u)[^\s]"), Expr::ClassBytes(bytes));
     }
 
@@ -2053,9 +2055,9 @@ mod tests {
         assert_eq!(p(r"[^\W]"), Expr::Class(class(PERLW)));
         assert_eq!(p(r"[^\S]"), Expr::Class(class(PERLS)));
 
-        assert_eq!(pb(r"(?-u)[^\D]"), Expr::ClassBytes(asciid()));
-        assert_eq!(pb(r"(?-u)[^\W]"), Expr::ClassBytes(asciiw()));
-        assert_eq!(pb(r"(?-u)[^\S]"), Expr::ClassBytes(asciis()));
+        assert_eq!(pb(r"(?-u)[^\D]"), Expr::ClassBytes(asciid_bytes()));
+        assert_eq!(pb(r"(?-u)[^\W]"), Expr::ClassBytes(asciiw_bytes()));
+        assert_eq!(pb(r"(?-u)[^\S]"), Expr::ClassBytes(asciis_bytes()));
     }
 
     #[test]
@@ -2063,7 +2065,8 @@ mod tests {
         assert_eq!(p(r"(?i)[\d]"), Expr::Class(class(PERLD).case_fold()));
         assert_eq!(p(r"(?i)[\p{Yi}]"), Expr::Class(class(YI).case_fold()));
 
-        assert_eq!(pb(r"(?i-u)[\d]"), Expr::ClassBytes(asciid().case_fold()));
+        assert_eq!(pb(r"(?i-u)[\d]"),
+                   Expr::ClassBytes(asciid_bytes().case_fold()));
     }
 
     #[test]
@@ -2075,11 +2078,11 @@ mod tests {
         assert_eq!(p(r"(?i)[^\s]"),
                    Expr::Class(class(PERLS).case_fold().negate()));
 
-        let bytes = asciid().case_fold().negate();
+        let bytes = asciid_bytes().case_fold().negate();
         assert_eq!(pb(r"(?i-u)[^\d]"), Expr::ClassBytes(bytes));
-        let bytes = asciiw().case_fold().negate();
+        let bytes = asciiw_bytes().case_fold().negate();
         assert_eq!(pb(r"(?i-u)[^\w]"), Expr::ClassBytes(bytes));
-        let bytes = asciis().case_fold().negate();
+        let bytes = asciis_bytes().case_fold().negate();
         assert_eq!(pb(r"(?i-u)[^\s]"), Expr::ClassBytes(bytes));
     }
 
@@ -2089,9 +2092,12 @@ mod tests {
         assert_eq!(p(r"(?i)[^\W]"), Expr::Class(class(PERLW).case_fold()));
         assert_eq!(p(r"(?i)[^\S]"), Expr::Class(class(PERLS).case_fold()));
 
-        assert_eq!(pb(r"(?i-u)[^\D]"), Expr::ClassBytes(asciid().case_fold()));
-        assert_eq!(pb(r"(?i-u)[^\W]"), Expr::ClassBytes(asciiw().case_fold()));
-        assert_eq!(pb(r"(?i-u)[^\S]"), Expr::ClassBytes(asciis().case_fold()));
+        assert_eq!(pb(r"(?i-u)[^\D]"),
+                   Expr::ClassBytes(asciid_bytes().case_fold()));
+        assert_eq!(pb(r"(?i-u)[^\W]"),
+                   Expr::ClassBytes(asciiw_bytes().case_fold()));
+        assert_eq!(pb(r"(?i-u)[^\S]"),
+                   Expr::ClassBytes(asciis_bytes().case_fold()));
     }
 
     #[test]
@@ -2184,8 +2190,7 @@ mod tests {
         assert_eq!(p("[:upper:]"), Expr::Class(class(UPPER)));
         assert_eq!(p("[[:upper:]]"), Expr::Class(class(UPPER)));
 
-        assert_eq!(pb("(?-u)[:upper:]"),
-                   Expr::ClassBytes(class(UPPER).to_byte_class()));
+        assert_eq!(pb("(?-u)[:upper:]"), Expr::Class(class(UPPER)));
         assert_eq!(pb("(?-u)[[:upper:]]"),
                    Expr::ClassBytes(class(UPPER).to_byte_class()));
     }
@@ -2233,7 +2238,7 @@ mod tests {
                    Expr::Class(class(UPPER).case_fold()));
 
         assert_eq!(pb("(?i-u)[:upper:]"),
-                   Expr::ClassBytes(class(UPPER).to_byte_class().case_fold()));
+                   Expr::Class(class(UPPER).case_fold()));
         assert_eq!(pb("(?i-u)[[:upper:]]"),
                    Expr::ClassBytes(class(UPPER).to_byte_class().case_fold()));
     }
diff --git a/src/dfa.rs b/src/dfa.rs
index fbd7d81ac2..56b1bd880d 100644
--- a/src/dfa.rs
+++ b/src/dfa.rs
@@ -101,10 +101,10 @@ pub struct Cache {
     qnext: SparseSet,
 }
 
-#[derive(Clone, Debug)]
 /// CacheInner is logically just a part of Cache, but groups together fields
 /// that aren't passed as function parameters throughout search. (This split
 /// is mostly an artifact of the borrow checker. It is happily paid.)
+#[derive(Clone, Debug)]
 struct CacheInner {
     /// A cache of pre-compiled DFA states, keyed by the set of NFA states
     /// and the set of empty-width flags set at the byte in the input when the
@@ -114,7 +114,7 @@ struct CacheInner {
     /// things, we just pass indexes around manually. The performance impact of
     /// this is probably an instruction or two in the inner loop. However, on
     /// 64 bit, each StatePtr is half the size of a *State.
-    compiled: HashMap<StateKey, StatePtr>,
+    compiled: HashMap<State, StatePtr>,
     /// The transition table.
     ///
     /// The transition table is laid out in row-major order, where states are
@@ -230,7 +230,7 @@ impl<T> Result<T> {
 ///
 /// States don't carry their transitions. Instead, transitions are stored in
 /// a single row-major table.
-#[derive(Clone)]
+#[derive(Clone, Eq, Hash, PartialEq)]
 struct State {
     /// The set of NFA states in this DFA state, which are computed by
     /// following epsilon transitions from `insts[0]`. Note that not all
@@ -244,20 +244,6 @@ struct State {
     flags: StateFlags,
 }
 
-/// A state's key for identifying it in the cache. In particular, if two
-/// state's cache keys are equivalent, then they cannot be discriminatory in
-/// a match.
-///
-/// We capture two bits of information: an ordered set of NFA states for the
-/// DFA state and its flags.
-#[derive(Clone, Debug, Eq, Hash, PartialEq)]
-struct StateKey {
-    /// An ordered set of NFA states.
-    insts: Box<[InstPtr]>,
-    /// The state's flags. (Match, word, empty assertions.)
-    flags: StateFlags,
-}
-
 /// InstPtr is a 32 bit pointer into a sequence of opcodes (i.e., it indexes
 /// an NFA state).
 ///
@@ -1090,8 +1076,7 @@ impl<'a> Fsm<'a> {
             }
         }
         // Allocate room for our state and add it.
-        let state = State { insts: key.insts.clone(), flags: state_flags };
-        self.add_state(key, state)
+        self.add_state(key)
     }
 
     /// Produces a key suitable for describing a state in the DFA cache.
@@ -1108,7 +1093,7 @@ impl<'a> Fsm<'a> {
         &mut self,
         q: &SparseSet,
         state_flags: &mut StateFlags,
-    ) -> Option<StateKey> {
+    ) -> Option<State> {
         use prog::Inst::*;
         use prog::EmptyLook::*;
 
@@ -1171,7 +1156,7 @@ impl<'a> Fsm<'a> {
         if insts.len() == 0 && !state_flags.is_match() {
             None
         } else {
-            Some(StateKey {
+            Some(State {
                 insts: insts.into_boxed_slice(),
                 flags: *state_flags,
             })
@@ -1256,16 +1241,12 @@ impl<'a> Fsm<'a> {
     /// Restores the given state back into the cache, and returns a pointer
     /// to it.
     fn restore_state(&mut self, state: State) -> Option<StatePtr> {
-        let key = StateKey {
-            insts: state.insts.clone(),
-            flags: state.flags,
-        };
         // If we've already stored this state, just return a pointer to it.
         // None will be the wiser.
-        if let Some(&si) = self.cache.compiled.get(&key) {
+        if let Some(&si) = self.cache.compiled.get(&state) {
             return Some(si);
         }
-        self.add_state(key, state)
+        self.add_state(state)
     }
 
     /// Returns the next state given the current state si and current byte
@@ -1395,7 +1376,7 @@ impl<'a> Fsm<'a> {
     ///
     /// If None is returned, then the state limit was reached and the DFA
     /// should quit.
-    fn add_state(&mut self, key: StateKey, state: State) -> Option<StatePtr> {
+    fn add_state(&mut self, state: State) -> Option<StatePtr> {
         // This will fail if the next state pointer exceeds STATE_PTR. In
         // practice, the cache limit will prevent us from ever getting here,
         // but maybe callers will set the cache size to something ridiculous...
@@ -1415,8 +1396,8 @@ impl<'a> Fsm<'a> {
         }
         // Finally, put our actual state on to our heap of states and index it
         // so we can find it later.
-        self.cache.states.push(state);
-        self.cache.compiled.insert(key, si);
+        self.cache.states.push(state.clone());
+        self.cache.compiled.insert(state, si);
         // Transition table and set of states and map should all be in sync.
         debug_assert!(self.cache.states.len()
                       == self.cache.trans.num_states());
@@ -1505,7 +1486,7 @@ impl<'a> Fsm<'a> {
         // Estimate that there are about 16 instructions per state consuming
         // 64 = 16 * 4 bytes of space.
         let compiled =
-            (self.cache.compiled.len() * (size::<StateKey>() + 64))
+            (self.cache.compiled.len() * (size::<State>() + 64))
             + (self.cache.compiled.len() * size::<StatePtr>());
         let states =
             self.cache.states.len()
@@ -1522,7 +1503,7 @@ impl<'a> Fsm<'a> {
     fn actual_size(&self) -> usize {
         let mut compiled = 0;
         for k in self.cache.compiled.keys() {
-            compiled += mem::size_of::<StateKey>();
+            compiled += mem::size_of::<State>();
             compiled += mem::size_of::<StatePtr>();
             compiled += k.insts.len() * mem::size_of::<InstPtr>();
         }
diff --git a/src/exec.rs b/src/exec.rs
index 11f4fde621..37228e005b 100644
--- a/src/exec.rs
+++ b/src/exec.rs
@@ -105,6 +105,7 @@ struct Parsed {
     exprs: Vec<Expr>,
     prefixes: Literals,
     suffixes: Literals,
+    bytes: bool,
 }
 
 impl ExecBuilder {
@@ -208,6 +209,7 @@ impl ExecBuilder {
         let mut exprs = Vec::with_capacity(self.options.pats.len());
         let mut prefixes = Some(Literals::empty());
         let mut suffixes = Some(Literals::empty());
+        let mut bytes = false;
         for pat in &self.options.pats {
             let parser =
                 ExprBuilder::new()
@@ -219,6 +221,7 @@ impl ExecBuilder {
                     .unicode(self.options.unicode)
                     .allow_bytes(!self.only_utf8);
             let expr = try!(parser.parse(pat));
+            bytes = bytes || expr.has_bytes();
             prefixes = prefixes.and_then(|mut prefixes| {
                 if !prefixes.union_prefixes(&expr) {
                     None
@@ -239,6 +242,7 @@ impl ExecBuilder {
             exprs: exprs,
             prefixes: prefixes.unwrap_or(Literals::empty()),
             suffixes: suffixes.unwrap_or(Literals::empty()),
+            bytes: bytes,
         })
     }
 
@@ -261,7 +265,7 @@ impl ExecBuilder {
         let mut nfa = try!(
             Compiler::new()
                      .size_limit(self.options.size_limit)
-                     .bytes(self.bytes)
+                     .bytes(self.bytes || parsed.bytes)
                      .only_utf8(self.only_utf8)
                      .compile(&parsed.exprs));
         let mut dfa = try!(
diff --git a/src/literals.rs b/src/literals.rs
index 934aefa29d..2de5a7e500 100644
--- a/src/literals.rs
+++ b/src/literals.rs
@@ -361,8 +361,8 @@ pub struct SingleSearch {
     /// The second rarest byte is used as a type of guard for quickly detecting
     /// a mismatch after memchr locates an instance of the rarest byte. This
     /// is a hedge against pathological cases where the pre-computed frequency
-    /// analysis may be off. (But of course, does not prevent pathological
-    /// cases.)
+    /// analysis may be off. (But of course, does not prevent *all*
+    /// pathological cases.)
     rare2: u8,
     /// The offset of the second rarest byte in `pat`.
     rare2i: usize,
diff --git a/src/prog.rs b/src/prog.rs
index 165c20a355..44e3228262 100644
--- a/src/prog.rs
+++ b/src/prog.rs
@@ -132,7 +132,7 @@ impl Program {
     /// Returns true if this program uses Byte instructions instead of
     /// Char/Range instructions.
     pub fn uses_bytes(&self) -> bool {
-        self.is_bytes || self.is_dfa || !self.only_utf8
+        self.is_bytes || self.is_dfa
     }
 
     /// Returns true if this program exclusively matches valid UTF-8 bytes.