fix(codegen/regex): allow vec growth on parse

fixes maciejhirsz#389
LeoDog896 · Jul 24, 2024 · eae28a6 · eae28a6
1 parent a4d2846
commit eae28a6
Showing 1 changed file with 26 additions and 20 deletions.
diff --git a/logos-codegen/src/graph/regex.rs b/logos-codegen/src/graph/regex.rs
@@ -5,6 +5,8 @@ use regex_syntax::utf8::Utf8Sequences;
 use crate::graph::{Disambiguate, Fork, Graph, Node, NodeId, Range, ReservedId, Rope};
 use crate::mir::{Class, ClassUnicode, Literal, Mir};
 
+use super::rope;
+
 impl<Leaf: Disambiguate + Debug> Graph<Leaf> {
     pub fn regex(&mut self, mir: Mir, then: NodeId) -> NodeId {
         self.parse_mir(&mir, then, None, None, false)
@@ -70,38 +72,30 @@ impl<Leaf: Disambiguate + Debug> Graph<Leaf> {
                 self.insert_or_push(reserved, Rope::new(pattern, then).miss(miss))
             }
             Mir::Concat(concat) => {
-                // We'll be writing from the back, so need to allocate enough
-                // space here. Worst case scenario is all unicode codepoints
-                // producing 4 byte utf8 sequences
-                let mut ropebuf = vec![Range::from(0); concat.len() * 4];
-                let mut cur = ropebuf.len();
-                let mut end = ropebuf.len();
+                // Take an initial guess at the capacity - estimates a little worse than an average case
+                // scenario by assuming every concat element is singular but has a full code-point unicode literal.
+                let mut ropebuf: Vec<Range> = Vec::with_capacity(concat.len() * 4);
                 let mut then = then;
 
                 let mut handle_bytes = |graph: &mut Self, mir: &Mir, then: &mut NodeId| match mir {
                     Mir::Literal(Literal(bytes)) => {
-                        cur -= bytes.len();
-                        for (i, byte) in bytes.iter().enumerate() {
-                            ropebuf[cur + i] = byte.into();
-                        }
+                        ropebuf.extend(bytes.iter().rev().map(|byte| Into::<Range>::into(byte)));
                         true
                     }
                     Mir::Class(Class::Unicode(class)) if is_one_ascii(class, repeated) => {
-                        cur -= 1;
-                        ropebuf[cur] = class.ranges()[0].into();
+                        ropebuf.push(class.ranges()[0].into());
                         true
                     }
                     Mir::Class(Class::Bytes(class)) if class.ranges().len() == 1 => {
-                        cur -= 1;
-                        ropebuf[cur] = class.ranges()[0].into();
+                        ropebuf.push(class.ranges()[0].into());
                         true
                     }
                     _ => {
-                        if end > cur {
-                            let rope = Rope::new(&ropebuf[cur..end], *then);
+                        if !ropebuf.is_empty() {
+                            let rope = Rope::new(ropebuf.iter().cloned().rev().collect::<Vec<_>>(), *then);
 
                             *then = graph.push(rope);
-                            end = cur;
+                            ropebuf = Vec::with_capacity(concat.len() * 4);
                         }
                         false
                     }
@@ -115,7 +109,7 @@ impl<Leaf: Disambiguate + Debug> Graph<Leaf> {
 
                 let first_mir = &concat[0];
                 if handle_bytes(self, first_mir, &mut then) {
-                    let rope = Rope::new(&ropebuf[cur..end], then).miss(miss);
+                    let rope = Rope::new(ropebuf.iter().cloned().rev().collect::<Vec<_>>(), then).miss(miss);
                     self.insert_or_push(reserved, rope)
                 } else {
                     self.parse_mir(first_mir, then, miss, reserved, false)
@@ -171,7 +165,7 @@ impl<Leaf: Disambiguate + Debug> Graph<Leaf> {
     }
 }
 
-/// Return wether current class unicode is ascii.
+/// Return whether current class unicode is ascii.
 ///
 /// Because unicode ranges are iterated in increasing order,
 /// it is only necessary to check the last range.
@@ -186,7 +180,7 @@ fn is_ascii(class: &ClassUnicode, repeated: bool) -> bool {
     })
 }
 
-/// Return wether current class unicode is ascii and only contains
+/// Return whether current class unicode is ascii and only contains
 /// one range.
 ///
 /// See [`is_ascii`] function for more details.
@@ -238,6 +232,18 @@ mod tests {
         );
     }
 
+    #[test]
+    fn long_concat_389() {
+        let mut graph = Graph::new();
+
+        let mir = Mir::utf8("abcdefghijklmn*").unwrap();
+
+        assert_eq!(mir.priority(), 26);
+
+        let leaf = graph.push(Node::Leaf("LEAF"));
+        let id = graph.regex(mir, leaf);
+    }
+
     #[test]
     fn repeat() {
         let mut graph = Graph::new();