Skip to content

Commit

Permalink
fix(codegen/regex): allow vec growth on parse
Browse files Browse the repository at this point in the history
  • Loading branch information
LeoDog896 committed Jul 24, 2024
1 parent a4d2846 commit eae28a6
Showing 1 changed file with 26 additions and 20 deletions.
46 changes: 26 additions & 20 deletions logos-codegen/src/graph/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ use regex_syntax::utf8::Utf8Sequences;
use crate::graph::{Disambiguate, Fork, Graph, Node, NodeId, Range, ReservedId, Rope};
use crate::mir::{Class, ClassUnicode, Literal, Mir};

use super::rope;

impl<Leaf: Disambiguate + Debug> Graph<Leaf> {
pub fn regex(&mut self, mir: Mir, then: NodeId) -> NodeId {
self.parse_mir(&mir, then, None, None, false)
Expand Down Expand Up @@ -70,38 +72,30 @@ impl<Leaf: Disambiguate + Debug> Graph<Leaf> {
self.insert_or_push(reserved, Rope::new(pattern, then).miss(miss))
}
Mir::Concat(concat) => {
// We'll be writing from the back, so need to allocate enough
// space here. Worst case scenario is all unicode codepoints
// producing 4 byte utf8 sequences
let mut ropebuf = vec![Range::from(0); concat.len() * 4];
let mut cur = ropebuf.len();
let mut end = ropebuf.len();
// Take an initial guess at the capacity - estimates a little worse than an average case
// scenario by assuming every concat element is singular but has a full code-point unicode literal.
let mut ropebuf: Vec<Range> = Vec::with_capacity(concat.len() * 4);
let mut then = then;

let mut handle_bytes = |graph: &mut Self, mir: &Mir, then: &mut NodeId| match mir {
Mir::Literal(Literal(bytes)) => {
cur -= bytes.len();
for (i, byte) in bytes.iter().enumerate() {
ropebuf[cur + i] = byte.into();
}
ropebuf.extend(bytes.iter().rev().map(|byte| Into::<Range>::into(byte)));
true
}
Mir::Class(Class::Unicode(class)) if is_one_ascii(class, repeated) => {
cur -= 1;
ropebuf[cur] = class.ranges()[0].into();
ropebuf.push(class.ranges()[0].into());
true
}
Mir::Class(Class::Bytes(class)) if class.ranges().len() == 1 => {
cur -= 1;
ropebuf[cur] = class.ranges()[0].into();
ropebuf.push(class.ranges()[0].into());
true
}
_ => {
if end > cur {
let rope = Rope::new(&ropebuf[cur..end], *then);
if !ropebuf.is_empty() {
let rope = Rope::new(ropebuf.iter().cloned().rev().collect::<Vec<_>>(), *then);

*then = graph.push(rope);
end = cur;
ropebuf = Vec::with_capacity(concat.len() * 4);
}
false
}
Expand All @@ -115,7 +109,7 @@ impl<Leaf: Disambiguate + Debug> Graph<Leaf> {

let first_mir = &concat[0];
if handle_bytes(self, first_mir, &mut then) {
let rope = Rope::new(&ropebuf[cur..end], then).miss(miss);
let rope = Rope::new(ropebuf.iter().cloned().rev().collect::<Vec<_>>(), then).miss(miss);
self.insert_or_push(reserved, rope)
} else {
self.parse_mir(first_mir, then, miss, reserved, false)
Expand Down Expand Up @@ -171,7 +165,7 @@ impl<Leaf: Disambiguate + Debug> Graph<Leaf> {
}
}

/// Return wether current class unicode is ascii.
/// Return whether current class unicode is ascii.
///
/// Because unicode ranges are iterated in increasing order,
/// it is only necessary to check the last range.
Expand All @@ -186,7 +180,7 @@ fn is_ascii(class: &ClassUnicode, repeated: bool) -> bool {
})
}

/// Return wether current class unicode is ascii and only contains
/// Return whether current class unicode is ascii and only contains
/// one range.
///
/// See [`is_ascii`] function for more details.
Expand Down Expand Up @@ -238,6 +232,18 @@ mod tests {
);
}

#[test]
fn long_concat_389() {
let mut graph = Graph::new();

let mir = Mir::utf8("abcdefghijklmn*").unwrap();

assert_eq!(mir.priority(), 26);

let leaf = graph.push(Node::Leaf("LEAF"));
let id = graph.regex(mir, leaf);
}

#[test]
fn repeat() {
let mut graph = Graph::new();
Expand Down

0 comments on commit eae28a6

Please sign in to comment.