Replace regex dependency with a hand-written parser

The removes the dependencies on both regex and once_cell, and gives significant performance increases in the current benchmarks.
jcreekmore · Jan 1, 2022 · 9d900cb · 9d900cb
1 parent e974337
commit 9d900cb
Show file tree

Hide file tree

Showing 3 changed files with 93 additions and 35 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -13,8 +13,6 @@ edition = "2018"
 
 [dependencies]
 base64 = "0.13.0"
-once_cell = "1.4"
-regex = {version = "1", default-features = false, features = ["std"]}
 
 [dev-dependencies]
 criterion = "0.3.0"

diff --git a/src/lib.rs b/src/lib.rs
@@ -104,27 +104,15 @@
 )]
 
 mod errors;
+mod parser;
+use parser::{parse_captures, parse_captures_iter, Captures};
 
 pub use crate::errors::{PemError, Result};
-use once_cell::sync::Lazy;
-use regex::bytes::{Captures, Regex};
 use std::str;
 
-const REGEX_STR: &str =
-    r"(?s)-----BEGIN (?P<begin>.*?)-----[ \t\n\r]*(?P<data>.*?)-----END (?P<end>.*?)-----[ \t\n\r]*";
-
 /// The line length for PEM encoding
 const LINE_WRAP: usize = 64;
 
-
-fn ascii_armor() -> &'static Regex {
-    static ASCII_ARMOR: Lazy<Regex> = Lazy::new(|| {
-        Regex::new(REGEX_STR).unwrap()
-    });
-
-    &ASCII_ARMOR
-}
-
 /// Enum describing line endings
 #[derive(Debug, Clone, Copy)]
 pub enum LineEnding {
@@ -157,21 +145,13 @@ impl Pem {
         }
 
         // Verify that the begin section exists
-        let tag = as_utf8(
-            caps.name("begin")
-                .ok_or_else(|| PemError::MissingBeginTag)?
-                .as_bytes(),
-        )?;
+        let tag = as_utf8(caps.begin)?;
         if tag.is_empty() {
             return Err(PemError::MissingBeginTag);
         }
 
         // as well as the end section
-        let tag_end = as_utf8(
-            caps.name("end")
-                .ok_or_else(|| PemError::MissingEndTag)?
-                .as_bytes(),
-        )?;
+        let tag_end = as_utf8(caps.end)?;
         if tag_end.is_empty() {
             return Err(PemError::MissingEndTag);
         }
@@ -182,11 +162,7 @@ impl Pem {
         }
 
         // If they did, then we can grab the data section
-        let raw_data = as_utf8(
-            caps.name("data")
-                .ok_or_else(|| PemError::MissingData)?
-                .as_bytes(),
-        )?;
+        let raw_data = as_utf8(caps.data)?;
 
         // We need to get rid of newlines for base64::decode
         // As base64 requires an AsRef<[u8]>, this must involve a copy
@@ -247,8 +223,7 @@ impl Pem {
 ///  assert_eq!(pem.tag, "RSA PRIVATE KEY");
 /// ```
 pub fn parse<B: AsRef<[u8]>>(input: B) -> Result<Pem> {
-    ascii_armor()
-        .captures(&input.as_ref())
+    parse_captures(&input.as_ref())
         .ok_or_else(|| PemError::MalformedFraming)
         .and_then(Pem::new_from_captures)
 }
@@ -324,8 +299,7 @@ pub fn parse<B: AsRef<[u8]>>(input: B) -> Result<Pem> {
 /// ```
 pub fn parse_many<B: AsRef<[u8]>>(input: B) -> Result<Vec<Pem>> {
     // Each time our regex matches a PEM section, we need to decode it.
-    ascii_armor()
-        .captures_iter(&input.as_ref())
+    parse_captures_iter(&input.as_ref())
         .map(|caps| Pem::new_from_captures(caps))
         .collect()
 }

diff --git a/src/parser.rs b/src/parser.rs
@@ -0,0 +1,86 @@
+pub struct Captures<'a> {
+    pub begin: &'a [u8],
+    pub data: &'a [u8],
+    pub end: &'a [u8],
+}
+
+pub fn parse_captures<'a>(input: &'a [u8]) -> Option<Captures<'a>> {
+    parser_inner(input).map(|(_, cap)| cap)
+}
+pub fn parse_captures_iter<'a>(input: &'a [u8]) -> CaptureMatches<'a> {
+    CaptureMatches { input }
+}
+
+pub struct CaptureMatches<'a> {
+    input: &'a [u8],
+}
+impl<'a> Iterator for CaptureMatches<'a> {
+    type Item = Captures<'a>;
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.input.is_empty() {
+            return None;
+        }
+        match parser_inner(self.input) {
+            Some((remaining, captures)) => {
+                self.input = remaining;
+                Some(captures)
+            }
+            None => {
+                self.input = &[];
+                None
+            }
+        }
+    }
+}
+
+fn parser_inner<'a>(input: &'a [u8]) -> Option<(&'a [u8], Captures<'a>)> {
+    // Should be equivalent to the regex
+    // "(?s)-----BEGIN (?P<begin>.*?)-----[ \t\n\r]*(?P<data>.*?)-----END (?P<end>.*?)-----[ \t\n\r]*"
+
+    // (?s)                                      # Enable dotall (. matches all characters incl \n)
+    // -----BEGIN (?P<begin>.*?)-----[ \t\n\r]*  # Parse begin
+    // (?P<data>.*?)                             # Parse data
+    // -----END (?P<end>.*?)-----[ \t\n\r]*      # Parse end
+
+    let (input, _) = read_until(input, b"-----BEGIN ")?;
+    let (input, begin) = read_until(input, b"-----")?;
+    let input = skip_whitespace(input);
+    let (input, data) = read_until(input, b"-----END ")?;
+    let (remaining, end) = read_until(input, b"-----")?;
+    let remaining = skip_whitespace(remaining);
+
+    let captures = Captures { begin, data, end };
+    Some((remaining, captures))
+}
+
+// Equivalent to the regex [ \t\n\r]*
+fn skip_whitespace(mut input: &[u8]) -> &[u8] {
+    while let Some(b' ' | b'\t' | b'\n' | b'\r') = input.first() {
+        input = &input[1..];
+    }
+    input
+}
+// Equivalent to (.*?) followed by a string
+// Returns the remaining input (after the secondary matched string) and the matched data
+fn read_until<'a, 'b>(input: &'a [u8], marker: &'b [u8]) -> Option<(&'a [u8], &'a [u8])> {
+    // If there is no end condition, short circuit
+    if marker.is_empty() {
+        return Some((&[], input));
+    }
+    let mut index = 0;
+    let mut found = 0;
+    while input.len() - index >= marker.len() - found {
+        if input[index] == marker[found] {
+            found += 1;
+        } else {
+            found = 0;
+        }
+        index += 1;
+        if found == marker.len() {
+            let remaining = &input[index..];
+            let matched = &input[..index - found];
+            return Some((remaining, matched));
+        }
+    }
+    None
+}