From 9d900cbb6a2d64090069a44bcd8cf5cea7730b29 Mon Sep 17 00:00:00 2001 From: Aeledfyr Date: Sat, 1 Jan 2022 15:49:54 -0600 Subject: [PATCH] Replace regex dependency with a hand-written parser The removes the dependencies on both regex and once_cell, and gives significant performance increases in the current benchmarks. --- Cargo.toml | 2 -- src/lib.rs | 40 +++++------------------- src/parser.rs | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 93 insertions(+), 35 deletions(-) create mode 100644 src/parser.rs diff --git a/Cargo.toml b/Cargo.toml index 1aff665..6dd3638 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,8 +13,6 @@ edition = "2018" [dependencies] base64 = "0.13.0" -once_cell = "1.4" -regex = {version = "1", default-features = false, features = ["std"]} [dev-dependencies] criterion = "0.3.0" diff --git a/src/lib.rs b/src/lib.rs index 694db7f..8443c41 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -104,27 +104,15 @@ )] mod errors; +mod parser; +use parser::{parse_captures, parse_captures_iter, Captures}; pub use crate::errors::{PemError, Result}; -use once_cell::sync::Lazy; -use regex::bytes::{Captures, Regex}; use std::str; -const REGEX_STR: &str = - r"(?s)-----BEGIN (?P.*?)-----[ \t\n\r]*(?P.*?)-----END (?P.*?)-----[ \t\n\r]*"; - /// The line length for PEM encoding const LINE_WRAP: usize = 64; - -fn ascii_armor() -> &'static Regex { - static ASCII_ARMOR: Lazy = Lazy::new(|| { - Regex::new(REGEX_STR).unwrap() - }); - - &ASCII_ARMOR -} - /// Enum describing line endings #[derive(Debug, Clone, Copy)] pub enum LineEnding { @@ -157,21 +145,13 @@ impl Pem { } // Verify that the begin section exists - let tag = as_utf8( - caps.name("begin") - .ok_or_else(|| PemError::MissingBeginTag)? - .as_bytes(), - )?; + let tag = as_utf8(caps.begin)?; if tag.is_empty() { return Err(PemError::MissingBeginTag); } // as well as the end section - let tag_end = as_utf8( - caps.name("end") - .ok_or_else(|| PemError::MissingEndTag)? - .as_bytes(), - )?; + let tag_end = as_utf8(caps.end)?; if tag_end.is_empty() { return Err(PemError::MissingEndTag); } @@ -182,11 +162,7 @@ impl Pem { } // If they did, then we can grab the data section - let raw_data = as_utf8( - caps.name("data") - .ok_or_else(|| PemError::MissingData)? - .as_bytes(), - )?; + let raw_data = as_utf8(caps.data)?; // We need to get rid of newlines for base64::decode // As base64 requires an AsRef<[u8]>, this must involve a copy @@ -247,8 +223,7 @@ impl Pem { /// assert_eq!(pem.tag, "RSA PRIVATE KEY"); /// ``` pub fn parse>(input: B) -> Result { - ascii_armor() - .captures(&input.as_ref()) + parse_captures(&input.as_ref()) .ok_or_else(|| PemError::MalformedFraming) .and_then(Pem::new_from_captures) } @@ -324,8 +299,7 @@ pub fn parse>(input: B) -> Result { /// ``` pub fn parse_many>(input: B) -> Result> { // Each time our regex matches a PEM section, we need to decode it. - ascii_armor() - .captures_iter(&input.as_ref()) + parse_captures_iter(&input.as_ref()) .map(|caps| Pem::new_from_captures(caps)) .collect() } diff --git a/src/parser.rs b/src/parser.rs new file mode 100644 index 0000000..5fe2a10 --- /dev/null +++ b/src/parser.rs @@ -0,0 +1,86 @@ +pub struct Captures<'a> { + pub begin: &'a [u8], + pub data: &'a [u8], + pub end: &'a [u8], +} + +pub fn parse_captures<'a>(input: &'a [u8]) -> Option> { + parser_inner(input).map(|(_, cap)| cap) +} +pub fn parse_captures_iter<'a>(input: &'a [u8]) -> CaptureMatches<'a> { + CaptureMatches { input } +} + +pub struct CaptureMatches<'a> { + input: &'a [u8], +} +impl<'a> Iterator for CaptureMatches<'a> { + type Item = Captures<'a>; + fn next(&mut self) -> Option { + if self.input.is_empty() { + return None; + } + match parser_inner(self.input) { + Some((remaining, captures)) => { + self.input = remaining; + Some(captures) + } + None => { + self.input = &[]; + None + } + } + } +} + +fn parser_inner<'a>(input: &'a [u8]) -> Option<(&'a [u8], Captures<'a>)> { + // Should be equivalent to the regex + // "(?s)-----BEGIN (?P.*?)-----[ \t\n\r]*(?P.*?)-----END (?P.*?)-----[ \t\n\r]*" + + // (?s) # Enable dotall (. matches all characters incl \n) + // -----BEGIN (?P.*?)-----[ \t\n\r]* # Parse begin + // (?P.*?) # Parse data + // -----END (?P.*?)-----[ \t\n\r]* # Parse end + + let (input, _) = read_until(input, b"-----BEGIN ")?; + let (input, begin) = read_until(input, b"-----")?; + let input = skip_whitespace(input); + let (input, data) = read_until(input, b"-----END ")?; + let (remaining, end) = read_until(input, b"-----")?; + let remaining = skip_whitespace(remaining); + + let captures = Captures { begin, data, end }; + Some((remaining, captures)) +} + +// Equivalent to the regex [ \t\n\r]* +fn skip_whitespace(mut input: &[u8]) -> &[u8] { + while let Some(b' ' | b'\t' | b'\n' | b'\r') = input.first() { + input = &input[1..]; + } + input +} +// Equivalent to (.*?) followed by a string +// Returns the remaining input (after the secondary matched string) and the matched data +fn read_until<'a, 'b>(input: &'a [u8], marker: &'b [u8]) -> Option<(&'a [u8], &'a [u8])> { + // If there is no end condition, short circuit + if marker.is_empty() { + return Some((&[], input)); + } + let mut index = 0; + let mut found = 0; + while input.len() - index >= marker.len() - found { + if input[index] == marker[found] { + found += 1; + } else { + found = 0; + } + index += 1; + if found == marker.len() { + let remaining = &input[index..]; + let matched = &input[..index - found]; + return Some((remaining, matched)); + } + } + None +}