Skip to content

Commit

Permalink
Replace regex dependency with a hand-written parser
Browse files Browse the repository at this point in the history
The removes the dependencies on both regex and once_cell, and
gives significant performance increases in the current benchmarks.
  • Loading branch information
Aeledfyr committed Jan 1, 2022
1 parent e974337 commit 9d900cb
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 35 deletions.
2 changes: 0 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@ edition = "2018"

[dependencies]
base64 = "0.13.0"
once_cell = "1.4"
regex = {version = "1", default-features = false, features = ["std"]}

[dev-dependencies]
criterion = "0.3.0"
Expand Down
40 changes: 7 additions & 33 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,27 +104,15 @@
)]

mod errors;
mod parser;
use parser::{parse_captures, parse_captures_iter, Captures};

pub use crate::errors::{PemError, Result};
use once_cell::sync::Lazy;
use regex::bytes::{Captures, Regex};
use std::str;

const REGEX_STR: &str =
r"(?s)-----BEGIN (?P<begin>.*?)-----[ \t\n\r]*(?P<data>.*?)-----END (?P<end>.*?)-----[ \t\n\r]*";

/// The line length for PEM encoding
const LINE_WRAP: usize = 64;


fn ascii_armor() -> &'static Regex {
static ASCII_ARMOR: Lazy<Regex> = Lazy::new(|| {
Regex::new(REGEX_STR).unwrap()
});

&ASCII_ARMOR
}

/// Enum describing line endings
#[derive(Debug, Clone, Copy)]
pub enum LineEnding {
Expand Down Expand Up @@ -157,21 +145,13 @@ impl Pem {
}

// Verify that the begin section exists
let tag = as_utf8(
caps.name("begin")
.ok_or_else(|| PemError::MissingBeginTag)?
.as_bytes(),
)?;
let tag = as_utf8(caps.begin)?;
if tag.is_empty() {
return Err(PemError::MissingBeginTag);
}

// as well as the end section
let tag_end = as_utf8(
caps.name("end")
.ok_or_else(|| PemError::MissingEndTag)?
.as_bytes(),
)?;
let tag_end = as_utf8(caps.end)?;
if tag_end.is_empty() {
return Err(PemError::MissingEndTag);
}
Expand All @@ -182,11 +162,7 @@ impl Pem {
}

// If they did, then we can grab the data section
let raw_data = as_utf8(
caps.name("data")
.ok_or_else(|| PemError::MissingData)?
.as_bytes(),
)?;
let raw_data = as_utf8(caps.data)?;

// We need to get rid of newlines for base64::decode
// As base64 requires an AsRef<[u8]>, this must involve a copy
Expand Down Expand Up @@ -247,8 +223,7 @@ impl Pem {
/// assert_eq!(pem.tag, "RSA PRIVATE KEY");
/// ```
pub fn parse<B: AsRef<[u8]>>(input: B) -> Result<Pem> {
ascii_armor()
.captures(&input.as_ref())
parse_captures(&input.as_ref())
.ok_or_else(|| PemError::MalformedFraming)
.and_then(Pem::new_from_captures)
}
Expand Down Expand Up @@ -324,8 +299,7 @@ pub fn parse<B: AsRef<[u8]>>(input: B) -> Result<Pem> {
/// ```
pub fn parse_many<B: AsRef<[u8]>>(input: B) -> Result<Vec<Pem>> {
// Each time our regex matches a PEM section, we need to decode it.
ascii_armor()
.captures_iter(&input.as_ref())
parse_captures_iter(&input.as_ref())
.map(|caps| Pem::new_from_captures(caps))
.collect()
}
Expand Down
86 changes: 86 additions & 0 deletions src/parser.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
pub struct Captures<'a> {
pub begin: &'a [u8],
pub data: &'a [u8],
pub end: &'a [u8],
}

pub fn parse_captures<'a>(input: &'a [u8]) -> Option<Captures<'a>> {
parser_inner(input).map(|(_, cap)| cap)
}
pub fn parse_captures_iter<'a>(input: &'a [u8]) -> CaptureMatches<'a> {
CaptureMatches { input }
}

pub struct CaptureMatches<'a> {
input: &'a [u8],
}
impl<'a> Iterator for CaptureMatches<'a> {
type Item = Captures<'a>;
fn next(&mut self) -> Option<Self::Item> {
if self.input.is_empty() {
return None;
}
match parser_inner(self.input) {
Some((remaining, captures)) => {
self.input = remaining;
Some(captures)
}
None => {
self.input = &[];
None
}
}
}
}

fn parser_inner<'a>(input: &'a [u8]) -> Option<(&'a [u8], Captures<'a>)> {
// Should be equivalent to the regex
// "(?s)-----BEGIN (?P<begin>.*?)-----[ \t\n\r]*(?P<data>.*?)-----END (?P<end>.*?)-----[ \t\n\r]*"

// (?s) # Enable dotall (. matches all characters incl \n)
// -----BEGIN (?P<begin>.*?)-----[ \t\n\r]* # Parse begin
// (?P<data>.*?) # Parse data
// -----END (?P<end>.*?)-----[ \t\n\r]* # Parse end

let (input, _) = read_until(input, b"-----BEGIN ")?;
let (input, begin) = read_until(input, b"-----")?;
let input = skip_whitespace(input);
let (input, data) = read_until(input, b"-----END ")?;
let (remaining, end) = read_until(input, b"-----")?;
let remaining = skip_whitespace(remaining);

let captures = Captures { begin, data, end };
Some((remaining, captures))
}

// Equivalent to the regex [ \t\n\r]*
fn skip_whitespace(mut input: &[u8]) -> &[u8] {
while let Some(b' ' | b'\t' | b'\n' | b'\r') = input.first() {
input = &input[1..];
}
input
}
// Equivalent to (.*?) followed by a string
// Returns the remaining input (after the secondary matched string) and the matched data
fn read_until<'a, 'b>(input: &'a [u8], marker: &'b [u8]) -> Option<(&'a [u8], &'a [u8])> {
// If there is no end condition, short circuit
if marker.is_empty() {
return Some((&[], input));
}
let mut index = 0;
let mut found = 0;
while input.len() - index >= marker.len() - found {
if input[index] == marker[found] {
found += 1;
} else {
found = 0;
}
index += 1;
if found == marker.len() {
let remaining = &input[index..];
let matched = &input[..index - found];
return Some((remaining, matched));
}
}
None
}

0 comments on commit 9d900cb

Please sign in to comment.