Skip to content

Commit

Permalink
Add partial lexer documentation (#217)
Browse files Browse the repository at this point in the history
* Add comments to longest-match lexer

* Add documentation to alphabet implementations

* Add partial documentation to lexing module main
  • Loading branch information
srhickma authored Dec 17, 2019
1 parent e0f37dd commit b5b062d
Show file tree
Hide file tree
Showing 5 changed files with 199 additions and 13 deletions.
8 changes: 4 additions & 4 deletions src/core/fmt/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,8 @@ impl<Symbol: GrammarSymbol> FormatterBuilder<Symbol> {
///
/// # Types
///
/// * `PatternBuildErr` - Indicates that an error that occurred while building a pattern.
/// * `DuplicateInjectionErr` - Indicates that a particular symbol was specified for injection
/// * `PatternBuildErr` - indicates that an error that occurred while building a pattern.
/// * `DuplicateInjectionErr` - indicates that a particular symbol was specified for injection
/// more than once.
#[derive(Debug)]
pub enum BuildError {
Expand Down Expand Up @@ -542,9 +542,9 @@ pub struct PatternPair<Symbol: GrammarSymbol> {
///
/// # Types
///
/// * `Left` - Indicates that a symbol prefers to be injected immediately after the previous
/// * `Left` - indicates that a symbol prefers to be injected immediately after the previous
/// non-terminal symbol.
/// * `Right` - Indicates that a symbol prefers to be injected immediately before the next
/// * `Right` - indicates that a symbol prefers to be injected immediately before the next
/// non-terminal symbol.
#[derive(Clone, PartialEq)]
pub enum InjectionAffinity {
Expand Down
12 changes: 6 additions & 6 deletions src/core/fmt/pattern.rs
Original file line number Diff line number Diff line change
Expand Up @@ -249,11 +249,11 @@ pub struct Pattern {
///
/// # Types
///
/// * `Filler` - Stores a literal string of "filler" text, which will be inserted as-is during
/// * `Filler` - stores a literal string of "filler" text, which will be inserted as-is during
/// formatting.
/// * `Substitution` - Stores the variable name for a run-time value substitution into the pattern
/// * `Substitution` - stores the variable name for a run-time value substitution into the pattern
/// during formatting.
/// * `Capture` - Stores a `Capture`, indicating that a child (in the parse tree) should be
/// * `Capture` - stores a `Capture`, indicating that a child (in the parse tree) should be
/// formatted and inserted at this position during formatting.
#[derive(Clone)]
pub enum Segment {
Expand Down Expand Up @@ -498,9 +498,9 @@ fn parse_pattern(input: &str) -> Result<Tree<PatternSymbol>, BuildError> {
///
/// # Types
///
/// * `LexErr` - Indicates that an error occurred while lexing a pattern.
/// * `ParseErr` - Indicates that an error occurred while parsing a pattern.
/// * `CaptureErr` - Indicates that an invalid pattern capture is present (e.g. out-of-bounds).
/// * `LexErr` - indicates that an error occurred while lexing a pattern.
/// * `ParseErr` - indicates that an error occurred while parsing a pattern.
/// * `CaptureErr` - indicates that an invalid pattern capture is present (e.g. out-of-bounds).
#[derive(Debug)]
pub enum BuildError {
LexErr(lex::Error),
Expand Down
5 changes: 5 additions & 0 deletions src/core/lex/alphabet.rs
Original file line number Diff line number Diff line change
@@ -1,20 +1,25 @@
use std::collections::HashSet;

// Alphabet: Trait representing a lexing alphabet.
pub trait Alphabet {
// Returns true if the alphabet contains `char`, false otherwise.
fn contains(&self, c: char) -> bool;
}

// Hashed Alphabet: Alphabet implementation using a hash-set.
pub struct HashedAlphabet {
alphabet: HashSet<char>,
}

impl HashedAlphabet {
// Returns a new empty alphabet.
pub fn new() -> HashedAlphabet {
HashedAlphabet {
alphabet: HashSet::new(),
}
}

// Inserts character `char` into the alphabet.
pub fn insert(&mut self, c: char) {
self.alphabet.insert(c);
}
Expand Down
45 changes: 42 additions & 3 deletions src/core/lex/longest_match.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use core::{
parse::grammar::GrammarSymbol,
};

/// Longest Match Lexer: Lexer which greedily consumes input, producing the longest possible tokens.
pub struct LongestMatchLexer;

impl<State: Data, Symbol: GrammarSymbol> Lexer<State, Symbol> for LongestMatchLexer {
Expand All @@ -12,6 +13,15 @@ impl<State: Data, Symbol: GrammarSymbol> Lexer<State, Symbol> for LongestMatchLe
input: &[char],
cdfa: &'cdfa dyn CDFA<State, Symbol>,
) -> Result<Vec<Token<Symbol>>, lex::Error> {
/// Scan-One Result: The result of scanning a single token.
///
/// # Fields
///
/// * `consumed` - the number of input characters consumed by the lex.
/// * `end_state` - the accepted CDFA state after completing the lex.
/// * `next_start` - the CDFA state to start the next lex from.
/// * `line` - the current line number after the lex.
/// * `characters` - the current character number after the lex.
struct ScanOneResult<State> {
consumed: usize,
end_state: Option<State>,
Expand All @@ -20,6 +30,26 @@ impl<State: Data, Symbol: GrammarSymbol> Lexer<State, Symbol> for LongestMatchLe
character: usize,
}

/// Scan a single token from the head of `input`. Lexing is performed by iteratively reading
/// input and traversing the passed CDFA. Once the input is exhausted or no transition
/// exists, the input up to the most recently accepting state in the CDFA is consumed.
///
/// Returns an error if the scan fails, or a `ScanOneResult` containing the details of the
/// scanned token.
///
/// # Type Parameters:
///
/// * `State` - the type of CDFA state being used.
/// * `Symbol` - the type of grammar symbol being tokenized into.
///
/// # Parameters
///
/// * `input` - a slice of the input array being scanned, where the start of the slice is
/// the current lex cursor.
/// * `start` - the starting CDFA state in which to begin the lex.
/// * `line` - the current line number.
/// * `character` - the current character number (on the current line).
/// * `cdfa` - the CDFA to use when lexing the input.
fn scan_one<State: Data, Symbol: GrammarSymbol>(
input: &[char],
start: State,
Expand All @@ -34,6 +64,7 @@ impl<State: Data, Symbol: GrammarSymbol> Lexer<State, Symbol> for LongestMatchLe

let next_start = cdfa.default_acceptor_destination(&state);

// If the start state is already accepting, remember it.
let end_state = if let Some(ref accd) = next_start {
if cdfa.accepts(&state) && state != *accd {
Some(state.clone())
Expand All @@ -44,17 +75,18 @@ impl<State: Data, Symbol: GrammarSymbol> Lexer<State, Symbol> for LongestMatchLe
None
};

let mut consumed: usize = 0;

let mut last_accepting = ScanOneResult {
consumed: 0,
consumed,
end_state,
next_start,
line,
character,
};

let mut consumed: usize = 0;

loop {
// Take a transition on the remaining input.
let res = cdfa.transition(&state, remaining);

match res {
Expand All @@ -63,17 +95,21 @@ impl<State: Data, Symbol: GrammarSymbol> Lexer<State, Symbol> for LongestMatchLe
consumed += dest.consumed;

for c in remaining.iter().take(dest.consumed) {
// Update calculation of current character and line.
character += 1;
if *c == '\n' {
line += 1;
character = 1;
}

// Error out if we see an unexpected character.
if !cdfa.alphabet_contains(*c) {
return Err(lex::Error::AlphabetErr(*c));
}
}

// If the current state is accepting, remember it.
// This avoids backtracking when we reach the end of the lex.
if cdfa.accepts(&dest.state) {
last_accepting = ScanOneResult {
consumed,
Expand Down Expand Up @@ -103,6 +139,7 @@ impl<State: Data, Symbol: GrammarSymbol> Lexer<State, Symbol> for LongestMatchLe
let mut character: usize = 1;

loop {
// Scan a single token.
let res: ScanOneResult<State> =
scan_one(remaining, next_start.clone(), line, character, cdfa)?;

Expand All @@ -116,6 +153,7 @@ impl<State: Data, Symbol: GrammarSymbol> Lexer<State, Symbol> for LongestMatchLe

match res.end_state {
None => {
// If more input remains after a failed token scan, return a lexing error.
if !remaining.is_empty() {
let sequence: String = (0..FAIL_SEQUENCE_LENGTH)
.map(|i| remaining.get(i))
Expand All @@ -133,6 +171,7 @@ impl<State: Data, Symbol: GrammarSymbol> Lexer<State, Symbol> for LongestMatchLe
break;
}
Some(state) => {
// Scanning succeeded, tokenize the consumed input and continue.
if let Some(kind) = cdfa.tokenize(&state) {
tokens.push(Token::leaf(
kind,
Expand Down
Loading

0 comments on commit b5b062d

Please sign in to comment.