diff --git a/src/core/fmt/mod.rs b/src/core/fmt/mod.rs index e25ebff..e5034bb 100644 --- a/src/core/fmt/mod.rs +++ b/src/core/fmt/mod.rs @@ -141,8 +141,8 @@ impl FormatterBuilder { /// /// # Types /// -/// * `PatternBuildErr` - Indicates that an error that occurred while building a pattern. -/// * `DuplicateInjectionErr` - Indicates that a particular symbol was specified for injection +/// * `PatternBuildErr` - indicates that an error that occurred while building a pattern. +/// * `DuplicateInjectionErr` - indicates that a particular symbol was specified for injection /// more than once. #[derive(Debug)] pub enum BuildError { @@ -542,9 +542,9 @@ pub struct PatternPair { /// /// # Types /// -/// * `Left` - Indicates that a symbol prefers to be injected immediately after the previous +/// * `Left` - indicates that a symbol prefers to be injected immediately after the previous /// non-terminal symbol. -/// * `Right` - Indicates that a symbol prefers to be injected immediately before the next +/// * `Right` - indicates that a symbol prefers to be injected immediately before the next /// non-terminal symbol. #[derive(Clone, PartialEq)] pub enum InjectionAffinity { diff --git a/src/core/fmt/pattern.rs b/src/core/fmt/pattern.rs index 5322076..5ce8d97 100644 --- a/src/core/fmt/pattern.rs +++ b/src/core/fmt/pattern.rs @@ -249,11 +249,11 @@ pub struct Pattern { /// /// # Types /// -/// * `Filler` - Stores a literal string of "filler" text, which will be inserted as-is during +/// * `Filler` - stores a literal string of "filler" text, which will be inserted as-is during /// formatting. -/// * `Substitution` - Stores the variable name for a run-time value substitution into the pattern +/// * `Substitution` - stores the variable name for a run-time value substitution into the pattern /// during formatting. -/// * `Capture` - Stores a `Capture`, indicating that a child (in the parse tree) should be +/// * `Capture` - stores a `Capture`, indicating that a child (in the parse tree) should be /// formatted and inserted at this position during formatting. #[derive(Clone)] pub enum Segment { @@ -498,9 +498,9 @@ fn parse_pattern(input: &str) -> Result, BuildError> { /// /// # Types /// -/// * `LexErr` - Indicates that an error occurred while lexing a pattern. -/// * `ParseErr` - Indicates that an error occurred while parsing a pattern. -/// * `CaptureErr` - Indicates that an invalid pattern capture is present (e.g. out-of-bounds). +/// * `LexErr` - indicates that an error occurred while lexing a pattern. +/// * `ParseErr` - indicates that an error occurred while parsing a pattern. +/// * `CaptureErr` - indicates that an invalid pattern capture is present (e.g. out-of-bounds). #[derive(Debug)] pub enum BuildError { LexErr(lex::Error), diff --git a/src/core/lex/alphabet.rs b/src/core/lex/alphabet.rs index 26c2647..c4651ee 100644 --- a/src/core/lex/alphabet.rs +++ b/src/core/lex/alphabet.rs @@ -1,20 +1,25 @@ use std::collections::HashSet; +// Alphabet: Trait representing a lexing alphabet. pub trait Alphabet { + // Returns true if the alphabet contains `char`, false otherwise. fn contains(&self, c: char) -> bool; } +// Hashed Alphabet: Alphabet implementation using a hash-set. pub struct HashedAlphabet { alphabet: HashSet, } impl HashedAlphabet { + // Returns a new empty alphabet. pub fn new() -> HashedAlphabet { HashedAlphabet { alphabet: HashSet::new(), } } + // Inserts character `char` into the alphabet. pub fn insert(&mut self, c: char) { self.alphabet.insert(c); } diff --git a/src/core/lex/longest_match.rs b/src/core/lex/longest_match.rs index 8197a81..2c023f2 100644 --- a/src/core/lex/longest_match.rs +++ b/src/core/lex/longest_match.rs @@ -4,6 +4,7 @@ use core::{ parse::grammar::GrammarSymbol, }; +/// Longest Match Lexer: Lexer which greedily consumes input, producing the longest possible tokens. pub struct LongestMatchLexer; impl Lexer for LongestMatchLexer { @@ -12,6 +13,15 @@ impl Lexer for LongestMatchLe input: &[char], cdfa: &'cdfa dyn CDFA, ) -> Result>, lex::Error> { + /// Scan-One Result: The result of scanning a single token. + /// + /// # Fields + /// + /// * `consumed` - the number of input characters consumed by the lex. + /// * `end_state` - the accepted CDFA state after completing the lex. + /// * `next_start` - the CDFA state to start the next lex from. + /// * `line` - the current line number after the lex. + /// * `characters` - the current character number after the lex. struct ScanOneResult { consumed: usize, end_state: Option, @@ -20,6 +30,26 @@ impl Lexer for LongestMatchLe character: usize, } + /// Scan a single token from the head of `input`. Lexing is performed by iteratively reading + /// input and traversing the passed CDFA. Once the input is exhausted or no transition + /// exists, the input up to the most recently accepting state in the CDFA is consumed. + /// + /// Returns an error if the scan fails, or a `ScanOneResult` containing the details of the + /// scanned token. + /// + /// # Type Parameters: + /// + /// * `State` - the type of CDFA state being used. + /// * `Symbol` - the type of grammar symbol being tokenized into. + /// + /// # Parameters + /// + /// * `input` - a slice of the input array being scanned, where the start of the slice is + /// the current lex cursor. + /// * `start` - the starting CDFA state in which to begin the lex. + /// * `line` - the current line number. + /// * `character` - the current character number (on the current line). + /// * `cdfa` - the CDFA to use when lexing the input. fn scan_one( input: &[char], start: State, @@ -34,6 +64,7 @@ impl Lexer for LongestMatchLe let next_start = cdfa.default_acceptor_destination(&state); + // If the start state is already accepting, remember it. let end_state = if let Some(ref accd) = next_start { if cdfa.accepts(&state) && state != *accd { Some(state.clone()) @@ -44,17 +75,18 @@ impl Lexer for LongestMatchLe None }; + let mut consumed: usize = 0; + let mut last_accepting = ScanOneResult { - consumed: 0, + consumed, end_state, next_start, line, character, }; - let mut consumed: usize = 0; - loop { + // Take a transition on the remaining input. let res = cdfa.transition(&state, remaining); match res { @@ -63,17 +95,21 @@ impl Lexer for LongestMatchLe consumed += dest.consumed; for c in remaining.iter().take(dest.consumed) { + // Update calculation of current character and line. character += 1; if *c == '\n' { line += 1; character = 1; } + // Error out if we see an unexpected character. if !cdfa.alphabet_contains(*c) { return Err(lex::Error::AlphabetErr(*c)); } } + // If the current state is accepting, remember it. + // This avoids backtracking when we reach the end of the lex. if cdfa.accepts(&dest.state) { last_accepting = ScanOneResult { consumed, @@ -103,6 +139,7 @@ impl Lexer for LongestMatchLe let mut character: usize = 1; loop { + // Scan a single token. let res: ScanOneResult = scan_one(remaining, next_start.clone(), line, character, cdfa)?; @@ -116,6 +153,7 @@ impl Lexer for LongestMatchLe match res.end_state { None => { + // If more input remains after a failed token scan, return a lexing error. if !remaining.is_empty() { let sequence: String = (0..FAIL_SEQUENCE_LENGTH) .map(|i| remaining.get(i)) @@ -133,6 +171,7 @@ impl Lexer for LongestMatchLe break; } Some(state) => { + // Scanning succeeded, tokenize the consumed input and continue. if let Some(kind) = cdfa.tokenize(&state) { tokens.push(Token::leaf( kind, diff --git a/src/core/lex/mod.rs b/src/core/lex/mod.rs index 85e9c31..682d7bd 100644 --- a/src/core/lex/mod.rs +++ b/src/core/lex/mod.rs @@ -10,9 +10,19 @@ pub mod alphabet; pub mod ecdfa; pub mod longest_match; +/// The character sequence length to generate when lexing fails. static FAIL_SEQUENCE_LENGTH: usize = 10; +/// Lexer: Trait which represents a generic lexer. +/// +/// # Type Parameters: +/// +/// * `State` - the state-type of the CDFA used to specify lexing behaviour. +/// * `Symbol` - the type of tokens produced by the lexer. pub trait Lexer: 'static + Send + Sync { + /// Lexes `input` using `cdfa` to specify the language. + /// + /// Returns a vector of scanned tokens if the lex is successful, otherwise an error is returned. fn lex( &self, input: &[char], @@ -20,38 +30,89 @@ pub trait Lexer: 'static + Send + Sync { ) -> Result>, Error>; } +/// Returns the current default lexer. +/// This lexer should be used for all non-testing purposes. pub fn def_lexer() -> Box> { Box::new(longest_match::LongestMatchLexer) } +/// Compressed Deterministic Finite Automata (CDFA): Trait representing the operations a CDFA +/// implementation must provide to support use by a `Lexer`. +/// +/// # Type Parameters +/// +/// * `State` - the type used to represent states in the CDFA graph. +/// * `Symbol` - the type of tokens produced by the CDFA. pub trait CDFA: Send + Sync { + /// Attempts to perform a transition from `state` on `input`, and returns the result. fn transition(&self, state: &State, input: &[char]) -> TransitionResult; + + /// Returns true if the alphabet of the CDFA contains `char`. fn alphabet_contains(&self, c: char) -> bool; + + /// Returns true if `state` is accepting. fn accepts(&self, state: &State) -> bool; + + /// Returns the default acceptor destination of `state`, or `None` if `state` has no acceptor + /// destination. fn default_acceptor_destination(&self, state: &State) -> Option; + + /// Returns the token associated with `state`, if one exists, otherwise `None` is returned. fn tokenize(&self, state: &State) -> Option; + + /// Returns the starting state of the CDFA, where lexing should begin. fn start(&self) -> State; } +/// CDFA Builder: Trait representing a builder for a CDFA. +/// +/// # Type Parameters +/// +/// * `State` - the type used to represent states in the CDFA graph. +/// * `Symbol` - the type of tokens produced by the CDFA. +/// * `CDFAType` - the type of CDFA this builder should produce. pub trait CDFABuilder { + /// Returns a new builder. fn new() -> Self; + + /// Consumes the builder and returns either a CDFA or an error, if a build failure occurred. fn build(self) -> Result; + + /// Sets `chars` as the alphabet of the CDFA. fn set_alphabet(&mut self, chars: impl Iterator) -> &mut Self; + + /// Marks `state` as an accepting state. fn accept(&mut self, state: &State) -> &mut Self; + + /// Marks `state` as an accepting state, with acceptor destination `to`. fn accept_to(&mut self, state: &State, to: &State) -> &mut Self; + + /// Sets `state` as the default start state of the CDFA. fn mark_start(&mut self, state: &State) -> &mut Self; + + /// Adds simple transition `transit` on `on` from `from`. + /// + /// Returns an error if the transition could not be added. fn mark_trans( &mut self, from: &State, transit: Transit, on: char, ) -> Result<&mut Self, CDFAError>; + + /// Adds chain transition `transit` on `on` from `from`. + /// + /// Returns an error if the transition could not be added. fn mark_chain( &mut self, from: &State, transit: Transit, on: impl Iterator, ) -> Result<&mut Self, CDFAError>; + + /// Adds character range transition `transit` on range [`start`, `end`] from `from`. + /// + /// Returns an error if the transition could not be added. fn mark_range( &mut self, from: &State, @@ -59,6 +120,11 @@ pub trait CDFABuilder { start: char, end: char, ) -> Result<&mut Self, CDFAError>; + + /// Adds character range transition `transit` on range [`start`, `end`] from all states in + /// `sources`. + /// + /// Returns an error if the transition could not be added. fn mark_range_for_all<'state_o: 'state_i, 'state_i>( &mut self, sources: impl Iterator, @@ -68,11 +134,29 @@ pub trait CDFABuilder { ) -> Result<&mut Self, CDFAError> where State: 'state_o; + + /// Adds default transition `transit` from `from`. + /// + /// Returns an error if the transition could not be added. fn default_to(&mut self, from: &State, transit: Transit) -> Result<&mut Self, CDFAError>; + + /// Mark that `state` should be tokenized to `token`. fn tokenize(&mut self, state: &State, token: &Symbol) -> &mut Self; } +/// Transit: Represents the action-phase of a transition. +/// +/// # Type Parameters +/// +/// * `State` - the state type of the associated CDFA. +/// +/// # Fields +/// +/// * `dest` - the destination state of the transition. +/// * `consumer` - the input consumption strategy to follow when taking the transition. +/// * `acceptor_destination` - the acceptor destination associated with this _transition_, not to +/// be confused with the possibly different acceptor destination of the destination state. #[derive(Clone)] pub struct Transit { dest: State, @@ -81,6 +165,9 @@ pub struct Transit { } impl Transit { + /// Creates a new transit to `dest` which consumes all input and has no acceptor destination. + /// This is the default behaviour, and is utilized by most transitions, hence it is included + /// here as a shorthand for using the builder. pub fn to(dest: State) -> Self { Transit { dest, @@ -90,6 +177,8 @@ impl Transit { } } +/// Transit Builder: Simple builder for `Transit` structs. +/// Fields and type parameters correspond exactly with those of the target type. #[derive(Clone)] pub struct TransitBuilder { dest: State, @@ -98,6 +187,8 @@ pub struct TransitBuilder { } impl TransitBuilder { + /// Creates a new transit builder, with destination `dest`, consuming all input, and no + /// acceptor destination. pub fn to(dest: State) -> Self { TransitBuilder { dest, @@ -106,16 +197,19 @@ impl TransitBuilder { } } + /// Sets the consumer strategy of the transit to `consumer`. pub fn consumer(&mut self, consumer: ConsumerStrategy) -> &mut Self { self.consumer = consumer; self } + /// Sets the acceptor destination of the transit to `acceptor_destination`. pub fn accept_to(&mut self, acceptor_destination: State) -> &mut Self { self.acceptor_destination = Some(acceptor_destination); self } + /// Copies the builder configuration into a new `Transit` struct, without consuming the builder. pub fn build(&self) -> Transit { Transit { dest: self.dest.clone(), @@ -125,22 +219,42 @@ impl TransitBuilder { } } +/// Consumer Strategy: Represents a strategy of input consumption to by taken by a CDFA transition. +/// +/// # Types +/// +/// * `All` - when a transition is taken, consume all input matched by the transition. +/// * `None` - when a transition is taken, do not consume any input. #[derive(Clone)] pub enum ConsumerStrategy { All, None, } +/// Transition Result: Represents the result of a transition attempt. +/// +/// # Types +/// +/// * `Fail` - indicates that the transition was unsuccessful. +/// * `Ok` - indicates that the transition was a success, and stores the destination of the +/// transition. +/// +/// # Type Parameters +/// +/// * `State` - the state type of the associated CDFA. pub enum TransitionResult { Fail, Ok(TransitionDestination), } impl TransitionResult { + /// Returns a new successful transition result through `transit` traversing one input character. pub fn direct(transit: &Transit) -> Self { TransitionResult::ok(transit, 1) } + /// Returns a new successful transition result through `transit` traversing `traversed` input + /// characters. pub fn ok(transit: &Transit, traversed: usize) -> Self { let consumed = match transit.consumer { ConsumerStrategy::All => traversed, @@ -155,12 +269,29 @@ impl TransitionResult { } } +/// Transition Destination: Represents the destination of a successful state transition. +/// +/// # Type Parameters +/// +/// * `State` - the state type of the associated CDFA. +/// +/// # Fields +/// +/// * `state` - the destination state. +/// * `consumed` - the amount of input consumed by the transition. +/// * `acceptor_destination` - the optional acceptor destination of the transition, not to be +/// confused with the possibly different acceptor destination of the destination state. pub struct TransitionDestination { state: State, consumed: usize, acceptor_destination: Option, } +/// CDFA Error: Represents an error encountered while using or constructing a CDFA. +/// +/// # Types +/// +/// * `BuildErr` - indicates than an error occurred while building a CDFA. #[derive(Debug)] pub enum CDFAError { BuildErr(String), @@ -194,12 +325,23 @@ impl From for CDFAError { } } +/// Token: A successfully lexed token of input. +/// +/// # Type Parameters +/// +/// * `Symbol` - the symbol-type of the token, as referenced by the language grammar. +/// +/// # Fields +/// +/// * `kind` - the kind of token, or `None` if the token represents an epsilon value (null). +/// * `lexeme` - the scanned characters which produced this token. #[derive(PartialEq, Eq, Hash, Clone, Debug)] pub struct Token { kind: Option, lexeme: String, } +// TODO(shane) try to achieve better separation between parsing and lexing logic here. impl Token { pub fn leaf(kind: Symbol, lexeme: String) -> Self { Token {