Add partial lexer documentation (#217)

* Add comments to longest-match lexer * Add documentation to alphabet implementations * Add partial documentation to lexing module main
srhickma · Dec 17, 2019 · b5b062d · b5b062d
1 parent e0f37dd
commit b5b062d
Show file tree

Hide file tree

Showing 5 changed files with 199 additions and 13 deletions.
diff --git a/src/core/fmt/mod.rs b/src/core/fmt/mod.rs
@@ -141,8 +141,8 @@ impl<Symbol: GrammarSymbol> FormatterBuilder<Symbol> {
 ///
 /// # Types
 ///
-/// * `PatternBuildErr` - Indicates that an error that occurred while building a pattern.
-/// * `DuplicateInjectionErr` - Indicates that a particular symbol was specified for injection
+/// * `PatternBuildErr` - indicates that an error that occurred while building a pattern.
+/// * `DuplicateInjectionErr` - indicates that a particular symbol was specified for injection
 /// more than once.
 #[derive(Debug)]
 pub enum BuildError {
@@ -542,9 +542,9 @@ pub struct PatternPair<Symbol: GrammarSymbol> {
 ///
 /// # Types
 ///
-/// * `Left` - Indicates that a symbol prefers to be injected immediately after the previous
+/// * `Left` - indicates that a symbol prefers to be injected immediately after the previous
 /// non-terminal symbol.
-/// * `Right` - Indicates that a symbol prefers to be injected immediately before the next
+/// * `Right` - indicates that a symbol prefers to be injected immediately before the next
 /// non-terminal symbol.
 #[derive(Clone, PartialEq)]
 pub enum InjectionAffinity {

diff --git a/src/core/fmt/pattern.rs b/src/core/fmt/pattern.rs
@@ -249,11 +249,11 @@ pub struct Pattern {
 ///
 /// # Types
 ///
-/// * `Filler` - Stores a literal string of "filler" text, which will be inserted as-is during
+/// * `Filler` - stores a literal string of "filler" text, which will be inserted as-is during
 /// formatting.
-/// * `Substitution` - Stores the variable name for a run-time value substitution into the pattern
+/// * `Substitution` - stores the variable name for a run-time value substitution into the pattern
 /// during formatting.
-/// * `Capture` - Stores a `Capture`, indicating that a child (in the parse tree) should be
+/// * `Capture` - stores a `Capture`, indicating that a child (in the parse tree) should be
 /// formatted and inserted at this position during formatting.
 #[derive(Clone)]
 pub enum Segment {
@@ -498,9 +498,9 @@ fn parse_pattern(input: &str) -> Result<Tree<PatternSymbol>, BuildError> {
 ///
 /// # Types
 ///
-/// * `LexErr` - Indicates that an error occurred while lexing a pattern.
-/// * `ParseErr` - Indicates that an error occurred while parsing a pattern.
-/// * `CaptureErr` - Indicates that an invalid pattern capture is present (e.g. out-of-bounds).
+/// * `LexErr` - indicates that an error occurred while lexing a pattern.
+/// * `ParseErr` - indicates that an error occurred while parsing a pattern.
+/// * `CaptureErr` - indicates that an invalid pattern capture is present (e.g. out-of-bounds).
 #[derive(Debug)]
 pub enum BuildError {
     LexErr(lex::Error),

diff --git a/src/core/lex/alphabet.rs b/src/core/lex/alphabet.rs
@@ -1,20 +1,25 @@
 use std::collections::HashSet;
 
+// Alphabet: Trait representing a lexing alphabet.
 pub trait Alphabet {
+    // Returns true if the alphabet contains `char`, false otherwise.
     fn contains(&self, c: char) -> bool;
 }
 
+// Hashed Alphabet: Alphabet implementation using a hash-set.
 pub struct HashedAlphabet {
     alphabet: HashSet<char>,
 }
 
 impl HashedAlphabet {
+    // Returns a new empty alphabet.
     pub fn new() -> HashedAlphabet {
         HashedAlphabet {
             alphabet: HashSet::new(),
         }
     }
 
+    // Inserts character `char` into the alphabet.
     pub fn insert(&mut self, c: char) {
         self.alphabet.insert(c);
     }

diff --git a/src/core/lex/longest_match.rs b/src/core/lex/longest_match.rs
@@ -4,6 +4,7 @@ use core::{
     parse::grammar::GrammarSymbol,
 };
 
+/// Longest Match Lexer: Lexer which greedily consumes input, producing the longest possible tokens.
 pub struct LongestMatchLexer;
 
 impl<State: Data, Symbol: GrammarSymbol> Lexer<State, Symbol> for LongestMatchLexer {
@@ -12,6 +13,15 @@ impl<State: Data, Symbol: GrammarSymbol> Lexer<State, Symbol> for LongestMatchLe
         input: &[char],
         cdfa: &'cdfa dyn CDFA<State, Symbol>,
     ) -> Result<Vec<Token<Symbol>>, lex::Error> {
+        /// Scan-One Result: The result of scanning a single token.
+        ///
+        /// # Fields
+        ///
+        /// * `consumed` - the number of input characters consumed by the lex.
+        /// * `end_state` - the accepted CDFA state after completing the lex.
+        /// * `next_start` - the CDFA state to start the next lex from.
+        /// * `line` - the current line number after the lex.
+        /// * `characters` - the current character number after the lex.
         struct ScanOneResult<State> {
             consumed: usize,
             end_state: Option<State>,
@@ -20,6 +30,26 @@ impl<State: Data, Symbol: GrammarSymbol> Lexer<State, Symbol> for LongestMatchLe
             character: usize,
         }
 
+        /// Scan a single token from the head of `input`. Lexing is performed by iteratively reading
+        /// input and traversing the passed CDFA. Once the input is exhausted or no transition
+        /// exists, the input up to the most recently accepting state in the CDFA is consumed.
+        ///
+        /// Returns an error if the scan fails, or a `ScanOneResult` containing the details of the
+        /// scanned token.
+        ///
+        /// # Type Parameters:
+        ///
+        /// * `State` - the type of CDFA state being used.
+        /// * `Symbol` - the type of grammar symbol being tokenized into.
+        ///
+        /// # Parameters
+        ///
+        /// * `input` - a slice of the input array being scanned, where the start of the slice is
+        /// the current lex cursor.
+        /// * `start` - the starting CDFA state in which to begin the lex.
+        /// * `line` - the current line number.
+        /// * `character` - the current character number (on the current line).
+        /// * `cdfa` - the CDFA to use when lexing the input.
         fn scan_one<State: Data, Symbol: GrammarSymbol>(
             input: &[char],
             start: State,
@@ -34,6 +64,7 @@ impl<State: Data, Symbol: GrammarSymbol> Lexer<State, Symbol> for LongestMatchLe
 
             let next_start = cdfa.default_acceptor_destination(&state);
 
+            // If the start state is already accepting, remember it.
             let end_state = if let Some(ref accd) = next_start {
                 if cdfa.accepts(&state) && state != *accd {
                     Some(state.clone())
@@ -44,17 +75,18 @@ impl<State: Data, Symbol: GrammarSymbol> Lexer<State, Symbol> for LongestMatchLe
                 None
             };
 
+            let mut consumed: usize = 0;
+
             let mut last_accepting = ScanOneResult {
-                consumed: 0,
+                consumed,
                 end_state,
                 next_start,
                 line,
                 character,
             };
 
-            let mut consumed: usize = 0;
-
             loop {
+                // Take a transition on the remaining input.
                 let res = cdfa.transition(&state, remaining);
 
                 match res {
@@ -63,17 +95,21 @@ impl<State: Data, Symbol: GrammarSymbol> Lexer<State, Symbol> for LongestMatchLe
                         consumed += dest.consumed;
 
                         for c in remaining.iter().take(dest.consumed) {
+                            // Update calculation of current character and line.
                             character += 1;
                             if *c == '\n' {
                                 line += 1;
                                 character = 1;
                             }
 
+                            // Error out if we see an unexpected character.
                             if !cdfa.alphabet_contains(*c) {
                                 return Err(lex::Error::AlphabetErr(*c));
                             }
                         }
 
+                        // If the current state is accepting, remember it.
+                        // This avoids backtracking when we reach the end of the lex.
                         if cdfa.accepts(&dest.state) {
                             last_accepting = ScanOneResult {
                                 consumed,
@@ -103,6 +139,7 @@ impl<State: Data, Symbol: GrammarSymbol> Lexer<State, Symbol> for LongestMatchLe
         let mut character: usize = 1;
 
         loop {
+            // Scan a single token.
             let res: ScanOneResult<State> =
                 scan_one(remaining, next_start.clone(), line, character, cdfa)?;
 
@@ -116,6 +153,7 @@ impl<State: Data, Symbol: GrammarSymbol> Lexer<State, Symbol> for LongestMatchLe
 
             match res.end_state {
                 None => {
+                    // If more input remains after a failed token scan, return a lexing error.
                     if !remaining.is_empty() {
                         let sequence: String = (0..FAIL_SEQUENCE_LENGTH)
                             .map(|i| remaining.get(i))
@@ -133,6 +171,7 @@ impl<State: Data, Symbol: GrammarSymbol> Lexer<State, Symbol> for LongestMatchLe
                     break;
                 }
                 Some(state) => {
+                    // Scanning succeeded, tokenize the consumed input and continue.
                     if let Some(kind) = cdfa.tokenize(&state) {
                         tokens.push(Token::leaf(
                             kind,