Merge pull request #23 from alllex/scannerless-parsing

Scannerless parsing
alllex · Oct 5, 2023 · fde0d8b · fde0d8b
2 parents 725491b + 4f6501d
commit fde0d8b
Show file tree

Hide file tree

Showing 17 changed files with 436 additions and 96 deletions.
diff --git a/build.gradle.kts b/build.gradle.kts
@@ -16,6 +16,7 @@ kotlin {
             dependencies {
                 implementation(kotlin("test"))
                 implementation("com.willowtreeapps.assertk:assertk:0.26.1")
+                runtimeOnly(kotlin("reflect"))
             }
         }
     }

diff --git a/src/commonMain/kotlin/me/alllex/parsus/parser/ChoiceParser.kt b/src/commonMain/kotlin/me/alllex/parsus/parser/ChoiceParser.kt
@@ -2,12 +2,19 @@ package me.alllex.parsus.parser
 
 import me.alllex.parsus.token.Token
 
-internal class ChoiceParser<out T>(
+private fun Parser<*>.hasUnknownFirstTokens() = firstTokens.isEmpty()
+private fun List<Parser<*>>.hasUnknownFirstTokens() = any { it.hasUnknownFirstTokens() }
+
+internal abstract class AbstractChoiceParser<T>(
     val parsers: List<Parser<T>>,
 ) : ParserImpl<T>(
     null,
     firstTokens = if (parsers.hasUnknownFirstTokens()) emptySet() else parsers.flatMap { it.firstTokens }.toSet()
-) {
+)
+
+internal class EagerChoiceParser<T>(
+    parsers: List<Parser<T>>,
+) : AbstractChoiceParser<T>(parsers) {
 
     private val parsersByFirstToken: Map<Token, List<Parser<T>>> =
         mutableMapOf<Token, MutableList<Parser<T>>>()
@@ -28,6 +35,7 @@ internal class ChoiceParser<out T>(
 
     private val unknownFirstTokenParsers = parsers.filter { it.hasUnknownFirstTokens() }
 
+    @Suppress("DEPRECATION")
     override suspend fun ParsingScope.parse(): T {
         val currentToken = currentToken?.token ?: fail(NoMatchingToken(currentOffset))
         val parsers = parsersByFirstToken[currentToken] ?: unknownFirstTokenParsers
@@ -37,13 +45,26 @@ internal class ChoiceParser<out T>(
         }
         fail(NoViableAlternative(currentOffset))
     }
+}
 
-    companion object {
-        private fun Parser<*>.hasUnknownFirstTokens() = firstTokens.isEmpty()
-        private fun List<Parser<*>>.hasUnknownFirstTokens() = any { it.hasUnknownFirstTokens() }
+internal class ScannerlessChoiceParser<T>(
+    parsers: List<Parser<T>>,
+) : AbstractChoiceParser<T>(parsers) {
+
+    override suspend fun ParsingScope.parse(): T {
+        for (parser in parsers) {
+            val r = tryParse(parser)
+            if (r is ParsedValue) return r.value
+        }
+        fail(NoViableAlternative(currentOffset))
     }
 }
 
+@Suppress("FunctionName")
+private fun <T> ChoiceParser(parsers: List<Parser<T>>) =
+    // EagerChoiceParser can only be used with EagerTokenizer
+    ScannerlessChoiceParser(parsers)
+
 /**
  * Creates a combined parser that will try the receiver parser first,
  * and fall back to the other parser in case of a parse error.
@@ -55,8 +76,8 @@ internal class ChoiceParser<out T>(
  * ```
  */
 infix fun <R> Parser<R>.or(p: Parser<R>): Parser<R> = when {
-    this is ChoiceParser && p is ChoiceParser -> ChoiceParser(parsers + p.parsers)
-    this is ChoiceParser -> ChoiceParser(parsers + p)
-    p is ChoiceParser -> ChoiceParser(listOf(this) + p.parsers)
+    this is AbstractChoiceParser && p is AbstractChoiceParser -> ChoiceParser(parsers + p.parsers)
+    this is AbstractChoiceParser -> ChoiceParser(parsers + p)
+    p is AbstractChoiceParser -> ChoiceParser(listOf(this) + p.parsers)
     else -> ChoiceParser(listOf(this, p))
 }
diff --git a/src/commonMain/kotlin/me/alllex/parsus/parser/Grammar.kt b/src/commonMain/kotlin/me/alllex/parsus/parser/Grammar.kt
@@ -3,6 +3,7 @@ package me.alllex.parsus.parser
 import me.alllex.parsus.annotations.ExperimentalParsusApi
 import me.alllex.parsus.token.EofToken
 import me.alllex.parsus.token.Token
+import me.alllex.parsus.tokenizer.ScannerlessTokenizer
 import me.alllex.parsus.trace.TokenMatchingTrace
 import me.alllex.parsus.trace.TracedParseResult
 import kotlin.reflect.KProperty
@@ -159,18 +160,20 @@ abstract class Grammar<out V>(
 
     private fun <T> parseEntire(parser: Parser<T>, input: String): ParseResult<T> {
         beforeParsing()
-        val lexer = Lexer(input, _tokens)
-        val parsingContext = ParsingContext(lexer, debugMode)
+        // If tokenizer impl is changed to EagerTokenizer, then ChoiceParser impl has to be changed to EagerChoiceParser
+        val tokenizer = ScannerlessTokenizer(input, _tokens)
+        val parsingContext = ParsingContext(tokenizer, debugMode)
         return parsingContext.runParser(createUntilEofParser(parser))
     }
 
     @ExperimentalParsusApi
     private fun <T> parseTracingEntire(parser: Parser<T>, input: String): TracedParseResult<T, TokenMatchingTrace> {
         beforeParsing()
-        val lexer = Lexer(input, _tokens, traceTokenMatching = true)
-        val parsingContext = ParsingContext(lexer, debugMode)
+        // If tokenizer impl is changed to EagerTokenizer, then ChoiceParser impl has to be changed to EagerChoiceParser
+        val tokenizer = ScannerlessTokenizer(input, _tokens, traceTokenMatching = true)
+        val parsingContext = ParsingContext(tokenizer, debugMode)
         val result = parsingContext.runParser(createUntilEofParser(parser))
-        val trace = lexer.getTokenMatchingTrace() ?: error("Token matching trace is not available")
+        val trace = tokenizer.getTokenMatchingTrace() ?: error("Token matching trace is not available")
         return TracedParseResult(result, trace)
     }
 

diff --git a/src/commonMain/kotlin/me/alllex/parsus/parser/ParseResult.kt b/src/commonMain/kotlin/me/alllex/parsus/parser/ParseResult.kt
@@ -26,6 +26,8 @@ abstract class ParseError : ParseResult<Nothing>() {
     override fun toString(): String = "ParseError"
 }
 
+data class UnmatchedToken(val expected: Token, override val offset: Int) : ParseError()
+
 data class MismatchedToken(val expected: Token, val found: TokenMatch) : ParseError() {
     override val offset: Int get() = found.offset
 }

diff --git a/src/commonMain/kotlin/me/alllex/parsus/parser/ParsingContext.kt b/src/commonMain/kotlin/me/alllex/parsus/parser/ParsingContext.kt
@@ -2,18 +2,21 @@ package me.alllex.parsus.parser
 
 import me.alllex.parsus.token.Token
 import me.alllex.parsus.token.TokenMatch
+import me.alllex.parsus.tokenizer.Tokenizer
 import kotlin.coroutines.Continuation
 import kotlin.coroutines.CoroutineContext
 import kotlin.coroutines.EmptyCoroutineContext
-import kotlin.coroutines.intrinsics.*
+import kotlin.coroutines.intrinsics.COROUTINE_SUSPENDED
+import kotlin.coroutines.intrinsics.createCoroutineUnintercepted
+import kotlin.coroutines.intrinsics.suspendCoroutineUninterceptedOrReturn
 
 /**
  * Executes parsers, keeping track of current position in the input and error-continuations.
  *
  * For each [run][runParser] a new context must be created.
  */
 internal class ParsingContext(
-    private val lexer: Lexer,
+    private val tokenizer: Tokenizer,
     private val debugMode: Boolean = false
 ) : ParsingScope {
 
@@ -35,12 +38,12 @@ internal class ParsingContext(
         return result.getOrThrow() as ParseResult<T>
     }
 
-    override val TokenMatch.text: String get() = lexer.input.substring(offset, offset + length)
+    override val TokenMatch.text: String get() = tokenizer.input.substring(offset, offset + length)
 
     override val currentOffset: Int get() = position
 
     override val currentToken: TokenMatch?
-        get() = lexer.findMatch(position)
+        get() = tokenizer.findContextFreeMatch(position)
 
     override suspend fun <R> Parser<R>.invoke(): R = parse()
 
@@ -55,8 +58,9 @@ internal class ParsingContext(
 
     override fun tryParse(token: Token): ParseResult<TokenMatch> {
         val fromIndex = this.position
-        val match = lexer.findMatch(fromIndex)
-            ?: return NoMatchingToken(fromIndex)
+        val match = tokenizer.findMatchOf(fromIndex, token)
+            ?: return UnmatchedToken(token, fromIndex)
+        // TODO: clean up, as this should not happen anymore
         if (match.token != token) return MismatchedToken(token, match)
         this.position = match.offset + match.length
         return ParsedValue(match)

diff --git a/src/commonMain/kotlin/me/alllex/parsus/parser/ParsingScope.kt b/src/commonMain/kotlin/me/alllex/parsus/parser/ParsingScope.kt
@@ -45,6 +45,7 @@ interface ParsingScope {
     /**
      * The token at the current offset in the input.
      */
+    @Deprecated("The new \"scannerless\" parsing approach does not eagerly tokenize the input. The `currentToken` is always null.")
     val currentToken: TokenMatch?
 
     /**

diff --git a/src/commonMain/kotlin/me/alllex/parsus/tokenizer/AbstractTokenizer.kt b/src/commonMain/kotlin/me/alllex/parsus/tokenizer/AbstractTokenizer.kt
@@ -0,0 +1,41 @@
+package me.alllex.parsus.tokenizer
+
+import me.alllex.parsus.annotations.ExperimentalParsusApi
+import me.alllex.parsus.token.Token
+import me.alllex.parsus.token.TokenMatch
+import me.alllex.parsus.trace.TokenMatchingEvent
+import me.alllex.parsus.trace.TokenMatchingTrace
+
+@OptIn(ExperimentalParsusApi::class)
+internal abstract class AbstractTokenizer(
+    override val input: String,
+    protected val tokens: List<Token>,
+    traceTokenMatching: Boolean = false,
+) : Tokenizer {
+
+    private val traceEvents: MutableList<TokenMatchingEvent>? = if (traceTokenMatching) mutableListOf() else null
+
+    override fun getTokenMatchingTrace(): TokenMatchingTrace? {
+        return traceEvents?.let { TokenMatchingTrace(input, it) }
+    }
+
+    protected fun matchImpl(fromIndex: Int, token: Token): TokenMatch? {
+        val length = token.match(input, fromIndex)
+        if (length == 0) {
+            traceMismatch(token, fromIndex)
+            return null
+        }
+
+        val match = TokenMatch(token, fromIndex, length)
+        traceMatch(token, match)
+        return match
+    }
+
+    private fun traceMismatch(token: Token, offset: Int) {
+        traceEvents?.add(TokenMatchingEvent(token, offset, null))
+    }
+
+    private fun traceMatch(token: Token, match: TokenMatch) {
+        traceEvents?.add(TokenMatchingEvent(token, match.offset, match))
+    }
+}
diff --git a/...n/kotlin/me/alllex/parsus/parser/Lexer.kt → ...alllex/parsus/tokenizer/EagerTokenizer.kt b/...n/kotlin/me/alllex/parsus/parser/Lexer.kt → ...alllex/parsus/tokenizer/EagerTokenizer.kt
@@ -1,28 +1,24 @@
-package me.alllex.parsus.parser
+package me.alllex.parsus.tokenizer
 
-import me.alllex.parsus.annotations.ExperimentalParsusApi
 import me.alllex.parsus.token.Token
 import me.alllex.parsus.token.TokenMatch
-import me.alllex.parsus.trace.TokenMatchingEvent
-import me.alllex.parsus.trace.TokenMatchingTrace
 
 /**
- * Lexer is responsible for [finding][findMatch] token-matches in the given position
- * in the input string.
+ * This tokenizer eagerly tries to match tokens from the input,
+ * based on the full token set ordered by priority.
+ * It deterministically matches tokens from the input,
+ * not taking into account tokens expected by parsers.
  */
-@OptIn(ExperimentalParsusApi::class)
-internal class Lexer(
-    val input: String,
-    private val tokens: List<Token>,
+internal class EagerTokenizer(
+    input: String,
+    tokens: List<Token>,
     traceTokenMatching: Boolean = false,
-) {
+) : AbstractTokenizer(input, tokens, traceTokenMatching) {
 
     private val tokensByFirstChar: Map<Char, List<Token>>
     private var cachedFromIndex: Int = -1
     private var cachedTokenMatch: TokenMatch? = null
 
-    private val traceEvents: MutableList<TokenMatchingEvent>? = if (traceTokenMatching) mutableListOf() else null
-
     init {
         tokensByFirstChar = mutableMapOf<Char, MutableList<Token>>()
         val unknownFirstCharTokens = mutableListOf<Token>()
@@ -43,11 +39,15 @@ internal class Lexer(
         }
     }
 
-    internal fun getTokenMatchingTrace(): TokenMatchingTrace? {
-        return traceEvents?.let { TokenMatchingTrace(input, it) }
+    override fun findContextFreeMatch(fromIndex: Int): TokenMatch? {
+        return findMatchCaching(fromIndex)
+    }
+
+    override fun findMatchOf(fromIndex: Int, targetToken: Token): TokenMatch? {
+        return findMatchCaching(fromIndex)
     }
 
-    fun findMatch(fromIndex: Int): TokenMatch? {
+    private fun findMatchCaching(fromIndex: Int): TokenMatch? {
         if (fromIndex == cachedFromIndex && cachedTokenMatch != null) {
             return cachedTokenMatch
         }
@@ -85,24 +85,4 @@ internal class Lexer(
         }
         return null
     }
-
-    private fun matchImpl(fromIndex: Int, token: Token): TokenMatch? {
-        val length = token.match(input, fromIndex)
-        if (length == 0) {
-            traceMismatch(token, fromIndex)
-            return null
-        }
-
-        val match = TokenMatch(token, fromIndex, length)
-        traceMatch(token, match)
-        return match
-    }
-
-    private fun traceMismatch(token: Token, offset: Int) {
-        traceEvents?.add(TokenMatchingEvent(token, offset, null))
-    }
-
-    private fun traceMatch(token: Token, match: TokenMatch) {
-        traceEvents?.add(TokenMatchingEvent(token, match.offset, match))
-    }
 }
diff --git a/src/commonMain/kotlin/me/alllex/parsus/tokenizer/ScannerlessTokenizer.kt b/src/commonMain/kotlin/me/alllex/parsus/tokenizer/ScannerlessTokenizer.kt
@@ -0,0 +1,92 @@
+package me.alllex.parsus.tokenizer
+
+import me.alllex.parsus.token.Token
+import me.alllex.parsus.token.TokenMatch
+
+/**
+ * Scannerless tokenizer tries to parse the target token at the give position.
+ *
+ * It treats the target token as having higher priority than all other tokens.
+ */
+internal class ScannerlessTokenizer(
+    input: String,
+    tokens: List<Token>,
+    traceTokenMatching: Boolean = false,
+) : AbstractTokenizer(input, tokens, traceTokenMatching) {
+
+    private val ignoredTokens = tokens.filter { it.ignored }
+
+    // We cache one mismatch and one match of ignored tokens.
+    // This is for the frequent case, when there is exactly one ignored token before the target token.
+    // Example:
+    //   parser = t1 or t2 or t3, ws = ignored whitespace
+    //   input = " t3"
+    // In this example, t1 will fail to match at 0, but ws will match at 0, so we cache the match.
+    // Then t1 will try to match at 1, but it will fail again, so we try ignored tokens again,
+    // but this time we get a mismatch, which we cache separately. This fails the t1 branch of the parser.
+    // Now, we backtrack and try t2 at 0, which fails.
+    // But we can avoid rematching ws at 0, because we cached this match.
+    // Then we try t2 at position 1, which fails. But we don't retry ws, because we cached the mismatch.
+    // In the last t3 branch, we try t3 at 0, which fails, but then we skip rematching ws at 0,
+    // because it is still cached. Then t3 succeeds at 0, and parsing succeeds.
+    private var cacheIgnoredMismatchFromIndex = -1
+    private var cachedIgnoredFromIndex: Int = -1
+    private var cachedIgnoredTokenMatch: TokenMatch? = null
+
+    override fun findContextFreeMatch(fromIndex: Int): TokenMatch? = null
+
+    override fun findMatchOf(fromIndex: Int, targetToken: Token): TokenMatch? {
+        var pos = fromIndex
+        while (true) {
+            matchTarget(pos, targetToken)?.let { return it }
+
+            val ignoredMatch = matchIgnored(pos)
+            @Suppress("LiftReturnOrAssignment")
+            if (ignoredMatch != null) {
+                val posAfterIgnored = ignoredMatch.offset + ignoredMatch.length
+                if (posAfterIgnored > pos) {
+                    pos = posAfterIgnored
+                    continue
+                } else {
+                    // An ignored token matched, but it did not advance the position.
+                    // This should not happen normally, but this is a safeguard.
+                    return null
+                }
+            } else {
+                // No ignored tokens matched at the current position either,
+                // so it is a mismatch overall
+                return null
+            }
+        }
+        // The loop will exit via a mismatch, because no tokens can match "after the end of input"
+    }
+
+    private fun matchIgnored(fromIndex: Int): TokenMatch? {
+        require(fromIndex >= 0) { "fromIndex must be non-negative, but was $fromIndex" }
+
+        if (fromIndex == cacheIgnoredMismatchFromIndex) {
+            return null
+        }
+        if (fromIndex == cachedIgnoredFromIndex) {
+            return cachedIgnoredTokenMatch
+        }
+
+        var match: TokenMatch? = null
+        for (ignoredToken in ignoredTokens) {
+            match = matchImpl(fromIndex, ignoredToken)
+            if (match != null) {
+                break
+            }
+        }
+
+        if (match == null) {
+            cacheIgnoredMismatchFromIndex = fromIndex
+        } else {
+            cachedIgnoredFromIndex = fromIndex
+            cachedIgnoredTokenMatch = match
+        }
+        return match
+    }
+
+    private fun matchTarget(pos: Int, targetToken: Token) = matchImpl(pos, targetToken)
+}