Skip to content

Commit

Permalink
Merge pull request #23 from alllex/scannerless-parsing
Browse files Browse the repository at this point in the history
Scannerless parsing
  • Loading branch information
alllex committed Oct 5, 2023
2 parents 725491b + 4f6501d commit fde0d8b
Show file tree
Hide file tree
Showing 17 changed files with 436 additions and 96 deletions.
1 change: 1 addition & 0 deletions build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ kotlin {
dependencies {
implementation(kotlin("test"))
implementation("com.willowtreeapps.assertk:assertk:0.26.1")
runtimeOnly(kotlin("reflect"))
}
}
}
Expand Down
37 changes: 29 additions & 8 deletions src/commonMain/kotlin/me/alllex/parsus/parser/ChoiceParser.kt
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,19 @@ package me.alllex.parsus.parser

import me.alllex.parsus.token.Token

internal class ChoiceParser<out T>(
private fun Parser<*>.hasUnknownFirstTokens() = firstTokens.isEmpty()
private fun List<Parser<*>>.hasUnknownFirstTokens() = any { it.hasUnknownFirstTokens() }

internal abstract class AbstractChoiceParser<T>(
val parsers: List<Parser<T>>,
) : ParserImpl<T>(
null,
firstTokens = if (parsers.hasUnknownFirstTokens()) emptySet() else parsers.flatMap { it.firstTokens }.toSet()
) {
)

internal class EagerChoiceParser<T>(
parsers: List<Parser<T>>,
) : AbstractChoiceParser<T>(parsers) {

private val parsersByFirstToken: Map<Token, List<Parser<T>>> =
mutableMapOf<Token, MutableList<Parser<T>>>()
Expand All @@ -28,6 +35,7 @@ internal class ChoiceParser<out T>(

private val unknownFirstTokenParsers = parsers.filter { it.hasUnknownFirstTokens() }

@Suppress("DEPRECATION")
override suspend fun ParsingScope.parse(): T {
val currentToken = currentToken?.token ?: fail(NoMatchingToken(currentOffset))
val parsers = parsersByFirstToken[currentToken] ?: unknownFirstTokenParsers
Expand All @@ -37,13 +45,26 @@ internal class ChoiceParser<out T>(
}
fail(NoViableAlternative(currentOffset))
}
}

companion object {
private fun Parser<*>.hasUnknownFirstTokens() = firstTokens.isEmpty()
private fun List<Parser<*>>.hasUnknownFirstTokens() = any { it.hasUnknownFirstTokens() }
internal class ScannerlessChoiceParser<T>(
parsers: List<Parser<T>>,
) : AbstractChoiceParser<T>(parsers) {

override suspend fun ParsingScope.parse(): T {
for (parser in parsers) {
val r = tryParse(parser)
if (r is ParsedValue) return r.value
}
fail(NoViableAlternative(currentOffset))
}
}

@Suppress("FunctionName")
private fun <T> ChoiceParser(parsers: List<Parser<T>>) =
// EagerChoiceParser can only be used with EagerTokenizer
ScannerlessChoiceParser(parsers)

/**
* Creates a combined parser that will try the receiver parser first,
* and fall back to the other parser in case of a parse error.
Expand All @@ -55,8 +76,8 @@ internal class ChoiceParser<out T>(
* ```
*/
infix fun <R> Parser<R>.or(p: Parser<R>): Parser<R> = when {
this is ChoiceParser && p is ChoiceParser -> ChoiceParser(parsers + p.parsers)
this is ChoiceParser -> ChoiceParser(parsers + p)
p is ChoiceParser -> ChoiceParser(listOf(this) + p.parsers)
this is AbstractChoiceParser && p is AbstractChoiceParser -> ChoiceParser(parsers + p.parsers)
this is AbstractChoiceParser -> ChoiceParser(parsers + p)
p is AbstractChoiceParser -> ChoiceParser(listOf(this) + p.parsers)
else -> ChoiceParser(listOf(this, p))
}
13 changes: 8 additions & 5 deletions src/commonMain/kotlin/me/alllex/parsus/parser/Grammar.kt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package me.alllex.parsus.parser
import me.alllex.parsus.annotations.ExperimentalParsusApi
import me.alllex.parsus.token.EofToken
import me.alllex.parsus.token.Token
import me.alllex.parsus.tokenizer.ScannerlessTokenizer
import me.alllex.parsus.trace.TokenMatchingTrace
import me.alllex.parsus.trace.TracedParseResult
import kotlin.reflect.KProperty
Expand Down Expand Up @@ -159,18 +160,20 @@ abstract class Grammar<out V>(

private fun <T> parseEntire(parser: Parser<T>, input: String): ParseResult<T> {
beforeParsing()
val lexer = Lexer(input, _tokens)
val parsingContext = ParsingContext(lexer, debugMode)
// If tokenizer impl is changed to EagerTokenizer, then ChoiceParser impl has to be changed to EagerChoiceParser
val tokenizer = ScannerlessTokenizer(input, _tokens)
val parsingContext = ParsingContext(tokenizer, debugMode)
return parsingContext.runParser(createUntilEofParser(parser))
}

@ExperimentalParsusApi
private fun <T> parseTracingEntire(parser: Parser<T>, input: String): TracedParseResult<T, TokenMatchingTrace> {
beforeParsing()
val lexer = Lexer(input, _tokens, traceTokenMatching = true)
val parsingContext = ParsingContext(lexer, debugMode)
// If tokenizer impl is changed to EagerTokenizer, then ChoiceParser impl has to be changed to EagerChoiceParser
val tokenizer = ScannerlessTokenizer(input, _tokens, traceTokenMatching = true)
val parsingContext = ParsingContext(tokenizer, debugMode)
val result = parsingContext.runParser(createUntilEofParser(parser))
val trace = lexer.getTokenMatchingTrace() ?: error("Token matching trace is not available")
val trace = tokenizer.getTokenMatchingTrace() ?: error("Token matching trace is not available")
return TracedParseResult(result, trace)
}

Expand Down
2 changes: 2 additions & 0 deletions src/commonMain/kotlin/me/alllex/parsus/parser/ParseResult.kt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ abstract class ParseError : ParseResult<Nothing>() {
override fun toString(): String = "ParseError"
}

data class UnmatchedToken(val expected: Token, override val offset: Int) : ParseError()

data class MismatchedToken(val expected: Token, val found: TokenMatch) : ParseError() {
override val offset: Int get() = found.offset
}
Expand Down
16 changes: 10 additions & 6 deletions src/commonMain/kotlin/me/alllex/parsus/parser/ParsingContext.kt
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,21 @@ package me.alllex.parsus.parser

import me.alllex.parsus.token.Token
import me.alllex.parsus.token.TokenMatch
import me.alllex.parsus.tokenizer.Tokenizer
import kotlin.coroutines.Continuation
import kotlin.coroutines.CoroutineContext
import kotlin.coroutines.EmptyCoroutineContext
import kotlin.coroutines.intrinsics.*
import kotlin.coroutines.intrinsics.COROUTINE_SUSPENDED
import kotlin.coroutines.intrinsics.createCoroutineUnintercepted
import kotlin.coroutines.intrinsics.suspendCoroutineUninterceptedOrReturn

/**
* Executes parsers, keeping track of current position in the input and error-continuations.
*
* For each [run][runParser] a new context must be created.
*/
internal class ParsingContext(
private val lexer: Lexer,
private val tokenizer: Tokenizer,
private val debugMode: Boolean = false
) : ParsingScope {

Expand All @@ -35,12 +38,12 @@ internal class ParsingContext(
return result.getOrThrow() as ParseResult<T>
}

override val TokenMatch.text: String get() = lexer.input.substring(offset, offset + length)
override val TokenMatch.text: String get() = tokenizer.input.substring(offset, offset + length)

override val currentOffset: Int get() = position

override val currentToken: TokenMatch?
get() = lexer.findMatch(position)
get() = tokenizer.findContextFreeMatch(position)

override suspend fun <R> Parser<R>.invoke(): R = parse()

Expand All @@ -55,8 +58,9 @@ internal class ParsingContext(

override fun tryParse(token: Token): ParseResult<TokenMatch> {
val fromIndex = this.position
val match = lexer.findMatch(fromIndex)
?: return NoMatchingToken(fromIndex)
val match = tokenizer.findMatchOf(fromIndex, token)
?: return UnmatchedToken(token, fromIndex)
// TODO: clean up, as this should not happen anymore
if (match.token != token) return MismatchedToken(token, match)
this.position = match.offset + match.length
return ParsedValue(match)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ interface ParsingScope {
/**
* The token at the current offset in the input.
*/
@Deprecated("The new \"scannerless\" parsing approach does not eagerly tokenize the input. The `currentToken` is always null.")
val currentToken: TokenMatch?

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package me.alllex.parsus.tokenizer

import me.alllex.parsus.annotations.ExperimentalParsusApi
import me.alllex.parsus.token.Token
import me.alllex.parsus.token.TokenMatch
import me.alllex.parsus.trace.TokenMatchingEvent
import me.alllex.parsus.trace.TokenMatchingTrace

@OptIn(ExperimentalParsusApi::class)
internal abstract class AbstractTokenizer(
override val input: String,
protected val tokens: List<Token>,
traceTokenMatching: Boolean = false,
) : Tokenizer {

private val traceEvents: MutableList<TokenMatchingEvent>? = if (traceTokenMatching) mutableListOf() else null

override fun getTokenMatchingTrace(): TokenMatchingTrace? {
return traceEvents?.let { TokenMatchingTrace(input, it) }
}

protected fun matchImpl(fromIndex: Int, token: Token): TokenMatch? {
val length = token.match(input, fromIndex)
if (length == 0) {
traceMismatch(token, fromIndex)
return null
}

val match = TokenMatch(token, fromIndex, length)
traceMatch(token, match)
return match
}

private fun traceMismatch(token: Token, offset: Int) {
traceEvents?.add(TokenMatchingEvent(token, offset, null))
}

private fun traceMatch(token: Token, match: TokenMatch) {
traceEvents?.add(TokenMatchingEvent(token, match.offset, match))
}
}
Original file line number Diff line number Diff line change
@@ -1,28 +1,24 @@
package me.alllex.parsus.parser
package me.alllex.parsus.tokenizer

import me.alllex.parsus.annotations.ExperimentalParsusApi
import me.alllex.parsus.token.Token
import me.alllex.parsus.token.TokenMatch
import me.alllex.parsus.trace.TokenMatchingEvent
import me.alllex.parsus.trace.TokenMatchingTrace

/**
* Lexer is responsible for [finding][findMatch] token-matches in the given position
* in the input string.
* This tokenizer eagerly tries to match tokens from the input,
* based on the full token set ordered by priority.
* It deterministically matches tokens from the input,
* not taking into account tokens expected by parsers.
*/
@OptIn(ExperimentalParsusApi::class)
internal class Lexer(
val input: String,
private val tokens: List<Token>,
internal class EagerTokenizer(
input: String,
tokens: List<Token>,
traceTokenMatching: Boolean = false,
) {
) : AbstractTokenizer(input, tokens, traceTokenMatching) {

private val tokensByFirstChar: Map<Char, List<Token>>
private var cachedFromIndex: Int = -1
private var cachedTokenMatch: TokenMatch? = null

private val traceEvents: MutableList<TokenMatchingEvent>? = if (traceTokenMatching) mutableListOf() else null

init {
tokensByFirstChar = mutableMapOf<Char, MutableList<Token>>()
val unknownFirstCharTokens = mutableListOf<Token>()
Expand All @@ -43,11 +39,15 @@ internal class Lexer(
}
}

internal fun getTokenMatchingTrace(): TokenMatchingTrace? {
return traceEvents?.let { TokenMatchingTrace(input, it) }
override fun findContextFreeMatch(fromIndex: Int): TokenMatch? {
return findMatchCaching(fromIndex)
}

override fun findMatchOf(fromIndex: Int, targetToken: Token): TokenMatch? {
return findMatchCaching(fromIndex)
}

fun findMatch(fromIndex: Int): TokenMatch? {
private fun findMatchCaching(fromIndex: Int): TokenMatch? {
if (fromIndex == cachedFromIndex && cachedTokenMatch != null) {
return cachedTokenMatch
}
Expand Down Expand Up @@ -85,24 +85,4 @@ internal class Lexer(
}
return null
}

private fun matchImpl(fromIndex: Int, token: Token): TokenMatch? {
val length = token.match(input, fromIndex)
if (length == 0) {
traceMismatch(token, fromIndex)
return null
}

val match = TokenMatch(token, fromIndex, length)
traceMatch(token, match)
return match
}

private fun traceMismatch(token: Token, offset: Int) {
traceEvents?.add(TokenMatchingEvent(token, offset, null))
}

private fun traceMatch(token: Token, match: TokenMatch) {
traceEvents?.add(TokenMatchingEvent(token, match.offset, match))
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
package me.alllex.parsus.tokenizer

import me.alllex.parsus.token.Token
import me.alllex.parsus.token.TokenMatch

/**
* Scannerless tokenizer tries to parse the target token at the give position.
*
* It treats the target token as having higher priority than all other tokens.
*/
internal class ScannerlessTokenizer(
input: String,
tokens: List<Token>,
traceTokenMatching: Boolean = false,
) : AbstractTokenizer(input, tokens, traceTokenMatching) {

private val ignoredTokens = tokens.filter { it.ignored }

// We cache one mismatch and one match of ignored tokens.
// This is for the frequent case, when there is exactly one ignored token before the target token.
// Example:
// parser = t1 or t2 or t3, ws = ignored whitespace
// input = " t3"
// In this example, t1 will fail to match at 0, but ws will match at 0, so we cache the match.
// Then t1 will try to match at 1, but it will fail again, so we try ignored tokens again,
// but this time we get a mismatch, which we cache separately. This fails the t1 branch of the parser.
// Now, we backtrack and try t2 at 0, which fails.
// But we can avoid rematching ws at 0, because we cached this match.
// Then we try t2 at position 1, which fails. But we don't retry ws, because we cached the mismatch.
// In the last t3 branch, we try t3 at 0, which fails, but then we skip rematching ws at 0,
// because it is still cached. Then t3 succeeds at 0, and parsing succeeds.
private var cacheIgnoredMismatchFromIndex = -1
private var cachedIgnoredFromIndex: Int = -1
private var cachedIgnoredTokenMatch: TokenMatch? = null

override fun findContextFreeMatch(fromIndex: Int): TokenMatch? = null

override fun findMatchOf(fromIndex: Int, targetToken: Token): TokenMatch? {
var pos = fromIndex
while (true) {
matchTarget(pos, targetToken)?.let { return it }

val ignoredMatch = matchIgnored(pos)
@Suppress("LiftReturnOrAssignment")
if (ignoredMatch != null) {
val posAfterIgnored = ignoredMatch.offset + ignoredMatch.length
if (posAfterIgnored > pos) {
pos = posAfterIgnored
continue
} else {
// An ignored token matched, but it did not advance the position.
// This should not happen normally, but this is a safeguard.
return null
}
} else {
// No ignored tokens matched at the current position either,
// so it is a mismatch overall
return null
}
}
// The loop will exit via a mismatch, because no tokens can match "after the end of input"
}

private fun matchIgnored(fromIndex: Int): TokenMatch? {
require(fromIndex >= 0) { "fromIndex must be non-negative, but was $fromIndex" }

if (fromIndex == cacheIgnoredMismatchFromIndex) {
return null
}
if (fromIndex == cachedIgnoredFromIndex) {
return cachedIgnoredTokenMatch
}

var match: TokenMatch? = null
for (ignoredToken in ignoredTokens) {
match = matchImpl(fromIndex, ignoredToken)
if (match != null) {
break
}
}

if (match == null) {
cacheIgnoredMismatchFromIndex = fromIndex
} else {
cachedIgnoredFromIndex = fromIndex
cachedIgnoredTokenMatch = match
}
return match
}

private fun matchTarget(pos: Int, targetToken: Token) = matchImpl(pos, targetToken)
}
Loading

0 comments on commit fde0d8b

Please sign in to comment.