Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scannerless parsing #23

Merged
merged 12 commits into from
Oct 5, 2023
1 change: 1 addition & 0 deletions build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ kotlin {
dependencies {
implementation(kotlin("test"))
implementation("com.willowtreeapps.assertk:assertk:0.26.1")
runtimeOnly(kotlin("reflect"))
}
}
}
Expand Down
37 changes: 29 additions & 8 deletions src/commonMain/kotlin/me/alllex/parsus/parser/ChoiceParser.kt
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,19 @@ package me.alllex.parsus.parser

import me.alllex.parsus.token.Token

internal class ChoiceParser<out T>(
private fun Parser<*>.hasUnknownFirstTokens() = firstTokens.isEmpty()
private fun List<Parser<*>>.hasUnknownFirstTokens() = any { it.hasUnknownFirstTokens() }

internal abstract class AbstractChoiceParser<T>(
val parsers: List<Parser<T>>,
) : ParserImpl<T>(
null,
firstTokens = if (parsers.hasUnknownFirstTokens()) emptySet() else parsers.flatMap { it.firstTokens }.toSet()
) {
)

internal class EagerChoiceParser<T>(
parsers: List<Parser<T>>,
) : AbstractChoiceParser<T>(parsers) {

private val parsersByFirstToken: Map<Token, List<Parser<T>>> =
mutableMapOf<Token, MutableList<Parser<T>>>()
Expand All @@ -28,6 +35,7 @@ internal class ChoiceParser<out T>(

private val unknownFirstTokenParsers = parsers.filter { it.hasUnknownFirstTokens() }

@Suppress("DEPRECATION")
override suspend fun ParsingScope.parse(): T {
val currentToken = currentToken?.token ?: fail(NoMatchingToken(currentOffset))
val parsers = parsersByFirstToken[currentToken] ?: unknownFirstTokenParsers
Expand All @@ -37,13 +45,26 @@ internal class ChoiceParser<out T>(
}
fail(NoViableAlternative(currentOffset))
}
}

companion object {
private fun Parser<*>.hasUnknownFirstTokens() = firstTokens.isEmpty()
private fun List<Parser<*>>.hasUnknownFirstTokens() = any { it.hasUnknownFirstTokens() }
internal class ScannerlessChoiceParser<T>(
parsers: List<Parser<T>>,
) : AbstractChoiceParser<T>(parsers) {

override suspend fun ParsingScope.parse(): T {
for (parser in parsers) {
val r = tryParse(parser)
if (r is ParsedValue) return r.value
}
fail(NoViableAlternative(currentOffset))
}
}

@Suppress("FunctionName")
private fun <T> ChoiceParser(parsers: List<Parser<T>>) =
// EagerChoiceParser can only be used with EagerTokenizer
ScannerlessChoiceParser(parsers)

/**
* Creates a combined parser that will try the receiver parser first,
* and fall back to the other parser in case of a parse error.
Expand All @@ -55,8 +76,8 @@ internal class ChoiceParser<out T>(
* ```
*/
infix fun <R> Parser<R>.or(p: Parser<R>): Parser<R> = when {
this is ChoiceParser && p is ChoiceParser -> ChoiceParser(parsers + p.parsers)
this is ChoiceParser -> ChoiceParser(parsers + p)
p is ChoiceParser -> ChoiceParser(listOf(this) + p.parsers)
this is AbstractChoiceParser && p is AbstractChoiceParser -> ChoiceParser(parsers + p.parsers)
this is AbstractChoiceParser -> ChoiceParser(parsers + p)
p is AbstractChoiceParser -> ChoiceParser(listOf(this) + p.parsers)
else -> ChoiceParser(listOf(this, p))
}
13 changes: 8 additions & 5 deletions src/commonMain/kotlin/me/alllex/parsus/parser/Grammar.kt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package me.alllex.parsus.parser
import me.alllex.parsus.annotations.ExperimentalParsusApi
import me.alllex.parsus.token.EofToken
import me.alllex.parsus.token.Token
import me.alllex.parsus.tokenizer.ScannerlessTokenizer
import me.alllex.parsus.trace.TokenMatchingTrace
import me.alllex.parsus.trace.TracedParseResult
import kotlin.reflect.KProperty
Expand Down Expand Up @@ -159,18 +160,20 @@ abstract class Grammar<out V>(

private fun <T> parseEntire(parser: Parser<T>, input: String): ParseResult<T> {
beforeParsing()
val lexer = Lexer(input, _tokens)
val parsingContext = ParsingContext(lexer, debugMode)
// If tokenizer impl is changed to EagerTokenizer, then ChoiceParser impl has to be changed to EagerChoiceParser
val tokenizer = ScannerlessTokenizer(input, _tokens)
val parsingContext = ParsingContext(tokenizer, debugMode)
return parsingContext.runParser(createUntilEofParser(parser))
}

@ExperimentalParsusApi
private fun <T> parseTracingEntire(parser: Parser<T>, input: String): TracedParseResult<T, TokenMatchingTrace> {
beforeParsing()
val lexer = Lexer(input, _tokens, traceTokenMatching = true)
val parsingContext = ParsingContext(lexer, debugMode)
// If tokenizer impl is changed to EagerTokenizer, then ChoiceParser impl has to be changed to EagerChoiceParser
val tokenizer = ScannerlessTokenizer(input, _tokens, traceTokenMatching = true)
val parsingContext = ParsingContext(tokenizer, debugMode)
val result = parsingContext.runParser(createUntilEofParser(parser))
val trace = lexer.getTokenMatchingTrace() ?: error("Token matching trace is not available")
val trace = tokenizer.getTokenMatchingTrace() ?: error("Token matching trace is not available")
return TracedParseResult(result, trace)
}

Expand Down
2 changes: 2 additions & 0 deletions src/commonMain/kotlin/me/alllex/parsus/parser/ParseResult.kt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ abstract class ParseError : ParseResult<Nothing>() {
override fun toString(): String = "ParseError"
}

data class UnmatchedToken(val expected: Token, override val offset: Int) : ParseError()

data class MismatchedToken(val expected: Token, val found: TokenMatch) : ParseError() {
override val offset: Int get() = found.offset
}
Expand Down
16 changes: 10 additions & 6 deletions src/commonMain/kotlin/me/alllex/parsus/parser/ParsingContext.kt
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,21 @@ package me.alllex.parsus.parser

import me.alllex.parsus.token.Token
import me.alllex.parsus.token.TokenMatch
import me.alllex.parsus.tokenizer.Tokenizer
import kotlin.coroutines.Continuation
import kotlin.coroutines.CoroutineContext
import kotlin.coroutines.EmptyCoroutineContext
import kotlin.coroutines.intrinsics.*
import kotlin.coroutines.intrinsics.COROUTINE_SUSPENDED
import kotlin.coroutines.intrinsics.createCoroutineUnintercepted
import kotlin.coroutines.intrinsics.suspendCoroutineUninterceptedOrReturn

/**
* Executes parsers, keeping track of current position in the input and error-continuations.
*
* For each [run][runParser] a new context must be created.
*/
internal class ParsingContext(
private val lexer: Lexer,
private val tokenizer: Tokenizer,
private val debugMode: Boolean = false
) : ParsingScope {

Expand All @@ -35,12 +38,12 @@ internal class ParsingContext(
return result.getOrThrow() as ParseResult<T>
}

override val TokenMatch.text: String get() = lexer.input.substring(offset, offset + length)
override val TokenMatch.text: String get() = tokenizer.input.substring(offset, offset + length)

override val currentOffset: Int get() = position

override val currentToken: TokenMatch?
get() = lexer.findMatch(position)
get() = tokenizer.findContextFreeMatch(position)

override suspend fun <R> Parser<R>.invoke(): R = parse()

Expand All @@ -55,8 +58,9 @@ internal class ParsingContext(

override fun tryParse(token: Token): ParseResult<TokenMatch> {
val fromIndex = this.position
val match = lexer.findMatch(fromIndex)
?: return NoMatchingToken(fromIndex)
val match = tokenizer.findMatchOf(fromIndex, token)
?: return UnmatchedToken(token, fromIndex)
// TODO: clean up, as this should not happen anymore
if (match.token != token) return MismatchedToken(token, match)
this.position = match.offset + match.length
return ParsedValue(match)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ interface ParsingScope {
/**
* The token at the current offset in the input.
*/
@Deprecated("The new \"scannerless\" parsing approach does not eagerly tokenize the input. The `currentToken` is always null.")
val currentToken: TokenMatch?

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package me.alllex.parsus.tokenizer

import me.alllex.parsus.annotations.ExperimentalParsusApi
import me.alllex.parsus.token.Token
import me.alllex.parsus.token.TokenMatch
import me.alllex.parsus.trace.TokenMatchingEvent
import me.alllex.parsus.trace.TokenMatchingTrace

@OptIn(ExperimentalParsusApi::class)
internal abstract class AbstractTokenizer(
override val input: String,
protected val tokens: List<Token>,
traceTokenMatching: Boolean = false,
) : Tokenizer {

private val traceEvents: MutableList<TokenMatchingEvent>? = if (traceTokenMatching) mutableListOf() else null

override fun getTokenMatchingTrace(): TokenMatchingTrace? {
return traceEvents?.let { TokenMatchingTrace(input, it) }
}

protected fun matchImpl(fromIndex: Int, token: Token): TokenMatch? {
val length = token.match(input, fromIndex)
if (length == 0) {
traceMismatch(token, fromIndex)
return null
}

val match = TokenMatch(token, fromIndex, length)
traceMatch(token, match)
return match
}

private fun traceMismatch(token: Token, offset: Int) {
traceEvents?.add(TokenMatchingEvent(token, offset, null))
}

private fun traceMatch(token: Token, match: TokenMatch) {
traceEvents?.add(TokenMatchingEvent(token, match.offset, match))
}
}
Original file line number Diff line number Diff line change
@@ -1,28 +1,24 @@
package me.alllex.parsus.parser
package me.alllex.parsus.tokenizer

import me.alllex.parsus.annotations.ExperimentalParsusApi
import me.alllex.parsus.token.Token
import me.alllex.parsus.token.TokenMatch
import me.alllex.parsus.trace.TokenMatchingEvent
import me.alllex.parsus.trace.TokenMatchingTrace

/**
* Lexer is responsible for [finding][findMatch] token-matches in the given position
* in the input string.
* This tokenizer eagerly tries to match tokens from the input,
* based on the full token set ordered by priority.
* It deterministically matches tokens from the input,
* not taking into account tokens expected by parsers.
*/
@OptIn(ExperimentalParsusApi::class)
internal class Lexer(
val input: String,
private val tokens: List<Token>,
internal class EagerTokenizer(
input: String,
tokens: List<Token>,
traceTokenMatching: Boolean = false,
) {
) : AbstractTokenizer(input, tokens, traceTokenMatching) {

private val tokensByFirstChar: Map<Char, List<Token>>
private var cachedFromIndex: Int = -1
private var cachedTokenMatch: TokenMatch? = null

private val traceEvents: MutableList<TokenMatchingEvent>? = if (traceTokenMatching) mutableListOf() else null

init {
tokensByFirstChar = mutableMapOf<Char, MutableList<Token>>()
val unknownFirstCharTokens = mutableListOf<Token>()
Expand All @@ -43,11 +39,15 @@ internal class Lexer(
}
}

internal fun getTokenMatchingTrace(): TokenMatchingTrace? {
return traceEvents?.let { TokenMatchingTrace(input, it) }
override fun findContextFreeMatch(fromIndex: Int): TokenMatch? {
return findMatchCaching(fromIndex)
}

override fun findMatchOf(fromIndex: Int, targetToken: Token): TokenMatch? {
return findMatchCaching(fromIndex)
}

fun findMatch(fromIndex: Int): TokenMatch? {
private fun findMatchCaching(fromIndex: Int): TokenMatch? {
if (fromIndex == cachedFromIndex && cachedTokenMatch != null) {
return cachedTokenMatch
}
Expand Down Expand Up @@ -85,24 +85,4 @@ internal class Lexer(
}
return null
}

private fun matchImpl(fromIndex: Int, token: Token): TokenMatch? {
val length = token.match(input, fromIndex)
if (length == 0) {
traceMismatch(token, fromIndex)
return null
}

val match = TokenMatch(token, fromIndex, length)
traceMatch(token, match)
return match
}

private fun traceMismatch(token: Token, offset: Int) {
traceEvents?.add(TokenMatchingEvent(token, offset, null))
}

private fun traceMatch(token: Token, match: TokenMatch) {
traceEvents?.add(TokenMatchingEvent(token, match.offset, match))
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
package me.alllex.parsus.tokenizer

import me.alllex.parsus.token.Token
import me.alllex.parsus.token.TokenMatch

/**
* Scannerless tokenizer tries to parse the target token at the give position.
*
* It treats the target token as having higher priority than all other tokens.
*/
internal class ScannerlessTokenizer(
input: String,
tokens: List<Token>,
traceTokenMatching: Boolean = false,
) : AbstractTokenizer(input, tokens, traceTokenMatching) {

private val ignoredTokens = tokens.filter { it.ignored }

// We cache one mismatch and one match of ignored tokens.
// This is for the frequent case, when there is exactly one ignored token before the target token.
// Example:
// parser = t1 or t2 or t3, ws = ignored whitespace
// input = " t3"
// In this example, t1 will fail to match at 0, but ws will match at 0, so we cache the match.
// Then t1 will try to match at 1, but it will fail again, so we try ignored tokens again,
// but this time we get a mismatch, which we cache separately. This fails the t1 branch of the parser.
// Now, we backtrack and try t2 at 0, which fails.
// But we can avoid rematching ws at 0, because we cached this match.
// Then we try t2 at position 1, which fails. But we don't retry ws, because we cached the mismatch.
// In the last t3 branch, we try t3 at 0, which fails, but then we skip rematching ws at 0,
// because it is still cached. Then t3 succeeds at 0, and parsing succeeds.
private var cacheIgnoredMismatchFromIndex = -1
private var cachedIgnoredFromIndex: Int = -1
private var cachedIgnoredTokenMatch: TokenMatch? = null

override fun findContextFreeMatch(fromIndex: Int): TokenMatch? = null

override fun findMatchOf(fromIndex: Int, targetToken: Token): TokenMatch? {
var pos = fromIndex
while (true) {
matchTarget(pos, targetToken)?.let { return it }

val ignoredMatch = matchIgnored(pos)
@Suppress("LiftReturnOrAssignment")
if (ignoredMatch != null) {
val posAfterIgnored = ignoredMatch.offset + ignoredMatch.length
if (posAfterIgnored > pos) {
pos = posAfterIgnored
continue
} else {
// An ignored token matched, but it did not advance the position.
// This should not happen normally, but this is a safeguard.
return null
}
} else {
// No ignored tokens matched at the current position either,
// so it is a mismatch overall
return null
}
}
// The loop will exit via a mismatch, because no tokens can match "after the end of input"
}

private fun matchIgnored(fromIndex: Int): TokenMatch? {
require(fromIndex >= 0) { "fromIndex must be non-negative, but was $fromIndex" }

if (fromIndex == cacheIgnoredMismatchFromIndex) {
return null
}
if (fromIndex == cachedIgnoredFromIndex) {
return cachedIgnoredTokenMatch
}

var match: TokenMatch? = null
for (ignoredToken in ignoredTokens) {
match = matchImpl(fromIndex, ignoredToken)
if (match != null) {
break
}
}

if (match == null) {
cacheIgnoredMismatchFromIndex = fromIndex
} else {
cachedIgnoredFromIndex = fromIndex
cachedIgnoredTokenMatch = match
}
return match
}

private fun matchTarget(pos: Int, targetToken: Token) = matchImpl(pos, targetToken)
}
Loading