From d9604e025d100bf9ecdcf5c2794afbb65086126b Mon Sep 17 00:00:00 2001 From: Alex Semin Date: Wed, 4 Oct 2023 08:58:20 +0200 Subject: [PATCH 01/11] Implement scannerless parsing --- .../me/alllex/parsus/parser/ChoiceParser.kt | 5 ++- .../kotlin/me/alllex/parsus/parser/Lexer.kt | 37 +++++++++++++++---- .../me/alllex/parsus/parser/ParseResult.kt | 2 + .../me/alllex/parsus/parser/ParsingContext.kt | 12 ++++-- 4 files changed, 43 insertions(+), 13 deletions(-) diff --git a/src/commonMain/kotlin/me/alllex/parsus/parser/ChoiceParser.kt b/src/commonMain/kotlin/me/alllex/parsus/parser/ChoiceParser.kt index 0cf8c46..4dbbdbc 100644 --- a/src/commonMain/kotlin/me/alllex/parsus/parser/ChoiceParser.kt +++ b/src/commonMain/kotlin/me/alllex/parsus/parser/ChoiceParser.kt @@ -29,8 +29,9 @@ internal class ChoiceParser( private val unknownFirstTokenParsers = parsers.filter { it.hasUnknownFirstTokens() } override suspend fun ParsingScope.parse(): T { - val currentToken = currentToken?.token ?: fail(NoMatchingToken(currentOffset)) - val parsers = parsersByFirstToken[currentToken] ?: unknownFirstTokenParsers + // TODO: clean up +// val currentToken = currentToken?.token ?: fail(NoMatchingToken(currentOffset)) +// val parsers = parsersByFirstToken[currentToken] ?: unknownFirstTokenParsers for (parser in parsers) { val r = tryParse(parser) if (r is ParsedValue) return r.value diff --git a/src/commonMain/kotlin/me/alllex/parsus/parser/Lexer.kt b/src/commonMain/kotlin/me/alllex/parsus/parser/Lexer.kt index 87ed8b4..74da9bc 100644 --- a/src/commonMain/kotlin/me/alllex/parsus/parser/Lexer.kt +++ b/src/commonMain/kotlin/me/alllex/parsus/parser/Lexer.kt @@ -12,9 +12,10 @@ internal class Lexer( private val tokens: List, ) { + private val ignoredTokens = tokens.filter { it.ignored } private val tokensByFirstChar: Map> - private var cachedFromIndex: Int = -1 - private var cachedTokenMatch: TokenMatch? = null +// private var cachedFromIndex: Int = -1 +// private var cachedTokenMatch: TokenMatch? = null init { tokensByFirstChar = mutableMapOf>() @@ -36,14 +37,36 @@ internal class Lexer( } } - fun findMatch(fromIndex: Int): TokenMatch? { - if (fromIndex == cachedFromIndex && cachedTokenMatch != null) { - return cachedTokenMatch + fun findMatchOf(fromIndex: Int, targetToken: Token): TokenMatch? { + var pos = fromIndex + while (true) { + matchImpl(pos, targetToken)?.let { return it } + + val preIgnorePos = pos + for (ignoredToken in ignoredTokens) { + val ignoredMatch = matchImpl(pos, ignoredToken) + if (ignoredMatch != null) { + pos = ignoredMatch.offset + ignoredMatch.length + break + } + } + + if (preIgnorePos == pos) { + // No ignored tokens matched, so we can't find the target token + return null + } } + // The loop will exit via a mismatch, because no tokens can match "after the end of input" + } + + fun findMatch(fromIndex: Int): TokenMatch? { +// if (fromIndex == cachedFromIndex && cachedTokenMatch != null) { +// return cachedTokenMatch +// } val foundTokenMatch = findMatchIgnoring(fromIndex) - cachedFromIndex = fromIndex - cachedTokenMatch = foundTokenMatch +// cachedFromIndex = fromIndex +// cachedTokenMatch = foundTokenMatch return foundTokenMatch } diff --git a/src/commonMain/kotlin/me/alllex/parsus/parser/ParseResult.kt b/src/commonMain/kotlin/me/alllex/parsus/parser/ParseResult.kt index e1f7da0..a0bc858 100644 --- a/src/commonMain/kotlin/me/alllex/parsus/parser/ParseResult.kt +++ b/src/commonMain/kotlin/me/alllex/parsus/parser/ParseResult.kt @@ -26,6 +26,8 @@ abstract class ParseError : ParseResult() { override fun toString(): String = "ParseError" } +data class UnmatchedToken(val expected: Token, override val offset: Int) : ParseError() + data class MismatchedToken(val expected: Token, val found: TokenMatch) : ParseError() { override val offset: Int get() = found.offset } diff --git a/src/commonMain/kotlin/me/alllex/parsus/parser/ParsingContext.kt b/src/commonMain/kotlin/me/alllex/parsus/parser/ParsingContext.kt index 2bb99dd..c002631 100644 --- a/src/commonMain/kotlin/me/alllex/parsus/parser/ParsingContext.kt +++ b/src/commonMain/kotlin/me/alllex/parsus/parser/ParsingContext.kt @@ -5,7 +5,9 @@ import me.alllex.parsus.token.TokenMatch import kotlin.coroutines.Continuation import kotlin.coroutines.CoroutineContext import kotlin.coroutines.EmptyCoroutineContext -import kotlin.coroutines.intrinsics.* +import kotlin.coroutines.intrinsics.COROUTINE_SUSPENDED +import kotlin.coroutines.intrinsics.createCoroutineUnintercepted +import kotlin.coroutines.intrinsics.suspendCoroutineUninterceptedOrReturn /** * Executes parsers, keeping track of current position in the input and error-continuations. @@ -39,8 +41,9 @@ internal class ParsingContext( override val currentOffset: Int get() = position + // TODO: clean up override val currentToken: TokenMatch? - get() = lexer.findMatch(position) + get() = null // lexer.findMatch(position) override suspend fun Parser.invoke(): R = parse() @@ -55,8 +58,9 @@ internal class ParsingContext( override fun tryParse(token: Token): ParseResult { val fromIndex = this.position - val match = lexer.findMatch(fromIndex) - ?: return NoMatchingToken(fromIndex) + val match = lexer.findMatchOf(fromIndex, token) + ?: return UnmatchedToken(token, fromIndex) + // TODO: clean up, as this should not happen anymore if (match.token != token) return MismatchedToken(token, match) this.position = match.offset + match.length return ParsedValue(match) From c72512106a8d06b40013fb3d93165d70854506b5 Mon Sep 17 00:00:00 2001 From: Alex Semin Date: Wed, 4 Oct 2023 08:58:46 +0200 Subject: [PATCH 02/11] Fix tests --- .../kotlin/me/alllex/parsus/GrammarTests.kt | 2 +- .../kotlin/me/alllex/parsus/IgnoreCaseTests.kt | 4 ++-- src/commonTest/kotlin/me/alllex/parsus/Tests.kt | 14 +++++++------- .../kotlin/me/alllex/parsus/TokenTests.kt | 14 ++++++++++++++ src/commonTest/kotlin/me/alllex/parsus/util.kt | 8 ++++++++ 5 files changed, 32 insertions(+), 10 deletions(-) diff --git a/src/commonTest/kotlin/me/alllex/parsus/GrammarTests.kt b/src/commonTest/kotlin/me/alllex/parsus/GrammarTests.kt index 09db2f7..34ecc98 100644 --- a/src/commonTest/kotlin/me/alllex/parsus/GrammarTests.kt +++ b/src/commonTest/kotlin/me/alllex/parsus/GrammarTests.kt @@ -75,7 +75,7 @@ class GrammarTests { override val root = parser { lexeme(a) } }.run { assertParsed("a").isEqualTo(a.lex()) - assertThatParsing("b").failedWithTokenMismatch(a, b, 0) + assertThatParsing("b").failedWithUnmatchedToken(a, 0) assertThat(parse(nonRootParser, "b").getOrThrow()).isEqualTo(b.lex()) } } diff --git a/src/commonTest/kotlin/me/alllex/parsus/IgnoreCaseTests.kt b/src/commonTest/kotlin/me/alllex/parsus/IgnoreCaseTests.kt index dd47672..c0e944d 100644 --- a/src/commonTest/kotlin/me/alllex/parsus/IgnoreCaseTests.kt +++ b/src/commonTest/kotlin/me/alllex/parsus/IgnoreCaseTests.kt @@ -2,7 +2,7 @@ package me.alllex.parsus import assertk.assertions.isEqualTo import me.alllex.parsus.parser.Grammar -import me.alllex.parsus.parser.NoMatchingToken +import me.alllex.parsus.parser.NoViableAlternative import me.alllex.parsus.parser.or import me.alllex.parsus.parser.parser import me.alllex.parsus.token.literalToken @@ -72,7 +72,7 @@ class IgnoreCaseTests { assertParsed("f").isEqualTo(lam.lex("f")) assertParsed("F").isEqualTo(lam.lex("F")) assertParsed("g").isEqualTo(lamStrict.lex("g")) - assertNotParsed("G").failedWith(NoMatchingToken(0)) + assertNotParsed("G").failedWith(NoViableAlternative(0)) } } diff --git a/src/commonTest/kotlin/me/alllex/parsus/Tests.kt b/src/commonTest/kotlin/me/alllex/parsus/Tests.kt index 8718a5c..fdfd52e 100644 --- a/src/commonTest/kotlin/me/alllex/parsus/Tests.kt +++ b/src/commonTest/kotlin/me/alllex/parsus/Tests.kt @@ -21,9 +21,9 @@ class Tests { override val root = parser { node(lexeme(a) + lexeme(b)) } } - assertThat(g.parse("bb")).failedWithTokenMismatch(expected = g.a, actual = g.b, offset = 0) - assertThat(g.parse("aa")).failedWithTokenMismatch(expected = g.b, actual = EofToken, offset = 2) - assertThat(g.parse("aabbaa")).failedWithTokenMismatch(expected = EofToken, actual = g.a, offset = 4) + assertThat(g.parse("bb")).failedWithUnmatchedToken(expected = g.a, offset = 0) + assertThat(g.parse("aa")).failedWithUnmatchedToken(expected = g.b, offset = 2) + assertThat(g.parse("aabbaa")).failedWithUnmatchedToken(expected = EofToken, offset = 4) } @Test @@ -256,7 +256,7 @@ class Tests { } }.run { assertParsed("ab").isEqualTo(b.lex(1)) - assertThatParsing("b").failedWithTokenMismatch(a, b, offset = 0) + assertThatParsing("b").failedWithUnmatchedToken(a, offset = 0) } } @@ -275,7 +275,7 @@ class Tests { assertParsed("a").isEqualTo(true to false) assertParsed("b").isEqualTo(false to true) assertParsed("").isEqualTo(false to false) - assertThatParsing("aa").failedWithTokenMismatch(EofToken, a, offset = 1) + assertThatParsing("aa").failedWithUnmatchedToken(EofToken, offset = 1) } } @@ -291,7 +291,7 @@ class Tests { assertThat(g.parseOrThrow("b")).isEqualTo(node(g.b)) assertThat(g.parseOrThrow("ab")).isEqualTo(node(g.a, g.b)) assertThat(g.parseOrThrow("aab")).isEqualTo(node(g.a, g.a, g.b)) - assertThat(g.parse("")).failedWithTokenMismatch(g.b, EofToken, offset = 0) + assertThat(g.parse("")).failedWithUnmatchedToken(g.b, offset = 0) } } @@ -322,7 +322,7 @@ class Tests { assertThat(g.parseOrThrow("baab")).isEqualTo(node(g.b, g.a, g.a, g.b)) assertThat(g.parseOrThrow("baaab")).isEqualTo(node(g.b, g.a, g.a, g.a, g.b)) assertThat(g.parse("bab")).failedWithNotEnoughRepetition(1, 2, 1) - assertThat(g.parse("baaaab")).failedWithTokenMismatch(g.b, g.a, offset = 4) + assertThat(g.parse("baaaab")).failedWithUnmatchedToken(g.b, offset = 4) } object : Grammar() { diff --git a/src/commonTest/kotlin/me/alllex/parsus/TokenTests.kt b/src/commonTest/kotlin/me/alllex/parsus/TokenTests.kt index 776a8d2..d89ee04 100644 --- a/src/commonTest/kotlin/me/alllex/parsus/TokenTests.kt +++ b/src/commonTest/kotlin/me/alllex/parsus/TokenTests.kt @@ -4,6 +4,7 @@ import assertk.assertions.isEqualTo import me.alllex.parsus.parser.Grammar import me.alllex.parsus.parser.map import me.alllex.parsus.parser.or +import me.alllex.parsus.token.TokenMatch import me.alllex.parsus.token.literalToken import me.alllex.parsus.token.regexToken import kotlin.test.Test @@ -21,4 +22,17 @@ class TokenTests { } } + @Test + fun tokenPriorityIsDrivenByParser() { + object : Grammar() { + val single by literalToken("<") + val double by literalToken("<<") + + // even though single token is declared first, it is not matched first + override val root by double or single + }.run { + assertParsed("<<").isEqualTo(TokenMatch(double, 0, 2)) + } + } + } diff --git a/src/commonTest/kotlin/me/alllex/parsus/util.kt b/src/commonTest/kotlin/me/alllex/parsus/util.kt index 8557668..e7436a8 100644 --- a/src/commonTest/kotlin/me/alllex/parsus/util.kt +++ b/src/commonTest/kotlin/me/alllex/parsus/util.kt @@ -82,3 +82,11 @@ fun Assert>.failedWithTokenMismatch(expected: Token, actual: } } } + +fun Assert>.failedWithUnmatchedToken(expected: Token, offset: Int) { + isInstanceOf(UnmatchedToken::class) + .all { + prop("expected token", UnmatchedToken::expected).isEqualTo(expected) + prop("offset", UnmatchedToken::offset).isEqualTo(offset) + } +} From 3857b9eefdab937d261002aa40fe58c9cd1d13bd Mon Sep 17 00:00:00 2001 From: Alex Semin Date: Wed, 4 Oct 2023 09:24:38 +0200 Subject: [PATCH 03/11] Add reflect to tests for better error reporting --- build.gradle.kts | 1 + 1 file changed, 1 insertion(+) diff --git a/build.gradle.kts b/build.gradle.kts index 388d773..220281c 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -16,6 +16,7 @@ kotlin { dependencies { implementation(kotlin("test")) implementation("com.willowtreeapps.assertk:assertk:0.26.1") + runtimeOnly(kotlin("reflect")) } } } From 04d32685e342b01b161eb2673e8611047d4df16c Mon Sep 17 00:00:00 2001 From: Alex Semin Date: Thu, 5 Oct 2023 21:36:37 +0200 Subject: [PATCH 04/11] Refactor tokenizers and switch to Scannerless --- .../me/alllex/parsus/parser/ChoiceParser.kt | 41 ++++-- .../kotlin/me/alllex/parsus/parser/Grammar.kt | 13 +- .../kotlin/me/alllex/parsus/parser/Lexer.kt | 131 ------------------ .../me/alllex/parsus/parser/ParsingContext.kt | 10 +- .../parsus/tokenizer/AbstractTokenizer.kt | 41 ++++++ .../alllex/parsus/tokenizer/EagerTokenizer.kt | 88 ++++++++++++ .../parsus/tokenizer/ScannerlessTokenizer.kt | 42 ++++++ .../me/alllex/parsus/tokenizer/Tokenizer.kt | 14 ++ .../alllex/parsus/trace/TokenMatchingTrace.kt | 2 +- .../alllex/parsus/TokenMatchingTraceTest.kt | 69 +++++---- 10 files changed, 271 insertions(+), 180 deletions(-) delete mode 100644 src/commonMain/kotlin/me/alllex/parsus/parser/Lexer.kt create mode 100644 src/commonMain/kotlin/me/alllex/parsus/tokenizer/AbstractTokenizer.kt create mode 100644 src/commonMain/kotlin/me/alllex/parsus/tokenizer/EagerTokenizer.kt create mode 100644 src/commonMain/kotlin/me/alllex/parsus/tokenizer/ScannerlessTokenizer.kt create mode 100644 src/commonMain/kotlin/me/alllex/parsus/tokenizer/Tokenizer.kt diff --git a/src/commonMain/kotlin/me/alllex/parsus/parser/ChoiceParser.kt b/src/commonMain/kotlin/me/alllex/parsus/parser/ChoiceParser.kt index 4dbbdbc..7183f63 100644 --- a/src/commonMain/kotlin/me/alllex/parsus/parser/ChoiceParser.kt +++ b/src/commonMain/kotlin/me/alllex/parsus/parser/ChoiceParser.kt @@ -2,12 +2,19 @@ package me.alllex.parsus.parser import me.alllex.parsus.token.Token -internal class ChoiceParser( +private fun Parser<*>.hasUnknownFirstTokens() = firstTokens.isEmpty() +private fun List>.hasUnknownFirstTokens() = any { it.hasUnknownFirstTokens() } + +internal abstract class AbstractChoiceParser( val parsers: List>, ) : ParserImpl( null, firstTokens = if (parsers.hasUnknownFirstTokens()) emptySet() else parsers.flatMap { it.firstTokens }.toSet() -) { +) + +internal class EagerChoiceParser( + parsers: List>, +) : AbstractChoiceParser(parsers) { private val parsersByFirstToken: Map>> = mutableMapOf>>() @@ -29,22 +36,34 @@ internal class ChoiceParser( private val unknownFirstTokenParsers = parsers.filter { it.hasUnknownFirstTokens() } override suspend fun ParsingScope.parse(): T { - // TODO: clean up -// val currentToken = currentToken?.token ?: fail(NoMatchingToken(currentOffset)) -// val parsers = parsersByFirstToken[currentToken] ?: unknownFirstTokenParsers + val currentToken = currentToken?.token ?: fail(NoMatchingToken(currentOffset)) + val parsers = parsersByFirstToken[currentToken] ?: unknownFirstTokenParsers for (parser in parsers) { val r = tryParse(parser) if (r is ParsedValue) return r.value } fail(NoViableAlternative(currentOffset)) } +} - companion object { - private fun Parser<*>.hasUnknownFirstTokens() = firstTokens.isEmpty() - private fun List>.hasUnknownFirstTokens() = any { it.hasUnknownFirstTokens() } +internal class ScannerlessChoiceParser( + parsers: List>, +) : AbstractChoiceParser(parsers) { + + override suspend fun ParsingScope.parse(): T { + for (parser in parsers) { + val r = tryParse(parser) + if (r is ParsedValue) return r.value + } + fail(NoViableAlternative(currentOffset)) } } +@Suppress("FunctionName") +private fun ChoiceParser(parsers: List>) = + // EagerChoiceParser can only be used with EagerTokenizer + ScannerlessChoiceParser(parsers) + /** * Creates a combined parser that will try the receiver parser first, * and fall back to the other parser in case of a parse error. @@ -56,8 +75,8 @@ internal class ChoiceParser( * ``` */ infix fun Parser.or(p: Parser): Parser = when { - this is ChoiceParser && p is ChoiceParser -> ChoiceParser(parsers + p.parsers) - this is ChoiceParser -> ChoiceParser(parsers + p) - p is ChoiceParser -> ChoiceParser(listOf(this) + p.parsers) + this is AbstractChoiceParser && p is AbstractChoiceParser -> ChoiceParser(parsers + p.parsers) + this is AbstractChoiceParser -> ChoiceParser(parsers + p) + p is AbstractChoiceParser -> ChoiceParser(listOf(this) + p.parsers) else -> ChoiceParser(listOf(this, p)) } diff --git a/src/commonMain/kotlin/me/alllex/parsus/parser/Grammar.kt b/src/commonMain/kotlin/me/alllex/parsus/parser/Grammar.kt index 1b607ee..69f3595 100644 --- a/src/commonMain/kotlin/me/alllex/parsus/parser/Grammar.kt +++ b/src/commonMain/kotlin/me/alllex/parsus/parser/Grammar.kt @@ -3,6 +3,7 @@ package me.alllex.parsus.parser import me.alllex.parsus.annotations.ExperimentalParsusApi import me.alllex.parsus.token.EofToken import me.alllex.parsus.token.Token +import me.alllex.parsus.tokenizer.ScannerlessTokenizer import me.alllex.parsus.trace.TokenMatchingTrace import me.alllex.parsus.trace.TracedParseResult import kotlin.reflect.KProperty @@ -159,18 +160,20 @@ abstract class Grammar( private fun parseEntire(parser: Parser, input: String): ParseResult { beforeParsing() - val lexer = Lexer(input, _tokens) - val parsingContext = ParsingContext(lexer, debugMode) + // If tokenizer impl is changed to EagerTokenizer, then ChoiceParser impl has to be changed to EagerChoiceParser + val tokenizer = ScannerlessTokenizer(input, _tokens) + val parsingContext = ParsingContext(tokenizer, debugMode) return parsingContext.runParser(createUntilEofParser(parser)) } @ExperimentalParsusApi private fun parseTracingEntire(parser: Parser, input: String): TracedParseResult { beforeParsing() - val lexer = Lexer(input, _tokens, traceTokenMatching = true) - val parsingContext = ParsingContext(lexer, debugMode) + // If tokenizer impl is changed to EagerTokenizer, then ChoiceParser impl has to be changed to EagerChoiceParser + val tokenizer = ScannerlessTokenizer(input, _tokens, traceTokenMatching = true) + val parsingContext = ParsingContext(tokenizer, debugMode) val result = parsingContext.runParser(createUntilEofParser(parser)) - val trace = lexer.getTokenMatchingTrace() ?: error("Token matching trace is not available") + val trace = tokenizer.getTokenMatchingTrace() ?: error("Token matching trace is not available") return TracedParseResult(result, trace) } diff --git a/src/commonMain/kotlin/me/alllex/parsus/parser/Lexer.kt b/src/commonMain/kotlin/me/alllex/parsus/parser/Lexer.kt deleted file mode 100644 index 4566d3d..0000000 --- a/src/commonMain/kotlin/me/alllex/parsus/parser/Lexer.kt +++ /dev/null @@ -1,131 +0,0 @@ -package me.alllex.parsus.parser - -import me.alllex.parsus.annotations.ExperimentalParsusApi -import me.alllex.parsus.token.Token -import me.alllex.parsus.token.TokenMatch -import me.alllex.parsus.trace.TokenMatchingEvent -import me.alllex.parsus.trace.TokenMatchingTrace - -/** - * Lexer is responsible for [finding][findMatch] token-matches in the given position - * in the input string. - */ -@OptIn(ExperimentalParsusApi::class) -internal class Lexer( - val input: String, - private val tokens: List, - traceTokenMatching: Boolean = false, -) { - - private val ignoredTokens = tokens.filter { it.ignored } - private val tokensByFirstChar: Map> -// private var cachedFromIndex: Int = -1 -// private var cachedTokenMatch: TokenMatch? = null - - private val traceEvents: MutableList? = if (traceTokenMatching) mutableListOf() else null - - init { - tokensByFirstChar = mutableMapOf>() - val unknownFirstCharTokens = mutableListOf() - for (token in tokens) { - val firstChars = token.firstChars - if (firstChars.isEmpty()) { - // If the token first char is unknown, then the first char heuristic cannot be applied. - // Therefore, we assume that such tokens can start with any character and put them in appropriate buckets - // to ensure the token priority correctness. - unknownFirstCharTokens += token - tokensByFirstChar.values.forEach { it += token } - } else { - for (c in firstChars) { - tokensByFirstChar.getOrPut(c) { unknownFirstCharTokens.toMutableList() } - .add(token) - } - } - } - } - - internal fun getTokenMatchingTrace(): TokenMatchingTrace? { - return traceEvents?.let { TokenMatchingTrace(input, it) } - } - - fun findMatchOf(fromIndex: Int, targetToken: Token): TokenMatch? { - var pos = fromIndex - while (true) { - matchImpl(pos, targetToken)?.let { return it } - - val preIgnorePos = pos - for (ignoredToken in ignoredTokens) { - val ignoredMatch = matchImpl(pos, ignoredToken) - if (ignoredMatch != null) { - pos = ignoredMatch.offset + ignoredMatch.length - break - } - } - - if (preIgnorePos == pos) { - // No ignored tokens matched, so we can't find the target token - return null - } - } - // The loop will exit via a mismatch, because no tokens can match "after the end of input" - } - - fun findMatch(fromIndex: Int): TokenMatch? { -// if (fromIndex == cachedFromIndex && cachedTokenMatch != null) { -// return cachedTokenMatch -// } - - val foundTokenMatch = findMatchIgnoring(fromIndex) -// cachedFromIndex = fromIndex -// cachedTokenMatch = foundTokenMatch - return foundTokenMatch - } - - private fun findMatchIgnoring(fromIndex: Int): TokenMatch? { - var pos = fromIndex - while (true) { - val lex = findMatchImpl(pos) ?: return null - if (lex.token.ignored) { - pos = lex.offset + lex.length - continue - } - - return lex - } - } - - private fun findMatchImpl(fromIndex: Int): TokenMatch? { - if (fromIndex < input.length) { - val nextChar = input[fromIndex] - val byFirstChar = tokensByFirstChar[nextChar].orEmpty() - for (token in byFirstChar) { - matchImpl(fromIndex, token)?.let { return it } - } - } - - for (token in tokens) { - matchImpl(fromIndex, token)?.let { return it } - } - return null - } - - private fun matchImpl(fromIndex: Int, token: Token): TokenMatch? { - val length = token.match(input, fromIndex) - if (length == 0) { - traceMismatch(token, fromIndex) - return null - } - - val match = TokenMatch(token, fromIndex, length) - traceMatch(token, match) - return match - } - - private fun traceMismatch(token: Token, offset: Int) { - traceEvents?.add(TokenMatchingEvent(token, offset, null)) - } - - private fun traceMatch(token: Token, match: TokenMatch) { - traceEvents?.add(TokenMatchingEvent(token, match.offset, match)) - } -} diff --git a/src/commonMain/kotlin/me/alllex/parsus/parser/ParsingContext.kt b/src/commonMain/kotlin/me/alllex/parsus/parser/ParsingContext.kt index c002631..edb8c01 100644 --- a/src/commonMain/kotlin/me/alllex/parsus/parser/ParsingContext.kt +++ b/src/commonMain/kotlin/me/alllex/parsus/parser/ParsingContext.kt @@ -2,6 +2,7 @@ package me.alllex.parsus.parser import me.alllex.parsus.token.Token import me.alllex.parsus.token.TokenMatch +import me.alllex.parsus.tokenizer.Tokenizer import kotlin.coroutines.Continuation import kotlin.coroutines.CoroutineContext import kotlin.coroutines.EmptyCoroutineContext @@ -15,7 +16,7 @@ import kotlin.coroutines.intrinsics.suspendCoroutineUninterceptedOrReturn * For each [run][runParser] a new context must be created. */ internal class ParsingContext( - private val lexer: Lexer, + private val tokenizer: Tokenizer, private val debugMode: Boolean = false ) : ParsingScope { @@ -37,13 +38,12 @@ internal class ParsingContext( return result.getOrThrow() as ParseResult } - override val TokenMatch.text: String get() = lexer.input.substring(offset, offset + length) + override val TokenMatch.text: String get() = tokenizer.input.substring(offset, offset + length) override val currentOffset: Int get() = position - // TODO: clean up override val currentToken: TokenMatch? - get() = null // lexer.findMatch(position) + get() = tokenizer.findContextFreeMatch(position) override suspend fun Parser.invoke(): R = parse() @@ -58,7 +58,7 @@ internal class ParsingContext( override fun tryParse(token: Token): ParseResult { val fromIndex = this.position - val match = lexer.findMatchOf(fromIndex, token) + val match = tokenizer.findMatchOf(fromIndex, token) ?: return UnmatchedToken(token, fromIndex) // TODO: clean up, as this should not happen anymore if (match.token != token) return MismatchedToken(token, match) diff --git a/src/commonMain/kotlin/me/alllex/parsus/tokenizer/AbstractTokenizer.kt b/src/commonMain/kotlin/me/alllex/parsus/tokenizer/AbstractTokenizer.kt new file mode 100644 index 0000000..0f7e3a3 --- /dev/null +++ b/src/commonMain/kotlin/me/alllex/parsus/tokenizer/AbstractTokenizer.kt @@ -0,0 +1,41 @@ +package me.alllex.parsus.tokenizer + +import me.alllex.parsus.annotations.ExperimentalParsusApi +import me.alllex.parsus.token.Token +import me.alllex.parsus.token.TokenMatch +import me.alllex.parsus.trace.TokenMatchingEvent +import me.alllex.parsus.trace.TokenMatchingTrace + +@OptIn(ExperimentalParsusApi::class) +internal abstract class AbstractTokenizer( + override val input: String, + protected val tokens: List, + traceTokenMatching: Boolean = false, +) : Tokenizer { + + private val traceEvents: MutableList? = if (traceTokenMatching) mutableListOf() else null + + override fun getTokenMatchingTrace(): TokenMatchingTrace? { + return traceEvents?.let { TokenMatchingTrace(input, it) } + } + + protected fun matchImpl(fromIndex: Int, token: Token): TokenMatch? { + val length = token.match(input, fromIndex) + if (length == 0) { + traceMismatch(token, fromIndex) + return null + } + + val match = TokenMatch(token, fromIndex, length) + traceMatch(token, match) + return match + } + + private fun traceMismatch(token: Token, offset: Int) { + traceEvents?.add(TokenMatchingEvent(token, offset, null)) + } + + private fun traceMatch(token: Token, match: TokenMatch) { + traceEvents?.add(TokenMatchingEvent(token, match.offset, match)) + } +} diff --git a/src/commonMain/kotlin/me/alllex/parsus/tokenizer/EagerTokenizer.kt b/src/commonMain/kotlin/me/alllex/parsus/tokenizer/EagerTokenizer.kt new file mode 100644 index 0000000..65f565c --- /dev/null +++ b/src/commonMain/kotlin/me/alllex/parsus/tokenizer/EagerTokenizer.kt @@ -0,0 +1,88 @@ +package me.alllex.parsus.tokenizer + +import me.alllex.parsus.token.Token +import me.alllex.parsus.token.TokenMatch + +/** + * This tokenizer eagerly tries to match tokens from the input, + * based on the full token set ordered by priority. + * It deterministically matches tokens from the input, + * not taking into account tokens expected by parsers. + */ +internal class EagerTokenizer( + input: String, + tokens: List, + traceTokenMatching: Boolean = false, +) : AbstractTokenizer(input, tokens, traceTokenMatching) { + + private val tokensByFirstChar: Map> + private var cachedFromIndex: Int = -1 + private var cachedTokenMatch: TokenMatch? = null + + init { + tokensByFirstChar = mutableMapOf>() + val unknownFirstCharTokens = mutableListOf() + for (token in tokens) { + val firstChars = token.firstChars + if (firstChars.isEmpty()) { + // If the token first char is unknown, then the first char heuristic cannot be applied. + // Therefore, we assume that such tokens can start with any character and put them in appropriate buckets + // to ensure the token priority correctness. + unknownFirstCharTokens += token + tokensByFirstChar.values.forEach { it += token } + } else { + for (c in firstChars) { + tokensByFirstChar.getOrPut(c) { unknownFirstCharTokens.toMutableList() } + .add(token) + } + } + } + } + + override fun findContextFreeMatch(fromIndex: Int): TokenMatch? { + return findMatchCaching(fromIndex) + } + + override fun findMatchOf(fromIndex: Int, targetToken: Token): TokenMatch? { + return findMatchCaching(fromIndex) + } + + private fun findMatchCaching(fromIndex: Int): TokenMatch? { + if (fromIndex == cachedFromIndex && cachedTokenMatch != null) { + return cachedTokenMatch + } + + val foundTokenMatch = findMatchIgnoring(fromIndex) + cachedFromIndex = fromIndex + cachedTokenMatch = foundTokenMatch + return foundTokenMatch + } + + private fun findMatchIgnoring(fromIndex: Int): TokenMatch? { + var pos = fromIndex + while (true) { + val lex = findMatchImpl(pos) ?: return null + if (lex.token.ignored) { + pos = lex.offset + lex.length + continue + } + + return lex + } + } + + private fun findMatchImpl(fromIndex: Int): TokenMatch? { + if (fromIndex < input.length) { + val nextChar = input[fromIndex] + val byFirstChar = tokensByFirstChar[nextChar].orEmpty() + for (token in byFirstChar) { + matchImpl(fromIndex, token)?.let { return it } + } + } + + for (token in tokens) { + matchImpl(fromIndex, token)?.let { return it } + } + return null + } +} diff --git a/src/commonMain/kotlin/me/alllex/parsus/tokenizer/ScannerlessTokenizer.kt b/src/commonMain/kotlin/me/alllex/parsus/tokenizer/ScannerlessTokenizer.kt new file mode 100644 index 0000000..9d204c7 --- /dev/null +++ b/src/commonMain/kotlin/me/alllex/parsus/tokenizer/ScannerlessTokenizer.kt @@ -0,0 +1,42 @@ +package me.alllex.parsus.tokenizer + +import me.alllex.parsus.token.Token +import me.alllex.parsus.token.TokenMatch + +/** + * Scannerless tokenizer tries to parse the target token at the give position. + * + * It treats the target token as having higher priority than all other tokens. + */ +internal class ScannerlessTokenizer( + input: String, + tokens: List, + traceTokenMatching: Boolean = false, +): AbstractTokenizer(input, tokens, traceTokenMatching) { + + private val ignoredTokens = tokens.filter { it.ignored } + + override fun findContextFreeMatch(fromIndex: Int): TokenMatch? = null + + override fun findMatchOf(fromIndex: Int, targetToken: Token): TokenMatch? { + var pos = fromIndex + while (true) { + matchImpl(pos, targetToken)?.let { return it } + + val preIgnorePos = pos + for (ignoredToken in ignoredTokens) { + val ignoredMatch = matchImpl(pos, ignoredToken) + if (ignoredMatch != null) { + pos = ignoredMatch.offset + ignoredMatch.length + break + } + } + + if (preIgnorePos == pos) { + // No ignored tokens matched, so we can't find the target token + return null + } + } + // The loop will exit via a mismatch, because no tokens can match "after the end of input" + } +} diff --git a/src/commonMain/kotlin/me/alllex/parsus/tokenizer/Tokenizer.kt b/src/commonMain/kotlin/me/alllex/parsus/tokenizer/Tokenizer.kt new file mode 100644 index 0000000..4f4ca77 --- /dev/null +++ b/src/commonMain/kotlin/me/alllex/parsus/tokenizer/Tokenizer.kt @@ -0,0 +1,14 @@ +package me.alllex.parsus.tokenizer + +import me.alllex.parsus.annotations.ExperimentalParsusApi +import me.alllex.parsus.token.Token +import me.alllex.parsus.token.TokenMatch +import me.alllex.parsus.trace.TokenMatchingTrace + +@OptIn(ExperimentalParsusApi::class) +internal interface Tokenizer { + val input: String + fun getTokenMatchingTrace(): TokenMatchingTrace? + fun findContextFreeMatch(fromIndex: Int): TokenMatch? + fun findMatchOf(fromIndex: Int, targetToken: Token): TokenMatch? +} diff --git a/src/commonMain/kotlin/me/alllex/parsus/trace/TokenMatchingTrace.kt b/src/commonMain/kotlin/me/alllex/parsus/trace/TokenMatchingTrace.kt index d1ffae3..b66fa4e 100644 --- a/src/commonMain/kotlin/me/alllex/parsus/trace/TokenMatchingTrace.kt +++ b/src/commonMain/kotlin/me/alllex/parsus/trace/TokenMatchingTrace.kt @@ -77,7 +77,7 @@ fun formatTokenMatchingTrace( sb.append(" ".repeat(lookBehind + 1)) sb.append(matchSymbol.repeat(matchLength.coerceAtLeast(1))) sb.append(" [$offset").append(if (match != null) " - ${offset + matchLength - 1}" else "") - .append("] ").append(event.token) + .append("] ").append(event.token.name?.let { "$it " } ?: "").append(event.token) sb.appendLine() } return sb.toString() diff --git a/src/commonTest/kotlin/me/alllex/parsus/TokenMatchingTraceTest.kt b/src/commonTest/kotlin/me/alllex/parsus/TokenMatchingTraceTest.kt index 6d323b8..e6a389f 100644 --- a/src/commonTest/kotlin/me/alllex/parsus/TokenMatchingTraceTest.kt +++ b/src/commonTest/kotlin/me/alllex/parsus/TokenMatchingTraceTest.kt @@ -5,10 +5,9 @@ import assertk.assertions.isEqualTo import me.alllex.parsus.annotations.ExperimentalParsusApi import me.alllex.parsus.parser.* import me.alllex.parsus.token.literalToken +import me.alllex.parsus.token.regexToken import me.alllex.parsus.trace.formatTokenMatchingTrace import me.alllex.parsus.tree.SyntaxTree -import me.alllex.parsus.tree.lexeme -import me.alllex.parsus.tree.plus import kotlin.test.Test @OptIn(ExperimentalParsusApi::class) @@ -17,39 +16,55 @@ class TokenMatchingTraceTest { @Test fun tokenMatchingTraceIsFormatted() { object : Grammar() { + @Suppress("unused") + val ws by regexToken("\\s+", ignored = true) val a by literalToken("a") val b by literalToken("b") val cd by literalToken("cd") - val ab by parser { node(lexeme(a) + lexeme(b)) } - override val root by ab * parlex(cd) map { (v1, v2) -> node(v1, v2) } + val ef by literalToken("ef") + val aOrB by parlex(a) or parlex(b) + val cdOrEf by parlex(cd) or parlex(ef) + val p by aOrB * cdOrEf map { (v1, v2) -> node(v1, v2) } + override val root by oneOrMore(p) map { node(it) } }.run { - val input = "abcd" + val input = "aefbcd" val tracedResult = parseTracingTokenMatching(input) - assertThat(tracedResult.result).isEqualTo(ParsedValue(node(node(a.lex(0), b.lex(1)), cd.lex(2)))) + assertThat(tracedResult.result).isEqualTo(ParsedValue( + node(node(a.lex(0), ef.lex(1)), node(b.lex(3), cd.lex(4)))) + ) val formattedTrace = formatTokenMatchingTrace(tracedResult.trace) + println(formattedTrace) assertThat("\n" + formattedTrace).isEqualTo( """ -__________ -······abcd - x [0] Token(EOF) -__________ -······abcd - ^ [0 - 0] LiteralToken('a') -__________ -·····abcd· - x [1] Token(EOF) -__________ -·····abcd· - ^ [1 - 1] LiteralToken('b') -__________ -····abcd·· - x [2] Token(EOF) -__________ -····abcd·· - ^^ [2 - 3] LiteralToken('cd') -__________ -··abcd···· - ^ [4 - 4] Token(EOF) +____________ +······aefbcd + ^ [0 - 0] a LiteralToken('a') +____________ +·····aefbcd· + x [1] cd LiteralToken('cd') + x [1] ws RegexToken(ws [\s+] [ignored]) +____________ +·····aefbcd· + ^^ [1 - 2] ef LiteralToken('ef') +____________ +···aefbcd··· + x [3] a LiteralToken('a') + x [3] ws RegexToken(ws [\s+] [ignored]) +____________ +···aefbcd··· + ^ [3 - 3] b LiteralToken('b') +____________ +··aefbcd···· + ^^ [4 - 5] cd LiteralToken('cd') +____________ +…efbcd······ + x [6] a LiteralToken('a') + x [6] ws RegexToken(ws [\s+] [ignored]) + x [6] b LiteralToken('b') + x [6] ws RegexToken(ws [\s+] [ignored]) +____________ +…efbcd······ + ^ [6 - 6] EOF Token(EOF) """ ) } From b33bc5b41fdfa745e6258f48edd0e49f45536969 Mon Sep 17 00:00:00 2001 From: Alex Semin Date: Thu, 5 Oct 2023 21:40:53 +0200 Subject: [PATCH 05/11] Add tests that checks support for ignored token parsing --- .../kotlin/me/alllex/parsus/TokenTests.kt | 36 +++++++++++++++++-- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/src/commonTest/kotlin/me/alllex/parsus/TokenTests.kt b/src/commonTest/kotlin/me/alllex/parsus/TokenTests.kt index d89ee04..50cc966 100644 --- a/src/commonTest/kotlin/me/alllex/parsus/TokenTests.kt +++ b/src/commonTest/kotlin/me/alllex/parsus/TokenTests.kt @@ -1,12 +1,12 @@ package me.alllex.parsus import assertk.assertions.isEqualTo -import me.alllex.parsus.parser.Grammar -import me.alllex.parsus.parser.map -import me.alllex.parsus.parser.or +import me.alllex.parsus.parser.* import me.alllex.parsus.token.TokenMatch import me.alllex.parsus.token.literalToken import me.alllex.parsus.token.regexToken +import me.alllex.parsus.tree.SyntaxTree +import me.alllex.parsus.tree.lexeme import kotlin.test.Test class TokenTests { @@ -35,4 +35,34 @@ class TokenTests { } } + @Test + fun explicitIgnoredTokenParsing() { + object : Grammar() { + val ws by regexToken("\\s+", ignored = true) + val a by literalToken("a") + override val root by parser { + val a1 = lexeme(a) + val w = lexeme(ws) + val a2 = lexeme(a) + node(a1, w, a2) + } + }.run { + assertParsed("a a").isEqualTo(node(a.lex("a", 0), ws.lex(" ", 1), a.lex("a", 2))) + assertParsed(" a a ").isEqualTo(node(a.lex("a", 1), ws.lex(" ", 2), a.lex("a", 3))) + assertNotParsed("aa").failedWithUnmatchedToken(ws, 1) + assertNotParsed(" aa").failedWithUnmatchedToken(ws, 2) + } + + object : Grammar() { + val ws by regexToken("\\s+", ignored = true) + val a by literalToken("a") + override val root by parlex(a) and (-ws * parlex(a)) map { node(it.first, it.second) } + }.run { + assertParsed("a a").isEqualTo(node(a.lex("a", 0), a.lex("a", 2))) + assertParsed(" a a ").isEqualTo(node(a.lex("a", 1), a.lex("a", 3))) + assertNotParsed("aa").failedWithUnmatchedToken(ws, 1) + assertNotParsed(" aa").failedWithUnmatchedToken(ws, 2) + } + } + } From 22cb43ef2cff2bac4a4eff9ae73c8d23cf895d15 Mon Sep 17 00:00:00 2001 From: Alex Semin Date: Thu, 5 Oct 2023 22:02:41 +0200 Subject: [PATCH 06/11] Scannerless: cache ignored token last matching result --- .../parsus/tokenizer/ScannerlessTokenizer.kt | 51 ++++++++++++++----- 1 file changed, 39 insertions(+), 12 deletions(-) diff --git a/src/commonMain/kotlin/me/alllex/parsus/tokenizer/ScannerlessTokenizer.kt b/src/commonMain/kotlin/me/alllex/parsus/tokenizer/ScannerlessTokenizer.kt index 9d204c7..fc13179 100644 --- a/src/commonMain/kotlin/me/alllex/parsus/tokenizer/ScannerlessTokenizer.kt +++ b/src/commonMain/kotlin/me/alllex/parsus/tokenizer/ScannerlessTokenizer.kt @@ -16,27 +16,54 @@ internal class ScannerlessTokenizer( private val ignoredTokens = tokens.filter { it.ignored } + private var cachedIgnoredFromIndex: Int = -1 + private var cachedIgnoredTokenMatch: TokenMatch? = null + override fun findContextFreeMatch(fromIndex: Int): TokenMatch? = null override fun findMatchOf(fromIndex: Int, targetToken: Token): TokenMatch? { var pos = fromIndex while (true) { - matchImpl(pos, targetToken)?.let { return it } - - val preIgnorePos = pos - for (ignoredToken in ignoredTokens) { - val ignoredMatch = matchImpl(pos, ignoredToken) - if (ignoredMatch != null) { - pos = ignoredMatch.offset + ignoredMatch.length - break - } - } + matchTarget(pos, targetToken)?.let { return it } - if (preIgnorePos == pos) { - // No ignored tokens matched, so we can't find the target token + val ignoredMatch = matchIgnored(pos) + @Suppress("LiftReturnOrAssignment") + if (ignoredMatch != null) { + val posAfterIgnored = ignoredMatch.offset + ignoredMatch.length + if (posAfterIgnored > pos) { + pos = posAfterIgnored + continue + } else { + // An ignored token matched, but it did not advance the position. + // This should not happen normally, but this is a safeguard. + return null + } + } else { + // No ignored tokens matched at the current position either, + // so it is a mismatch overall return null } } // The loop will exit via a mismatch, because no tokens can match "after the end of input" } + + private fun matchIgnored(fromIndex: Int): TokenMatch? { + if (fromIndex == cachedIgnoredFromIndex) { + return cachedIgnoredTokenMatch + } + + var match: TokenMatch? = null + for (ignoredToken in ignoredTokens) { + match = matchImpl(fromIndex, ignoredToken) + if (match != null) { + break + } + } + + cachedIgnoredFromIndex = fromIndex + cachedIgnoredTokenMatch = match + return match + } + + private fun matchTarget(pos: Int, targetToken: Token) = matchImpl(pos, targetToken) } From 2d1e1a56c85c732731131dd268b319629fe46dd3 Mon Sep 17 00:00:00 2001 From: Alex Semin Date: Thu, 5 Oct 2023 22:33:22 +0200 Subject: [PATCH 07/11] Scannerless: cache last ignored mismatch separately --- .../parsus/tokenizer/ScannerlessTokenizer.kt | 29 +++++++++++++++++-- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/src/commonMain/kotlin/me/alllex/parsus/tokenizer/ScannerlessTokenizer.kt b/src/commonMain/kotlin/me/alllex/parsus/tokenizer/ScannerlessTokenizer.kt index fc13179..d747dec 100644 --- a/src/commonMain/kotlin/me/alllex/parsus/tokenizer/ScannerlessTokenizer.kt +++ b/src/commonMain/kotlin/me/alllex/parsus/tokenizer/ScannerlessTokenizer.kt @@ -12,10 +12,24 @@ internal class ScannerlessTokenizer( input: String, tokens: List, traceTokenMatching: Boolean = false, -): AbstractTokenizer(input, tokens, traceTokenMatching) { +) : AbstractTokenizer(input, tokens, traceTokenMatching) { private val ignoredTokens = tokens.filter { it.ignored } + // We cache one mismatch and one match of ignored tokens. + // This is for the frequent case, when there is exactly one ignored token before the target token. + // Example: + // parser = t1 or t2 or t3, ws = ignored whitespace + // input = " t3" + // In this example, t1 will fail to match at 0, but ws will match at 0, so we cache the match. + // Then t1 will try to match at 1, but it will fail again, so we try ignored tokens again, + // but this time we get a mismatch, which we cache separately. This fails the t1 branch of the parser. + // Now, we backtrack and try t2 at 0, which fails. + // But we can avoid rematching ws at 0, because we cached this match. + // Then we try t2 at position 1, which fails. But we don't retry ws, because we cached the mismatch. + // In the last t3 branch, we try t3 at 0, which fails, but then we skip rematching ws at 0, + // because it is still cached. Then t3 succeeds at 0, and parsing succeeds. + private var cacheIgnoredMismatchFromIndex = -1 private var cachedIgnoredFromIndex: Int = -1 private var cachedIgnoredTokenMatch: TokenMatch? = null @@ -48,6 +62,11 @@ internal class ScannerlessTokenizer( } private fun matchIgnored(fromIndex: Int): TokenMatch? { + require(fromIndex >= 0) { "fromIndex must be non-negative, but was $fromIndex" } + + if (fromIndex == cacheIgnoredMismatchFromIndex) { + return null + } if (fromIndex == cachedIgnoredFromIndex) { return cachedIgnoredTokenMatch } @@ -60,8 +79,12 @@ internal class ScannerlessTokenizer( } } - cachedIgnoredFromIndex = fromIndex - cachedIgnoredTokenMatch = match + if (match == null) { + cacheIgnoredMismatchFromIndex = fromIndex + } else { + cachedIgnoredFromIndex = fromIndex + cachedIgnoredTokenMatch = match + } return match } From 3368259b5b5eebc84895e6acc1e7ef495bcfad19 Mon Sep 17 00:00:00 2001 From: Alex Semin Date: Thu, 5 Oct 2023 22:33:43 +0200 Subject: [PATCH 08/11] Fix TokenMatchingTraceTest --- .../alllex/parsus/TokenMatchingTraceTest.kt | 103 +++++++++++++----- 1 file changed, 73 insertions(+), 30 deletions(-) diff --git a/src/commonTest/kotlin/me/alllex/parsus/TokenMatchingTraceTest.kt b/src/commonTest/kotlin/me/alllex/parsus/TokenMatchingTraceTest.kt index e6a389f..507d72d 100644 --- a/src/commonTest/kotlin/me/alllex/parsus/TokenMatchingTraceTest.kt +++ b/src/commonTest/kotlin/me/alllex/parsus/TokenMatchingTraceTest.kt @@ -27,44 +27,87 @@ class TokenMatchingTraceTest { val p by aOrB * cdOrEf map { (v1, v2) -> node(v1, v2) } override val root by oneOrMore(p) map { node(it) } }.run { - val input = "aefbcd" + val input = " a ef b cd " val tracedResult = parseTracingTokenMatching(input) assertThat(tracedResult.result).isEqualTo(ParsedValue( - node(node(a.lex(0), ef.lex(1)), node(b.lex(3), cd.lex(4)))) + node(node(a.lex(1), ef.lex(3)), node(b.lex(7), cd.lex(9)))) ) val formattedTrace = formatTokenMatchingTrace(tracedResult.trace) println(formattedTrace) assertThat("\n" + formattedTrace).isEqualTo( """ -____________ -······aefbcd - ^ [0 - 0] a LiteralToken('a') -____________ -·····aefbcd· - x [1] cd LiteralToken('cd') - x [1] ws RegexToken(ws [\s+] [ignored]) -____________ -·····aefbcd· - ^^ [1 - 2] ef LiteralToken('ef') -____________ -···aefbcd··· - x [3] a LiteralToken('a') +__________________ +······␣a␣ef␣␣b␣cd␣ + x [0] a LiteralToken('a') +__________________ +······␣a␣ef␣␣b␣cd␣ + ^ [0 - 0] ws RegexToken(ws [\s+] [ignored]) +__________________ +·····␣a␣ef␣␣b␣cd␣· + ^ [1 - 1] a LiteralToken('a') +__________________ +····␣a␣ef␣␣b␣cd␣·· + x [2] cd LiteralToken('cd') +__________________ +····␣a␣ef␣␣b␣cd␣·· + ^ [2 - 2] ws RegexToken(ws [\s+] [ignored]) +__________________ +···␣a␣ef␣␣b␣cd␣··· + x [3] cd LiteralToken('cd') x [3] ws RegexToken(ws [\s+] [ignored]) -____________ -···aefbcd··· - ^ [3 - 3] b LiteralToken('b') -____________ -··aefbcd···· - ^^ [4 - 5] cd LiteralToken('cd') -____________ -…efbcd······ - x [6] a LiteralToken('a') - x [6] ws RegexToken(ws [\s+] [ignored]) - x [6] b LiteralToken('b') - x [6] ws RegexToken(ws [\s+] [ignored]) -____________ -…efbcd······ - ^ [6 - 6] EOF Token(EOF) +__________________ +····␣a␣ef␣␣b␣cd␣·· + x [2] ef LiteralToken('ef') +__________________ +···␣a␣ef␣␣b␣cd␣··· + ^^ [3 - 4] ef LiteralToken('ef') +__________________ +·␣a␣ef␣␣b␣cd␣····· + x [5] a LiteralToken('a') +__________________ +·␣a␣ef␣␣b␣cd␣····· + ^^ [5 - 6] ws RegexToken(ws [\s+] [ignored]) +__________________ +…␣ef␣␣b␣cd␣······· + x [7] a LiteralToken('a') + x [7] ws RegexToken(ws [\s+] [ignored]) +__________________ +·␣a␣ef␣␣b␣cd␣····· + x [5] b LiteralToken('b') +__________________ +…␣ef␣␣b␣cd␣······· + ^ [7 - 7] b LiteralToken('b') +__________________ +…ef␣␣b␣cd␣········ + x [8] cd LiteralToken('cd') +__________________ +…ef␣␣b␣cd␣········ + ^ [8 - 8] ws RegexToken(ws [\s+] [ignored]) +__________________ +…f␣␣b␣cd␣········· + ^^ [9 - 10] cd LiteralToken('cd') +__________________ +…␣b␣cd␣··········· + x [11] a LiteralToken('a') +__________________ +…␣b␣cd␣··········· + ^ [11 - 11] ws RegexToken(ws [\s+] [ignored]) +__________________ +…b␣cd␣············ + x [12] a LiteralToken('a') + x [12] ws RegexToken(ws [\s+] [ignored]) +__________________ +…␣b␣cd␣··········· + x [11] b LiteralToken('b') +__________________ +…b␣cd␣············ + x [12] b LiteralToken('b') +__________________ +…␣b␣cd␣··········· + x [11] EOF Token(EOF) +__________________ +…b␣cd␣············ + ^ [12 - 12] EOF Token(EOF) """ ) } From 915edc84836a0e78280d00df514080dbc157da64 Mon Sep 17 00:00:00 2001 From: Alex Semin Date: Thu, 5 Oct 2023 22:36:39 +0200 Subject: [PATCH 09/11] Add naive JSON grammar test --- .../kotlin/me/alllex/parsus/GrammarTests.kt | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/src/commonTest/kotlin/me/alllex/parsus/GrammarTests.kt b/src/commonTest/kotlin/me/alllex/parsus/GrammarTests.kt index 34ecc98..f223a7b 100644 --- a/src/commonTest/kotlin/me/alllex/parsus/GrammarTests.kt +++ b/src/commonTest/kotlin/me/alllex/parsus/GrammarTests.kt @@ -4,6 +4,7 @@ import assertk.assertThat import assertk.assertions.isEqualTo import me.alllex.parsus.parser.* import me.alllex.parsus.token.literalToken +import me.alllex.parsus.token.regexToken import me.alllex.parsus.tree.SyntaxTree import me.alllex.parsus.tree.lexeme import kotlin.test.Test @@ -80,4 +81,56 @@ class GrammarTests { } } + @Test + fun naiveJsonGrammarTest() { + NaiveJsonGrammar.run { + assertParsed("""{${'\n'}"a": 1,${'\n'}"b": {"c":false}${'\n'}}""").isEqualTo( + Json.Obj( + mapOf( + "a" to Json.Num(1.0), + "b" to Json.Obj(mapOf("c" to Json.Bool(false))) + ) + ) + ) + } + } + + sealed class Json { + object Null : Json() { + override fun toString(): String = "Null" + } + + data class Bool(val value: Boolean) : Json() + data class Num(val value: Double) : Json() + data class Str(val value: String) : Json() + data class Arr(val values: List) : Json() + data class Obj(val values: Map) : Json() + } + + object NaiveJsonGrammar : Grammar() { + init { + regexToken("\\s+", ignored = true) + } + + private val comma by literalToken(",") + private val colon by literalToken(":") + private val lbrace by literalToken("{") + private val rbrace by literalToken("}") + private val lbracket by literalToken("[") + private val rbracket by literalToken("]") + private val str by regexToken("\"[^\\\\\"]*(\\\\[\"nrtbf\\\\][^\\\\\"]*)*\"") map { it.text.run { substring(1, lastIndex) } } + private val jsonTrue by literalToken("true") map { Json.Bool(true) } + private val jsonFalse by literalToken("false") map { Json.Bool(false) } + private val jsonNull by literalToken("null") map Json.Null + private val jsonNum by regexToken("-?(?:0|[1-9]\\d*)(?:\\.\\d+)?(?:[eE][+-]?\\d+)?") map { Json.Num(it.text.toDouble()) } + private val jsonStr by str map { Json.Str(it) } + + private val keyValue by str * -colon and ref(::jsonValue) map { it.toPair() } + private val jsonObj by -lbrace * separated(keyValue, comma) * -rbrace map { Json.Obj(it.toMap()) } + + private val jsonArr by -lbracket * separated(ref(::jsonValue), comma) * -rbracket map { Json.Arr(it) } + private val jsonValue: Parser by jsonNull or jsonTrue or jsonFalse or jsonNum or jsonStr or jsonArr or jsonObj + override val root by jsonValue + } + } From 30669a60af56e3e2b0a23101a8842f27e604eeaa Mon Sep 17 00:00:00 2001 From: Alex Semin Date: Thu, 5 Oct 2023 22:39:41 +0200 Subject: [PATCH 10/11] Extend the test that validates token priority is driven by parsers --- .../kotlin/me/alllex/parsus/TokenTests.kt | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/commonTest/kotlin/me/alllex/parsus/TokenTests.kt b/src/commonTest/kotlin/me/alllex/parsus/TokenTests.kt index 50cc966..8a5e0dd 100644 --- a/src/commonTest/kotlin/me/alllex/parsus/TokenTests.kt +++ b/src/commonTest/kotlin/me/alllex/parsus/TokenTests.kt @@ -2,6 +2,7 @@ package me.alllex.parsus import assertk.assertions.isEqualTo import me.alllex.parsus.parser.* +import me.alllex.parsus.token.EofToken import me.alllex.parsus.token.TokenMatch import me.alllex.parsus.token.literalToken import me.alllex.parsus.token.regexToken @@ -25,14 +26,31 @@ class TokenTests { @Test fun tokenPriorityIsDrivenByParser() { object : Grammar() { - val single by literalToken("<") + // double declared first val double by literalToken("<<") + val single by literalToken("<") + override val root by double or single + }.run { + assertParsed("<<").isEqualTo(TokenMatch(double, 0, 2)) + } + object : Grammar() { + val single by literalToken("<") + val double by literalToken("<<") // even though single token is declared first, it is not matched first override val root by double or single }.run { assertParsed("<<").isEqualTo(TokenMatch(double, 0, 2)) } + + object : Grammar() { + val single by literalToken("<") + val double by literalToken("<<") + // if the order in the parser is "wrong", then the parsing will fail too + override val root by single or double + }.run { + assertNotParsed("<<").failedWithUnmatchedToken(EofToken, 1) + } } @Test From 4f6501dbf2fc418293e073b3996be46075ea886d Mon Sep 17 00:00:00 2001 From: Alex Semin Date: Thu, 5 Oct 2023 22:46:27 +0200 Subject: [PATCH 11/11] Deprecate currentToken --- src/commonMain/kotlin/me/alllex/parsus/parser/ChoiceParser.kt | 1 + src/commonMain/kotlin/me/alllex/parsus/parser/ParsingScope.kt | 1 + 2 files changed, 2 insertions(+) diff --git a/src/commonMain/kotlin/me/alllex/parsus/parser/ChoiceParser.kt b/src/commonMain/kotlin/me/alllex/parsus/parser/ChoiceParser.kt index 7183f63..f467280 100644 --- a/src/commonMain/kotlin/me/alllex/parsus/parser/ChoiceParser.kt +++ b/src/commonMain/kotlin/me/alllex/parsus/parser/ChoiceParser.kt @@ -35,6 +35,7 @@ internal class EagerChoiceParser( private val unknownFirstTokenParsers = parsers.filter { it.hasUnknownFirstTokens() } + @Suppress("DEPRECATION") override suspend fun ParsingScope.parse(): T { val currentToken = currentToken?.token ?: fail(NoMatchingToken(currentOffset)) val parsers = parsersByFirstToken[currentToken] ?: unknownFirstTokenParsers diff --git a/src/commonMain/kotlin/me/alllex/parsus/parser/ParsingScope.kt b/src/commonMain/kotlin/me/alllex/parsus/parser/ParsingScope.kt index 7437166..cc8c5e7 100644 --- a/src/commonMain/kotlin/me/alllex/parsus/parser/ParsingScope.kt +++ b/src/commonMain/kotlin/me/alllex/parsus/parser/ParsingScope.kt @@ -45,6 +45,7 @@ interface ParsingScope { /** * The token at the current offset in the input. */ + @Deprecated("The new \"scannerless\" parsing approach does not eagerly tokenize the input. The `currentToken` is always null.") val currentToken: TokenMatch? /**