Skip to content

Commit

Permalink
Fix false positive for token first char heuristic
Browse files Browse the repository at this point in the history
  • Loading branch information
alllex committed Aug 21, 2023
1 parent 3ef14b2 commit 22e1993
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 12 deletions.
9 changes: 3 additions & 6 deletions src/commonMain/kotlin/me/alllex/parsus/parser/ChoiceParser.kt
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,12 @@ internal class ChoiceParser<out T>(
val pendingUnknownFirstTokens = mutableListOf<Parser<T>>()
for (parser in parsers) {
if (parser.hasUnknownFirstTokens()) {
values.forEach { it.add(parser) }
pendingUnknownFirstTokens += parser
values.forEach { it += parser }
} else {
for (token in parser.firstTokens) {
val parsersForToken = getOrPut(token) { mutableListOf() }
if (parsersForToken.isEmpty()) {
parsersForToken += pendingUnknownFirstTokens
}
parsersForToken.add(parser)
val parsersForToken = getOrPut(token) { pendingUnknownFirstTokens.toMutableList() }
parsersForToken += parser
}
}
}
Expand Down
20 changes: 14 additions & 6 deletions src/commonMain/kotlin/me/alllex/parsus/parser/Lexer.kt
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,20 @@ internal class Lexer(

init {
tokensByFirstChar = mutableMapOf<Char, MutableList<Token>>()
val unknownFirstCharTokens = mutableListOf<Token>()
for (token in tokens) {
val firstChars = token.firstChars
for (c in firstChars) {
tokensByFirstChar.getOrPut(c) { mutableListOf() }.add(token)
if (firstChars.isEmpty()) {
// If the token first char is unknown, then the first char heuristic cannot be applied.
// Therefore, we assume that such tokens can start with any character and put them in appropriate buckets
// to ensure the token priority correctness.
unknownFirstCharTokens += token
tokensByFirstChar.values.forEach { it += token }
} else {
for (c in firstChars) {
tokensByFirstChar.getOrPut(c) { unknownFirstCharTokens.toMutableList() }
.add(token)
}
}
}
}
Expand Down Expand Up @@ -54,14 +64,12 @@ internal class Lexer(
if (fromIndex < input.length) {
val nextChar = input[fromIndex]
val byFirstChar = tokensByFirstChar[nextChar].orEmpty()
for (i in byFirstChar.indices) {
val token = byFirstChar[i]
for (token in byFirstChar) {
matchImpl(fromIndex, token)?.let { return it }
}
}

for (i in tokens.indices) {
val token = tokens[i]
for (token in tokens) {
matchImpl(fromIndex, token)?.let { return it }
}
return null
Expand Down
24 changes: 24 additions & 0 deletions src/commonTest/kotlin/me/alllex/parsus/TokenTests.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package me.alllex.parsus

import assertk.assertions.isEqualTo
import me.alllex.parsus.parser.Grammar
import me.alllex.parsus.parser.map
import me.alllex.parsus.parser.or
import me.alllex.parsus.token.literalToken
import me.alllex.parsus.token.regexToken
import kotlin.test.Test

class TokenTests {

@Test
fun literalTokenThatPrefixesRegexTokenWithHigherPriority() {
object : Grammar<Int>() {
val r by regexToken("abba") map 1
val ab by literalToken("ab") map 2
override val root by r or ab
}.run {
assertParsed("abba").isEqualTo(1)
}
}

}

0 comments on commit 22e1993

Please sign in to comment.