Skip to content

Commit

Permalink
More tests and fixes for gen lexer word boundary
Browse files Browse the repository at this point in the history
Third time's the charm!

The correct word boundary test when the position is NOT at the beginning
and NOT at the end is to always test the character before the current
position and the character at the current position.

In the first version, it tested the character at the current position
and the next character after the current position.

In the second version, it also tested the character
before the current position and the character at the current position.

The correct solution is simpler and makes sense.

Added more conformance tests to cover the changes, including more cases
to ensure \b does NOT match when it should not.
  • Loading branch information
klondikedragon authored and alecthomas committed Oct 28, 2022
1 parent fb225ea commit 6ca58cf
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 18 deletions.
21 changes: 4 additions & 17 deletions cmd/participle/gen_lexer_cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -292,18 +292,15 @@ func generateRegexMatch(w io.Writer, lexerName, name, pattern string) error {
syntax.OpBeginText, syntax.OpEndText,
syntax.OpBeginLine, syntax.OpEndLine:
fmt.Fprintf(w, "var l, u rune = -1, -1\n")
fmt.Fprintf(w, "var checkPrevChar = false\n")
fmt.Fprintf(w, "if p == 0 {\n")
fmt.Fprintf(w, " if p < len(s) {\n")
decodeRune(w, "0", "u", "_")
fmt.Fprintf(w, " }\n")
fmt.Fprintf(w, "} else if p == len(s) {\n")
fmt.Fprintf(w, " l, _ = utf8.DecodeLastRuneInString(s)\n")
fmt.Fprintf(w, "} else {\n")
fmt.Fprintf(w, " checkPrevChar = true\n")
fmt.Fprintf(w, " var ln int\n")
decodeRune(w, "p", "l", "ln")
fmt.Fprintf(w, " if p+ln <= len(s) {\n")
decodeRune(w, "p+ln", "u", "_")
fmt.Fprintf(w, " }\n")
fmt.Fprintf(w, " l, _ = utf8.DecodeLastRuneInString(s[0:p])\n")
decodeRune(w, "p", "u", "_")
fmt.Fprintf(w, "}\n")
fmt.Fprintf(w, "op := syntax.EmptyOpContext(l, u)\n")
lut := map[syntax.Op]string{
Expand All @@ -315,16 +312,6 @@ func generateRegexMatch(w io.Writer, lexerName, name, pattern string) error {
syntax.OpEndLine: "EmptyEndLine",
}
fmt.Fprintf(w, "if op & syntax.%s != 0 { return p }\n", lut[re.Op])
// If this isn't the start or end of the string, we also have to check if we match
// the preceding character (zero length op could have matched right before)
fmt.Fprintf(w, "if checkPrevChar {\n")
// decode the character immediately previous to this one (conditional logic above
// guarantees that p is > 0)
fmt.Fprintf(w, " l, _ = utf8.DecodeLastRuneInString(s[0:p])\n")
decodeRune(w, "p", "u", "_")
fmt.Fprintf(w, " op := syntax.EmptyOpContext(l, u)\n")
fmt.Fprintf(w, " if op & syntax.%s != 0 { return p }\n", lut[re.Op])
fmt.Fprintf(w, "}\n")
fmt.Fprintf(w, "return -1\n")

case syntax.OpCapture: // capturing subexpression with index Cap, optional name Name
Expand Down
36 changes: 35 additions & 1 deletion lexer/internal/conformance/conformance_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ var conformanceLexer = lexer.MustStateful(lexer.Rules{
{"ExprTest", `EXPRTEST:`, lexer.Push("ExprTest")},
{"LiteralTest", `LITTEST:`, lexer.Push("LiteralTest")},
{"CaseInsensitiveTest", `CITEST:`, lexer.Push("CaseInsensitiveTest")},
{"WordBoundaryTest", `WBTEST:`, lexer.Push("WordBoundaryTest")},
// Use this to test \b at very start of the string!
{"WordBoundaryTest", `\bWBTEST:`, lexer.Push("WordBoundaryTest")},
},
"ExprTest": {
{"ExprString", `"`, lexer.Push("ExprString")},
Expand Down Expand Up @@ -61,6 +62,7 @@ var conformanceLexer = lexer.MustStateful(lexer.Rules{
},
"WordBoundaryTest": {
{`WBKeyword`, `\b(?:abc|xyz)\b`, nil},
{`WBGroupKeyword`, `(?:90|0)\b`, nil},
{"Slash", `/`, nil},
{"Ident", `\w+`, nil},
{"Whitespace", `\s+`, nil},
Expand Down Expand Up @@ -160,16 +162,48 @@ func testLexer(t *testing.T, lex lexer.Definition) {
{"Whitespace", " "},
{"Ident", "world"},
}},
{"WordBoundarySlash2", `WBTEST:abc/xyz`, []token{
{"WBKeyword", "abc"},
{"Slash", "/"},
{"WBKeyword", "xyz"},
}},
{"WordBoundaryWhitespace", `WBTEST:abchello xyz world`, []token{
{"Ident", "abchello"},
{"Whitespace", " "},
{"WBKeyword", "xyz"},
{"Whitespace", " "},
{"Ident", "world"},
}},
// Case to ensure \b doesn't match even if only one character after token would match \b
{"WordBoundaryNoMatch1", `WBTEST:abc1 xyz1`, []token{
{"Ident", "abc1"},
{"Whitespace", " "},
{"Ident", "xyz1"},
}},
{"WordBoundaryNoMatch2", `WBTEST:abc12 xyz12`, []token{
{"Ident", "abc12"},
{"Whitespace", " "},
{"Ident", "xyz12"},
}},
{"WordBoundaryStartEnd", `WBTEST:xyz`, []token{
{"WBKeyword", "xyz"},
}},
{"WordBoundaryGroupMatch", `WBTEST:hello 90/0 world`, []token{
{"Ident", "hello"},
{"Whitespace", " "},
{"WBGroupKeyword", "90"},
{"Slash", "/"},
{"WBGroupKeyword", "0"},
{"Whitespace", " "},
{"Ident", "world"},
}},
{"WordBoundaryGroupNoMatch", `WBTEST:hello 900 world`, []token{
{"Ident", "hello"},
{"Whitespace", " "},
{"Ident", "900"},
{"Whitespace", " "},
{"Ident", "world"},
}},
}
symbols := lexer.SymbolsByRune(lex)
for _, test := range tests {
Expand Down

0 comments on commit 6ca58cf

Please sign in to comment.