diff --git a/cmd/participle/gen_lexer_cmd.go b/cmd/participle/gen_lexer_cmd.go index 97f83647..edc0fdcc 100644 --- a/cmd/participle/gen_lexer_cmd.go +++ b/cmd/participle/gen_lexer_cmd.go @@ -292,18 +292,15 @@ func generateRegexMatch(w io.Writer, lexerName, name, pattern string) error { syntax.OpBeginText, syntax.OpEndText, syntax.OpBeginLine, syntax.OpEndLine: fmt.Fprintf(w, "var l, u rune = -1, -1\n") - fmt.Fprintf(w, "var checkPrevChar = false\n") fmt.Fprintf(w, "if p == 0 {\n") + fmt.Fprintf(w, " if p < len(s) {\n") decodeRune(w, "0", "u", "_") + fmt.Fprintf(w, " }\n") fmt.Fprintf(w, "} else if p == len(s) {\n") fmt.Fprintf(w, " l, _ = utf8.DecodeLastRuneInString(s)\n") fmt.Fprintf(w, "} else {\n") - fmt.Fprintf(w, " checkPrevChar = true\n") - fmt.Fprintf(w, " var ln int\n") - decodeRune(w, "p", "l", "ln") - fmt.Fprintf(w, " if p+ln <= len(s) {\n") - decodeRune(w, "p+ln", "u", "_") - fmt.Fprintf(w, " }\n") + fmt.Fprintf(w, " l, _ = utf8.DecodeLastRuneInString(s[0:p])\n") + decodeRune(w, "p", "u", "_") fmt.Fprintf(w, "}\n") fmt.Fprintf(w, "op := syntax.EmptyOpContext(l, u)\n") lut := map[syntax.Op]string{ @@ -315,16 +312,6 @@ func generateRegexMatch(w io.Writer, lexerName, name, pattern string) error { syntax.OpEndLine: "EmptyEndLine", } fmt.Fprintf(w, "if op & syntax.%s != 0 { return p }\n", lut[re.Op]) - // If this isn't the start or end of the string, we also have to check if we match - // the preceding character (zero length op could have matched right before) - fmt.Fprintf(w, "if checkPrevChar {\n") - // decode the character immediately previous to this one (conditional logic above - // guarantees that p is > 0) - fmt.Fprintf(w, " l, _ = utf8.DecodeLastRuneInString(s[0:p])\n") - decodeRune(w, "p", "u", "_") - fmt.Fprintf(w, " op := syntax.EmptyOpContext(l, u)\n") - fmt.Fprintf(w, " if op & syntax.%s != 0 { return p }\n", lut[re.Op]) - fmt.Fprintf(w, "}\n") fmt.Fprintf(w, "return -1\n") case syntax.OpCapture: // capturing subexpression with index Cap, optional name Name diff --git a/lexer/internal/conformance/conformance_test.go b/lexer/internal/conformance/conformance_test.go index c39a7521..839bbc1d 100644 --- a/lexer/internal/conformance/conformance_test.go +++ b/lexer/internal/conformance/conformance_test.go @@ -19,7 +19,8 @@ var conformanceLexer = lexer.MustStateful(lexer.Rules{ {"ExprTest", `EXPRTEST:`, lexer.Push("ExprTest")}, {"LiteralTest", `LITTEST:`, lexer.Push("LiteralTest")}, {"CaseInsensitiveTest", `CITEST:`, lexer.Push("CaseInsensitiveTest")}, - {"WordBoundaryTest", `WBTEST:`, lexer.Push("WordBoundaryTest")}, + // Use this to test \b at very start of the string! + {"WordBoundaryTest", `\bWBTEST:`, lexer.Push("WordBoundaryTest")}, }, "ExprTest": { {"ExprString", `"`, lexer.Push("ExprString")}, @@ -61,6 +62,7 @@ var conformanceLexer = lexer.MustStateful(lexer.Rules{ }, "WordBoundaryTest": { {`WBKeyword`, `\b(?:abc|xyz)\b`, nil}, + {`WBGroupKeyword`, `(?:90|0)\b`, nil}, {"Slash", `/`, nil}, {"Ident", `\w+`, nil}, {"Whitespace", `\s+`, nil}, @@ -160,6 +162,11 @@ func testLexer(t *testing.T, lex lexer.Definition) { {"Whitespace", " "}, {"Ident", "world"}, }}, + {"WordBoundarySlash2", `WBTEST:abc/xyz`, []token{ + {"WBKeyword", "abc"}, + {"Slash", "/"}, + {"WBKeyword", "xyz"}, + }}, {"WordBoundaryWhitespace", `WBTEST:abchello xyz world`, []token{ {"Ident", "abchello"}, {"Whitespace", " "}, @@ -167,9 +174,36 @@ func testLexer(t *testing.T, lex lexer.Definition) { {"Whitespace", " "}, {"Ident", "world"}, }}, + // Case to ensure \b doesn't match even if only one character after token would match \b + {"WordBoundaryNoMatch1", `WBTEST:abc1 xyz1`, []token{ + {"Ident", "abc1"}, + {"Whitespace", " "}, + {"Ident", "xyz1"}, + }}, + {"WordBoundaryNoMatch2", `WBTEST:abc12 xyz12`, []token{ + {"Ident", "abc12"}, + {"Whitespace", " "}, + {"Ident", "xyz12"}, + }}, {"WordBoundaryStartEnd", `WBTEST:xyz`, []token{ {"WBKeyword", "xyz"}, }}, + {"WordBoundaryGroupMatch", `WBTEST:hello 90/0 world`, []token{ + {"Ident", "hello"}, + {"Whitespace", " "}, + {"WBGroupKeyword", "90"}, + {"Slash", "/"}, + {"WBGroupKeyword", "0"}, + {"Whitespace", " "}, + {"Ident", "world"}, + }}, + {"WordBoundaryGroupNoMatch", `WBTEST:hello 900 world`, []token{ + {"Ident", "hello"}, + {"Whitespace", " "}, + {"Ident", "900"}, + {"Whitespace", " "}, + {"Ident", "world"}, + }}, } symbols := lexer.SymbolsByRune(lex) for _, test := range tests {