More tests and fixes for gen lexer word boundary

Third time's the charm! The correct word boundary test when the position is NOT at the beginning and NOT at the end is to always test the character before the current position and the character at the current position. In the first version, it tested the character at the current position and the next character after the current position. In the second version, it also tested the character before the current position and the character at the current position. The correct solution is simpler and makes sense. Added more conformance tests to cover the changes, including more cases to ensure \b does NOT match when it should not.
alecthomas · Oct 28, 2022 · 6ca58cf · 6ca58cf
1 parent fb225ea
commit 6ca58cf
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 18 deletions.
diff --git a/cmd/participle/gen_lexer_cmd.go b/cmd/participle/gen_lexer_cmd.go
@@ -292,18 +292,15 @@ func generateRegexMatch(w io.Writer, lexerName, name, pattern string) error {
 			syntax.OpBeginText, syntax.OpEndText,
 			syntax.OpBeginLine, syntax.OpEndLine:
 			fmt.Fprintf(w, "var l, u rune = -1, -1\n")
-			fmt.Fprintf(w, "var checkPrevChar = false\n")
 			fmt.Fprintf(w, "if p == 0 {\n")
+			fmt.Fprintf(w, "  if p < len(s) {\n")
 			decodeRune(w, "0", "u", "_")
+			fmt.Fprintf(w, "  }\n")
 			fmt.Fprintf(w, "} else if p == len(s) {\n")
 			fmt.Fprintf(w, "  l, _ = utf8.DecodeLastRuneInString(s)\n")
 			fmt.Fprintf(w, "} else {\n")
-			fmt.Fprintf(w, "  checkPrevChar = true\n")
-			fmt.Fprintf(w, "  var ln int\n")
-			decodeRune(w, "p", "l", "ln")
-			fmt.Fprintf(w, "  if p+ln <= len(s) {\n")
-			decodeRune(w, "p+ln", "u", "_")
-			fmt.Fprintf(w, "  }\n")
+			fmt.Fprintf(w, "  l, _ = utf8.DecodeLastRuneInString(s[0:p])\n")
+			decodeRune(w, "p", "u", "_")
 			fmt.Fprintf(w, "}\n")
 			fmt.Fprintf(w, "op := syntax.EmptyOpContext(l, u)\n")
 			lut := map[syntax.Op]string{
@@ -315,16 +312,6 @@ func generateRegexMatch(w io.Writer, lexerName, name, pattern string) error {
 				syntax.OpEndLine:        "EmptyEndLine",
 			}
 			fmt.Fprintf(w, "if op & syntax.%s != 0 { return p }\n", lut[re.Op])
-			// If this isn't the start or end of the string, we also have to check if we match
-			// the preceding character (zero length op could have matched right before)
-			fmt.Fprintf(w, "if checkPrevChar {\n")
-			// decode the character immediately previous to this one (conditional logic above
-			// guarantees that p is > 0)
-			fmt.Fprintf(w, "  l, _ = utf8.DecodeLastRuneInString(s[0:p])\n")
-			decodeRune(w, "p", "u", "_")
-			fmt.Fprintf(w, "  op := syntax.EmptyOpContext(l, u)\n")
-			fmt.Fprintf(w, "  if op & syntax.%s != 0 { return p }\n", lut[re.Op])
-			fmt.Fprintf(w, "}\n")
 			fmt.Fprintf(w, "return -1\n")
 
 		case syntax.OpCapture: // capturing subexpression with index Cap, optional name Name

diff --git a/lexer/internal/conformance/conformance_test.go b/lexer/internal/conformance/conformance_test.go
@@ -19,7 +19,8 @@ var conformanceLexer = lexer.MustStateful(lexer.Rules{
 		{"ExprTest", `EXPRTEST:`, lexer.Push("ExprTest")},
 		{"LiteralTest", `LITTEST:`, lexer.Push("LiteralTest")},
 		{"CaseInsensitiveTest", `CITEST:`, lexer.Push("CaseInsensitiveTest")},
-		{"WordBoundaryTest", `WBTEST:`, lexer.Push("WordBoundaryTest")},
+		// Use this to test \b at very start of the string!
+		{"WordBoundaryTest", `\bWBTEST:`, lexer.Push("WordBoundaryTest")},
 	},
 	"ExprTest": {
 		{"ExprString", `"`, lexer.Push("ExprString")},
@@ -61,6 +62,7 @@ var conformanceLexer = lexer.MustStateful(lexer.Rules{
 	},
 	"WordBoundaryTest": {
 		{`WBKeyword`, `\b(?:abc|xyz)\b`, nil},
+		{`WBGroupKeyword`, `(?:90|0)\b`, nil},
 		{"Slash", `/`, nil},
 		{"Ident", `\w+`, nil},
 		{"Whitespace", `\s+`, nil},
@@ -160,16 +162,48 @@ func testLexer(t *testing.T, lex lexer.Definition) {
 			{"Whitespace", " "},
 			{"Ident", "world"},
 		}},
+		{"WordBoundarySlash2", `WBTEST:abc/xyz`, []token{
+			{"WBKeyword", "abc"},
+			{"Slash", "/"},
+			{"WBKeyword", "xyz"},
+		}},
 		{"WordBoundaryWhitespace", `WBTEST:abchello xyz world`, []token{
 			{"Ident", "abchello"},
 			{"Whitespace", " "},
 			{"WBKeyword", "xyz"},
 			{"Whitespace", " "},
 			{"Ident", "world"},
 		}},
+		// Case to ensure \b doesn't match even if only one character after token would match \b
+		{"WordBoundaryNoMatch1", `WBTEST:abc1 xyz1`, []token{
+			{"Ident", "abc1"},
+			{"Whitespace", " "},
+			{"Ident", "xyz1"},
+		}},
+		{"WordBoundaryNoMatch2", `WBTEST:abc12 xyz12`, []token{
+			{"Ident", "abc12"},
+			{"Whitespace", " "},
+			{"Ident", "xyz12"},
+		}},
 		{"WordBoundaryStartEnd", `WBTEST:xyz`, []token{
 			{"WBKeyword", "xyz"},
 		}},
+		{"WordBoundaryGroupMatch", `WBTEST:hello 90/0 world`, []token{
+			{"Ident", "hello"},
+			{"Whitespace", " "},
+			{"WBGroupKeyword", "90"},
+			{"Slash", "/"},
+			{"WBGroupKeyword", "0"},
+			{"Whitespace", " "},
+			{"Ident", "world"},
+		}},
+		{"WordBoundaryGroupNoMatch", `WBTEST:hello 900 world`, []token{
+			{"Ident", "hello"},
+			{"Whitespace", " "},
+			{"Ident", "900"},
+			{"Whitespace", " "},
+			{"Ident", "world"},
+		}},
 	}
 	symbols := lexer.SymbolsByRune(lex)
 	for _, test := range tests {