Merge pull request #55 from josharian/utf8-woes

fix unicodeFoldTransformer byte consumed counts
lithammer · May 9, 2023 · 6053418 · 6053418
2 parents e05c30e + d52caf1
commit 6053418
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 4 deletions.
diff --git a/fuzzy/fuzzy.go b/fuzzy/fuzzy.go
@@ -250,22 +250,30 @@ func stringTransform(s string, t transform.Transformer) (transformed string) {
 type unicodeFoldTransformer struct{ transform.NopResetter }
 
 func (unicodeFoldTransformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
-	n := 0
 	// Converting src to a string allocates.
 	// In theory, it need not; see https://go.dev/issue/27148.
 	// It is possible to write this loop using utf8.DecodeRune
 	// and thereby avoid allocations, but it is noticeably slower.
 	// So just let's wait for the compiler to get smarter.
 	for _, r := range string(src) {
+		if r == utf8.RuneError {
+			// Go spec for ranging over a string says:
+			// If the iteration encounters an invalid UTF-8 sequence,
+			// the second value will be 0xFFFD, the Unicode replacement character,
+			// and the next iteration will advance a single byte in the string.
+			nSrc++
+		} else {
+			nSrc += utf8.RuneLen(r)
+		}
 		r = unicode.ToLower(r)
 		x := utf8.RuneLen(r)
-		if x > len(dst[n:]) {
+		if x > len(dst[nDst:]) {
 			err = transform.ErrShortDst
 			break
 		}
-		n += utf8.EncodeRune(dst[n:], r)
+		nDst += utf8.EncodeRune(dst[nDst:], r)
 	}
-	return n, n, err
+	return nDst, nSrc, err
 }
 
 type nopTransformer struct{ transform.NopResetter }

diff --git a/fuzzy/fuzzy_test.go b/fuzzy/fuzzy_test.go
@@ -47,6 +47,8 @@ var fuzzyTests = []struct {
 	{"イ", "イカ", true, 1},
 	{"limón", "limon", false, -1},
 	{"kitten", "setting", false, -1},
+	{"\xffinvalid UTF-8\xff", "", false, -1}, // invalid UTF-8
+	{"Ⱦ", "", false, -1},                     // uppercase and lowercase runes have different UTF-8 encoding lengths
 }
 
 func TestFuzzyMatch(t *testing.T) {