From d52caf1404fa50739ba1ecee4ffc4b2b45d16d80 Mon Sep 17 00:00:00 2001
From: Josh Bleecher Snyder <josharian@gmail.com>
Date: Thu, 4 May 2023 11:30:55 -0700
Subject: [PATCH] fix unicodeFoldTransformer byte consumed counts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This costs some CPU time, but of course it's better than panicking.

Fixes #54

goos: darwin
goarch: arm64
pkg: github.com/lithammer/fuzzysearch/fuzzy
                              │      a      │                 b                  │
                              │   sec/op    │   sec/op     vs base               │
Match-8                         16.08n ± 2%   15.99n ± 1%       ~ (p=0.361 n=10)
MatchBigLate-8                  1.005µ ± 1%   1.005µ ± 0%       ~ (p=0.861 n=10)
MatchBigEarly-8                 12.60n ± 1%   12.56n ± 1%       ~ (p=0.641 n=10)
MatchFold-8                     136.0n ± 1%   144.5n ± 1%  +6.25% (p=0.000 n=10)
MatchFoldBigLate-8              7.071µ ± 5%   7.520µ ± 1%  +6.36% (p=0.000 n=10)
MatchFoldBigEarly-8             6.093µ ± 2%   6.560µ ± 3%  +7.67% (p=0.000 n=10)
RankMatch-8                     17.67n ± 1%   17.56n ± 1%       ~ (p=0.319 n=10)
RankMatchBigLate-8              1.008µ ± 2%   1.008µ ± 1%       ~ (p=0.870 n=10)
RankMatchBigEarly-8             1.228µ ± 1%   1.228µ ± 2%       ~ (p=0.821 n=10)
LevenshteinDistance-8           55.63n ± 1%   55.45n ± 2%       ~ (p=0.225 n=10)
LevenshteinDistanceBigLate-8    20.44µ ± 2%   20.44µ ± 3%       ~ (p=0.739 n=10)
LevenshteinDistanceBigEarly-8   20.47µ ± 3%   20.43µ ± 1%       ~ (p=0.280 n=10)
geomean                         539.4n        547.4n       +1.48%
---
 fuzzy/fuzzy.go      | 16 ++++++++++++----
 fuzzy/fuzzy_test.go |  2 ++
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/fuzzy/fuzzy.go b/fuzzy/fuzzy.go
index 98b06be..cee5f90 100644
--- a/fuzzy/fuzzy.go
+++ b/fuzzy/fuzzy.go
@@ -250,22 +250,30 @@ func stringTransform(s string, t transform.Transformer) (transformed string) {
 type unicodeFoldTransformer struct{ transform.NopResetter }
 
 func (unicodeFoldTransformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
-	n := 0
 	// Converting src to a string allocates.
 	// In theory, it need not; see https://go.dev/issue/27148.
 	// It is possible to write this loop using utf8.DecodeRune
 	// and thereby avoid allocations, but it is noticeably slower.
 	// So just let's wait for the compiler to get smarter.
 	for _, r := range string(src) {
+		if r == utf8.RuneError {
+			// Go spec for ranging over a string says:
+			// If the iteration encounters an invalid UTF-8 sequence,
+			// the second value will be 0xFFFD, the Unicode replacement character,
+			// and the next iteration will advance a single byte in the string.
+			nSrc++
+		} else {
+			nSrc += utf8.RuneLen(r)
+		}
 		r = unicode.ToLower(r)
 		x := utf8.RuneLen(r)
-		if x > len(dst[n:]) {
+		if x > len(dst[nDst:]) {
 			err = transform.ErrShortDst
 			break
 		}
-		n += utf8.EncodeRune(dst[n:], r)
+		nDst += utf8.EncodeRune(dst[nDst:], r)
 	}
-	return n, n, err
+	return nDst, nSrc, err
 }
 
 type nopTransformer struct{ transform.NopResetter }
diff --git a/fuzzy/fuzzy_test.go b/fuzzy/fuzzy_test.go
index ffecfed..c744fee 100644
--- a/fuzzy/fuzzy_test.go
+++ b/fuzzy/fuzzy_test.go
@@ -47,6 +47,8 @@ var fuzzyTests = []struct {
 	{"イ", "イカ", true, 1},
 	{"limón", "limon", false, -1},
 	{"kitten", "setting", false, -1},
+	{"\xffinvalid UTF-8\xff", "", false, -1}, // invalid UTF-8
+	{"Ⱦ", "", false, -1},                     // uppercase and lowercase runes have different UTF-8 encoding lengths
 }
 
 func TestFuzzyMatch(t *testing.T) {