From d52caf1404fa50739ba1ecee4ffc4b2b45d16d80 Mon Sep 17 00:00:00 2001 From: Josh Bleecher Snyder Date: Thu, 4 May 2023 11:30:55 -0700 Subject: [PATCH] fix unicodeFoldTransformer byte consumed counts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This costs some CPU time, but of course it's better than panicking. Fixes #54 goos: darwin goarch: arm64 pkg: github.com/lithammer/fuzzysearch/fuzzy │ a │ b │ │ sec/op │ sec/op vs base │ Match-8 16.08n ± 2% 15.99n ± 1% ~ (p=0.361 n=10) MatchBigLate-8 1.005µ ± 1% 1.005µ ± 0% ~ (p=0.861 n=10) MatchBigEarly-8 12.60n ± 1% 12.56n ± 1% ~ (p=0.641 n=10) MatchFold-8 136.0n ± 1% 144.5n ± 1% +6.25% (p=0.000 n=10) MatchFoldBigLate-8 7.071µ ± 5% 7.520µ ± 1% +6.36% (p=0.000 n=10) MatchFoldBigEarly-8 6.093µ ± 2% 6.560µ ± 3% +7.67% (p=0.000 n=10) RankMatch-8 17.67n ± 1% 17.56n ± 1% ~ (p=0.319 n=10) RankMatchBigLate-8 1.008µ ± 2% 1.008µ ± 1% ~ (p=0.870 n=10) RankMatchBigEarly-8 1.228µ ± 1% 1.228µ ± 2% ~ (p=0.821 n=10) LevenshteinDistance-8 55.63n ± 1% 55.45n ± 2% ~ (p=0.225 n=10) LevenshteinDistanceBigLate-8 20.44µ ± 2% 20.44µ ± 3% ~ (p=0.739 n=10) LevenshteinDistanceBigEarly-8 20.47µ ± 3% 20.43µ ± 1% ~ (p=0.280 n=10) geomean 539.4n 547.4n +1.48% --- fuzzy/fuzzy.go | 16 ++++++++++++---- fuzzy/fuzzy_test.go | 2 ++ 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/fuzzy/fuzzy.go b/fuzzy/fuzzy.go index 98b06be..cee5f90 100644 --- a/fuzzy/fuzzy.go +++ b/fuzzy/fuzzy.go @@ -250,22 +250,30 @@ func stringTransform(s string, t transform.Transformer) (transformed string) { type unicodeFoldTransformer struct{ transform.NopResetter } func (unicodeFoldTransformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { - n := 0 // Converting src to a string allocates. // In theory, it need not; see https://go.dev/issue/27148. // It is possible to write this loop using utf8.DecodeRune // and thereby avoid allocations, but it is noticeably slower. // So just let's wait for the compiler to get smarter. for _, r := range string(src) { + if r == utf8.RuneError { + // Go spec for ranging over a string says: + // If the iteration encounters an invalid UTF-8 sequence, + // the second value will be 0xFFFD, the Unicode replacement character, + // and the next iteration will advance a single byte in the string. + nSrc++ + } else { + nSrc += utf8.RuneLen(r) + } r = unicode.ToLower(r) x := utf8.RuneLen(r) - if x > len(dst[n:]) { + if x > len(dst[nDst:]) { err = transform.ErrShortDst break } - n += utf8.EncodeRune(dst[n:], r) + nDst += utf8.EncodeRune(dst[nDst:], r) } - return n, n, err + return nDst, nSrc, err } type nopTransformer struct{ transform.NopResetter } diff --git a/fuzzy/fuzzy_test.go b/fuzzy/fuzzy_test.go index ffecfed..c744fee 100644 --- a/fuzzy/fuzzy_test.go +++ b/fuzzy/fuzzy_test.go @@ -47,6 +47,8 @@ var fuzzyTests = []struct { {"イ", "イカ", true, 1}, {"limón", "limon", false, -1}, {"kitten", "setting", false, -1}, + {"\xffinvalid UTF-8\xff", "", false, -1}, // invalid UTF-8 + {"Ⱦ", "", false, -1}, // uppercase and lowercase runes have different UTF-8 encoding lengths } func TestFuzzyMatch(t *testing.T) {