Skip to content

Commit

Permalink
fix unicodeFoldTransformer byte consumed counts
Browse files Browse the repository at this point in the history
This costs some CPU time, but of course it's better than panicking.

Fixes #54

goos: darwin
goarch: arm64
pkg: github.com/lithammer/fuzzysearch/fuzzy
                              │      a      │                 b                  │
                              │   sec/op    │   sec/op     vs base               │
Match-8                         16.08n ± 2%   15.99n ± 1%       ~ (p=0.361 n=10)
MatchBigLate-8                  1.005µ ± 1%   1.005µ ± 0%       ~ (p=0.861 n=10)
MatchBigEarly-8                 12.60n ± 1%   12.56n ± 1%       ~ (p=0.641 n=10)
MatchFold-8                     136.0n ± 1%   144.5n ± 1%  +6.25% (p=0.000 n=10)
MatchFoldBigLate-8              7.071µ ± 5%   7.520µ ± 1%  +6.36% (p=0.000 n=10)
MatchFoldBigEarly-8             6.093µ ± 2%   6.560µ ± 3%  +7.67% (p=0.000 n=10)
RankMatch-8                     17.67n ± 1%   17.56n ± 1%       ~ (p=0.319 n=10)
RankMatchBigLate-8              1.008µ ± 2%   1.008µ ± 1%       ~ (p=0.870 n=10)
RankMatchBigEarly-8             1.228µ ± 1%   1.228µ ± 2%       ~ (p=0.821 n=10)
LevenshteinDistance-8           55.63n ± 1%   55.45n ± 2%       ~ (p=0.225 n=10)
LevenshteinDistanceBigLate-8    20.44µ ± 2%   20.44µ ± 3%       ~ (p=0.739 n=10)
LevenshteinDistanceBigEarly-8   20.47µ ± 3%   20.43µ ± 1%       ~ (p=0.280 n=10)
geomean                         539.4n        547.4n       +1.48%
  • Loading branch information
josharian committed May 4, 2023
1 parent 24e57ae commit d52caf1
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 4 deletions.
16 changes: 12 additions & 4 deletions fuzzy/fuzzy.go
Original file line number Diff line number Diff line change
Expand Up @@ -250,22 +250,30 @@ func stringTransform(s string, t transform.Transformer) (transformed string) {
type unicodeFoldTransformer struct{ transform.NopResetter }

func (unicodeFoldTransformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
n := 0
// Converting src to a string allocates.
// In theory, it need not; see https://go.dev/issue/27148.
// It is possible to write this loop using utf8.DecodeRune
// and thereby avoid allocations, but it is noticeably slower.
// So just let's wait for the compiler to get smarter.
for _, r := range string(src) {
if r == utf8.RuneError {
// Go spec for ranging over a string says:
// If the iteration encounters an invalid UTF-8 sequence,
// the second value will be 0xFFFD, the Unicode replacement character,
// and the next iteration will advance a single byte in the string.
nSrc++
} else {
nSrc += utf8.RuneLen(r)
}
r = unicode.ToLower(r)
x := utf8.RuneLen(r)
if x > len(dst[n:]) {
if x > len(dst[nDst:]) {
err = transform.ErrShortDst
break
}
n += utf8.EncodeRune(dst[n:], r)
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return n, n, err
return nDst, nSrc, err
}

type nopTransformer struct{ transform.NopResetter }
Expand Down
2 changes: 2 additions & 0 deletions fuzzy/fuzzy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ var fuzzyTests = []struct {
{"イ", "イカ", true, 1},
{"limón", "limon", false, -1},
{"kitten", "setting", false, -1},
{"\xffinvalid UTF-8\xff", "", false, -1}, // invalid UTF-8
{"Ⱦ", "", false, -1}, // uppercase and lowercase runes have different UTF-8 encoding lengths
}

func TestFuzzyMatch(t *testing.T) {
Expand Down

0 comments on commit d52caf1

Please sign in to comment.