Skip to content

Commit

Permalink
Merge pull request #55 from josharian/utf8-woes
Browse files Browse the repository at this point in the history
fix unicodeFoldTransformer byte consumed counts
  • Loading branch information
lithammer authored May 9, 2023
2 parents e05c30e + d52caf1 commit 6053418
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 4 deletions.
16 changes: 12 additions & 4 deletions fuzzy/fuzzy.go
Original file line number Diff line number Diff line change
Expand Up @@ -250,22 +250,30 @@ func stringTransform(s string, t transform.Transformer) (transformed string) {
type unicodeFoldTransformer struct{ transform.NopResetter }

func (unicodeFoldTransformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
n := 0
// Converting src to a string allocates.
// In theory, it need not; see https://go.dev/issue/27148.
// It is possible to write this loop using utf8.DecodeRune
// and thereby avoid allocations, but it is noticeably slower.
// So just let's wait for the compiler to get smarter.
for _, r := range string(src) {
if r == utf8.RuneError {
// Go spec for ranging over a string says:
// If the iteration encounters an invalid UTF-8 sequence,
// the second value will be 0xFFFD, the Unicode replacement character,
// and the next iteration will advance a single byte in the string.
nSrc++
} else {
nSrc += utf8.RuneLen(r)
}
r = unicode.ToLower(r)
x := utf8.RuneLen(r)
if x > len(dst[n:]) {
if x > len(dst[nDst:]) {
err = transform.ErrShortDst
break
}
n += utf8.EncodeRune(dst[n:], r)
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return n, n, err
return nDst, nSrc, err
}

type nopTransformer struct{ transform.NopResetter }
Expand Down
2 changes: 2 additions & 0 deletions fuzzy/fuzzy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ var fuzzyTests = []struct {
{"イ", "イカ", true, 1},
{"limón", "limon", false, -1},
{"kitten", "setting", false, -1},
{"\xffinvalid UTF-8\xff", "", false, -1}, // invalid UTF-8
{"Ⱦ", "", false, -1}, // uppercase and lowercase runes have different UTF-8 encoding lengths
}

func TestFuzzyMatch(t *testing.T) {
Expand Down

0 comments on commit 6053418

Please sign in to comment.