Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add normalized and normalized-fold varieties of match/find/rank #16

Merged
merged 2 commits into from
Dec 9, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/go.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: Go
on: [push]
on: [push, pull_request]
jobs:
test:
strategy:
Expand Down
126 changes: 103 additions & 23 deletions fuzzy/fuzzy.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,48 @@
package fuzzy

import (
"bytes"
"unicode"
"unicode/utf8"

"golang.org/x/text/runes"
"golang.org/x/text/transform"
"golang.org/x/text/unicode/norm"
)

var noop = func(r rune) rune { return r }
var foldTransformer = unicodeFoldTransformer{}
var noopTransformer = transform.Nop
var normalizeTransformer = transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
var normalizeFoldTransformer = transform.Chain(normalizeTransformer, foldTransformer)

// Match returns true if source matches target using a fuzzy-searching
// algorithm. Note that it doesn't implement Levenshtein distance (see
// RankMatch instead), but rather a simplified version where there's no
// approximation. The method will return true only if each character in the
// source can be found in the target and occurs after the preceding matches.
func Match(source, target string) bool {
return match(source, target, noop)
return match(source, target, noopTransformer)
}

// MatchFold is a case-insensitive version of Match.
func MatchFold(source, target string) bool {
return match(source, target, unicode.ToLower)
return match(source, target, foldTransformer)
}

// MatchNormalized is a unicode-normalized version of Match.
func MatchNormalized(source, target string) bool {
return match(source, target, normalizeTransformer)
}

func match(source, target string, fn func(rune) rune) bool {
// MatchNormalizedFold is a unicode-normalized and case-insensitive version of Match.
func MatchNormalizedFold(source, target string) bool {
return match(source, target, normalizeFoldTransformer)
}

func match(source, target string, transformer transform.Transformer) bool {
source = stringTransform(source, transformer)
target = stringTransform(target, transformer)

lenDiff := len(target) - len(source)

if lenDiff < 0 {
Expand All @@ -37,7 +58,7 @@ func match(source, target string, fn func(rune) rune) bool {
Outer:
for _, r1 := range source {
for i, r2 := range target {
if fn(r1) == fn(r2) {
if r1 == r2 {
target = target[i+utf8.RuneLen(r2):]
continue Outer
}
Expand All @@ -50,19 +71,29 @@ Outer:

// Find will return a list of strings in targets that fuzzy matches source.
func Find(source string, targets []string) []string {
return find(source, targets, noop)
return find(source, targets, noopTransformer)
}

// FindFold is a case-insensitive version of Find.
func FindFold(source string, targets []string) []string {
return find(source, targets, unicode.ToLower)
return find(source, targets, foldTransformer)
}

func find(source string, targets []string, fn func(rune) rune) []string {
// FindNormalized is a unicode-normalized version of Find.
func FindNormalized(source string, targets []string) []string {
return find(source, targets, normalizeTransformer)
}

// FindNormalizedFold is a unicode-normalized and case-insensitive version of Find.
func FindNormalizedFold(source string, targets []string) []string {
return find(source, targets, normalizeFoldTransformer)
}

func find(source string, targets []string, transformer transform.Transformer) []string {
var matches []string

for _, target := range targets {
if match(source, target, fn) {
if match(source, target, transformer) {
matches = append(matches, target)
}
}
Expand All @@ -77,21 +108,34 @@ func find(source string, targets []string, fn func(rune) rune) []string {
// the Levenshtein calculation, only deletions need be considered, required
// additions and substitutions would fail the match test.
func RankMatch(source, target string) int {
return rank(source, target, noop)
return rank(source, target, noopTransformer)
}

// RankMatchFold is a case-insensitive version of RankMatch.
func RankMatchFold(source, target string) int {
return rank(source, target, unicode.ToLower)
return rank(source, target, foldTransformer)
}

func rank(source, target string, fn func(rune) rune) int {
// RankMatchNormalized is a unicode-normalized version of RankMatch.
func RankMatchNormalized(source, target string) int {
return rank(source, target, normalizeTransformer)
}

// RankMatchNormalizedFold is a unicode-normalized and case-insensitive version of RankMatch.
func RankMatchNormalizedFold(source, target string) int {
return rank(source, target, normalizeFoldTransformer)
}

func rank(source, target string, transformer transform.Transformer) int {
lenDiff := len(target) - len(source)

if lenDiff < 0 {
return -1
}

source = stringTransform(source, transformer)
target = stringTransform(target, transformer)

if lenDiff == 0 && source == target {
return 0
}
Expand All @@ -101,7 +145,7 @@ func rank(source, target string, fn func(rune) rune) int {
Outer:
for _, r1 := range source {
for i, r2 := range target {
if fn(r1) == fn(r2) {
if r1 == r2 {
target = target[i+utf8.RuneLen(r2):]
continue Outer
} else {
Expand All @@ -120,23 +164,29 @@ Outer:
// RankFind is similar to Find, except it will also rank all matches using
// Levenshtein distance.
func RankFind(source string, targets []string) Ranks {
var r Ranks

for index, target := range targets {
if match(source, target, noop) {
distance := LevenshteinDistance(source, target)
r = append(r, Rank{source, target, distance, index})
}
}
return r
return rankFind(source, targets, noopTransformer)
}

// RankFindFold is a case-insensitive version of RankFind.
func RankFindFold(source string, targets []string) Ranks {
return rankFind(source, targets, foldTransformer)
}

// RankFindNormalized is a unicode-normalizedversion of RankFind.
func RankFindNormalized(source string, targets []string) Ranks {
return rankFind(source, targets, normalizeTransformer)
}

// RankFindNormalizedFold is a unicode-normalized and case-insensitive version of RankFind.
func RankFindNormalizedFold(source string, targets []string) Ranks {
return rankFind(source, targets, normalizeFoldTransformer)
}

func rankFind(source string, targets []string, transformer transform.Transformer) Ranks {
var r Ranks

for index, target := range targets {
if match(source, target, unicode.ToLower) {
if match(source, target, transformer) {
distance := LevenshteinDistance(source, target)
r = append(r, Rank{source, target, distance, index})
}
Expand Down Expand Up @@ -171,3 +221,33 @@ func (r Ranks) Swap(i, j int) {
func (r Ranks) Less(i, j int) bool {
return r[i].Distance < r[j].Distance
}

func stringTransform(s string, t transform.Transformer) (transformed string) {
var err error
transformed, _, err = transform.String(t, s)
if err != nil {
transformed = s
}

return
}

type unicodeFoldTransformer struct{}

func (unicodeFoldTransformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
runes := bytes.Runes(src)
var lowerRunes []rune
for _, r := range runes {
lowerRunes = append(lowerRunes, unicode.ToLower(r))
}

srcBytes := []byte(string(lowerRunes))
n := copy(dst, srcBytes)
if n < len(srcBytes) {
err = transform.ErrShortDst
}

return n, n, err
}

func (unicodeFoldTransformer) Reset() {}
Loading