Skip to content

Commit

Permalink
Merge pull request #14 from f1monkey/feature/refactoring
Browse files Browse the repository at this point in the history
Feature/refactoring

Remove unused code
Improve overall performance: +20% speed, -36% memory consumption, -98% allocation count according to benchmarks
  • Loading branch information
cyradin authored Jun 18, 2024
2 parents e3e4943 + 1a0e302 commit 419df6d
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 52 deletions.
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,10 @@ Running tool: /usr/local/go/bin/go test -benchmem -run=^$ -bench ^Benchmark_Norv
goos: linux
goarch: amd64
pkg: github.com/f1monkey/spellchecker
cpu: AMD Ryzen 7 7840HS w/ Radeon 780M Graphics
Benchmark_Norvig1-16 242 4861057 ns/op 74.07 success_percent 200.0 success_words 270.0 total_words 1643485 B/op 88241 allocs/op
cpu: 13th Gen Intel(R) Core(TM) i9-13980HX
Benchmark_Norvig1-32 294 3876229 ns/op 74.07 success_percent 200.0 success_words 270.0 total_words 918275 B/op 2150 allocs/op
PASS
ok github.com/f1monkey/spellchecker 3.343s
ok github.com/f1monkey/spellchecker 3.378s
```

#### [Test set 2](http://norvig.com/spell-testset2.txt):
Expand All @@ -107,8 +107,8 @@ Running tool: /usr/local/go/bin/go test -benchmem -run=^$ -bench ^Benchmark_Norv
goos: linux
goarch: amd64
pkg: github.com/f1monkey/spellchecker
cpu: AMD Ryzen 7 7840HS w/ Radeon 780M Graphics
Benchmark_Norvig2-16 150 7226006 ns/op 70.00 success_percent 280.0 success_words 400.0 total_words 2389231 B/op 129486 allocs/op
cpu: 13th Gen Intel(R) Core(TM) i9-13980HX
Benchmark_Norvig2-32 198 6102429 ns/op 70.00 success_percent 280.0 success_words 400.0 total_words 1327385 B/op 3121 allocs/op
PASS
ok github.com/f1monkey/spellchecker 3.244s
ok github.com/f1monkey/spellchecker 3.895s
```
61 changes: 19 additions & 42 deletions dictionary.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,13 @@ import (
"encoding/gob"
"math"
"sort"
"sync"
"sync/atomic"

"github.com/agnivade/levenshtein"
"github.com/f1monkey/bitmap"
)

type dictionary struct {
mtx sync.RWMutex

maxErrors int
alphabet alphabet
nextID func() uint32
Expand Down Expand Up @@ -46,25 +43,16 @@ func newDictionary(ab string, maxErrors int) (*dictionary, error) {

// id get ID of the word. Returns 0 if not found
func (d *dictionary) id(word string) uint32 {
d.mtx.RLock()
defer d.mtx.RUnlock()

return d.ids[word]
}

// has check if the word is present in the dictionary
func (d *dictionary) has(word string) bool {
d.mtx.RLock()
defer d.mtx.RUnlock()

return d.ids[word] > 0
}

// add puts the word to the dictionary
func (d *dictionary) add(word string) (uint32, error) {
d.mtx.Lock()
defer d.mtx.Unlock()

id := d.nextID()
d.ids[word] = id

Expand All @@ -79,9 +67,6 @@ func (d *dictionary) add(word string) (uint32, error) {

// inc increase word occurence counter
func (d *dictionary) inc(id uint32) {
d.mtx.Lock()
defer d.mtx.Unlock()

_, ok := d.counts[id]
if !ok {
return
Expand All @@ -94,16 +79,13 @@ type match struct {
Score float64
}

func (d *dictionary) Find(word string, n int) []match {
d.mtx.RLock()
defer d.mtx.RUnlock()

func (d *dictionary) find(word string, n int) []match {
if d.maxErrors <= 0 {
return nil
}

bm := d.alphabet.encode([]rune(word))
candidates := d.getCandidates(word, bm, 1)
candidates := d.getCandidates(word, bm)
result := calcScores([]rune(word), candidates)

if len(result) < n {
Expand All @@ -119,14 +101,11 @@ type сandidate struct {
Count int
}

func (d *dictionary) getCandidates(word string, bmSrc bitmap.Bitmap32, errCnt int) []сandidate {
checked := make(map[uint64]struct{}, d.alphabet.len()*2)

func (d *dictionary) getCandidates(word string, bmSrc bitmap.Bitmap32) []сandidate {
result := make([]сandidate, 0, 50)

// "exact match" OR "candidate has all the same letters as the word but in different order"
key := sum(bmSrc)
checked[key] = struct{}{}
ids := d.index[key]
for _, id := range ids {
docWord, ok := d.words[id]
Expand All @@ -151,7 +130,7 @@ func (d *dictionary) getCandidates(word string, bmSrc bitmap.Bitmap32, errCnt in
}

// @todo perform phonetic analysis with early termination here
for bm := range d.computeCandidateBitmaps(word, bmSrc) {
for bm := range d.computeCandidateBitmaps(bmSrc) {
ids := d.index[bm]
for _, id := range ids {
docWord, ok := d.words[id]
Expand All @@ -174,30 +153,34 @@ func (d *dictionary) getCandidates(word string, bmSrc bitmap.Bitmap32, errCnt in
return result
}

func (d *dictionary) computeCandidateBitmaps(word string, bmSrc bitmap.Bitmap32) map[uint64]struct{} {
func (d *dictionary) computeCandidateBitmaps(bmSrc bitmap.Bitmap32) map[uint64]struct{} {
bitmaps := make(map[uint64]struct{}, d.alphabet.len()*5)
bmSrc = bmSrc.Clone()

var i, j uint32
// swap one bit
for i := 0; i < d.alphabet.len(); i++ {
bit := uint32(i)
bmCandidate := bmSrc.Clone()
bmCandidate.Xor(bit)
for i = 0; i < uint32(d.alphabet.len()); i++ {
bmSrc.Xor(i)

// swap one more bit to be able to fix:
// - two deletions ("rang" => "orange")
// - replacements ("problam" => "problem")
for j := 0; j < d.alphabet.len(); j++ {
bit := uint32(j)
bmCandidate := bmCandidate.Clone()
bmCandidate.Xor(bit)
key := sum(bmCandidate)
for j = 0; j < uint32(d.alphabet.len()); j++ {
if i == j {
continue
}

bmSrc.Xor(j)
key := sum(bmSrc)
bmSrc.Xor(j) // return back the changed bit
if len(d.index[key]) == 0 {
continue
}
bitmaps[key] = struct{}{}
}

key := sum(bmCandidate)
key := sum(bmSrc)
bmSrc.Xor(i) // return back the changed bit
if len(d.index[key]) == 0 {
continue
}
Expand Down Expand Up @@ -249,9 +232,6 @@ type dictData struct {
}

func (d *dictionary) MarshalBinary() ([]byte, error) {
d.mtx.Lock()
defer d.mtx.Unlock()

data := &dictData{
Alphabet: d.alphabet,
IDs: d.ids,
Expand All @@ -271,9 +251,6 @@ func (d *dictionary) MarshalBinary() ([]byte, error) {
}

func (d *dictionary) UnmarshalBinary(data []byte) error {
d.mtx.Lock()
defer d.mtx.Unlock()

dictData := &dictData{}
err := gob.NewDecoder(bytes.NewBuffer(data)).Decode(dictData)
if err != nil {
Expand Down
8 changes: 4 additions & 4 deletions spellchecker.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,9 @@ func (s *Spellchecker) Fix(word string) (string, error) {
return word, nil
}

hits := s.dict.Find(word, 1)
hits := s.dict.find(word, 1)
if len(hits) == 0 {
return word, fmt.Errorf("%w: %s", ErrUnknownWord, word)
return word, ErrUnknownWord
}

return hits[0].Value, nil
Expand All @@ -113,9 +113,9 @@ func (s *Spellchecker) Suggest(word string, n int) ([]string, error) {
return []string{word}, nil
}

hits := s.dict.Find(word, n)
hits := s.dict.find(word, n)
if len(hits) == 0 {
return []string{word}, fmt.Errorf("%w: %s", ErrUnknownWord, word)
return []string{word}, ErrUnknownWord
}

result := make([]string, len(hits))
Expand Down

0 comments on commit 419df6d

Please sign in to comment.