Merge pull request #14 from f1monkey/feature/refactoring

Feature/refactoring Remove unused code Improve overall performance: +20% speed, -36% memory consumption, -98% allocation count according to benchmarks
f1monkey · Jun 18, 2024 · 419df6d · 419df6d
2 parents e3e4943 + 1a0e302
commit 419df6d
Show file tree

Hide file tree

Showing 3 changed files with 29 additions and 52 deletions.
diff --git a/README.md b/README.md
@@ -93,10 +93,10 @@ Running tool: /usr/local/go/bin/go test -benchmem -run=^$ -bench ^Benchmark_Norv
 goos: linux
 goarch: amd64
 pkg: github.com/f1monkey/spellchecker
-cpu: AMD Ryzen 7 7840HS w/ Radeon 780M Graphics     
-Benchmark_Norvig1-16    	     242	   4861057 ns/op	        74.07 success_percent	       200.0 success_words	       270.0 total_words	 1643485 B/op	   88241 allocs/op
+cpu: 13th Gen Intel(R) Core(TM) i9-13980HX
+Benchmark_Norvig1-32    	     294	   3876229 ns/op	        74.07 success_percent	       200.0 success_words	       270.0 total_words	  918275 B/op	    2150 allocs/op
 PASS
-ok  	github.com/f1monkey/spellchecker	3.343s
+ok  	github.com/f1monkey/spellchecker	3.378s
 ```
 
 #### [Test set 2](http://norvig.com/spell-testset2.txt):
@@ -107,8 +107,8 @@ Running tool: /usr/local/go/bin/go test -benchmem -run=^$ -bench ^Benchmark_Norv
 goos: linux
 goarch: amd64
 pkg: github.com/f1monkey/spellchecker
-cpu: AMD Ryzen 7 7840HS w/ Radeon 780M Graphics     
-Benchmark_Norvig2-16    	     150	   7226006 ns/op	        70.00 success_percent	       280.0 success_words	       400.0 total_words	 2389231 B/op	  129486 allocs/op
+cpu: 13th Gen Intel(R) Core(TM) i9-13980HX
+Benchmark_Norvig2-32    	     198	   6102429 ns/op	        70.00 success_percent	       280.0 success_words	       400.0 total_words	 1327385 B/op	    3121 allocs/op
 PASS
-ok  	github.com/f1monkey/spellchecker	3.244s
+ok  	github.com/f1monkey/spellchecker	3.895s
 ```
diff --git a/dictionary.go b/dictionary.go
@@ -6,16 +6,13 @@ import (
 	"encoding/gob"
 	"math"
 	"sort"
-	"sync"
 	"sync/atomic"
 
 	"github.com/agnivade/levenshtein"
 	"github.com/f1monkey/bitmap"
 )
 
 type dictionary struct {
-	mtx sync.RWMutex
-
 	maxErrors int
 	alphabet  alphabet
 	nextID    func() uint32
@@ -46,25 +43,16 @@ func newDictionary(ab string, maxErrors int) (*dictionary, error) {
 
 // id get ID of the word. Returns 0 if not found
 func (d *dictionary) id(word string) uint32 {
-	d.mtx.RLock()
-	defer d.mtx.RUnlock()
-
 	return d.ids[word]
 }
 
 // has check if the word is present in the dictionary
 func (d *dictionary) has(word string) bool {
-	d.mtx.RLock()
-	defer d.mtx.RUnlock()
-
 	return d.ids[word] > 0
 }
 
 // add puts the word to the dictionary
 func (d *dictionary) add(word string) (uint32, error) {
-	d.mtx.Lock()
-	defer d.mtx.Unlock()
-
 	id := d.nextID()
 	d.ids[word] = id
 
@@ -79,9 +67,6 @@ func (d *dictionary) add(word string) (uint32, error) {
 
 // inc increase word occurence counter
 func (d *dictionary) inc(id uint32) {
-	d.mtx.Lock()
-	defer d.mtx.Unlock()
-
 	_, ok := d.counts[id]
 	if !ok {
 		return
@@ -94,16 +79,13 @@ type match struct {
 	Score float64
 }
 
-func (d *dictionary) Find(word string, n int) []match {
-	d.mtx.RLock()
-	defer d.mtx.RUnlock()
-
+func (d *dictionary) find(word string, n int) []match {
 	if d.maxErrors <= 0 {
 		return nil
 	}
 
 	bm := d.alphabet.encode([]rune(word))
-	candidates := d.getCandidates(word, bm, 1)
+	candidates := d.getCandidates(word, bm)
 	result := calcScores([]rune(word), candidates)
 
 	if len(result) < n {
@@ -119,14 +101,11 @@ type сandidate struct {
 	Count    int
 }
 
-func (d *dictionary) getCandidates(word string, bmSrc bitmap.Bitmap32, errCnt int) []сandidate {
-	checked := make(map[uint64]struct{}, d.alphabet.len()*2)
-
+func (d *dictionary) getCandidates(word string, bmSrc bitmap.Bitmap32) []сandidate {
 	result := make([]сandidate, 0, 50)
 
 	// "exact match" OR "candidate has all the same letters as the word but in different order"
 	key := sum(bmSrc)
-	checked[key] = struct{}{}
 	ids := d.index[key]
 	for _, id := range ids {
 		docWord, ok := d.words[id]
@@ -151,7 +130,7 @@ func (d *dictionary) getCandidates(word string, bmSrc bitmap.Bitmap32, errCnt in
 	}
 
 	// @todo perform phonetic analysis with early termination here
-	for bm := range d.computeCandidateBitmaps(word, bmSrc) {
+	for bm := range d.computeCandidateBitmaps(bmSrc) {
 		ids := d.index[bm]
 		for _, id := range ids {
 			docWord, ok := d.words[id]
@@ -174,30 +153,34 @@ func (d *dictionary) getCandidates(word string, bmSrc bitmap.Bitmap32, errCnt in
 	return result
 }
 
-func (d *dictionary) computeCandidateBitmaps(word string, bmSrc bitmap.Bitmap32) map[uint64]struct{} {
+func (d *dictionary) computeCandidateBitmaps(bmSrc bitmap.Bitmap32) map[uint64]struct{} {
 	bitmaps := make(map[uint64]struct{}, d.alphabet.len()*5)
+	bmSrc = bmSrc.Clone()
 
+	var i, j uint32
 	// swap one bit
-	for i := 0; i < d.alphabet.len(); i++ {
-		bit := uint32(i)
-		bmCandidate := bmSrc.Clone()
-		bmCandidate.Xor(bit)
+	for i = 0; i < uint32(d.alphabet.len()); i++ {
+		bmSrc.Xor(i)
 
 		// swap one more bit to be able to fix:
 		// - two deletions ("rang" => "orange")
 		// - replacements ("problam" => "problem")
-		for j := 0; j < d.alphabet.len(); j++ {
-			bit := uint32(j)
-			bmCandidate := bmCandidate.Clone()
-			bmCandidate.Xor(bit)
-			key := sum(bmCandidate)
+		for j = 0; j < uint32(d.alphabet.len()); j++ {
+			if i == j {
+				continue
+			}
+
+			bmSrc.Xor(j)
+			key := sum(bmSrc)
+			bmSrc.Xor(j) // return back the changed bit
 			if len(d.index[key]) == 0 {
 				continue
 			}
 			bitmaps[key] = struct{}{}
 		}
 
-		key := sum(bmCandidate)
+		key := sum(bmSrc)
+		bmSrc.Xor(i) // return back the changed bit
 		if len(d.index[key]) == 0 {
 			continue
 		}
@@ -249,9 +232,6 @@ type dictData struct {
 }
 
 func (d *dictionary) MarshalBinary() ([]byte, error) {
-	d.mtx.Lock()
-	defer d.mtx.Unlock()
-
 	data := &dictData{
 		Alphabet:  d.alphabet,
 		IDs:       d.ids,
@@ -271,9 +251,6 @@ func (d *dictionary) MarshalBinary() ([]byte, error) {
 }
 
 func (d *dictionary) UnmarshalBinary(data []byte) error {
-	d.mtx.Lock()
-	defer d.mtx.Unlock()
-
 	dictData := &dictData{}
 	err := gob.NewDecoder(bytes.NewBuffer(data)).Decode(dictData)
 	if err != nil {

diff --git a/spellchecker.go b/spellchecker.go
@@ -96,9 +96,9 @@ func (s *Spellchecker) Fix(word string) (string, error) {
 		return word, nil
 	}
 
-	hits := s.dict.Find(word, 1)
+	hits := s.dict.find(word, 1)
 	if len(hits) == 0 {
-		return word, fmt.Errorf("%w: %s", ErrUnknownWord, word)
+		return word, ErrUnknownWord
 	}
 
 	return hits[0].Value, nil
@@ -113,9 +113,9 @@ func (s *Spellchecker) Suggest(word string, n int) ([]string, error) {
 		return []string{word}, nil
 	}
 
-	hits := s.dict.Find(word, n)
+	hits := s.dict.find(word, n)
 	if len(hits) == 0 {
-		return []string{word}, fmt.Errorf("%w: %s", ErrUnknownWord, word)
+		return []string{word}, ErrUnknownWord
 	}
 
 	result := make([]string, len(hits))