Skip to content

Commit

Permalink
Merge pull request #12 from f1monkey/feature/bigger-bitmap
Browse files Browse the repository at this point in the history
Feature/bigger bitmap
  • Loading branch information
cyradin authored Feb 9, 2024
2 parents 5fcd7df + b6f7ade commit c92aacd
Show file tree
Hide file tree
Showing 12 changed files with 137 additions and 287 deletions.
31 changes: 17 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,13 @@

Yet another spellchecker written in go.

* [Features](#features)
* [Installation](#installation)
* [Usage](#usage)
* [Benchmarks](#benchmarks)
- [Spellchecker](#spellchecker)
- [Features:](#features)
- [Installation](#installation)
- [Usage](#usage)
- [Benchmarks](#benchmarks)
- [Test set 1:](#test-set-1)
- [Test set 2:](#test-set-2)

## Features:
- very small database: approximately 1mb for 30,000 unique words
Expand All @@ -23,10 +26,10 @@ $ go get -v github.com/f1monkey/spellchecker
```go
func main() {
// Create new instance
sc, err := spellchecker.New(spellchecker.Alphabet{
Letters: "abcdefghijklmnopqrstuvwxyz1234567890",
Length: 36,
}, spellchecker.WithMaxErrors(2))
sc, err := spellchecker.New(
"abcdefghijklmnopqrstuvwxyz1234567890", // allowed symbols, other symbols will be ignored
spellchecker.WithMaxErrors(2)
)
if err != nil {
panic(err)
}
Expand Down Expand Up @@ -90,10 +93,10 @@ Running tool: /usr/local/go/bin/go test -benchmem -run=^$ -bench ^Benchmark_Norv
goos: linux
goarch: amd64
pkg: github.com/f1monkey/spellchecker
cpu: Intel(R) Core(TM) i9-8950HK CPU @ 2.90GHz
Benchmark_Norvig1-12 100 10721930 ns/op 74.07 success_percent 200.0 success_words 270.0 total_words 1085913 B/op 2063 allocs/op
cpu: AMD Ryzen 7 7840HS w/ Radeon 780M Graphics
Benchmark_Norvig1-16 242 4861057 ns/op 74.07 success_percent 200.0 success_words 270.0 total_words 1643485 B/op 88241 allocs/op
PASS
ok github.com/f1monkey/spellchecker 1.910s
ok github.com/f1monkey/spellchecker 3.343s
```

#### [Test set 2](http://norvig.com/spell-testset2.txt):
Expand All @@ -104,8 +107,8 @@ Running tool: /usr/local/go/bin/go test -benchmem -run=^$ -bench ^Benchmark_Norv
goos: linux
goarch: amd64
pkg: github.com/f1monkey/spellchecker
cpu: Intel(R) Core(TM) i9-8950HK CPU @ 2.90GHz
Benchmark_Norvig2-12 72 13977916 ns/op 70.00 success_percent 280.0 success_words 400.0 total_words 1573316 B/op 3050 allocs/op
cpu: AMD Ryzen 7 7840HS w/ Radeon 780M Graphics
Benchmark_Norvig2-16 150 7226006 ns/op 70.00 success_percent 280.0 success_words 400.0 total_words 2389231 B/op 129486 allocs/op
PASS
ok github.com/f1monkey/spellchecker 1.874s
ok github.com/f1monkey/spellchecker 3.244s
```
41 changes: 13 additions & 28 deletions alphabet.go
Original file line number Diff line number Diff line change
@@ -1,53 +1,38 @@
package spellchecker

import "fmt"

type Alphabet struct {
// Letters to use in alphabet. Duplicates are not allowed
Letters string
// Length bit count to encode alphabet
// If it is less than rune count in letters then
// several letters will be encoded as one bit.
// It decreases database size for a bit
// but drastically reduces search performance in large dictionaries
Length int
}
import (
"fmt"

var DefaultAlphabet = Alphabet{
Letters: "abcdefghijklmnopqrstuvwxyz",
Length: 26,
}
"github.com/f1monkey/bitmap"
)

const DefaultAlphabet = "abcdefghijklmnopqrstuvwxyz"

type alphabet map[rune]uint32

// newAlphabet create a new alphabet instance
func newAlphabet(str string, length int) (alphabet, error) {
func newAlphabet(str string) (alphabet, error) {
runes := []rune(str)
if len(runes) == 0 {
return nil, fmt.Errorf("unable to use empty string as alphabet")
}

if length > 63 {
return nil, fmt.Errorf("alphabets longer than 63 are not supported (yet?)")
return nil, fmt.Errorf("unable to use empty string as an alphabet")
}

result := make(alphabet, length)
result := make(alphabet, len(runes))
for i, s := range runes {
index := i % length
if _, ok := result[s]; ok {
return nil, fmt.Errorf("duplicate symbol %q at position %d", s, i)
}
result[s] = uint32(index)
result[s] = uint32(i)
}

return result, nil
}

func (a alphabet) encode(word []rune) bitmap {
var b bitmap
func (a alphabet) encode(word []rune) bitmap.Bitmap32 {
var b bitmap.Bitmap32
for _, letter := range word {
if index, ok := a[letter]; ok {
b.or(index)
b.Set(index)
}
}

Expand Down
17 changes: 6 additions & 11 deletions alphabet_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,40 +3,35 @@ package spellchecker
import (
"testing"

"github.com/f1monkey/bitmap"
"github.com/stretchr/testify/require"
)

func Test_newAlphabet(t *testing.T) {
t.Run("must not allow an empty string to be the alphabet", func(t *testing.T) {
result, err := newAlphabet("", 3)
result, err := newAlphabet("")
require.Error(t, err)
require.Nil(t, result)
})

t.Run("must create a valid map from the string", func(t *testing.T) {
result, err := newAlphabet("abc", 3)
result, err := newAlphabet("abc")
require.NoError(t, err)
require.Equal(t, result, alphabet{'a': 0, 'b': 1, 'c': 2})
})

t.Run("must return error if alphabet length is greater than max", func(t *testing.T) {
result, err := newAlphabet("abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщъыьэюя01234", 64)
require.Error(t, err)
require.Nil(t, result)
})

t.Run("must not allow duplicate symbols in alphabet", func(t *testing.T) {
result, err := newAlphabet("abb", 3)
result, err := newAlphabet("abb")
require.Error(t, err)
require.Nil(t, result)
})
}

func Test_alphabet_encode(t *testing.T) {
ab, err := newAlphabet("abcd", 4)
ab, err := newAlphabet("abcd")
require.NoError(t, err)

word := []rune("aab")
result := ab.encode(word)
require.Equal(t, bitmap(3), result)
require.Equal(t, bitmap.Bitmap32{3}, result)
}
29 changes: 0 additions & 29 deletions bitmap.go

This file was deleted.

113 changes: 0 additions & 113 deletions bitmap_test.go

This file was deleted.

Loading

0 comments on commit c92aacd

Please sign in to comment.