-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit bf2c409
Showing
6 changed files
with
284 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
.idea | ||
.vscode | ||
|
||
/vendor/ | ||
|
||
__debug_bin |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# ngrams | ||
|
||
N-grams generation library written in go | ||
|
||
## Install | ||
|
||
```bash | ||
$ go get -v github.com/cyradin/ngrams | ||
``` | ||
|
||
## Usage | ||
|
||
```go | ||
result := ngrams.From("orange", 3) | ||
fmt.Println(result) // [ora, ran, ang, nge] | ||
|
||
result := ngrams.FromRange("word", 2, 3) | ||
fmt.Println(result) // [wo, or, rd, wor, ord] | ||
``` | ||
|
||
|
||
## Benchmark | ||
|
||
``` | ||
goos: linux | ||
goarch: amd64 | ||
pkg: github.com/cyradin/ngrams | ||
cpu: Intel(R) Core(TM) i9-8950HK CPU @ 2.90GHz | ||
Benchmark_MakeRange-12 493305 2446 ns/op 1440 B/op 31 allocs/op | ||
Benchmark_From_6_3-12 2563634 456.1 ns/op 224 B/op 6 allocs/op | ||
Benchmark_FromRunes_6_3-12 3208182 413.3 ns/op 224 B/op 6 allocs/op | ||
PASS | ||
coverage: 80.9% of statements | ||
ok github.com/cyradin/ngrams 4.593s | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
module github.com/cyradin/ngrams | ||
|
||
go 1.12 | ||
|
||
require github.com/stretchr/testify v1.8.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= | ||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= | ||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= | ||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= | ||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= | ||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= | ||
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= | ||
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= | ||
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= | ||
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= | ||
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk= | ||
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= | ||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= | ||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= | ||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= | ||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= | ||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
package ngrams | ||
|
||
import ( | ||
"fmt" | ||
"strings" | ||
) | ||
|
||
var ErrParam = fmt.Errorf("invalid value of") | ||
|
||
// MakeRange generates ngrams with len=min..max | ||
func MakeRange(word string, min, max int) ([]string, error) { | ||
if min < 1 { | ||
return nil, fmt.Errorf("%w min: cannot be < 1", ErrParam) | ||
} | ||
if max < 1 { | ||
return nil, fmt.Errorf("%w min: cannot be < 1", ErrParam) | ||
} | ||
if min > max { | ||
return nil, fmt.Errorf("%w min: cannot be > max", ErrParam) | ||
} | ||
|
||
if word == "" { | ||
return nil, nil | ||
} | ||
|
||
runes := []rune(word) | ||
wordLen := len(runes) | ||
maxN := minOf(wordLen, max) | ||
result := make([]string, 0, ngramCnt(len(runes), min, maxN)) | ||
for n := min; n <= maxN; n++ { | ||
items, err := FromRunes(runes, n) | ||
if err != nil { | ||
return nil, err | ||
} | ||
result = append(result, items...) | ||
} | ||
|
||
return result, nil | ||
} | ||
|
||
// From generates ngrams from a string with len=n | ||
func From(word string, n int) ([]string, error) { | ||
return FromRunes([]rune(word), n) | ||
} | ||
|
||
// FromRunes generates ngrams from a rune slice with len=n | ||
func FromRunes(runes []rune, n int) ([]string, error) { | ||
if n < 1 { | ||
return nil, fmt.Errorf("%w: n cannot be < 1", ErrParam) | ||
} | ||
|
||
if n > len(runes) { | ||
return nil, nil | ||
} | ||
|
||
if n == len(runes) { | ||
return []string{string(runes)}, nil | ||
} | ||
|
||
cnt := ngramCnt(len(runes), n, n) | ||
builders := make([]strings.Builder, cnt) | ||
|
||
for i, r := range runes { | ||
for j := i - n + 1; j <= i; j++ { | ||
if j < 0 { | ||
continue | ||
} | ||
if j >= cnt { | ||
break | ||
} | ||
|
||
builders[j].WriteRune(r) | ||
} | ||
} | ||
|
||
result := make([]string, cnt) | ||
for i, b := range builders { | ||
result[i] = b.String() | ||
} | ||
|
||
return result, nil | ||
} | ||
|
||
func ngramCnt(l, min, max int) int { | ||
if min == max { | ||
return l - min + 1 | ||
} | ||
|
||
var cnt int | ||
for i := min; i <= max; i++ { | ||
cnt += l - i + 1 | ||
} | ||
return cnt | ||
} | ||
|
||
func minOf(a, b int) int { | ||
if a > b { | ||
return b | ||
} | ||
return a | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
package ngrams | ||
|
||
import ( | ||
"testing" | ||
|
||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
func Benchmark_MakeRange(b *testing.B) { | ||
for i := 0; i < b.N; i++ { | ||
MakeRange("orange", 1, 5) | ||
} | ||
} | ||
|
||
func Test_MakeRange(t *testing.T) { | ||
t.Run("must return error if min < 1", func(t *testing.T) { | ||
result, err := MakeRange("word", 0, 5) | ||
require.Error(t, err) | ||
require.Nil(t, result) | ||
}) | ||
|
||
t.Run("must return error if max < 1", func(t *testing.T) { | ||
result, err := MakeRange("word", 5, 0) | ||
require.Error(t, err) | ||
require.Nil(t, result) | ||
}) | ||
|
||
t.Run("must return error if min > max", func(t *testing.T) { | ||
result, err := MakeRange("word", 2, 1) | ||
require.Error(t, err) | ||
require.Nil(t, result) | ||
}) | ||
|
||
t.Run("must return nil for an empty word", func(t *testing.T) { | ||
result, err := MakeRange("", 1, 5) | ||
require.NoError(t, err) | ||
require.Nil(t, result) | ||
}) | ||
|
||
t.Run("must return valid ngrams", func(t *testing.T) { | ||
t.Run("min=3,max=3", func(t *testing.T) { | ||
result, err := MakeRange("orange", 3, 3) | ||
require.NoError(t, err) | ||
require.Equal(t, []string{"ora", "ran", "ang", "nge"}, result) | ||
}) | ||
|
||
t.Run("min=1,max=5", func(t *testing.T) { | ||
result, err := MakeRange("orange", 1, 5) | ||
require.NoError(t, err) | ||
require.Equal(t, []string{ | ||
"o", "r", "a", "n", "g", "e", | ||
"or", "ra", "an", "ng", "ge", | ||
"ora", "ran", "ang", "nge", | ||
"oran", "rang", "ange", | ||
"orang", "range", | ||
}, result) | ||
}) | ||
}) | ||
} | ||
|
||
func Benchmark_From_6_3(b *testing.B) { | ||
for i := 0; i < b.N; i++ { | ||
From("qwerty", 3) | ||
} | ||
} | ||
|
||
func Benchmark_FromRunes_6_3(b *testing.B) { | ||
runes := []rune("qwerty") | ||
for i := 0; i < b.N; i++ { | ||
FromRunes(runes, 3) | ||
} | ||
} | ||
|
||
func Test_From(t *testing.T) { | ||
t.Run("must return error if n < 1", func(t *testing.T) { | ||
result, err := From("qwe", 0) | ||
require.Error(t, err) | ||
require.Nil(t, result) | ||
}) | ||
|
||
t.Run("must return nil if n > word len", func(t *testing.T) { | ||
result, err := From("qwe", 5) | ||
require.NoError(t, err) | ||
require.Nil(t, result) | ||
}) | ||
|
||
t.Run("must return word if n == word len", func(t *testing.T) { | ||
result, err := From("qwe", 3) | ||
require.NoError(t, err) | ||
require.Equal(t, []string{"qwe"}, result) | ||
}) | ||
|
||
t.Run("must return correct set of ngrams", func(t *testing.T) { | ||
t.Run("en", func(t *testing.T) { | ||
t.Run("len=4, n=3", func(t *testing.T) { | ||
result, err := From("word", 3) | ||
require.NoError(t, err) | ||
require.Equal(t, []string{"wor", "ord"}, result) | ||
}) | ||
t.Run("len=4, n=2", func(t *testing.T) { | ||
result, err := From("word", 2) | ||
require.NoError(t, err) | ||
require.Equal(t, []string{"wo", "or", "rd"}, result) | ||
}) | ||
t.Run("len=6, n=3", func(t *testing.T) { | ||
result, err := From("orange", 3) | ||
require.NoError(t, err) | ||
require.Equal(t, []string{"ora", "ran", "ang", "nge"}, result) | ||
}) | ||
}) | ||
|
||
t.Run("ru", func(t *testing.T) { | ||
t.Run("len=6, n=3", func(t *testing.T) { | ||
result, err := From("яблоко", 3) | ||
require.NoError(t, err) | ||
require.Equal(t, []string{"ябл", "бло", "лок", "око"}, result) | ||
}) | ||
}) | ||
}) | ||
} |