diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5b218e0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.idea +.vscode + +/vendor/ + +__debug_bin diff --git a/README.md b/README.md new file mode 100644 index 0000000..fc35ec7 --- /dev/null +++ b/README.md @@ -0,0 +1,35 @@ +# ngrams + +N-grams generation library written in go + +## Install + +```bash +$ go get -v github.com/cyradin/ngrams +``` + +## Usage + +```go + result := ngrams.From("orange", 3) + fmt.Println(result) // [ora, ran, ang, nge] + + result := ngrams.FromRange("word", 2, 3) + fmt.Println(result) // [wo, or, rd, wor, ord] +``` + + +## Benchmark + +``` +goos: linux +goarch: amd64 +pkg: github.com/cyradin/ngrams +cpu: Intel(R) Core(TM) i9-8950HK CPU @ 2.90GHz +Benchmark_MakeRange-12 493305 2446 ns/op 1440 B/op 31 allocs/op +Benchmark_From_6_3-12 2563634 456.1 ns/op 224 B/op 6 allocs/op +Benchmark_FromRunes_6_3-12 3208182 413.3 ns/op 224 B/op 6 allocs/op +PASS +coverage: 80.9% of statements +ok github.com/cyradin/ngrams 4.593s +``` \ No newline at end of file diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..a1554fa --- /dev/null +++ b/go.mod @@ -0,0 +1,5 @@ +module github.com/cyradin/ngrams + +go 1.12 + +require github.com/stretchr/testify v1.8.1 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..2ec90f7 --- /dev/null +++ b/go.sum @@ -0,0 +1,17 @@ +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/ngrams.go b/ngrams.go new file mode 100644 index 0000000..fa855e2 --- /dev/null +++ b/ngrams.go @@ -0,0 +1,101 @@ +package ngrams + +import ( + "fmt" + "strings" +) + +var ErrParam = fmt.Errorf("invalid value of") + +// MakeRange generates ngrams with len=min..max +func MakeRange(word string, min, max int) ([]string, error) { + if min < 1 { + return nil, fmt.Errorf("%w min: cannot be < 1", ErrParam) + } + if max < 1 { + return nil, fmt.Errorf("%w min: cannot be < 1", ErrParam) + } + if min > max { + return nil, fmt.Errorf("%w min: cannot be > max", ErrParam) + } + + if word == "" { + return nil, nil + } + + runes := []rune(word) + wordLen := len(runes) + maxN := minOf(wordLen, max) + result := make([]string, 0, ngramCnt(len(runes), min, maxN)) + for n := min; n <= maxN; n++ { + items, err := FromRunes(runes, n) + if err != nil { + return nil, err + } + result = append(result, items...) + } + + return result, nil +} + +// From generates ngrams from a string with len=n +func From(word string, n int) ([]string, error) { + return FromRunes([]rune(word), n) +} + +// FromRunes generates ngrams from a rune slice with len=n +func FromRunes(runes []rune, n int) ([]string, error) { + if n < 1 { + return nil, fmt.Errorf("%w: n cannot be < 1", ErrParam) + } + + if n > len(runes) { + return nil, nil + } + + if n == len(runes) { + return []string{string(runes)}, nil + } + + cnt := ngramCnt(len(runes), n, n) + builders := make([]strings.Builder, cnt) + + for i, r := range runes { + for j := i - n + 1; j <= i; j++ { + if j < 0 { + continue + } + if j >= cnt { + break + } + + builders[j].WriteRune(r) + } + } + + result := make([]string, cnt) + for i, b := range builders { + result[i] = b.String() + } + + return result, nil +} + +func ngramCnt(l, min, max int) int { + if min == max { + return l - min + 1 + } + + var cnt int + for i := min; i <= max; i++ { + cnt += l - i + 1 + } + return cnt +} + +func minOf(a, b int) int { + if a > b { + return b + } + return a +} diff --git a/ngrams_test.go b/ngrams_test.go new file mode 100644 index 0000000..90eb43d --- /dev/null +++ b/ngrams_test.go @@ -0,0 +1,120 @@ +package ngrams + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func Benchmark_MakeRange(b *testing.B) { + for i := 0; i < b.N; i++ { + MakeRange("orange", 1, 5) + } +} + +func Test_MakeRange(t *testing.T) { + t.Run("must return error if min < 1", func(t *testing.T) { + result, err := MakeRange("word", 0, 5) + require.Error(t, err) + require.Nil(t, result) + }) + + t.Run("must return error if max < 1", func(t *testing.T) { + result, err := MakeRange("word", 5, 0) + require.Error(t, err) + require.Nil(t, result) + }) + + t.Run("must return error if min > max", func(t *testing.T) { + result, err := MakeRange("word", 2, 1) + require.Error(t, err) + require.Nil(t, result) + }) + + t.Run("must return nil for an empty word", func(t *testing.T) { + result, err := MakeRange("", 1, 5) + require.NoError(t, err) + require.Nil(t, result) + }) + + t.Run("must return valid ngrams", func(t *testing.T) { + t.Run("min=3,max=3", func(t *testing.T) { + result, err := MakeRange("orange", 3, 3) + require.NoError(t, err) + require.Equal(t, []string{"ora", "ran", "ang", "nge"}, result) + }) + + t.Run("min=1,max=5", func(t *testing.T) { + result, err := MakeRange("orange", 1, 5) + require.NoError(t, err) + require.Equal(t, []string{ + "o", "r", "a", "n", "g", "e", + "or", "ra", "an", "ng", "ge", + "ora", "ran", "ang", "nge", + "oran", "rang", "ange", + "orang", "range", + }, result) + }) + }) +} + +func Benchmark_From_6_3(b *testing.B) { + for i := 0; i < b.N; i++ { + From("qwerty", 3) + } +} + +func Benchmark_FromRunes_6_3(b *testing.B) { + runes := []rune("qwerty") + for i := 0; i < b.N; i++ { + FromRunes(runes, 3) + } +} + +func Test_From(t *testing.T) { + t.Run("must return error if n < 1", func(t *testing.T) { + result, err := From("qwe", 0) + require.Error(t, err) + require.Nil(t, result) + }) + + t.Run("must return nil if n > word len", func(t *testing.T) { + result, err := From("qwe", 5) + require.NoError(t, err) + require.Nil(t, result) + }) + + t.Run("must return word if n == word len", func(t *testing.T) { + result, err := From("qwe", 3) + require.NoError(t, err) + require.Equal(t, []string{"qwe"}, result) + }) + + t.Run("must return correct set of ngrams", func(t *testing.T) { + t.Run("en", func(t *testing.T) { + t.Run("len=4, n=3", func(t *testing.T) { + result, err := From("word", 3) + require.NoError(t, err) + require.Equal(t, []string{"wor", "ord"}, result) + }) + t.Run("len=4, n=2", func(t *testing.T) { + result, err := From("word", 2) + require.NoError(t, err) + require.Equal(t, []string{"wo", "or", "rd"}, result) + }) + t.Run("len=6, n=3", func(t *testing.T) { + result, err := From("orange", 3) + require.NoError(t, err) + require.Equal(t, []string{"ora", "ran", "ang", "nge"}, result) + }) + }) + + t.Run("ru", func(t *testing.T) { + t.Run("len=6, n=3", func(t *testing.T) { + result, err := From("яблоко", 3) + require.NoError(t, err) + require.Equal(t, []string{"ябл", "бло", "лок", "око"}, result) + }) + }) + }) +}