Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
cyradin committed Nov 9, 2022
0 parents commit bf2c409
Show file tree
Hide file tree
Showing 6 changed files with 284 additions and 0 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
.idea
.vscode

/vendor/

__debug_bin
35 changes: 35 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# ngrams

N-grams generation library written in go

## Install

```bash
$ go get -v github.com/cyradin/ngrams
```

## Usage

```go
result := ngrams.From("orange", 3)
fmt.Println(result) // [ora, ran, ang, nge]

result := ngrams.FromRange("word", 2, 3)
fmt.Println(result) // [wo, or, rd, wor, ord]
```


## Benchmark

```
goos: linux
goarch: amd64
pkg: github.com/cyradin/ngrams
cpu: Intel(R) Core(TM) i9-8950HK CPU @ 2.90GHz
Benchmark_MakeRange-12 493305 2446 ns/op 1440 B/op 31 allocs/op
Benchmark_From_6_3-12 2563634 456.1 ns/op 224 B/op 6 allocs/op
Benchmark_FromRunes_6_3-12 3208182 413.3 ns/op 224 B/op 6 allocs/op
PASS
coverage: 80.9% of statements
ok github.com/cyradin/ngrams 4.593s
```
5 changes: 5 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
module github.com/cyradin/ngrams

go 1.12

require github.com/stretchr/testify v1.8.1
17 changes: 17 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
101 changes: 101 additions & 0 deletions ngrams.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
package ngrams

import (
"fmt"
"strings"
)

var ErrParam = fmt.Errorf("invalid value of")

// MakeRange generates ngrams with len=min..max
func MakeRange(word string, min, max int) ([]string, error) {
if min < 1 {
return nil, fmt.Errorf("%w min: cannot be < 1", ErrParam)
}
if max < 1 {
return nil, fmt.Errorf("%w min: cannot be < 1", ErrParam)
}
if min > max {
return nil, fmt.Errorf("%w min: cannot be > max", ErrParam)
}

if word == "" {
return nil, nil
}

runes := []rune(word)
wordLen := len(runes)
maxN := minOf(wordLen, max)
result := make([]string, 0, ngramCnt(len(runes), min, maxN))
for n := min; n <= maxN; n++ {
items, err := FromRunes(runes, n)
if err != nil {
return nil, err
}
result = append(result, items...)
}

return result, nil
}

// From generates ngrams from a string with len=n
func From(word string, n int) ([]string, error) {
return FromRunes([]rune(word), n)
}

// FromRunes generates ngrams from a rune slice with len=n
func FromRunes(runes []rune, n int) ([]string, error) {
if n < 1 {
return nil, fmt.Errorf("%w: n cannot be < 1", ErrParam)
}

if n > len(runes) {
return nil, nil
}

if n == len(runes) {
return []string{string(runes)}, nil
}

cnt := ngramCnt(len(runes), n, n)
builders := make([]strings.Builder, cnt)

for i, r := range runes {
for j := i - n + 1; j <= i; j++ {
if j < 0 {
continue
}
if j >= cnt {
break
}

builders[j].WriteRune(r)
}
}

result := make([]string, cnt)
for i, b := range builders {
result[i] = b.String()
}

return result, nil
}

func ngramCnt(l, min, max int) int {
if min == max {
return l - min + 1
}

var cnt int
for i := min; i <= max; i++ {
cnt += l - i + 1
}
return cnt
}

func minOf(a, b int) int {
if a > b {
return b
}
return a
}
120 changes: 120 additions & 0 deletions ngrams_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
package ngrams

import (
"testing"

"github.com/stretchr/testify/require"
)

func Benchmark_MakeRange(b *testing.B) {
for i := 0; i < b.N; i++ {
MakeRange("orange", 1, 5)
}
}

func Test_MakeRange(t *testing.T) {
t.Run("must return error if min < 1", func(t *testing.T) {
result, err := MakeRange("word", 0, 5)
require.Error(t, err)
require.Nil(t, result)
})

t.Run("must return error if max < 1", func(t *testing.T) {
result, err := MakeRange("word", 5, 0)
require.Error(t, err)
require.Nil(t, result)
})

t.Run("must return error if min > max", func(t *testing.T) {
result, err := MakeRange("word", 2, 1)
require.Error(t, err)
require.Nil(t, result)
})

t.Run("must return nil for an empty word", func(t *testing.T) {
result, err := MakeRange("", 1, 5)
require.NoError(t, err)
require.Nil(t, result)
})

t.Run("must return valid ngrams", func(t *testing.T) {
t.Run("min=3,max=3", func(t *testing.T) {
result, err := MakeRange("orange", 3, 3)
require.NoError(t, err)
require.Equal(t, []string{"ora", "ran", "ang", "nge"}, result)
})

t.Run("min=1,max=5", func(t *testing.T) {
result, err := MakeRange("orange", 1, 5)
require.NoError(t, err)
require.Equal(t, []string{
"o", "r", "a", "n", "g", "e",
"or", "ra", "an", "ng", "ge",
"ora", "ran", "ang", "nge",
"oran", "rang", "ange",
"orang", "range",
}, result)
})
})
}

func Benchmark_From_6_3(b *testing.B) {
for i := 0; i < b.N; i++ {
From("qwerty", 3)
}
}

func Benchmark_FromRunes_6_3(b *testing.B) {
runes := []rune("qwerty")
for i := 0; i < b.N; i++ {
FromRunes(runes, 3)
}
}

func Test_From(t *testing.T) {
t.Run("must return error if n < 1", func(t *testing.T) {
result, err := From("qwe", 0)
require.Error(t, err)
require.Nil(t, result)
})

t.Run("must return nil if n > word len", func(t *testing.T) {
result, err := From("qwe", 5)
require.NoError(t, err)
require.Nil(t, result)
})

t.Run("must return word if n == word len", func(t *testing.T) {
result, err := From("qwe", 3)
require.NoError(t, err)
require.Equal(t, []string{"qwe"}, result)
})

t.Run("must return correct set of ngrams", func(t *testing.T) {
t.Run("en", func(t *testing.T) {
t.Run("len=4, n=3", func(t *testing.T) {
result, err := From("word", 3)
require.NoError(t, err)
require.Equal(t, []string{"wor", "ord"}, result)
})
t.Run("len=4, n=2", func(t *testing.T) {
result, err := From("word", 2)
require.NoError(t, err)
require.Equal(t, []string{"wo", "or", "rd"}, result)
})
t.Run("len=6, n=3", func(t *testing.T) {
result, err := From("orange", 3)
require.NoError(t, err)
require.Equal(t, []string{"ora", "ran", "ang", "nge"}, result)
})
})

t.Run("ru", func(t *testing.T) {
t.Run("len=6, n=3", func(t *testing.T) {
result, err := From("яблоко", 3)
require.NoError(t, err)
require.Equal(t, []string{"ябл", "бло", "лок", "око"}, result)
})
})
})
}

0 comments on commit bf2c409

Please sign in to comment.