Skip to content

Commit

Permalink
feat: prebuilt gob-files for faster loading (#12)
Browse files Browse the repository at this point in the history
* tmp: ac[string][]index -> []string

* feat: ac[string]int -> [][]string

* tmp: using ahocorasick lookup

* tried using openacid/slim

* feat: added dghubble to the mix

* feat: scrapped any notion of a trie, but made a prebuilt datastructure instead

* fix: removed commented out package
  • Loading branch information
aaaton authored May 28, 2019
1 parent 64c3afa commit dd436e8
Show file tree
Hide file tree
Showing 29 changed files with 240 additions and 1,279 deletions.
6 changes: 6 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
dicts/it/data.go filter=lfs diff=lfs merge=lfs -text
dicts/sv/data.go filter=lfs diff=lfs merge=lfs -text
dicts/de/data.go filter=lfs diff=lfs merge=lfs -text
dicts/en/data.go filter=lfs diff=lfs merge=lfs -text
dicts/es/data.go filter=lfs diff=lfs merge=lfs -text
dicts/fr/data.go filter=lfs diff=lfs merge=lfs -text
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
data

vendor
.vscode
# Testing and benchmarks
*.out
*.test
Expand Down
56 changes: 29 additions & 27 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,38 +2,40 @@ SHELL:=/bin/bash
default: all
LANG=en
all:
go get -u github.com/jteeuwen/go-bindata/...
# go get -u github.com/jteeuwen/go-bindata/...
mkdir -p data
$(MAKE) en
$(MAKE) sv
$(MAKE) fr
$(MAKE) es
$(MAKE) de
$(MAKE) it
$(MAKE) en sv fr es de it

package-all:
$(MAKE) LANG=en package
$(MAKE) LANG=sv package
$(MAKE) LANG=fr package
$(MAKE) LANG=es package
$(MAKE) LANG=de package
$(MAKE) LANG=it package


en: LANG=en
en: download

sv: LANG=sv
sv: download

fr: LANG=fr
fr: download

es: LANG=es
es: download

de: LANG=de
de: download

it: LANG=it
it: download
en:
$(MAKE) LANG=en download package
sv:
$(MAKE) LANG=sv download package
fr:
$(MAKE) LANG=fr download package
es:
$(MAKE) LANG=es download package
de:
$(MAKE) LANG=de download package
it:
$(MAKE) LANG=it download package

download:
curl https://raw.githubusercontent.com/michmech/lemmatization-lists/master/lemmatization-$(LANG).txt > data/$(LANG)
go-bindata -o dicts/$(LANG)/$(LANG).go -pkg $(LANG) data/$(LANG)
go run dicts/cmd/generate_pack.go -locale $(LANG) > dicts/$(LANG)/pack.go

package:
# Packaging $(LANG)
go run dicts/cmd/gobify/gobify.go data/$(LANG) data/$(LANG).gob
go-bindata -o dicts/$(LANG)/data.go -pkg $(LANG) data/$(LANG).gob
go run dicts/cmd/genpack/genpack.go -locale $(LANG) > dicts/$(LANG)/pack.go
# ----------------

benchcmp:
# ensure no govenor weirdness
Expand Down
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
# GoLem

This project is a dictionary based lemmatizer written in pure go, without external dependencies.

### What?

A [lemmatizer](https://en.wikipedia.org/wiki/Lemmatisation) is a tool that finds the base form of words.

| Lang | Input | Output |
Expand All @@ -13,9 +15,11 @@ A [lemmatizer](https://en.wikipedia.org/wiki/Lemmatisation) is a tool that finds
It's based on the dictionaries found on [michmech/lemmatization-lists](https://github.com/michmech/lemmatization-lists), which are available under the [Open Database License](https://opendatacommons.org/licenses/odbl/summary/). This project would not be feasible without them.

### Languages

At the moment golem supports English, Swedish, French, Spanish, Italian & German, but adding another language should be no more trouble than getting the dictionary for that language. Some of which are already available on lexiconista. Please let me know if there is something you would like to see in here, or fork the project and create a pull request.

### Basic usage

```golang
package main

Expand All @@ -27,7 +31,7 @@ import (
func main() {
// the language packages are available under golem/dicts
// "en" is for english
lemmatizer, err := golem.New(en.NewPackage())
lemmatizer, err := golem.New(en.New())
if err != nil {
panic(err)
}
Expand Down
6 changes: 4 additions & 2 deletions dicts/cmd/generate_pack.go → dicts/cmd/genpack/genpack.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,20 +23,22 @@ func main() {
var packTemplate = ` // Code generated by golem/dicts/cmd/generate_pack.go DO NOT EDIT
package {{.Locale}}
import "github.com/aaaton/golem/dicts"
const locale = "{{.Locale}}"
// LanguagePack is an implementation of the generic golem.LanguagePack interface for {{.Locale}}
type LanguagePack struct {
}
// NewPackage creates a language pack
func NewPackage() *LanguagePack {
func New() dicts.LanguagePack {
return &LanguagePack{}
}
// GetResource returns the dictionary of lemmatized words
func (l *LanguagePack) GetResource() ([]byte, error) {
return Asset("data/" + locale)
return Asset("data/" + locale + ".gob")
}
// GetLocale returns the language name
Expand Down
107 changes: 107 additions & 0 deletions dicts/cmd/gobify/gobify.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
package main

import (
"encoding/gob"
"fmt"
"io/ioutil"
"os"
"sort"
"strings"
)

type localStorage struct {
Lookup map[string]int
Words [][]string
}

func main() {
if len(os.Args) != 3 {
fmt.Println("usage: cedar_creator [input] [output]")
os.Exit(1)
}
inName, outName := os.Args[1], os.Args[2]
f, err := os.Open(inName)
if err != nil {
fmt.Println(err)
os.Exit(1)
}
defer f.Close()
b, err := ioutil.ReadAll(f)
if err != nil {
fmt.Println(err)
os.Exit(1)
}

ls := &localStorage{}
m := make(map[string][]string)
for _, line := range strings.Split(strings.TrimSpace(string(b)), "\n") {
parts := strings.Split(strings.TrimSpace(line), "\t")
if len(parts) == 2 {
base := strings.ToLower(parts[0])
form := strings.ToLower(parts[1])
add(m, form, base)
add(m, base, base)
} else {
fmt.Printf("the line >%s< is odd\n", line)
}
}
joined2Index := make(map[string]int)
var forms []string
ls.Lookup = make(map[string]int)
for k, v := range m {
lookup := strings.Join(v, "|")
index, ok := joined2Index[lookup]
if !ok {
index = len(ls.Words)
joined2Index[lookup] = index
ls.Words = append(ls.Words, v)
}
forms = append(forms, k)
ls.Lookup[k] = index
}

count := 0.0
for _, form := range forms {
if _, found := ls.Lookup[form]; !found {
count++
}
}
if count > 0 {
fmt.Printf("Couldn't find %f%% of the keys entered\n", count/float64(len(forms))*100)
os.Exit(1)
}

f, err = os.Create(outName)
if err != nil {
panic(err)
}
defer f.Close()
err = gob.NewEncoder(f).Encode(ls)
if err != nil {
fmt.Println(err)
os.Exit(1)
}
fmt.Println("Words in dict:", len(forms))
fmt.Println("Saved to", outName, "and all is good")
}

func add(m map[string][]string, key, value string) {
if values, ok := m[key]; ok {
if !contains(values, value) {
values = append(values, value)
sort.Strings(values)
m[key] = values
}
} else {
m[key] = []string{value}
}
}

func contains(values []string, value string) bool {
for _, v := range values {
if v == value {
return true
}
}
return false
}
3 changes: 3 additions & 0 deletions dicts/de/data.go
Git LFS file not shown
237 changes: 0 additions & 237 deletions dicts/de/de.go

This file was deleted.

6 changes: 4 additions & 2 deletions dicts/de/pack.go
Original file line number Diff line number Diff line change
@@ -1,20 +1,22 @@
// Code generated by golem/dicts/cmd/generate_pack.go DO NOT EDIT
package de

import "github.com/aaaton/golem/dicts"

const locale = "de"

// LanguagePack is an implementation of the generic golem.LanguagePack interface for de
type LanguagePack struct {
}

// NewPackage creates a language pack
func NewPackage() *LanguagePack {
func New() dicts.LanguagePack {
return &LanguagePack{}
}

// GetResource returns the dictionary of lemmatized words
func (l *LanguagePack) GetResource() ([]byte, error) {
return Asset("data/" + locale)
return Asset("data/" + locale + ".gob")
}

// GetLocale returns the language name
Expand Down
7 changes: 7 additions & 0 deletions dicts/dict.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package dicts

// LanguagePack is what each language should implement
type LanguagePack interface {
GetResource() ([]byte, error)
GetLocale() string
}
3 changes: 3 additions & 0 deletions dicts/en/data.go
Git LFS file not shown
237 changes: 0 additions & 237 deletions dicts/en/en.go

This file was deleted.

6 changes: 4 additions & 2 deletions dicts/en/pack.go
Original file line number Diff line number Diff line change
@@ -1,20 +1,22 @@
// Code generated by golem/dicts/cmd/generate_pack.go DO NOT EDIT
package en

import "github.com/aaaton/golem/dicts"

const locale = "en"

// LanguagePack is an implementation of the generic golem.LanguagePack interface for en
type LanguagePack struct {
}

// NewPackage creates a language pack
func NewPackage() *LanguagePack {
func New() dicts.LanguagePack {
return &LanguagePack{}
}

// GetResource returns the dictionary of lemmatized words
func (l *LanguagePack) GetResource() ([]byte, error) {
return Asset("data/" + locale)
return Asset("data/" + locale + ".gob")
}

// GetLocale returns the language name
Expand Down
3 changes: 3 additions & 0 deletions dicts/es/data.go
Git LFS file not shown
237 changes: 0 additions & 237 deletions dicts/es/es.go

This file was deleted.

6 changes: 4 additions & 2 deletions dicts/es/pack.go
Original file line number Diff line number Diff line change
@@ -1,20 +1,22 @@
// Code generated by golem/dicts/cmd/generate_pack.go DO NOT EDIT
package es

import "github.com/aaaton/golem/dicts"

const locale = "es"

// LanguagePack is an implementation of the generic golem.LanguagePack interface for es
type LanguagePack struct {
}

// NewPackage creates a language pack
func NewPackage() *LanguagePack {
func New() dicts.LanguagePack {
return &LanguagePack{}
}

// GetResource returns the dictionary of lemmatized words
func (l *LanguagePack) GetResource() ([]byte, error) {
return Asset("data/" + locale)
return Asset("data/" + locale + ".gob")
}

// GetLocale returns the language name
Expand Down
3 changes: 3 additions & 0 deletions dicts/fr/data.go
Git LFS file not shown
Loading

0 comments on commit dd436e8

Please sign in to comment.