Skip to content

Commit

Permalink
Merge pull request #51 from ikawaha/develop
Browse files Browse the repository at this point in the history
Release candidate
  • Loading branch information
ikawaha authored Jun 15, 2024
2 parents bb78b99 + 922d2be commit a77f306
Show file tree
Hide file tree
Showing 22 changed files with 289 additions and 47 deletions.
8 changes: 8 additions & 0 deletions dict/builder/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,13 @@ const MaxInt16 = 1<<15 - 1

// Config represents the configuration of dictionary builder.
type Config struct {
name string
src string
paths []string
recordInfo *MorphRecordInfo
unkInfo *UnkRecordInfo
enc encoding.Encoding
dictInfo *dict.Info

MatrixDefFileName string
CharDefFileName string
Expand All @@ -40,6 +43,10 @@ func NewConfig(path string, other []string, enc encoding.Encoding, info *MorphRe
}
}

func (c *Config) AddDictInfo(info *dict.Info) {
c.dictInfo = info
}

// Build builds a dictionary.
func Build(c *Config) (*dict.Dict, error) {
if c == nil {
Expand Down Expand Up @@ -72,6 +79,7 @@ func Build(c *Config) (*dict.Dict, error) {
ContentsMeta: c.recordInfo.Meta,
Contents: make([][]string, 0, len(records)),
}
ret.SetInfo(c.dictInfo)

// ConnectionTable
matrix, err := parseMatrixDefFile(c.paths[0] + "/" + c.MatrixDefFileName)
Expand Down
30 changes: 29 additions & 1 deletion dict/dict.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ const (
CharDefDictFileName = "chardef.dict"
// UnkDictFileName is the default filename of an unknown dict.
UnkDictFileName = "unk.dict"
// DictInfoFileName is the file name of a dictionary info.
DictInfoFileName = "dict.info"
)

// Dict represents a dictionary of a tokenizer.
Expand All @@ -39,10 +41,15 @@ type Dict struct {
InvokeList InvokeList
GroupList GroupList
UnkDict UnkDict
dictInfo *Info
}

func (d *Dict) SetInfo(info *Info) {
d.dictInfo = info
}

// CharacterCategory returns the category of a rune.
func (d Dict) CharacterCategory(r rune) byte {
func (d *Dict) CharacterCategory(r rune) byte {
if int(r) < len(d.CharCategory) {
return d.CharCategory[r]
}
Expand Down Expand Up @@ -125,6 +132,12 @@ func (d *Dict) loadUnkDict(r io.Reader) error {
return nil
}

func (d *Dict) loadDictInfo(r io.Reader) error {
info := ReadDictInfo(r)
d.dictInfo = info
return nil
}

// LoadDictFile loads a dictionary from a file.
func LoadDictFile(path string) (d *Dict, err error) {
r, err := zip.OpenReader(path)
Expand Down Expand Up @@ -156,6 +169,7 @@ var loaders = map[string]dictionaryPartLoader{
ConnectionDictFileName: (*Dict).loadConnectionDict,
CharDefDictFileName: (*Dict).loadCharDefDict,
UnkDictFileName: (*Dict).loadUnkDict,
DictInfoFileName: (*Dict).loadDictInfo,
}

// Load loads a dictionary from a zipped reader.
Expand Down Expand Up @@ -193,6 +207,7 @@ var dictionaryPartFiles = []string{
ConnectionDictFileName,
CharDefDictFileName,
UnkDictFileName,
DictInfoFileName,
}

type dictionaryPartSaver func(Dict, io.Writer) error
Expand All @@ -206,6 +221,7 @@ var savers = map[string]dictionaryPartSaver{
ConnectionDictFileName: Dict.saveConnectionDict,
CharDefDictFileName: Dict.saveCharDefDict,
UnkDictFileName: Dict.saveUnkDict,
DictInfoFileName: Dict.saveInfo,
}

// Save saves a dictionary in a zipped format.
Expand Down Expand Up @@ -273,3 +289,15 @@ func (d Dict) saveUnkDict(w io.Writer) error {
_, err := d.UnkDict.WriteTo(w)
return err
}

func (d Dict) saveInfo(w io.Writer) error {
if d.dictInfo == nil {
return nil
}
_, err := d.dictInfo.WriteTo(w)
return err
}

func (d Dict) Info() *Info {
return d.dictInfo
}
50 changes: 50 additions & 0 deletions dict/dict_info.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package dict

import (
"bytes"
"encoding/gob"
"errors"
"io"
)

// Info represents the dictionary info.
type Info struct {
Name string
Src string
}

// ReadDictInfo reads gob encoded dictionary info and returns it.
//
// For backward compatibility, if a dictionary name is not defined or empty, it
// returns UndefinedDictName.
func ReadDictInfo(r io.Reader) *Info {
if r == nil {
return nil
}
var name string
dec := gob.NewDecoder(r)
if err := dec.Decode(&name); err != nil {
return nil
}
var src string
if err := dec.Decode(&src); err != nil {
return nil
}
return &Info{Name: name, Src: src}
}

// WriteTo implements the io.WriteTo interface.
func (d Info) WriteTo(w io.Writer) (n int64, err error) {
if w == nil {
return 0, errors.New("given writer is nil")
}
var b bytes.Buffer
enc := gob.NewEncoder(&b)
if err := enc.Encode(d.Name); err != nil {
return 0, err
}
if err := enc.Encode(d.Src); err != nil {
return 0, err
}
return b.WriteTo(w)
}
75 changes: 75 additions & 0 deletions dict/dict_info_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
package dict

import (
"bytes"
"testing"
)

func TestDictName_golden(t *testing.T) {
in := Info{Name: "test_dict"}

// Get gob encoded dictionary name.
var gobName bytes.Buffer
if _, err := in.WriteTo(&gobName); err != nil {
t.Errorf("failed to get encoded name data: %v", err)
}

// Decode gob encoded dictionary name.
out := ReadDictInfo(&gobName)

// Assert be equal.
if in.Name != out.Name {
t.Errorf("want %v, got %v", in, out)
}
}

func TestDictName_bad_input(t *testing.T) {
t.Run("empty name", func(t *testing.T) {
in := Info{Name: ""}

// Get gob encoded dictionary name.
var gobName bytes.Buffer
if _, err := in.WriteTo(&gobName); err != nil {
t.Errorf("failed to encode dict name: %v", err)
}

// Decode gob encoded dictionary name.
got := ReadDictInfo(&gobName)
if got.Name != "" {
t.Errorf("empty name should return empty name. got %v", got)
}
})

t.Run("nil input", func(t *testing.T) {
// Nil input shuold return default name.
got := ReadDictInfo(nil)
if got != nil {
t.Errorf("nil input should return nil. got %v", got)
}
})

t.Run("bad gob data", func(t *testing.T) {
// Bad gob data should return default name.
got := ReadDictInfo(bytes.NewReader([]byte{0x00}))
if got != nil {
t.Errorf("bad gob data should return nil. got %v", got)
}
})
}

func TestDictName_WriteTo(t *testing.T) {
in := Info{Name: "test_dict"}

// Nil writer should return error.
_, err := in.WriteTo(nil)

// Assert error.
if err == nil {
t.Error("nil writer should return error")
}
// Assert error message.
want := "given writer is nil"
if want != err.Error() {
t.Errorf("want %v, got %v", want, err.Error())
}
}
14 changes: 11 additions & 3 deletions dict/dict_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,13 +68,21 @@ func newTestDict(t *testing.T) *Dict {
[]string{"bb1", "bb2", "bb3"},
},
},
dictInfo: &Info{Name: "testDict", Src: "testDictSrc"},
}
}

// save <--> load
func Test_DictSaveLoad(t *testing.T) {
dict := newTestDict(t)

if nameDict := dict.dictInfo.Name; nameDict != "testDict" {
t.Fatalf("unexpected dict name, %v", nameDict)
}
if srcDict := dict.dictInfo.Src; srcDict != "testDictSrc" {
t.Fatalf("unexpected dict source, %v", srcDict)
}

var b bytes.Buffer
zw := zip.NewWriter(&b)
if err := dict.Save(zw); err != nil {
Expand All @@ -95,8 +103,8 @@ func Test_DictSaveLoad(t *testing.T) {
}

if !reflect.DeepEqual(dict, got) {
t.Errorf("want %+v, got %+v", dict, got)
fmt.Printf("%T\n", got.ContentsMeta)
fmt.Printf("%T\n", dict.ContentsMeta)
t.Errorf("\nwant %+v\ngot %+v\n", dict, got)
fmt.Printf("got type: %T\n", got.ContentsMeta)
fmt.Printf("want type: %T\n", dict.ContentsMeta)
}
}
5 changes: 2 additions & 3 deletions dict/unkdict.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import (
"encoding/binary"
"fmt"
"io"
"io/ioutil"
"sort"
)

Expand Down Expand Up @@ -121,9 +120,9 @@ func ReadUnkDic(r io.Reader) (UnkDict, error) {
if err != nil {
return d, err
}
d.ContentsMeta =me
d.ContentsMeta = me

b, err := ioutil.ReadAll(r)
b, err := io.ReadAll(r)
if err != nil {
return d, err
}
Expand Down
20 changes: 11 additions & 9 deletions ipa/dict.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,19 @@ import (
"github.com/ikawaha/kagome-dict/dict"
)

// DictName represents a dictionary name to identify.
// You can retrieve this name via dict.dictInfo.Name field.
const DictName = "IPA"

type FeatureIndex = int

// Features are information given to a word, such as follows:
// 公園 名詞,一般,*,*,*,*,公園,コウエン,コーエン
// に 助詞,格助詞,一般,*,*,*,に,ニ,ニ
// 行っ 動詞,自立,*,*,五段・カ行促音便,連用タ接続,行く,イッ,イッ
// た 助動詞,*,*,*,特殊・タ,基本形,た,タ,タ
// EOS
const (
// Features are information given to a word, such as follows:
// 公園 名詞,一般,*,*,*,*,公園,コウエン,コーエン
// に 助詞,格助詞,一般,*,*,*,に,ニ,ニ
// 行っ 動詞,自立,*,*,五段・カ行促音便,連用タ接続,行く,イッ,イッ
// た 助動詞,*,*,*,特殊・タ,基本形,た,タ,タ
// EOS

// POSHierarchy represents part-of-speech hierarchy
// e.g. Columns 動詞,自立,*,* are POSs which hierarchy depth is 4.
POSHierarchy = 4
Expand All @@ -35,7 +38,6 @@ const (
Pronunciation = 8
)


type systemDict struct {
once sync.Once
dict *dict.Dict
Expand Down Expand Up @@ -75,7 +77,7 @@ func loadDict(full bool) *dict.Dict {
panic(err)
}
r := bytes.NewReader(b)
zr,err := zip.NewReader(r, r.Size())
zr, err := zip.NewReader(r, r.Size())
if err != nil {
panic(err)
}
Expand Down
23 changes: 22 additions & 1 deletion ipa/dict_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,16 @@ func Test_LoadDictFile(t *testing.T) {
if err != nil {
t.Fatalf("unexpected error, %v", err)
}
info := d.Info()
if info == nil {
t.Fatalf("info is nil")
}
if want, got := DictName, info.Name; want != got {
t.Errorf("want %s, got %s", want, got)
}
if want, got := "mecab-ipadic-2.7.0-20070801+patch", info.Src; want != got {
t.Errorf("want %s, got %s", want, got)
}
if want, got := IPADictEntrySize, len(d.Morphs); want != got {
t.Errorf("want %d, got %d", want, got)
}
Expand Down Expand Up @@ -90,6 +100,17 @@ func Test_ContentsMeta(t *testing.T) {
}
}

func Test_Dict_get_dictionary_name(t *testing.T) {
d := Dict()
got := d.Info()
if want, got := DictName, got.Name; want != got {
t.Errorf("want %s, got %s", want, got)
}
if want, got := "mecab-ipadic-2.7.0-20070801+patch", got.Src; want != got {
t.Errorf("want %s, got %s", want, got)
}
}

/*
func Test_InflectionalType(t *testing.T) {
tnz, err := tokenizer.New(Dict())
Expand Down Expand Up @@ -251,4 +272,4 @@ func Test_FeatureIndex(t *testing.T) {
}
}
}
*/
*/
2 changes: 2 additions & 0 deletions ipa/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@ module github.com/ikawaha/kagome-dict/ipa
go 1.19

require github.com/ikawaha/kagome-dict v1.0.10

replace github.com/ikawaha/kagome-dict => ../
Binary file modified ipa/ipa.dict
Binary file not shown.
Binary file added ipa/ipa.v1.0.10.dict
Binary file not shown.
Loading

0 comments on commit a77f306

Please sign in to comment.