From 72393cea38c1c3b7b1c4afb37a82caf43fb706cd Mon Sep 17 00:00:00 2001 From: Arijit Das Date: Thu, 23 Jan 2020 15:06:08 +0530 Subject: [PATCH] Add support for language in Exact Tokenizer --- posting/index.go | 24 ++++++++++++++++++--- query/common_test.go | 17 +-------------- testutil/client.go | 2 +- tok/tok.go | 50 +++++++++++++++++++++----------------------- tok/tokens.go | 2 +- worker/sort.go | 6 +++--- 6 files changed, 51 insertions(+), 50 deletions(-) diff --git a/posting/index.go b/posting/index.go index 31422194562..0430371d268 100644 --- a/posting/index.go +++ b/posting/index.go @@ -439,13 +439,18 @@ func (l *List) AddMutationWithIndex(ctx context.Context, edge *pb.DirectedEdge, } // deleteTokensFor deletes the index for the given attribute and token. -func deleteTokensFor(attr, tokenizerName string) error { +func deleteTokensFor(attr, tokenizerName string, hasLang bool) error { pk := x.ParsedKey{Attr: attr} prefix := pk.IndexPrefix() tokenizer, ok := tok.GetTokenizer(tokenizerName) if !ok { return errors.Errorf("Could not find valid tokenizer for %s", tokenizerName) } + if hasLang { + // We just need the tokenizer identifier for ExactTokenizer having language. + // It will be same for all the language. + tokenizer = tok.GetLangTokenizer(tokenizer, "en") + } prefix = append(prefix, tokenizer.Identifier()) if err := pstore.DropPrefix(prefix); err != nil { return err @@ -785,7 +790,13 @@ func rebuildIndex(ctx context.Context, rb *IndexRebuild) error { glog.Infof("Deleting index for attr %s and tokenizers %s", rb.Attr, rebuildInfo.tokenizersToDelete) for _, tokenizer := range rebuildInfo.tokenizersToDelete { - if err := deleteTokensFor(rb.Attr, tokenizer); err != nil { + if err := deleteTokensFor(rb.Attr, tokenizer, false); err != nil { + return err + } + if tokenizer != "exact" { + continue + } + if err := deleteTokensFor(rb.Attr, tokenizer, true); err != nil { return err } } @@ -804,7 +815,13 @@ func rebuildIndex(ctx context.Context, rb *IndexRebuild) error { rebuildInfo.tokenizersToRebuild) // Before rebuilding, the existing index needs to be deleted. for _, tokenizer := range rebuildInfo.tokenizersToRebuild { - if err := deleteTokensFor(rb.Attr, tokenizer); err != nil { + if err := deleteTokensFor(rb.Attr, tokenizer, false); err != nil { + return err + } + if tokenizer != "exact" { + continue + } + if err := deleteTokensFor(rb.Attr, tokenizer, true); err != nil { return err } } @@ -824,6 +841,7 @@ func rebuildIndex(ctx context.Context, rb *IndexRebuild) error { Value: p.Value, Tid: types.TypeID(p.ValType), } + edge.Lang = string(p.LangTag) for { err := txn.addIndexMutations(ctx, &indexMutationInfo{ diff --git a/query/common_test.go b/query/common_test.go index 1f504394602..a3892a5c9ed 100644 --- a/query/common_test.go +++ b/query/common_test.go @@ -294,9 +294,7 @@ noindex_age : int . noindex_dob : datetime . noindex_alive : bool . noindex_salary : float . -language : [string] .\ -name_lang_index : string @index(exact) @lang . -name_index : string @index(exact) . +language : [string] . ` func populateCluster() { @@ -309,19 +307,6 @@ func populateCluster() { testutil.AssignUids(100000) err = addTriplesToCluster(` - - <10101> "zon"@sv . - <10101> "öffnen"@de . - <10102> "öppna"@sv . - <10102> "zumachen"@de . - - - <10103> "zon" . - <10104> "öffnen" . - <10105> "öppna" . - <10106> "zumachen" . - - <1> "Michonne" . <2> "King Lear" . <3> "Margaret" . diff --git a/testutil/client.go b/testutil/client.go index e734cc5e92e..4750398fd9b 100644 --- a/testutil/client.go +++ b/testutil/client.go @@ -67,7 +67,7 @@ func init() { return port } - grpcPort = getPort("TEST_PORT_ALPHA", 8180) + grpcPort = getPort("TEST_PORT_ALPHA", 9180) SockAddr = fmt.Sprintf("localhost:%d", grpcPort) SockAddrHttp = fmt.Sprintf("localhost:%d", grpcPort-1000) diff --git a/tok/tok.go b/tok/tok.go index 919097c2be2..771caa1284b 100644 --- a/tok/tok.go +++ b/tok/tok.go @@ -93,7 +93,6 @@ func init() { registerTokenizer(MonthTokenizer{}) registerTokenizer(DayTokenizer{}) registerTokenizer(ExactTokenizer{}) - registerTokenizer(LangTokenizer{}) registerTokenizer(BoolTokenizer{}) registerTokenizer(TrigramTokenizer{}) registerTokenizer(HashTokenizer{}) @@ -292,34 +291,24 @@ func (t TermTokenizer) Identifier() byte { return IdentTerm } func (t TermTokenizer) IsSortable() bool { return false } func (t TermTokenizer) IsLossy() bool { return true } -// ExactTokenizer returns the exact string as a token. -type ExactTokenizer struct{} - -func (t ExactTokenizer) Name() string { return "exact" } -func (t ExactTokenizer) Type() string { return "string" } -func (t ExactTokenizer) Tokens(v interface{}) ([]string, error) { - if term, ok := v.(string); ok { - return []string{term}, nil - } - return nil, errors.Errorf("Exact indices only supported for string types") -} -func (t ExactTokenizer) Identifier() byte { return IdentExact } -func (t ExactTokenizer) IsSortable() bool { return true } -func (t ExactTokenizer) IsLossy() bool { return false } - -// LangTokenizer returns the exact string along with language prefix as a token. -type LangTokenizer struct { +// ExactTokenizer returns the exact string as a token. If collator is provided for +// any language then it also adds the language in the prefix . +type ExactTokenizer struct { langBase string cl *collate.Collator buffer *collate.Buffer } -func (t LangTokenizer) Name() string { return "lang" } -func (t LangTokenizer) Type() string { return "string" } -func (t LangTokenizer) Tokens(v interface{}) ([]string, error) { +func (t ExactTokenizer) Name() string { return "exact" } +func (t ExactTokenizer) Type() string { return "string" } +func (t ExactTokenizer) Tokens(v interface{}) ([]string, error) { val, ok := v.(string) if !ok { - return nil, errors.Errorf("Lang indices only supported for string types") + return nil, errors.Errorf("Exact indices only supported for string types") + } + + if t.cl == nil { + return []string{val}, nil } encodedTerm := t.cl.KeyFromString(t.buffer, val) @@ -332,10 +321,19 @@ func (t LangTokenizer) Tokens(v interface{}) ([]string, error) { t.buffer.Reset() return []string{string(term)}, nil } -func (t LangTokenizer) Identifier() byte { return IdentExactLang } -func (t LangTokenizer) IsSortable() bool { return true } -func (t LangTokenizer) IsLossy() bool { return false } -func (t LangTokenizer) Prefix() []byte { + +func (t ExactTokenizer) Identifier() byte { + if t.cl == nil { + return IdentExact + } + return IdentExactLang +} +func (t ExactTokenizer) IsSortable() bool { return true } +func (t ExactTokenizer) IsLossy() bool { return false } +func (t ExactTokenizer) Prefix() []byte { + if t.cl == nil { + return []byte{IdentExact} + } prefix := []byte{IdentExactLang} prefix = append(prefix, []byte(t.langBase)...) prefix = append(prefix, IdentDelimiter) diff --git a/tok/tokens.go b/tok/tokens.go index 9c6a6a94968..f5d3e1ca8e3 100644 --- a/tok/tokens.go +++ b/tok/tokens.go @@ -42,7 +42,7 @@ func GetLangTokenizer(t Tokenizer, lang string) Tokenizer { if err != nil { langTag = enLangTag } - return LangTokenizer{langBase: LangBase(lang), cl: collate.New(langTag), + return ExactTokenizer{langBase: LangBase(lang), cl: collate.New(langTag), buffer: &collate.Buffer{}} } } diff --git a/worker/sort.go b/worker/sort.go index 30ef2f068a1..23c184442d8 100644 --- a/worker/sort.go +++ b/worker/sort.go @@ -225,13 +225,13 @@ func sortWithIndex(ctx context.Context, ts *pb.SortMessage) *sortresult { var prefix []byte if len(order.Langs) > 0 { - // Only one language is allowed. + // Only one languge is allowed. lang := order.Langs[0] tokenizer = tok.GetLangTokenizer(tokenizer, lang) - langTokenizer, ok := tokenizer.(tok.LangTokenizer) + langTokenizer, ok := tokenizer.(*tok.ExactTokenizer) if !ok { return resultWithError(errors.Errorf( - "Invalid tokenizer for language %s.", lang)) + "Failed to get tokenizer for Attribute %s for language %s.", order.Attr, lang)) } prefix = langTokenizer.Prefix() } else {