Skip to content

Commit

Permalink
Add support for language in Exact Tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
Arijit Das committed Jan 23, 2020
1 parent 30d7fa6 commit 38601e0
Show file tree
Hide file tree
Showing 6 changed files with 51 additions and 50 deletions.
24 changes: 21 additions & 3 deletions posting/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -439,13 +439,18 @@ func (l *List) AddMutationWithIndex(ctx context.Context, edge *pb.DirectedEdge,
}

// deleteTokensFor deletes the index for the given attribute and token.
func deleteTokensFor(attr, tokenizerName string) error {
func deleteTokensFor(attr, tokenizerName string, hasLang bool) error {
pk := x.ParsedKey{Attr: attr}
prefix := pk.IndexPrefix()
tokenizer, ok := tok.GetTokenizer(tokenizerName)
if !ok {
return errors.Errorf("Could not find valid tokenizer for %s", tokenizerName)
}
if hasLang {
// We just need the tokenizer identifier for ExactTokenizer having language.
// It will be same for all the language.
tokenizer = tok.GetLangTokenizer(tokenizer, "en")
}
prefix = append(prefix, tokenizer.Identifier())
if err := pstore.DropPrefix(prefix); err != nil {
return err
Expand Down Expand Up @@ -785,7 +790,13 @@ func rebuildIndex(ctx context.Context, rb *IndexRebuild) error {
glog.Infof("Deleting index for attr %s and tokenizers %s", rb.Attr,
rebuildInfo.tokenizersToDelete)
for _, tokenizer := range rebuildInfo.tokenizersToDelete {
if err := deleteTokensFor(rb.Attr, tokenizer); err != nil {
if err := deleteTokensFor(rb.Attr, tokenizer, false); err != nil {
return err
}
if tokenizer != "exact" {
continue
}
if err := deleteTokensFor(rb.Attr, tokenizer, true); err != nil {
return err
}
}
Expand All @@ -804,7 +815,13 @@ func rebuildIndex(ctx context.Context, rb *IndexRebuild) error {
rebuildInfo.tokenizersToRebuild)
// Before rebuilding, the existing index needs to be deleted.
for _, tokenizer := range rebuildInfo.tokenizersToRebuild {
if err := deleteTokensFor(rb.Attr, tokenizer); err != nil {
if err := deleteTokensFor(rb.Attr, tokenizer, false); err != nil {
return err
}
if tokenizer != "exact" {
continue
}
if err := deleteTokensFor(rb.Attr, tokenizer, true); err != nil {
return err
}
}
Expand All @@ -824,6 +841,7 @@ func rebuildIndex(ctx context.Context, rb *IndexRebuild) error {
Value: p.Value,
Tid: types.TypeID(p.ValType),
}
edge.Lang = string(p.LangTag)

for {
err := txn.addIndexMutations(ctx, &indexMutationInfo{
Expand Down
17 changes: 1 addition & 16 deletions query/common_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -294,9 +294,7 @@ noindex_age : int .
noindex_dob : datetime .
noindex_alive : bool .
noindex_salary : float .
language : [string] .\
name_lang_index : string @index(exact) @lang .
name_index : string @index(exact) .
language : [string] .
`

func populateCluster() {
Expand All @@ -309,19 +307,6 @@ func populateCluster() {
testutil.AssignUids(100000)

err = addTriplesToCluster(`
<10101> <name_lang_index> "zon"@sv .
<10101> <name_lang_index> "öffnen"@de .
<10102> <name_lang_index> "öppna"@sv .
<10102> <name_lang_index> "zumachen"@de .
<10103> <name_index> "zon" .
<10104> <name_index> "öffnen" .
<10105> <name_index> "öppna" .
<10106> <name_index> "zumachen" .
<1> <name> "Michonne" .
<2> <name> "King Lear" .
<3> <name> "Margaret" .
Expand Down
2 changes: 1 addition & 1 deletion testutil/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ func init() {
return port
}

grpcPort = getPort("TEST_PORT_ALPHA", 8180)
grpcPort = getPort("TEST_PORT_ALPHA", 9180)
SockAddr = fmt.Sprintf("localhost:%d", grpcPort)
SockAddrHttp = fmt.Sprintf("localhost:%d", grpcPort-1000)

Expand Down
50 changes: 24 additions & 26 deletions tok/tok.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@ func init() {
registerTokenizer(MonthTokenizer{})
registerTokenizer(DayTokenizer{})
registerTokenizer(ExactTokenizer{})
registerTokenizer(LangTokenizer{})
registerTokenizer(BoolTokenizer{})
registerTokenizer(TrigramTokenizer{})
registerTokenizer(HashTokenizer{})
Expand Down Expand Up @@ -292,34 +291,24 @@ func (t TermTokenizer) Identifier() byte { return IdentTerm }
func (t TermTokenizer) IsSortable() bool { return false }
func (t TermTokenizer) IsLossy() bool { return true }

// ExactTokenizer returns the exact string as a token.
type ExactTokenizer struct{}

func (t ExactTokenizer) Name() string { return "exact" }
func (t ExactTokenizer) Type() string { return "string" }
func (t ExactTokenizer) Tokens(v interface{}) ([]string, error) {
if term, ok := v.(string); ok {
return []string{term}, nil
}
return nil, errors.Errorf("Exact indices only supported for string types")
}
func (t ExactTokenizer) Identifier() byte { return IdentExact }
func (t ExactTokenizer) IsSortable() bool { return true }
func (t ExactTokenizer) IsLossy() bool { return false }

// LangTokenizer returns the exact string along with language prefix as a token.
type LangTokenizer struct {
// ExactTokenizer returns the exact string as a token. If collator is provided for
// any language then it also adds the language in the prefix .
type ExactTokenizer struct {
langBase string
cl *collate.Collator
buffer *collate.Buffer
}

func (t LangTokenizer) Name() string { return "lang" }
func (t LangTokenizer) Type() string { return "string" }
func (t LangTokenizer) Tokens(v interface{}) ([]string, error) {
func (t ExactTokenizer) Name() string { return "exact" }
func (t ExactTokenizer) Type() string { return "string" }
func (t ExactTokenizer) Tokens(v interface{}) ([]string, error) {
val, ok := v.(string)
if !ok {
return nil, errors.Errorf("Lang indices only supported for string types")
return nil, errors.Errorf("Exact indices only supported for string types")
}

if t.cl == nil {
return []string{val}, nil
}

encodedTerm := t.cl.KeyFromString(t.buffer, val)
Expand All @@ -332,10 +321,19 @@ func (t LangTokenizer) Tokens(v interface{}) ([]string, error) {
t.buffer.Reset()
return []string{string(term)}, nil
}
func (t LangTokenizer) Identifier() byte { return IdentExactLang }
func (t LangTokenizer) IsSortable() bool { return true }
func (t LangTokenizer) IsLossy() bool { return false }
func (t LangTokenizer) Prefix() []byte {

func (t ExactTokenizer) Identifier() byte {
if t.cl == nil {
return IdentExact
}
return IdentExactLang
}
func (t ExactTokenizer) IsSortable() bool { return true }
func (t ExactTokenizer) IsLossy() bool { return false }
func (t ExactTokenizer) Prefix() []byte {
if t.cl == nil {
return []byte{IdentExact}
}
prefix := []byte{IdentExactLang}
prefix = append(prefix, []byte(t.langBase)...)
prefix = append(prefix, IdentDelimiter)
Expand Down
2 changes: 1 addition & 1 deletion tok/tokens.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ func GetLangTokenizer(t Tokenizer, lang string) Tokenizer {
if err != nil {
langTag = enLangTag
}
return LangTokenizer{langBase: LangBase(lang), cl: collate.New(langTag),
return ExactTokenizer{langBase: LangBase(lang), cl: collate.New(langTag),
buffer: &collate.Buffer{}}
}
return t
Expand Down
6 changes: 3 additions & 3 deletions worker/sort.go
Original file line number Diff line number Diff line change
Expand Up @@ -225,13 +225,13 @@ func sortWithIndex(ctx context.Context, ts *pb.SortMessage) *sortresult {

var prefix []byte
if len(order.Langs) > 0 {
// Only one language is allowed.
// Only one languge is allowed.
lang := order.Langs[0]
tokenizer = tok.GetLangTokenizer(tokenizer, lang)
langTokenizer, ok := tokenizer.(tok.LangTokenizer)
langTokenizer, ok := tokenizer.(*tok.ExactTokenizer)
if !ok {
return resultWithError(errors.Errorf(
"Invalid tokenizer for language %s.", lang))
"Failed to get tokenizer for Attribute %s for language %s.", order.Attr, lang))
}
prefix = langTokenizer.Prefix()
} else {
Expand Down

0 comments on commit 38601e0

Please sign in to comment.