Skip to content

Commit

Permalink
feat(lyra): removes diacritics to keep index smaller
Browse files Browse the repository at this point in the history
implements #75
  • Loading branch information
micheleriva committed Aug 4, 2022
1 parent 929a6cd commit e8396c3
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 11 deletions.
23 changes: 23 additions & 0 deletions packages/lyra/src/tokenizer/diacritics.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
const diacritics = [
{ char: "A", base: /[\300-\306]/g },
{ char: "a", base: /[\340-\346]/g },
{ char: "E", base: /[\310-\313]/g },
{ char: "e", base: /[\350-\353]/g },
{ char: "I", base: /[\314-\317]/g },
{ char: "i", base: /[\354-\357]/g },
{ char: "O", base: /[\322-\330]/g },
{ char: "o", base: /[\362-\370]/g },
{ char: "U", base: /[\331-\334]/g },
{ char: "u", base: /[\371-\374]/g },
{ char: "N", base: /[\321]/g },
{ char: "n", base: /[\361]/g },
{ char: "C", base: /[\307]/g },
{ char: "c", base: /[\347]/g },
];

export function replaceDiacritics(str: string): string {
for (const { char, base } of diacritics) {
str = str.replace(base, char);
}
return str;
}
3 changes: 2 additions & 1 deletion packages/lyra/src/tokenizer/index.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { Language } from "./languages";
import { replaceDiacritics } from "./diacritics";

const splitRegex: Record<Language, RegExp> = {
dutch: /[^a-z0-9_'-]+/gim,
Expand All @@ -16,7 +17,7 @@ export function tokenize(input: string, language: Language = "english") {
if (typeof input !== "string") return [input];

const splitRule = splitRegex[language];
const tokens = input.toLowerCase().split(splitRule);
const tokens = input.toLowerCase().split(splitRule).map(replaceDiacritics);
return Array.from(new Set(trim(tokens)));
}

Expand Down
20 changes: 10 additions & 10 deletions packages/lyra/tap-snapshots/tests/tokenizer.test.ts.test.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ Array [
"ai",
"fait",
"des",
"gâteaux",
"gateaux",
]
`

Expand Down Expand Up @@ -100,13 +100,13 @@ Array [

exports[`tests/tokenizer.test.ts TAP Tokenizer Should tokenize and stem correctly in norwegian > Should tokenize and stem correctly in norwegian-O2 1`] = `
Array [
"å",
"a",
"sove",
"er",
"en",
"vanskelig",
"ting",
"når",
"nar",
"testene",
"mislykkes",
]
Expand All @@ -124,10 +124,10 @@ Array [
exports[`tests/tokenizer.test.ts TAP Tokenizer Should tokenize and stem correctly in portuguese > Should tokenize and stem correctly in portuguese-O2 1`] = `
Array [
"dormir",
"é",
"e",
"uma",
"coisa",
"difícil",
"dificil",
"quando",
"os",
"testes",
Expand Down Expand Up @@ -156,7 +156,7 @@ Array [

exports[`tests/tokenizer.test.ts TAP Tokenizer Should tokenize and stem correctly in spanish > Should tokenize and stem correctly in spanish-O1 1`] = `
Array [
"cociné",
"cocine",
"unos",
"pasteles",
]
Expand All @@ -179,7 +179,7 @@ exports[`tests/tokenizer.test.ts TAP Tokenizer Should tokenize and stem correctl
Array [
"jag",
"lagade",
"några",
"nagra",
"kakor",
]
`
Expand All @@ -188,11 +188,11 @@ exports[`tests/tokenizer.test.ts TAP Tokenizer Should tokenize and stem correctl
Array [
"att",
"sova",
"är",
"ar",
"en",
"svår",
"svar",
"sak",
"när",
"nar",
"testerna",
"misslyckas",
]
Expand Down

0 comments on commit e8396c3

Please sign in to comment.