From c49028e24cde6973773477e8f6869708c2ecc129 Mon Sep 17 00:00:00 2001 From: Paolo Insogna Date: Tue, 7 Mar 2023 12:59:25 +0100 Subject: [PATCH 1/5] feat: Rewritten with components architecture. --- .swcrc | 8 - benchmarks/src/typo-tolerant-search.js | 14 +- examples/with-react/index.html | 20 +- examples/with-react/src/App.css | 134 ++--- examples/with-react/src/App.tsx | 177 +++--- examples/with-react/src/assets/db.json | 48 +- examples/with-react/src/index.css | 38 +- examples/with-react/src/main.tsx | 6 +- examples/with-react/vite.config.ts | 6 +- package.json | 3 +- src/algorithms.ts | 93 ---- src/cjs/index.cts | 126 +++-- src/cjs/internals.cts | 2 +- src/components/algorithms.ts | 42 ++ src/components/defaults.ts | 44 ++ src/components/documents-store.ts | 84 +++ src/components/facets.ts | 97 ++++ src/components/filters.ts | 17 + src/components/hooks.ts | 45 ++ src/components/index.ts | 388 ++++++++++++++ src/{ => components}/levenshtein.ts | 28 +- src/components/sync-blocking-checker.ts | 54 ++ src/errors.ts | 172 +++--- src/facets.ts | 117 ---- src/filters.ts | 103 ---- src/index.ts | 14 +- src/insertion-checker.ts | 31 -- src/internals.ts | 6 +- src/methods/common.ts | 56 -- src/methods/create.ts | 210 +++++--- src/methods/docs.ts | 39 +- src/methods/hooks.ts | 37 -- src/methods/insert.ts | 296 +++------- src/methods/load.ts | 21 - src/methods/remove.ts | 161 +++--- src/methods/save.ts | 14 - src/methods/search.ts | 397 ++++---------- src/methods/serialization.ts | 23 + src/methods/update.ts | 30 ++ src/tokenizer/diacritics.ts | 2 +- src/tokenizer/index.ts | 234 +++----- src/tokenizer/languages.ts | 92 ++-- src/trees/avl.ts | 326 +++++++++++ src/trees/avl/index.ts | 248 --------- src/trees/avl/node.ts | 17 - src/trees/avl/utils.ts | 52 -- src/trees/{radix/index.ts => radix.ts} | 203 ++++--- src/trees/radix/node.ts | 58 -- src/types.ts | 335 ++++++++++++ src/types/facets.ts | 26 - src/types/filters.ts | 33 -- src/types/index.ts | 132 ----- src/utils.ts | 144 +++-- tests/algorithms.test.ts | 53 -- tests/boosting.test.ts | 21 +- tests/ci/playwright/browsers.spec.ts | 1 + tests/ci/playwright/index.html | 106 ++-- tests/ci/playwright/vite.config.js | 17 +- .../{lyra.dataset.test.ts => dataset.test.ts} | 19 +- tests/docs.test.ts | 24 +- tests/elapsed.test.ts | 20 +- tests/facets.test.ts | 99 ++-- tests/filters.test.ts | 243 +++++---- tests/insert.test.ts | 68 ++- tests/levenshtein.test.ts | 2 +- tests/{lyra.test.ts => main.test.ts} | 167 +++--- tests/remove.test.ts | 27 +- ...yra.edge.test.ts => serialization.test.ts} | 77 +-- tests/snapshots/events.json | 186 +++---- tests/tokenizer.test.ts | 506 +++++------------- tests/tree.avl.test.ts | 148 ++--- tests/{radix.test.ts => tree.radix.test.ts} | 4 +- tests/utils.test.ts | 24 +- tsconfig.json | 2 +- 74 files changed, 3421 insertions(+), 3496 deletions(-) delete mode 100644 src/algorithms.ts create mode 100644 src/components/algorithms.ts create mode 100644 src/components/defaults.ts create mode 100644 src/components/documents-store.ts create mode 100644 src/components/facets.ts create mode 100644 src/components/filters.ts create mode 100644 src/components/hooks.ts create mode 100644 src/components/index.ts rename src/{ => components}/levenshtein.ts (100%) create mode 100644 src/components/sync-blocking-checker.ts delete mode 100644 src/facets.ts delete mode 100644 src/filters.ts delete mode 100644 src/insertion-checker.ts delete mode 100644 src/methods/common.ts delete mode 100644 src/methods/hooks.ts delete mode 100644 src/methods/load.ts delete mode 100644 src/methods/save.ts create mode 100644 src/methods/serialization.ts create mode 100644 src/methods/update.ts create mode 100644 src/trees/avl.ts delete mode 100644 src/trees/avl/index.ts delete mode 100644 src/trees/avl/node.ts delete mode 100644 src/trees/avl/utils.ts rename src/trees/{radix/index.ts => radix.ts} (77%) delete mode 100644 src/trees/radix/node.ts create mode 100644 src/types.ts delete mode 100644 src/types/facets.ts delete mode 100644 src/types/filters.ts delete mode 100644 src/types/index.ts delete mode 100644 tests/algorithms.test.ts rename tests/{lyra.dataset.test.ts => dataset.test.ts} (89%) rename tests/{lyra.test.ts => main.test.ts} (87%) rename tests/{lyra.edge.test.ts => serialization.test.ts} (63%) rename tests/{radix.test.ts => tree.radix.test.ts} (98%) diff --git a/.swcrc b/.swcrc index dd1de6e60..ef696c79a 100644 --- a/.swcrc +++ b/.swcrc @@ -14,10 +14,6 @@ }, "experimental": { "keepImportAssertions": true - }, - "baseUrl": ".", - "paths": { - "@stemmer/*": ["./src/stemmer/*"] } }, "sourceMaps": true @@ -41,10 +37,6 @@ }, "experimental": { "keepImportAssertions": true - }, - "baseUrl": ".", - "paths": { - "@stemmer/*": ["./src/stemmer/*"] } }, "sourceMaps": true diff --git a/benchmarks/src/typo-tolerant-search.js b/benchmarks/src/typo-tolerant-search.js index 177109788..da9618781 100644 --- a/benchmarks/src/typo-tolerant-search.js +++ b/benchmarks/src/typo-tolerant-search.js @@ -1,5 +1,7 @@ import cronometro from "cronometro"; -import { create, insertBatch, search } from "../../dist/index.js"; +import { isMainThread } from "worker_threads"; +import { create, search, insertMultiple } from "../../dist/index.js"; +import { createTokenizer } from "../../dist/internals.js"; import { formattedEvents } from "./utils/dataset.js"; const db = await create({ @@ -22,15 +24,17 @@ const dbNoStemming = await create({ second: "string", }, }, - tokenizer: { - enableStemming: false, + components: { + tokenizer: await createTokenizer("english", { stemming: false }), }, }); const first30000Events = formattedEvents.slice(0, 30_000); -await insertBatch(db, first30000Events); -await insertBatch(dbNoStemming, first30000Events); +if (!isMainThread) { + await insertMultiple(db, first30000Events); + await insertMultiple(dbNoStemming, first30000Events); +} await cronometro({ 'search "beauty", default settings': () => { diff --git a/examples/with-react/index.html b/examples/with-react/index.html index 31772ec0a..22b2dbc06 100644 --- a/examples/with-react/index.html +++ b/examples/with-react/index.html @@ -1,13 +1,13 @@ - - - - - Lyra + React - - -
- - + + + + + Lyra + React + + +
+ + diff --git a/examples/with-react/src/App.css b/examples/with-react/src/App.css index cf3316cc0..12a9e1ae9 100644 --- a/examples/with-react/src/App.css +++ b/examples/with-react/src/App.css @@ -1,113 +1,113 @@ .main { - width: 80vw; - min-height: 100vh; - display: flex; - flex-direction: column; - align-items: center; - justify-content: center; + width: 80vw; + min-height: 100vh; + display: flex; + flex-direction: column; + align-items: center; + justify-content: center; } .top { - width: 100%; - display: flex; - align-items: center; - justify-content: space-between; - margin: 2rem 0; + width: 100%; + display: flex; + align-items: center; + justify-content: space-between; + margin: 2rem 0; } .title, .movieTitle { - width: 100%; - text-align: left; - font-size: 1.5rem; - margin: 0; + width: 100%; + text-align: left; + font-size: 1.5rem; + margin: 0; } .movieTitle { - font-size: 1.2rem; - margin-bottom: 1rem; + font-size: 1.2rem; + margin-bottom: 1rem; } .input { - outline: none; - border: none; - background-color: transparent; - border: 1px solid #d5b3ffaa; - border-radius: 1rem; - padding: 0.5rem 1rem; - width: 50%; - font-size: 1rem; - color: #eee; - transition: 0.15s; + outline: none; + border: none; + background-color: transparent; + border: 1px solid #d5b3ffaa; + border-radius: 1rem; + padding: 0.5rem 1rem; + width: 50%; + font-size: 1rem; + color: #eee; + transition: 0.15s; } .input:focus { - border-color: #d5b3ff; + border-color: #d5b3ff; } .button { - outline: none; - border: none; - background-color: transparent; - border: 1px solid #ef4444; - border-radius: 0.5rem; - padding: 0.5rem 1rem; - width: fit-content; - font-size: 1rem; - color: #eee; - cursor: pointer; - transition: 0.15s; + outline: none; + border: none; + background-color: transparent; + border: 1px solid #ef4444; + border-radius: 0.5rem; + padding: 0.5rem 1rem; + width: fit-content; + font-size: 1rem; + color: #eee; + cursor: pointer; + transition: 0.15s; } .button:hover { - color: #ef4444; + color: #ef4444; } .container { - width: 100%; - display: grid; - grid-template-columns: repeat(2, minmax(0, 1fr)); - column-gap: 1rem; - row-gap: 1rem; + width: 100%; + display: grid; + grid-template-columns: repeat(2, minmax(0, 1fr)); + column-gap: 1rem; + row-gap: 1rem; } .movie { - padding: 1rem; - border-radius: 0.5rem; - border: 2px solid #d5b3ffaa; - cursor: pointer; - transition: 0.15s; + padding: 1rem; + border-radius: 0.5rem; + border: 2px solid #d5b3ffaa; + cursor: pointer; + transition: 0.15s; } .movie:hover { - border-color: #d5b3ff; + border-color: #d5b3ff; } .favLabel { - width: 100%; - text-align: right; - margin: 0; + width: 100%; + text-align: right; + margin: 0; } .details { - width: 100%; - display: flex; - flex-direction: column; - align-items: flex-end; - justify-content: center; + width: 100%; + display: flex; + flex-direction: column; + align-items: flex-end; + justify-content: center; } .sub { - margin: 0; - text-align: right; + margin: 0; + text-align: right; } @media (max-width: 700px) { - .container { - grid-template-columns: repeat(1, minmax(0, 1fr)); - } + .container { + grid-template-columns: repeat(1, minmax(0, 1fr)); + } - .input { - width: 90%; - } + .input { + width: 90%; + } } diff --git a/examples/with-react/src/App.tsx b/examples/with-react/src/App.tsx index 1580c3c41..be87cff0a 100644 --- a/examples/with-react/src/App.tsx +++ b/examples/with-react/src/App.tsx @@ -1,112 +1,103 @@ +import { create, insertMultiple, search } from "@lyrasearch/lyra"; import { useEffect, useRef, useState } from "react"; -import { create, insertBatch, search } from "@lyrasearch/lyra"; -import data from "./assets/db.json"; import "./App.css"; +import data from "./assets/db.json"; interface Movies { - id: string; - title: string; - director: string; - plot: string; - year: number; - isFavorite: boolean; + id: string; + title: string; + director: string; + plot: string; + year: number; + isFavorite: boolean; } function App() { - const searchInput = useRef(null); - const [isSearching, setIsSearching] = useState(false); - const [movies, setMovies] = useState(); + const searchInput = useRef(null); + const [isSearching, setIsSearching] = useState(false); + const [movies, setMovies] = useState(); - useEffect(() => { - // show all movies at the start - setMovies(data); - }, []); + useEffect(() => { + // show all movies at the start + setMovies(data); + }, []); - const handleSearch = async (e: React.KeyboardEvent) => { - if (e.key != "Enter") return; + const handleSearch = async (e: React.KeyboardEvent) => { + if (e.key != "Enter") return; - const db = await create({ - schema: { - id: "string", // users usually don't search this - title: "string", - director: "string", - plot: "string", // users usually don't search this - year: "number", // unsearchable - isFavorite: "boolean", // unsearchable - }, - }); + const db = await create({ + schema: { + id: "string", // users usually don't search this + title: "string", + director: "string", + plot: "string", // users usually don't search this + year: "number", // unsearchable + isFavorite: "boolean", // unsearchable + }, + }); - await insertBatch(db, data, { batchSize: data.length }); + await insertMultiple(db, data, { batchSize: data.length }); - const searchResult = await search(db, { - term: searchInput.current!.value, - properties: ["title", "director"], - tolerance: 1, // for typo tolerance - }); + const searchResult = await search(db, { + term: searchInput.current!.value, + properties: ["title", "director"], + tolerance: 1, // for typo tolerance + }); - let result = []; + let result = []; - for (let i = 0; i < searchResult.hits.length; i++) { - result.push(searchResult.hits[i].document); - } + for (let i = 0; i < searchResult.hits.length; i++) { + result.push(searchResult.hits[i].document); + } - setIsSearching(true); - setMovies(result); - }; + setIsSearching(true); + setMovies(result); + }; - return ( -
-
- {isSearching ? ( - <> -

- Searching for "{searchInput.current!.value}" -

- - - ) : ( - <> -

All Movies

- handleSearch(e)} - className="input" - /> - - )} -
-
- {movies?.length ? ( - <> - {movies.map((movie) => ( -
- {movie.isFavorite && ( -

- )} -

{movie.title}

-

{movie.plot}

-
- {movie.director} - {movie.year} -
-
- ))} - - ) : ( -

No movies found...

- )} -
-
- ); + return ( +
+
+ {isSearching ? ( + <> +

Searching for "{searchInput.current!.value}"

+ + + ) : ( + <> +

All Movies

+ handleSearch(e)} className="input" /> + + )} +
+
+ {movies?.length ? ( + <> + {movies.map(movie => ( +
+ {movie.isFavorite &&

} +

{movie.title}

+

{movie.plot}

+
+ {movie.director} + {movie.year} +
+
+ ))} + + ) : ( +

No movies found...

+ )} +
+
+ ); } export default App; diff --git a/examples/with-react/src/assets/db.json b/examples/with-react/src/assets/db.json index 0c16d2f9f..8335d3bc8 100644 --- a/examples/with-react/src/assets/db.json +++ b/examples/with-react/src/assets/db.json @@ -1,26 +1,26 @@ [ - { - "id": "1", - "title": "The Prestige", - "director": "Christopher Nolan", - "plot": "Two friends and fellow magicians become bitter enemies after a sudden tragedy. As they devote themselves to this rivalry, they make sacrifices that bring them fame but with terrible consequences.", - "year": 2006, - "isFavorite": true - }, - { - "id": "2", - "title": "Big Fish", - "director": "Tim Burton", - "plot": "Will Bloom returns home to care for his dying father, who had a penchant for telling unbelievable stories. After he passes away, Will tries to find out if his tales were really true.", - "year": 2004, - "isFavorite": true - }, - { - "id": "3", - "title": "Harry Potter and the Philosopher's Stone", - "director": "Chris Columbus", - "plot": "Harry Potter, an eleven-year-old orphan, discovers that he is a wizard and is invited to study at Hogwarts. Even as he escapes a dreary life and enters a world of magic, he finds trouble awaiting him.", - "year": 2001, - "isFavorite": false - } + { + "id": "1", + "title": "The Prestige", + "director": "Christopher Nolan", + "plot": "Two friends and fellow magicians become bitter enemies after a sudden tragedy. As they devote themselves to this rivalry, they make sacrifices that bring them fame but with terrible consequences.", + "year": 2006, + "isFavorite": true + }, + { + "id": "2", + "title": "Big Fish", + "director": "Tim Burton", + "plot": "Will Bloom returns home to care for his dying father, who had a penchant for telling unbelievable stories. After he passes away, Will tries to find out if his tales were really true.", + "year": 2004, + "isFavorite": true + }, + { + "id": "3", + "title": "Harry Potter and the Philosopher's Stone", + "director": "Chris Columbus", + "plot": "Harry Potter, an eleven-year-old orphan, discovers that he is a wizard and is invited to study at Hogwarts. Even as he escapes a dreary life and enters a world of magic, he finds trouble awaiting him.", + "year": 2001, + "isFavorite": false + } ] diff --git a/examples/with-react/src/index.css b/examples/with-react/src/index.css index 8da03067f..063350e34 100644 --- a/examples/with-react/src/index.css +++ b/examples/with-react/src/index.css @@ -2,13 +2,13 @@ html, body { - padding: 0; - margin: 0; - font-family: "Inter", sans-serif; - background-color: #1b1b1d; - color: #eee; - overflow-x: hidden; - scroll-behavior: smooth; + padding: 0; + margin: 0; + font-family: "Inter", sans-serif; + background-color: #1b1b1d; + color: #eee; + overflow-x: hidden; + scroll-behavior: smooth; } h1, @@ -18,25 +18,25 @@ h4, h5, h6, p { - margin: 0; - margin-bottom: 0.5rem; + margin: 0; + margin-bottom: 0.5rem; } a { - color: inherit; - text-decoration: none; + color: inherit; + text-decoration: none; } * { - box-sizing: border-box; + box-sizing: border-box; } #root { - padding: 0; - margin: 0; - width: 100vw; - min-height: 100vh; - display: flex; - align-items: center; - justify-content: center; + padding: 0; + margin: 0; + width: 100vw; + min-height: 100vh; + display: flex; + align-items: center; + justify-content: center; } diff --git a/examples/with-react/src/main.tsx b/examples/with-react/src/main.tsx index 7f8e89e6b..8b1ddb971 100644 --- a/examples/with-react/src/main.tsx +++ b/examples/with-react/src/main.tsx @@ -4,7 +4,7 @@ import App from "./App"; import "./index.css"; ReactDOM.createRoot(document.getElementById("root") as HTMLElement).render( - - - + + + , ); diff --git a/examples/with-react/vite.config.ts b/examples/with-react/vite.config.ts index 5a33944a9..9cc50ead1 100644 --- a/examples/with-react/vite.config.ts +++ b/examples/with-react/vite.config.ts @@ -1,7 +1,7 @@ -import { defineConfig } from 'vite' -import react from '@vitejs/plugin-react' +import { defineConfig } from "vite"; +import react from "@vitejs/plugin-react"; // https://vitejs.dev/config/ export default defineConfig({ plugins: [react()], -}) +}); diff --git a/package.json b/package.json index 834e69a17..8001e76be 100644 --- a/package.json +++ b/package.json @@ -118,7 +118,8 @@ "tape": "^5.6.1", "tcompare": "^6.0.0", "tsx": "^3.12.1", - "typescript": "^4.9.4" + "typescript": "^4.9.4", + "vite": "^4.1.4" }, "pnpm": { "peerDependencyRules": { diff --git a/src/algorithms.ts b/src/algorithms.ts deleted file mode 100644 index f3f8f3813..000000000 --- a/src/algorithms.ts +++ /dev/null @@ -1,93 +0,0 @@ -import type { BM25Params, TokenScore } from "./types/index.js"; -import * as ERRORS from "./errors.js"; - -// Adapted from https://github.com/lovasoa/fast_array_intersect -// MIT Licensed (https://github.com/lovasoa/fast_array_intersect/blob/master/LICENSE) -// while on tag https://github.com/lovasoa/fast_array_intersect/tree/v1.1.0 -export function intersectTokenScores(arrays: TokenScore[][]): TokenScore[] { - if (arrays.length === 0) { - return []; - } - - for (let i = 1; i < arrays.length; i++) { - if (arrays[i].length < arrays[0].length) { - const tmp = arrays[0]; - arrays[0] = arrays[i]; - arrays[i] = tmp; - } - } - - const set: Map = new Map(); - for (const elem of arrays[0]) { - set.set(elem[0], [1, elem[1]]); - } - - const arrLength = arrays.length; - for (let i = 1; i < arrLength; i++) { - let found = 0; - for (const elem of arrays[i]) { - /* c8 ignore next */ - const key = elem[0] ?? ""; - - const [count, score] = set.get(key) ?? [0, 0]; - if (count === i) { - set.set(key, [count + 1, score + elem[1]]); - found++; - } - } - - if (found === 0) { - return []; - } - } - - const result: TokenScore[] = []; - - for (const [token, [count, score]] of set) { - if (count === arrLength) { - result.push([token, score]); - } - } - - return result; -} - -export function prioritizeTokenScores(arrays: TokenScore[][], boost: number): TokenScore[] { - if (boost === 0) { - throw new Error(ERRORS.INVALID_BOOST_VALUE()); - } - - const tokenMap: Record = {}; - - const mapsLength = arrays.length; - for (let i = 0; i < mapsLength; i++) { - const arr = arrays[i]; - - const entriesLength = arr.length; - for (let j = 0; j < entriesLength; j++) { - const [token, score] = arr[j]; - const boostScore = score * boost; - - if (token in tokenMap) { - tokenMap[token] *= 1.5 + boostScore; - } else { - tokenMap[token] = boostScore; - } - } - } - - return Object.entries(tokenMap).sort((a, b) => b[1] - a[1]); -} - -export function BM25( - tf: number, - matchingCount: number, - docsCount: number, - fieldLength: number, - averageFieldLength: number, - BM25Params: BM25Params -) { - const { k, b, d } = BM25Params; - const idf = Math.log(1 + (docsCount - matchingCount + 0.5) / (matchingCount + 0.5)); - return idf * (d + tf * (k + 1)) / (tf + k * (1 - b + b * fieldLength / averageFieldLength)); -} \ No newline at end of file diff --git a/src/cjs/index.cts b/src/cjs/index.cts index 8af9d4cf1..e772bf90d 100644 --- a/src/cjs/index.cts +++ b/src/cjs/index.cts @@ -1,40 +1,47 @@ import type { create as esmCreate } from "../methods/create.js"; -import type { - insert as esmInsert, - insertBatch as esmInsertBatch, - insertWithHooks as esmInsertWithHooks, -} from "../methods/insert.js"; -import type { load as esmLoad } from "../methods/load.js"; -import type { remove as esmRemove } from "../methods/remove.js"; -import type { save as esmSave } from "../methods/save.js"; -import type { search as esmSearch } from "../methods/search.js"; import type { count as esmCount, getByID as esmGetByID } from "../methods/docs.js"; +import type { insert as esmInsert, insertMultiple as esminsertMultiple } from "../methods/insert.js"; +import type { remove as esmRemove, removeMultiple as esmRemoveMultiple } from "../methods/remove.js"; +import type { search as esmSearch } from "../methods/search.js"; +import type { load as esmLoad, save as esmSave } from "../methods/serialization.js"; +import type { update as esmUpdate, updateMultiple as esmUpdateMultiple } from "../methods/update.js"; export interface LyraExport { + count: typeof esmCount; create: typeof esmCreate; + getByID: typeof esmGetByID; insert: typeof esmInsert; - insertWithHooks: typeof esmInsertWithHooks; - insertBatch: typeof esmInsertBatch; + insertMultiple: typeof esminsertMultiple; + load: typeof esmLoad; remove: typeof esmRemove; - search: typeof esmSearch; + removeMultiple: typeof esmRemoveMultiple; save: typeof esmSave; - load: typeof esmLoad; - count: typeof esmCount; - getByID: typeof esmGetByID; + search: typeof esmSearch; } export type RequireCallback = (err: Error | undefined, lyra?: LyraExport) => void; +let _esmCount: typeof esmCount; let _esmCreate: typeof esmCreate; +let _esmGetByID: typeof esmGetByID; let _esmInsert: typeof esmInsert; -let _esmInsertWithHooks: typeof esmInsertWithHooks; -let _esmInsertBatch: typeof esmInsertBatch; +let _esmInsertMultiple: typeof esminsertMultiple; +let _esmLoad: typeof esmLoad; let _esmRemove: typeof esmRemove; -let _esmSearch: typeof esmSearch; +let _esmRemoveMultiple: typeof esmRemoveMultiple; let _esmSave: typeof esmSave; -let _esmLoad: typeof esmLoad; -let _esmCount: typeof esmCount; -let _esmGetByID: typeof esmGetByID; +let _esmSearch: typeof esmSearch; +let _esmUpdate: typeof esmUpdate; +let _esmUpdateMultiple: typeof esmUpdateMultiple; + +export async function count(...args: Parameters): ReturnType { + if (!_esmCount) { + const imported = await import("../methods/docs.js"); + _esmCount = imported.count; + } + + return _esmCount(...args); +} export async function create(...args: Parameters): ReturnType { if (!_esmCreate) { @@ -45,6 +52,15 @@ export async function create(...args: Parameters): ReturnType< return _esmCreate(...args); } +export async function getByID(...args: Parameters): ReturnType { + if (!_esmGetByID) { + const imported = await import("../methods/docs.js"); + _esmGetByID = imported.getByID; + } + + return _esmGetByID(...args); +} + export async function insert(...args: Parameters): ReturnType { if (!_esmInsert) { const imported = await import("../methods/insert.js"); @@ -54,24 +70,24 @@ export async function insert(...args: Parameters): ReturnType< return _esmInsert(...args); } -export async function insertWithHooks( - ...args: Parameters -): ReturnType { - if (!_esmInsertWithHooks) { +export async function insertMultiple( + ...args: Parameters +): ReturnType { + if (!_esmInsertMultiple) { const imported = await import("../methods/insert.js"); - _esmInsertWithHooks = imported.insertWithHooks; + _esmInsertMultiple = imported.insertMultiple; } - return _esmInsertWithHooks(...args); + return _esmInsertMultiple(...args); } -export async function insertBatch(...args: Parameters): ReturnType { - if (!_esmInsertBatch) { - const imported = await import("../methods/insert.js"); - _esmInsertBatch = imported.insertBatch; +export async function load(...args: Parameters): ReturnType { + if (!_esmLoad) { + const imported = await import("../methods/serialization.js"); + _esmLoad = imported.load; } - return _esmInsertBatch(...args); + return _esmLoad(...args); } export async function remove(...args: Parameters): ReturnType { @@ -83,49 +99,53 @@ export async function remove(...args: Parameters): ReturnType< return _esmRemove(...args); } -export async function search(...args: Parameters): ReturnType { - if (!_esmSearch) { - const imported = await import("../methods/search.js"); - _esmSearch = imported.search; +export async function removeMultiple( + ...args: Parameters +): ReturnType { + if (!_esmRemoveMultiple) { + const imported = await import("../methods/remove.js"); + _esmRemoveMultiple = imported.removeMultiple; } - return _esmSearch(...args); + return _esmRemoveMultiple(...args); } export async function save(...args: Parameters): ReturnType { if (!_esmSave) { - const imported = await import("../methods/save.js"); + const imported = await import("../methods/serialization.js"); _esmSave = imported.save; } return _esmSave(...args); } -export async function load(...args: Parameters): ReturnType { - if (!_esmLoad) { - const imported = await import("../methods/load.js"); - _esmLoad = imported.load; +export async function search(...args: Parameters): ReturnType { + if (!_esmSearch) { + const imported = await import("../methods/search.js"); + _esmSearch = imported.search; } - return _esmLoad(...args); + return _esmSearch(...args); } -export async function count(...args: Parameters): ReturnType { - if (!_esmCount) { - const imported = await import("../methods/docs.js"); - _esmCount = imported.count; +export async function update(...args: Parameters): ReturnType { + if (!_esmUpdate) { + const imported = await import("../methods/update.js"); + _esmUpdate = imported.update; } - return _esmCount(...args); + return _esmUpdate(...args); } -export async function getByID(...args: Parameters): ReturnType { - if (!_esmGetByID) { - const imported = await import("../methods/docs.js"); - _esmGetByID = imported.getByID; +export async function updateMultiple( + ...args: Parameters +): ReturnType { + if (!_esmUpdateMultiple) { + const imported = await import("../methods/update.js"); + _esmUpdateMultiple = imported.updateMultiple; } - return _esmGetByID(...args); + return _esmUpdateMultiple(...args); } export function requireLyra(callback: RequireCallback): void { diff --git a/src/cjs/internals.cts b/src/cjs/internals.cts index 273a5c0c1..339854207 100644 --- a/src/cjs/internals.cts +++ b/src/cjs/internals.cts @@ -1,4 +1,4 @@ -import type { BoundedMetric } from "../levenshtein.js"; +import type { BoundedMetric } from "../components/levenshtein.js"; export interface LyraInternals { boundedLevenshtein(a: string, b: string, tolerance: number): BoundedMetric; diff --git a/src/components/algorithms.ts b/src/components/algorithms.ts new file mode 100644 index 000000000..bfd2cf1c5 --- /dev/null +++ b/src/components/algorithms.ts @@ -0,0 +1,42 @@ +import { createError } from "../errors.js"; +import { TokenScore, BM25Params } from "../types.js"; + +export function prioritizeTokenScores(arrays: TokenScore[][], boost: number): TokenScore[] { + if (boost === 0) { + throw createError("INVALID_BOOST_VALUE"); + } + + const tokenMap: Record = {}; + + const mapsLength = arrays.length; + for (let i = 0; i < mapsLength; i++) { + const arr = arrays[i]; + + const entriesLength = arr.length; + for (let j = 0; j < entriesLength; j++) { + const [token, score] = arr[j]; + const boostScore = score * boost; + + if (token in tokenMap) { + tokenMap[token] *= 1.5 + boostScore; + } else { + tokenMap[token] = boostScore; + } + } + } + + return Object.entries(tokenMap).sort((a, b) => b[1] - a[1]); +} + +export function BM25( + tf: number, + matchingCount: number, + docsCount: number, + fieldLength: number, + averageFieldLength: number, + BM25Params: Required, +) { + const { k, b, d } = BM25Params; + const idf = Math.log(1 + (docsCount - matchingCount + 0.5) / (matchingCount + 0.5)); + return (idf * (d + tf * (k + 1))) / (tf + k * (1 - b + (b * fieldLength) / averageFieldLength)); +} diff --git a/src/components/defaults.ts b/src/components/defaults.ts new file mode 100644 index 000000000..a9f5ac4a5 --- /dev/null +++ b/src/components/defaults.ts @@ -0,0 +1,44 @@ +import { createError } from "../errors.js"; +import { Document, Schema, SimpleComponents } from "../types.js"; +import { getDocumentProperties, uniqueId } from "../utils.js"; + +function validateSchema(doc: Document, schema: S): boolean { + for (const [prop, type] of Object.entries(schema)) { + if (typeof type === "object") { + if (!doc[prop] || (typeof doc[prop] !== "object" && Array.isArray(doc[prop]))) { + return false; + } + + if (!validateSchema(doc[prop] as Document, type)) { + return false; + } + } + + if (typeof doc[prop] !== type) { + return false; + } + } + + return true; +} + +export function getDefaultComponents(): SimpleComponents { + return { + validateSchema, + getDocumentIndexId(doc: Document): string { + if (doc.id) { + if (typeof doc.id !== "string") { + throw createError("DOCUMENT_ID_MUST_BE_STRING", typeof doc.id); + } + + return doc.id; + } + + return uniqueId(); + }, + getDocumentProperties, + formatElapsedTime(n: bigint): bigint { + return n; + }, + }; +} diff --git a/src/components/documents-store.ts b/src/components/documents-store.ts new file mode 100644 index 000000000..3f783278c --- /dev/null +++ b/src/components/documents-store.ts @@ -0,0 +1,84 @@ +import { Document, IDocumentsStore, OpaqueDocumentStore, OpaqueIndex, Schema } from "../types.js"; + +export interface DocumentsStore extends OpaqueDocumentStore { + docs: Record; + count: number; +} + +type DefaultDocumentsStore = IDocumentsStore; + +function create(): DocumentsStore { + return { + docs: {}, + count: 0, + }; +} + +function get(store: DocumentsStore, id: string): Document | undefined { + return store.docs[id]; +} + +function getMultiple(store: DocumentsStore, ids: string[]): (Document | undefined)[] { + const found: (Document | undefined)[] = Array.from({ length: ids.length }); + + for (let i = 0; i < ids.length; i++) { + found[i] = store.docs[ids[i]]; + } + + return found; +} + +function store(store: DocumentsStore, id: string, doc: Document): boolean { + if (typeof store.docs[id] !== "undefined") { + return false; + } + + store.docs[id] = doc; + store.count++; + + return true; +} + +function remove(store: DocumentsStore, id: string): boolean { + if (typeof store.docs[id] === "undefined") { + return false; + } + + store.docs[id] = undefined; + store.count--; + + return true; +} + +function count(store: DocumentsStore): number { + return store.count; +} + +function load(raw: unknown): DocumentsStore { + const rawDocument = raw as DocumentsStore; + + return { + docs: rawDocument.docs, + count: rawDocument.count, + }; +} + +function save(docs: DocumentsStore): unknown { + return { + docs: docs.docs, + count: docs.count, + }; +} + +export function createDocumentsStore(): DefaultDocumentsStore { + return { + create, + get, + getMultiple, + store, + remove, + count, + load, + save, + }; +} diff --git a/src/components/facets.ts b/src/components/facets.ts new file mode 100644 index 000000000..330dce5e2 --- /dev/null +++ b/src/components/facets.ts @@ -0,0 +1,97 @@ +import type { + FacetResult, + FacetSorting, + Lyra, + NumberFacetDefinition, + OpaqueDocumentStore, + OpaqueIndex, + Schema, + SearchParams, + StringFacetDefinition, + TokenScore, +} from "../types.js"; +import { getNested } from "../utils.js"; + +function sortingPredicate(order: FacetSorting = "desc", a: [string, number], b: [string, number]) { + if (order.toLowerCase() === "asc") { + return a[1] - b[1]; + } else { + return b[1] - a[1]; + } +} + +export async function getFacets( + lyra: Lyra, + results: TokenScore[], + facetsConfig: Required["facets"], +): Promise { + const facets: FacetResult = {}; + const allIDs = results.map(([id]) => id); + const allDocs = await lyra.documentsStore.getMultiple(lyra.data.docs, allIDs); + const facetKeys = Object.keys(facetsConfig!); + + const properties = await lyra.index.getSearchablePropertiesWithTypes(lyra.data.index); + + for (const facet of facetKeys) { + let values = {}; + + // Hack to guarantee the same order of ranges as specified by the user + if (properties[facet] === "number") { + const { ranges } = facetsConfig[facet] as NumberFacetDefinition; + const tmp = []; + for (const range of ranges) { + tmp.push([`${range.from}-${range.to}`, 0]); + } + values = Object.fromEntries(tmp); + } + + facets[facet] = { + count: 0, + values, + }; + } + + const allDocsLength = allDocs.length; + for (let i = 0; i < allDocsLength; i++) { + const doc = allDocs[i]; + + for (const facet of facetKeys) { + const facetValue = facet.includes(".") ? getNested(doc!, facet)! : (doc![facet] as number | boolean); + + // Range facets based on numbers + if (properties[facet] === "number") { + for (const range of (facetsConfig[facet] as NumberFacetDefinition).ranges) { + if (facetValue >= range.from && facetValue <= range.to) { + if (facets[facet].values[`${range.from}-${range.to}`] === undefined) { + facets[facet].values[`${range.from}-${range.to}`] = 1; + } else { + facets[facet].values[`${range.from}-${range.to}`]++; + } + } + } + } else { + // String or boolean based facets + const value = facetValue.toString(); + facets[facet].values[value] = (facets[facet].values[value] ?? 0) + 1; + } + } + } + + for (const facet of facetKeys) { + // Count the number of values for each facet + facets[facet].count = Object.keys(facets[facet].values).length; + + // Sort only string-based facets + if (properties[facet] === "string") { + const stringFacetDefinition = facetsConfig as StringFacetDefinition; + + facets[facet].values = Object.fromEntries( + Object.entries(facets[facet].values) + .sort((a, b) => sortingPredicate(stringFacetDefinition.sort, a, b)) + .slice(stringFacetDefinition.offset ?? 0, stringFacetDefinition.limit ?? 10), + ); + } + } + + return facets; +} diff --git a/src/components/filters.ts b/src/components/filters.ts new file mode 100644 index 000000000..2faa8d4a8 --- /dev/null +++ b/src/components/filters.ts @@ -0,0 +1,17 @@ +export function intersectFilteredIDs(filtered: string[], lookedUp: [string, number][]): [string, number][] { + const map = new Map(); + const result: [string, number][] = []; + + for (const id of filtered) { + map.set(id, true); + } + + for (const [id, score] of lookedUp) { + if (map.has(id)) { + result.push([id, score]); + map.delete(id); + } + } + + return result; +} diff --git a/src/components/hooks.ts b/src/components/hooks.ts new file mode 100644 index 000000000..b0d61615d --- /dev/null +++ b/src/components/hooks.ts @@ -0,0 +1,45 @@ +import { + Document, + Lyra, + MultipleCallbackComponent, + OpaqueDocumentStore, + OpaqueIndex, + Schema, + SingleCallbackComponent, +} from "../types.js"; + +export const COMPLEX_COMPONENTS = ["tokenizer", "index", "documentsStore"]; + +export const SIMPLE_COMPONENTS = ["validateSchema", "getDocumentIndexId", "getDocumentProperties", "formatElapsedTime"]; + +export const SIMPLE_OR_ARRAY_COMPONENTS = [ + "beforeInsert", + "afterInsert", + "beforeRemove", + "afterRemove", + "beforeMultipleInsert", + "afterMultipleInsert", + "beforeMultipleRemove", + "afterMultipleRemove", +]; + +export async function runSingleHook( + hooks: SingleCallbackComponent[], + lyra: Lyra, + id: string, + doc?: Document, +): Promise { + for (let i = 0; i < hooks.length; i++) { + await hooks[i](lyra, id, doc); + } +} + +export async function runMultipleHook( + hooks: MultipleCallbackComponent[], + lyra: Lyra, + docsOrIds: Document[] | string[], +): Promise { + for (let i = 0; i < hooks.length; i++) { + await hooks[i](lyra, docsOrIds); + } +} diff --git a/src/components/index.ts b/src/components/index.ts new file mode 100644 index 000000000..9692fc51a --- /dev/null +++ b/src/components/index.ts @@ -0,0 +1,388 @@ +import { createError } from "../errors.js"; +import { + create as avlCreate, + find as avlFind, + greaterThan as avlGreaterThan, + insert as avlInsert, + lessThan as avlLessThan, + Node as AVLNode, + rangeSearch as avlRangeSearch, + removeDocument as avlRemoveDocument, +} from "../trees/avl.js"; +import { + create as radixCreate, + find as radixFind, + insert as radixInsert, + Node as RadixNode, + removeDocumentByWord as radixRemoveDocument, +} from "../trees/radix.js"; +import { + BM25Params, + ComparisonOperator, + IIndex, + Lyra, + OpaqueDocumentStore, + OpaqueIndex, + Schema, + SearchableType, + SearchableValue, + SearchContext, + Tokenizer, + TokenScore, +} from "../types.js"; +import { intersect } from "../utils.js"; +import { BM25 } from "./algorithms.js"; + +type FrequencyMap = { + [property: string]: { + [documentID: string]: + | { + [token: string]: number; + } + | undefined; + }; +}; + +type BooleanIndex = { + true: string[]; + false: string[]; +}; + +export interface Index extends OpaqueIndex { + indexes: Record | BooleanIndex>; + searchableProperties: string[]; + searchablePropertiesWithTypes: Record; + frequencies: FrequencyMap; + tokenOccurrencies: Record>; + avgFieldLength: Record; + fieldLengths: Record>; +} + +type DefaultIndex = IIndex; + +function create( + lyra: Lyra, + schema: Schema, + index?: Index, + prefix = "", +): Index { + if (!index) { + index = { + indexes: {}, + searchableProperties: [], + searchablePropertiesWithTypes: {}, + frequencies: {}, + tokenOccurrencies: {}, + avgFieldLength: {}, + fieldLengths: {}, + }; + } + + for (const [prop, type] of Object.entries(schema)) { + const typeActualType = typeof type; + const path = `${prefix}${prefix ? "." : ""}${prop}`; + + if (typeActualType === "object" && !Array.isArray(type)) { + // Nested + create(lyra, type as Schema, index, path); + continue; + } + + switch (type) { + case "boolean": + index.indexes[path] = { true: [], false: [] }; + break; + case "number": + index.indexes[path] = avlCreate(0, []); + break; + case "string": + index.indexes[path] = radixCreate(); + index.avgFieldLength[path] = 0; + index.frequencies[path] = {}; + index.tokenOccurrencies[path] = {}; + index.fieldLengths[path] = {}; + + break; + default: + throw createError("INVALID_SCHEMA_TYPE", Array.isArray(type) ? "array" : typeActualType); + } + + index.searchableProperties.push(path); + index.searchablePropertiesWithTypes[path] = type; + } + + return index; +} + +function insert( + index: Index, + prop: string, + id: string, + value: SearchableValue, + language: string | undefined, + tokenizer: Tokenizer, + docsCount: number, +): void { + if (typeof value === "number") { + avlInsert(index.indexes[prop] as AVLNode, value as number, [id]); + return; + } else if (typeof value === "boolean") { + (index.indexes[prop] as BooleanIndex)[value ? "true" : "false"].push(id); + return; + } + + const tokens = tokenizer.tokenize(value as string, language); + + if (!(id in index.frequencies[prop])) { + index.frequencies[prop][id] = {}; + } + + index.fieldLengths[prop][id] = tokens.length; + index.avgFieldLength[prop] = ((index.avgFieldLength[prop] ?? 0) * (docsCount - 1) + tokens.length) / docsCount; + + for (const token of tokens) { + let tokenFrequency = 0; + + for (const t of tokens) { + if (t === token) { + tokenFrequency++; + } + } + + const tf = tokenFrequency / tokens.length; + + index.frequencies[prop][id]![token] = tf; + + if (!(token in index.tokenOccurrencies[prop])) { + index.tokenOccurrencies[prop][token] = 0; + } + + // increase a token counter that may not yet exist + index.tokenOccurrencies[prop][token] = (index.tokenOccurrencies[prop][token] ?? 0) + 1; + + radixInsert(index.indexes[prop] as RadixNode, token, id); + } +} + +function remove( + index: Index, + prop: string, + id: string, + value: SearchableValue, + language: string | undefined, + tokenizer: Tokenizer, + docsCount: number, +): void { + if (typeof value === "number") { + avlRemoveDocument(index.indexes[prop] as AVLNode, id, value); + return; + } else if (typeof value === "boolean") { + const booleanKey = value ? "true" : "false"; + const position = (index.indexes[prop] as BooleanIndex)[booleanKey].indexOf(id); + + (index.indexes[prop] as BooleanIndex)[value ? "true" : "false"].splice(position, 1); + return; + } + + const tokens = tokenizer.tokenize(value as string, language); + + index.avgFieldLength[prop] = + (index.avgFieldLength[prop] * docsCount - index.fieldLengths[prop][id]!) / (docsCount - 1); + index.fieldLengths[prop][id] = undefined; + index.frequencies[prop][id] = undefined; + + for (const token of tokens) { + index.tokenOccurrencies[prop][token]--; + radixRemoveDocument(index.indexes[prop] as RadixNode, token, id); + } +} + +function search(index: Index, prop: string, term: string, context: SearchContext): TokenScore[] { + if (!(prop in index.tokenOccurrencies)) { + return []; + } + + // Exact fields for TF-IDF + const avgFieldLength = index.avgFieldLength[prop]; + const fieldLengths = index.fieldLengths[prop]; + const lyraOccurrencies = index.tokenOccurrencies[prop]; + const lyraFrequencies = index.frequencies[prop]; + + // Performa the search + const rootNode = index.indexes[prop] as RadixNode; + const { exact, tolerance } = context.params; + const searchResult = radixFind(rootNode, { term, exact, tolerance }); + + const ids = new Set(); + + for (const key in searchResult) { + for (const id of searchResult[key]) { + ids.add(id); + } + } + + const documentIDs = Array.from(ids); + + // lyraOccurrencies[term] can be undefined, 0, string, or { [k: string]: number } + const termOccurrencies = typeof lyraOccurrencies[term] === "number" ? lyraOccurrencies[term] ?? 0 : 0; + + const scoreList: TokenScore[] = []; + + // Calculate TF-IDF value for each term, in each document, for each index. + const documentIDsLength = documentIDs.length; + for (let k = 0; k < documentIDsLength; k++) { + const id = documentIDs[k]; + const tf = lyraFrequencies?.[id]?.[term] ?? 0; + + const bm25 = BM25( + tf, + termOccurrencies, + context.docsCount, + fieldLengths[id]!, + avgFieldLength, + context.params.relevance! as Required, + ); + + scoreList.push([id, bm25]); + } + + return scoreList; +} + +function searchByWhereClause(index: Index, filters: Record): string[] { + const filterKeys = Object.keys(filters); + + const filtersMap: Record = filterKeys.reduce( + (acc, key) => ({ + [key]: [], + ...acc, + }), + {}, + ); + + for (const param of filterKeys) { + const operation = filters[param]; + + if (typeof operation === "boolean") { + const idx = index.indexes[param] as BooleanIndex; + const filteredIDs = idx[operation.toString() as keyof BooleanIndex]; + filtersMap[param].push(...filteredIDs); + continue; + } + + const operationKeys = Object.keys(operation); + + if (operationKeys.length > 1) { + throw createError("INVALID_FILTER_OPERATION", operationKeys.length); + } + + const operationOpt = operationKeys[0] as ComparisonOperator; + const operationValue = operation[operationOpt as unknown as keyof ComparisonOperator]; + + const AVLNode = index.indexes[param] as AVLNode; + + switch (operationOpt) { + case "gt": { + const filteredIDs = avlGreaterThan(AVLNode, operationValue, false); + filtersMap[param].push(...filteredIDs); + break; + } + case "gte": { + const filteredIDs = avlGreaterThan(AVLNode, operationValue, true); + filtersMap[param].push(...filteredIDs); + break; + } + case "lt": { + const filteredIDs = avlLessThan(AVLNode, operationValue, false); + filtersMap[param].push(...filteredIDs); + break; + } + case "lte": { + const filteredIDs = avlLessThan(AVLNode, operationValue, true); + filtersMap[param].push(...filteredIDs); + break; + } + case "eq": { + const filteredIDs = avlFind(AVLNode, operationValue) ?? []; + filtersMap[param].push(...filteredIDs); + break; + } + case "between": { + const [min, max] = operationValue as number[]; + const filteredIDs = avlRangeSearch(AVLNode, min, max); + filtersMap[param].push(...filteredIDs); + } + } + } + + // AND operation: calculate the intersection between all the IDs in filterMap + const result = intersect(Object.values(filtersMap)) as unknown as string[]; + + return result; +} + +function getSearchableProperties(index: Index): string[] { + return index.searchableProperties; +} + +function getSearchablePropertiesWithTypes(index: Index): Record { + return index.searchablePropertiesWithTypes; +} + +function load(raw: unknown): Index { + const { + indexes, + searchableProperties, + searchablePropertiesWithTypes, + frequencies, + tokenOccurrencies, + avgFieldLength, + fieldLengths, + } = raw as Index; + + return { + indexes, + searchableProperties, + searchablePropertiesWithTypes, + frequencies, + tokenOccurrencies, + avgFieldLength, + fieldLengths, + }; +} + +function save(index: Index): unknown { + const { + indexes, + searchableProperties, + searchablePropertiesWithTypes, + frequencies, + tokenOccurrencies, + avgFieldLength, + fieldLengths, + } = index; + + return { + indexes, + searchableProperties, + searchablePropertiesWithTypes, + frequencies, + tokenOccurrencies, + avgFieldLength, + fieldLengths, + } as unknown; +} + +export function createIndex(): DefaultIndex { + return { + create, + insert, + remove, + search, + searchByWhereClause, + getSearchableProperties, + getSearchablePropertiesWithTypes, + load, + save, + }; +} diff --git a/src/levenshtein.ts b/src/components/levenshtein.ts similarity index 100% rename from src/levenshtein.ts rename to src/components/levenshtein.ts index 81751151e..78d669410 100644 --- a/src/levenshtein.ts +++ b/src/components/levenshtein.ts @@ -3,20 +3,6 @@ export type BoundedMetric = { distance: number; }; -/** - * Computes the Levenshtein distance between two strings (a, b), returning early with -1 if the distance - * is greater than the given tolerance. - * It assumes that: - * - tolerance >= ||a| - |b|| >= 0 - */ -export function boundedLevenshtein(a: string, b: string, tolerance: number): BoundedMetric { - const distance = _boundedLevenshtein(a, b, tolerance); - return { - distance, - isBounded: distance >= 0, - }; -} - /** * Inspired by: * https://github.com/Yomguithereal/talisman/blob/86ae55cbd040ff021d05e282e0e6c71f2dde21f8/src/metrics/levenshtein.js#L218-L340 @@ -136,6 +122,20 @@ function _boundedLevenshtein(a: string, b: string, tolerance: number): number { return current <= tolerance ? current : -1; } +/** + * Computes the Levenshtein distance between two strings (a, b), returning early with -1 if the distance + * is greater than the given tolerance. + * It assumes that: + * - tolerance >= ||a| - |b|| >= 0 + */ +export function boundedLevenshtein(a: string, b: string, tolerance: number): BoundedMetric { + const distance = _boundedLevenshtein(a, b, tolerance); + return { + distance, + isBounded: distance >= 0, + }; +} + export function levenshtein(a: string, b: string): number { /* c8 ignore next 3 */ if (!a.length) { diff --git a/src/components/sync-blocking-checker.ts b/src/components/sync-blocking-checker.ts new file mode 100644 index 000000000..9d4bb70ad --- /dev/null +++ b/src/components/sync-blocking-checker.ts @@ -0,0 +1,54 @@ +import { kInsertions, kRemovals, Lyra, OpaqueDocumentStore, OpaqueIndex, Schema } from "../types.js"; + +// Web platforms don't have process. React-Native doesn't have process.emitWarning. +const warn = + globalThis.process?.emitWarning ?? + function emitWarning(message: string, options: { code: string }) { + console.warn(`[WARNING] [${options.code}] ${message}`); + }; + +export function trackInsertion( + lyra: Lyra, +): void { + if (typeof lyra[kInsertions] !== "number") { + queueMicrotask(() => { + lyra[kInsertions] = undefined; + }); + + lyra[kInsertions] = 0; + } + + if (lyra[kInsertions]! > 1000) { + warn( + "Lyra's insert operation is synchronous. Please avoid inserting a large number of document in a single operation in order not to block the main thread or, in alternative, please use insertMultiple.", + { code: "LYRA0001" }, + ); + + lyra[kInsertions] = -1; + } else if (lyra[kInsertions] >= 0) { + lyra[kInsertions]++; + } +} + +export function trackRemoval( + lyra: Lyra, +): void { + if (typeof lyra[kRemovals] !== "number") { + queueMicrotask(() => { + lyra[kRemovals] = undefined; + }); + + lyra[kRemovals] = 0; + } + + if (lyra[kRemovals]! > 1000) { + warn( + "Lyra's remove operation is synchronous. Please avoid removing a large number of document in a single operation in order not to block the main thread, in alternative, please use updateMultiple.", + { code: "LYRA0002" }, + ); + + lyra[kRemovals] = -1; + } else if (lyra[kRemovals] >= 0) { + lyra[kRemovals]++; + } +} diff --git a/src/errors.ts b/src/errors.ts index 30f8d4f6b..72e3ec741 100644 --- a/src/errors.ts +++ b/src/errors.ts @@ -1,85 +1,91 @@ import { SUPPORTED_LANGUAGES } from "./tokenizer/languages.js"; - -function formatJSON(input: object) { - return JSON.stringify(input, null, 2); -} - -export function INVALID_SCHEMA_TYPE(type: string): string { - return `Invalid schema type. Expected string or object, but got ${type}`; -} - -export function INVALID_DOC_SCHEMA(expected: object, found: object): string { - return `Invalid document structure. \nLyra has been initialized with the following schema: \n\n${formatJSON( - expected, - )}\n\nbut found the following doc:\n\n${formatJSON(found)}`; -} - -export function INVALID_PROPERTY(name: string, expected: string[]): string { - return `Invalid property name. Expected a wildcard string ("*") or array containing one of the following properties: ${expected.join( - ", ", - )}, but got: ${name}`; -} - -export function CANT_DELETE_DOC_NOT_FOUND(id: string): string { - return `Document with ID ${id} does not exist.`; -} - -export function CANT_DELETE_DOCUMENT(docID: string, key: string, token: string): string { - return `Unable to delete document "${docID}" from index "${key}" on word "${token}".`; -} - -export function UNSUPPORTED_NESTED_PROPERTIES(): string { - return `Nested properties are not supported in this Lyra version, but will be in the future.`; -} - -export function DOC_ID_DOES_NOT_EXISTS(id: string): string { - return `Document with ID ${id} does not exists`; -} - -export function GETTER_SETTER_WORKS_ON_EDGE_ONLY(method: string): string { - return `${method} works on edge only. Use edge: true in Lyra constructor to enable it.`; -} - -export function INVALID_HOOKS_OBJECT(): string { - return "Invalid hooks object"; +import { sprintf } from "./utils.js"; + +export type ErrorCode = + | "NO_DEFAULT_LANGUAGE_WITH_CUSTOM_TOKENIZER" + | "LANGUAGE_NOT_SUPPORTED" + | "INVALID_STEMMER_FUNCTION_TYPE" + | "CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY" + | "UNSUPPORTED_COMPONENT" + | "COMPONENT_MUST_BE_FUNCTION" + | "COMPONENT_MUST_BE_FUNCTION_OR_ARRAY_FUNCTIONS" + | "INVALID_SCHEMA_TYPE" + | "TYPE_ERROR_ID_MUST_BE_STRING" + | "DOCUMENT_ID_MUST_BE_STRING" + | "DOCUMENT_ALREADY_EXISTS" + | "DOCUMENT_DOES_NOT_EXIST" + | "MISSING_DOCUMENT_PROPERTY" + | "INVALID_DOCUMENT_PROPERTY" + | "INVALID_BOOST_VALUE" + | "UNKNOWN_INDEX" + | "INVALID_FILTER_OPERATION"; + +export interface LyraError extends Error { + code: string; +} + +export function createError(code: ErrorCode, ...args: Array): LyraError { + let message = ""; + + switch (code) { + case "NO_DEFAULT_LANGUAGE_WITH_CUSTOM_TOKENIZER": + message = "Do not pass the defaultLanguage option to create when using a custom tokenizer."; + break; + case "LANGUAGE_NOT_SUPPORTED": + message = `Language "%s" is not supported.\nSupported languages are:\n - ${SUPPORTED_LANGUAGES.join("\n - ")}`; + break; + case "INVALID_STEMMER_FUNCTION_TYPE": + message = `config.stemmer property must be a function.`; + break; + case "CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY": + message = "Custom stop words array must only contain strings."; + break; + case "UNSUPPORTED_COMPONENT": + message = `Unsupported component "%s".`; + break; + case "COMPONENT_MUST_BE_FUNCTION": + message = `The component "%s" must be a function.`; + break; + case "COMPONENT_MUST_BE_FUNCTION_OR_ARRAY_FUNCTIONS": + message = `The component "%s" must be a function or an array of functions.`; + break; + case "INVALID_SCHEMA_TYPE": + message = `Unsupported schema type "%s". Expected "string", "boolean" or "number".`; + break; + case "DOCUMENT_ID_MUST_BE_STRING": + message = `Document id must be of type "string". Got "%s" instead.`; + break; + case "DOCUMENT_ALREADY_EXISTS": + message = `A document with id "%s" already exists.`; + break; + case "DOCUMENT_DOES_NOT_EXIST": + message = `A document with id "%s" does not exists.`; + break; + case "MISSING_DOCUMENT_PROPERTY": + message = `Missing searchable property "%s".`; + break; + case "INVALID_DOCUMENT_PROPERTY": + message = `Invalid document property "%s": expected "%s", got "%s"`; + break; + case "UNKNOWN_INDEX": + message = `Invalid property name "%s". Expected a wildcard string ("*") or array containing one of the following properties: %s`; + break; + case "INVALID_BOOST_VALUE": + message = `Boost value must be a number greater than, or less than 0.`; + break; + case "INVALID_FILTER_OPERATION": + message = `You can only use one operation per filter, you requested %d.`; + break; + default: + message = `Unsupported Lyra Error code: ${code}`; + break; + } + + const error = new Error(sprintf(message, ...args)) as LyraError; + error.code = code; + if ("captureStackTrace" in Error.prototype) { + Error.captureStackTrace(error); + } + + return error; } - -export function NON_SUPPORTED_HOOKS(invalidHooks: string[]): string { - return `The following hooks aren't supported. Hooks: ${invalidHooks}`; -} - -export function TYPE_ERROR_ID_MUST_BE_STRING(type: string): string { - return `"id" must be of type "string". Got "${type}" instead.`; -} - -export function ID_ALREADY_EXISTS(id: string): string { - return `Document with ID "${id}" already exists.`; -} - -export function LANGUAGE_NOT_SUPPORTED(lang: string): string { - return `Language "${lang}" is not supported.\nSupported languages are:\n - ${SUPPORTED_LANGUAGES.join("\n - ")}`; -} - -export function CUSTOM_STOP_WORDS_ARRAY_MUST_BE_STRING_ARRAY(): string { - return `Custom stop words array must only contain strings.`; -} - -export function CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY(): string { - return `Custom stop words must be a function or an array of strings.`; -} - -export function INVALID_STEMMER_FUNCTION_TYPE(): string { - return `tokenizer.stemmingFn property must be a function.`; -} - -export function INVALID_TOKENIZER_FUNCTION(): string { - return `tokenizer.tokenizerFn must be a function.`; -} - -export function INVALID_BOOST_VALUE(): string { - return `Boost value must be a number greater than, or less than 0.`; -} - -export function INVALID_FILTER_OPERATION(found: string[]): string { - return `You can only use one operation per filter. Found ${found.length}: ${found.join(", ")}`; -} \ No newline at end of file diff --git a/src/facets.ts b/src/facets.ts deleted file mode 100644 index bfc58db88..000000000 --- a/src/facets.ts +++ /dev/null @@ -1,117 +0,0 @@ -import type { FacetSorting, FacetsSearch, PropertiesSchema, ResolveSchema, TokenScore } from "./types/index.js"; -import { getNested } from './utils.js'; - -export type FacetReturningValue = { - [key: string]: { - count: number; - values: { - [key: string]: number; - } - } -} - -export function getFacets(schema: PropertiesSchema, docs: Record | undefined>, results: TokenScore[], facetsConfig: FacetsSearch): FacetReturningValue { - const facets: FacetReturningValue = {}; - const allIDs = results.map(([id]) => id); - const allDocs = allIDs.map((id) => docs[id]); - const facetKeys = Object.keys(facetsConfig); - - for (const facet of facetKeys) { - const facetType = getFacetType(schema, facet); - let values = {}; - - // Hack to guarantee the same order of ranges as specified by the user - if (facetType === "number") { - const { ranges } = (facetsConfig as any)[facet]; - const tmp = []; - for (const range of ranges) { - tmp.push([`${range.from}-${range.to}`, 0]); - } - values = Object.fromEntries(tmp as any); - } - - facets[facet] = { - count: 0, - values, - }; - } - - const allDocsLength = allDocs.length; - for (let i = 0; i < allDocsLength; i++) { - const doc = allDocs[i]; - - for (const facet of facetKeys) { - const facetValue = facet.includes('.') - ? getNested(doc!, facet)! - : doc![facet] as number | boolean; - - // String based facets - if (typeof facetValue === "string") { - if (facets[facet].values[facetValue] === undefined) { - facets[facet].values[facetValue] = 1; - } else { - facets[facet].values[facetValue]++; - } - - // Boolean facets - } else if (typeof facetValue === "boolean") { - if (facets[facet].values[facetValue.toString()] === undefined) { - facets[facet].values[facetValue.toString()] = 1; - } else { - facets[facet].values[facetValue.toString()]++; - } - } - - // Range facets based on numbers - else if (typeof facetValue === "number") { - for (const range of (facetsConfig as any)[facet].ranges) { - if (facetValue >= range.from && facetValue <= range.to) { - if (facets[facet].values[`${range.from}-${range.to}`] === undefined) { - facets[facet].values[`${range.from}-${range.to}`] = 1; - } else { - facets[facet].values[`${range.from}-${range.to}`]++; - } - } - } - } - } - } - - for (const facet of facetKeys) { - const facetType = getFacetType(schema, facet); - - // Count the number of values for each facet - facets[facet].count = Object.keys(facets[facet].values).length; - - // Sort only string-based facets - if (facetType === "string") { - facets[facet].values = Object.fromEntries( - Object.entries(facets[facet].values) - .sort((a, b) => sortingPredicate((facetsConfig as any)[facet].sort, a, b)) - .slice((facetsConfig as any)[facet].offset ?? 0, (facetsConfig as any)[facet].limit ?? 10), - ) - } - } - - return facets; -} - -const facetTypeCache = new Map(); - -function getFacetType(schema: PropertiesSchema, facet: string) { - if (facetTypeCache.has(facet)) { - return facetTypeCache.get(facet)!; - } - - const facetType = getNested(schema, facet)!; - facetTypeCache.set(facet, facetType); - return facetType; -} - -function sortingPredicate(order: FacetSorting = "desc", a: [string, number], b: [string, number]) { - if (order.toLowerCase() === "asc") { - return a[1] - b[1]; - } else { - return b[1] - a[1]; - } -} \ No newline at end of file diff --git a/src/filters.ts b/src/filters.ts deleted file mode 100644 index 8edf58dfe..000000000 --- a/src/filters.ts +++ /dev/null @@ -1,103 +0,0 @@ -import type { WhereFilter, FilterOperation, PropertiesSchema, Lyra, BooleanIndex } from "./types/index.js"; -import type { AVLNode } from "./trees/avl/node.js"; -import { greaterThan, lessThan, rangeSearch, find } from "./trees/avl/index.js"; -import { intersect } from './utils.js' -import * as ERRORS from "./errors.js"; - -export function getWhereFiltersIDs(filters: WhereFilter, lyra: Lyra): string[] { - const filterKeys = Object.keys(filters); - - const filtersMap: Record = filterKeys.reduce((acc, key) => ({ - [key]: [], - ...acc, - }), {}); - - for (const param of filterKeys) { - const operation = filters[param as keyof WhereFilter] - const operationKeys = Object.keys(operation as unknown as FilterOperation[]) - - if (operationKeys.length > 1) { - throw new Error(ERRORS.INVALID_FILTER_OPERATION(operationKeys)) - } - - if (typeof operation === 'boolean') { - const idx = lyra.index[param] as BooleanIndex; - // eslint-disable-next-line @typescript-eslint/ban-ts-comment - // @ts-ignore - this is a bug in the typescript compiler - const filteredIDs = idx[operation.toString() as keyof BooleanIndex]; - filtersMap[param].push(...filteredIDs); - } - - const operationOpt = operationKeys[0] as FilterOperation - const operationValue = operation[operationOpt as keyof typeof operation]; - - const AVLNode = lyra.index[param] as AVLNode; - - switch (operationOpt) { - case "gt": { - // eslint-disable-next-line @typescript-eslint/ban-ts-comment - // @ts-ignore - this is a bug in the typescript compiler - const filteredIDs = greaterThan(AVLNode, operationValue, false); - filtersMap[param].push(...filteredIDs); - break; - } - case "gte": { - // eslint-disable-next-line @typescript-eslint/ban-ts-comment - // @ts-ignore - this is a bug in the typescript compiler - const filteredIDs = greaterThan(AVLNode, operationValue, true); - filtersMap[param].push(...filteredIDs); - break; - } - case "lt": { - // eslint-disable-next-line @typescript-eslint/ban-ts-comment - // @ts-ignore - this is a bug in the typescript compiler - const filteredIDs = lessThan(AVLNode, operationValue, false); - filtersMap[param].push(...filteredIDs); - break; - } - case "lte": { - // eslint-disable-next-line @typescript-eslint/ban-ts-comment - // @ts-ignore - this is a bug in the typescript compiler - const filteredIDs = lessThan(AVLNode, operationValue, true); - filtersMap[param].push(...filteredIDs); - break; - } - case "eq": { - // eslint-disable-next-line @typescript-eslint/ban-ts-comment - // @ts-ignore - this is a bug in the typescript compiler - const filteredIDs = find(AVLNode, operationValue) ?? []; - filtersMap[param].push(...filteredIDs); - break; - } - case "between": { - // eslint-disable-next-line @typescript-eslint/ban-ts-comment - // @ts-ignore - this is a bug in the typescript compiler - const filteredIDs = rangeSearch(AVLNode, operationValue[0], operationValue[1]); - filtersMap[param].push(...filteredIDs); - } - } - } - - // AND operation: calculate the intersection between all the IDs in filterMap - const result = intersect(Object.values(filtersMap)) as unknown as string[]; - - return result; -} - -export function intersectFilteredIDs(filtered: string[], lookedUp: [string, number][]): [string, number][] { - const map = new Map(); - const result: [string, number][] = []; - - for (const id of filtered) { - map.set(id, true); - } - - for (const [id, score] of lookedUp) { - if (map.has(id)) { - result.push([id, score]); - map.delete(id); - } - } - - return result; -} \ No newline at end of file diff --git a/src/index.ts b/src/index.ts index 86463a620..ebc431223 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,13 +1,11 @@ export { create } from "./methods/create.js"; -export { insert, insertBatch, insertWithHooks } from "./methods/insert.js"; -export { load } from "./methods/load.js"; -export { remove } from "./methods/remove.js"; -export { save } from "./methods/save.js"; +export { insert, insertMultiple } from "./methods/insert.js"; +export { load, save } from "./methods/serialization.js"; +export { remove, removeMultiple } from "./methods/remove.js"; +export { update, updateMultiple } from "./methods/update.js"; export { search } from "./methods/search.js"; export { getByID, count } from "./methods/docs.js"; -export * from "./types/index.js"; +export * from "./types.js"; export type { Language } from "./tokenizer/languages.js"; -export type { InsertConfig, InsertBatchConfig } from "./methods/insert.js"; -export type { RetrievedDoc, SearchParams, SearchResult } from "./methods/search.js"; -export type { Stemmer, TokenizerConfig, Tokenizer } from "./tokenizer/index.js"; +export type { Stemmer, TokenizerConfig } from "./tokenizer/index.js"; diff --git a/src/insertion-checker.ts b/src/insertion-checker.ts deleted file mode 100644 index f92db569b..000000000 --- a/src/insertion-checker.ts +++ /dev/null @@ -1,31 +0,0 @@ -const kInsertions = Symbol("lyra.insertions"); - -// Web platforms don't have process. React-Native doesn't have process.emitWarning. -const warn = - globalThis.process?.emitWarning ?? - function emitWarning(message: string, options: { code: string }) { - console.warn(`[WARNING] [${options.code}] ${message}`); - }; - -export function trackInsertion(_lyra: unknown) { - const lyra = _lyra as object & { [kInsertions]?: number }; - - if (typeof lyra[kInsertions] !== "number") { - queueMicrotask(() => { - lyra[kInsertions] = undefined; - }); - - lyra[kInsertions] = 0; - } - - if (lyra[kInsertions]! > 1000) { - warn( - "Lyra's insert operation is synchronous. Please avoid inserting a large number of document in a single operation in order not to block the main thread.", - { code: "LYRA0001" }, - ); - - lyra[kInsertions] = -1; - } else if (lyra[kInsertions] >= 0) { - lyra[kInsertions]++; - } -} diff --git a/src/internals.ts b/src/internals.ts index 4de224af4..555e06c1a 100644 --- a/src/internals.ts +++ b/src/internals.ts @@ -1,3 +1,3 @@ -export { boundedLevenshtein } from "./levenshtein.js"; -export { formatNanoseconds, getNanosecondsTime } from "./utils.js"; -export { tokenize, normalizationCache, defaultTokenizerConfig } from "./tokenizer/index.js"; +export { boundedLevenshtein } from "./components/levenshtein.js"; +export { sprintf, formatBytes, formatNanoseconds, getNanosecondsTime, uniqueId } from "./utils.js"; +export { normalizationCache, createTokenizer } from "./tokenizer/index.js"; diff --git a/src/methods/common.ts b/src/methods/common.ts deleted file mode 100644 index 848630566..000000000 --- a/src/methods/common.ts +++ /dev/null @@ -1,56 +0,0 @@ -import * as ERRORS from "../errors.js"; -import type { Lyra, PropertiesSchema, ResolveSchema } from "../types/index.js"; -import type { SearchParams } from "./search.js"; - -export function assertDocSchema(doc: ResolveSchema, lyraSchema: PropertiesSchema) { - if (!recursiveCheckDocSchema(doc, lyraSchema)) { - throw new Error(ERRORS.INVALID_DOC_SCHEMA(lyraSchema, doc)); - } -} - -export function recursiveCheckDocSchema( - newDoc: ResolveSchema, - schema: PropertiesSchema, -): boolean { - for (const key in newDoc) { - if (!(key in schema)) { - continue; - } - - const propType = typeof newDoc[key]; - - if (propType === "object") { - recursiveCheckDocSchema(newDoc[key] as ResolveSchema, schema); - } else if (typeof newDoc[key] !== schema[key]) { - return false; - } - } - - return true; -} - -export function getIndices( - lyra: Lyra, - indices: SearchParams["properties"], -): string[] { - const knownIndices = Object.keys(lyra.index); - - if (!indices) { - return knownIndices; - } - - if (typeof indices === "string") { - if (indices !== "*") { - throw new Error(ERRORS.INVALID_PROPERTY(indices, knownIndices)); - } - return knownIndices; - } - - for (const index of indices as string[]) { - if (!knownIndices.includes(index)) { - throw new Error(ERRORS.INVALID_PROPERTY(index, knownIndices)); - } - } - - return indices as string[]; -} diff --git a/src/methods/create.ts b/src/methods/create.ts index bd42874e3..30036a301 100644 --- a/src/methods/create.ts +++ b/src/methods/create.ts @@ -1,86 +1,146 @@ -import type { Configuration, Lyra, PropertiesSchema } from "../types/index.js"; -import { defaultTokenizerConfig, Language } from "../tokenizer/index.js"; -import * as ERRORS from "../errors.js"; -import { create as createNode } from "../trees/radix/node.js"; -import { create as createAVLNode } from "../trees/avl/index.js"; -import { validateHooks } from "./hooks.js"; -import { intersectTokenScores } from "../algorithms.js"; - -/** - * Creates a new database. - * @param properties Options to initialize the database with. - * @example - * // Create a database that stores documents containing 'author' and 'quote' fields. - * const db = await create({ - * schema: { - * author: 'string', - * quote: 'string' - * }, - * hooks: { - * afterInsert: [afterInsertHook], - * } - * }); - */ -export async function create(properties: Configuration): Promise> { - const defaultLanguage = (properties?.defaultLanguage?.toLowerCase() as Language) ?? "english"; - - const tokenizer = defaultTokenizerConfig(defaultLanguage, properties.components?.tokenizer ?? {}); - tokenizer.assertSupportedLanguage(defaultLanguage); - - validateHooks(properties.hooks); - - const instance: Lyra = { - defaultLanguage, - schema: properties.schema, - docs: {}, - docsCount: 0, - index: {}, - hooks: properties.hooks || {}, - edge: properties.edge ?? false, - frequencies: {}, - tokenOccurrencies: {}, - avgFieldLength: {}, - fieldLengths: {}, - components: { - elapsed: properties.components?.elapsed ?? {}, - tokenizer, - algorithms: { - intersectTokenScores: properties.components?.algorithms?.intersectTokenScores ?? intersectTokenScores, - }, - }, - }; +import { getDefaultComponents } from "../components/defaults.js"; +import { createError } from "../errors.js"; +import { COMPLEX_COMPONENTS, SIMPLE_COMPONENTS, SIMPLE_OR_ARRAY_COMPONENTS } from "../components/hooks.js"; +import { createIndex } from "../components/index.js"; +import { createTokenizer } from "../tokenizer/index.js"; +import { + ArrayCallbackComponents, + Components, + IDocumentsStore, + IIndex, + Lyra, + OpaqueDocumentStore, + OpaqueIndex, + Schema, + SimpleComponents, + SimpleOrArrayCallbackComponents, +} from "../types.js"; +import { createDocumentsStore } from "../components/documents-store.js"; - buildIndex(instance, properties.schema); - return instance; +interface CreateArguments { + schema: Schema; + defaultLanguage?: string; + components?: Components; } -function buildIndex(lyra: Lyra, schema: S, prefix = "") { - for (const prop of Object.keys(schema)) { - const propType = typeof prop; - const isNested = typeof schema[prop] === "object"; - - if (propType !== "string") throw new Error(ERRORS.INVALID_SCHEMA_TYPE(propType)); +function validateComponents( + components: Components, +) { + const defaultComponents = getDefaultComponents(); - const propName = `${prefix}${prop}`; + for (const rawKey of SIMPLE_COMPONENTS) { + const key = rawKey as keyof SimpleComponents; - if (isNested) { - buildIndex(lyra, schema[prop] as S, `${propName}.`); - } else { - if (schema[prop] === "string") { - lyra.index[propName] = createNode(); - lyra.avgFieldLength[propName] = 0; - continue; - } - - if (schema[prop] === "number") { - lyra.index[propName] = createAVLNode(0, []); - continue; + if (components[key]) { + if (typeof components[key] !== "function") { + throw createError("COMPONENT_MUST_BE_FUNCTION", key); } + } else { + // @ts-expect-error TSC is unable to resolve this + components[key] = defaultComponents[key]; + } + } + + for (const rawKey of SIMPLE_OR_ARRAY_COMPONENTS) { + const key = rawKey as keyof ArrayCallbackComponents; - if (schema[prop] === "boolean") { - lyra.index[propName] = { 'true': [], 'false': [] }; - continue; + if (!components[key]) { + components[key] = []; + } else if (!Array.isArray(components[key])) { + // @ts-expect-error TSC is unable to resolve this + components[key] = [components[key]]; + } + + for (const fn of components[key] as unknown as SimpleOrArrayCallbackComponents[]) { + if (typeof fn !== "function") { + throw createError("COMPONENT_MUST_BE_FUNCTION_OR_ARRAY_FUNCTIONS", key); } } } + + for (const rawKey of Object.keys(components)) { + if ( + !COMPLEX_COMPONENTS.includes(rawKey) && + !SIMPLE_COMPONENTS.includes(rawKey) && + !SIMPLE_OR_ARRAY_COMPONENTS.includes(rawKey) + ) { + throw createError("UNSUPPORTED_COMPONENT", rawKey); + } + } +} + +export async function create({ + schema, + defaultLanguage, + components, +}: CreateArguments): Promise> { + if (!components) { + components = {}; + } + + let tokenizer = components.tokenizer; + let index = components.index; + let documentsStore = components.documentsStore; + + if (!tokenizer) { + // Use the default tokenizer + tokenizer = await createTokenizer(defaultLanguage ?? "english"); + } else if (defaultLanguage) { + // Accept defaultLanguage only if a tokenizer is not provided + throw createError("NO_DEFAULT_LANGUAGE_WITH_CUSTOM_TOKENIZER"); + } + + if (!index) { + index = createIndex() as unknown as IIndex; + } + + if (!documentsStore) { + documentsStore = createDocumentsStore() as unknown as IDocumentsStore; + } + + // Validate all other components + validateComponents(components); + + // Assign only recognized components and hooks + const { + getDocumentProperties, + getDocumentIndexId, + validateSchema, + beforeInsert, + afterInsert, + beforeRemove, + afterRemove, + beforeMultipleInsert, + afterMultipleInsert, + beforeMultipleRemove, + afterMultipleRemove, + formatElapsedTime, + } = components; + + const lyra = { + data: {}, + schema, + tokenizer, + index, + documentsStore, + getDocumentProperties, + getDocumentIndexId, + validateSchema, + beforeInsert, + afterInsert, + beforeRemove, + afterRemove, + beforeMultipleInsert, + afterMultipleInsert, + beforeMultipleRemove, + afterMultipleRemove, + formatElapsedTime, + } as Lyra; + + lyra.data = { + index: await lyra.index.create(lyra, schema), + docs: await lyra.documentsStore.create(lyra), + }; + + return lyra; } diff --git a/src/methods/docs.ts b/src/methods/docs.ts index 8234b62d5..239118455 100644 --- a/src/methods/docs.ts +++ b/src/methods/docs.ts @@ -1,33 +1,14 @@ -import type { PropertiesSchema, Lyra, ResolveSchema } from "../types/index.js"; +import { Document, Lyra, OpaqueDocumentStore, OpaqueIndex, Schema } from "../types.js"; -/** - * Gets a document from a Lyra database by its ID. - * @template S - The schema type for the Lyra database. - * @param {Lyra} db - The Lyra database to get the document from. - * @param {string} id - The ID of the document to get. - * @returns {Promise | undefined>} - The document with the given ID, or undefined if it doesn't exist. - * @example - * - * import { getByID } from '@lyrasearch/lyra'; - * - * const doc = await getByID(db, 'doc1'); // { id: 'doc1', title: 'Hello World' } - * const doc = await getByID(db, 'doc4'); // undefined - */ -export async function getByID(db: Lyra, id: string): Promise | undefined> { - return db.docs[id]; +export function getByID( + db: Lyra, + id: string, +): Promise { + return db.documentsStore.get(db.data.docs, id) as Promise; } -/** - * Counts the number of documents in a Lyra database. - * @template S - The schema type for the Lyra database. - * @param {Lyra} db - The Lyra database to count documents in. - * @returns {Promise} - The number of documents in the Lyra database. - * @example - * - * import { count } from '@lyrasearch/lyra'; - * - * const numDocs = await count(db); // 3 -*/ -export async function count(db: Lyra): Promise { - return Object.keys(db.docs).length; +export function count( + db: Lyra, +): Promise { + return db.documentsStore.count(db.data.docs) as Promise; } diff --git a/src/methods/hooks.ts b/src/methods/hooks.ts deleted file mode 100644 index a05f38931..000000000 --- a/src/methods/hooks.ts +++ /dev/null @@ -1,37 +0,0 @@ -import * as ERRORS from "../errors.js"; -import type { Lyra, PropertiesSchema } from "../types/index.js"; - -export interface AfterInsertHook { - (this: Lyra, id: string): Promise | void; -} - -export type Hooks = { - afterInsert?: AfterInsertHook | AfterInsertHook[]; -}; - -const SUPPORTED_HOOKS = ["afterInsert"]; - -export function validateHooks(hooks?: Hooks): void | never { - if (hooks) { - if (typeof hooks !== "object") { - throw new Error(ERRORS.INVALID_HOOKS_OBJECT()); - } - - const invalidHooks = Object.keys(hooks).filter(hook => !SUPPORTED_HOOKS.includes(hook)); - if (invalidHooks.length) { - throw new Error(ERRORS.NON_SUPPORTED_HOOKS(invalidHooks)); - } - } -} - -export async function hookRunner( - this: Lyra, - // eslint-disable-next-line @typescript-eslint/ban-types - funcs: Function | Function[], - ...args: unknown[] -): Promise { - const hooks = Array.isArray(funcs) ? funcs : [funcs]; - for (let i = 0; i < hooks.length; i++) { - await hooks[i].apply(this, args); - } -} diff --git a/src/methods/insert.ts b/src/methods/insert.ts index 28e0c484f..cdccfdb00 100644 --- a/src/methods/insert.ts +++ b/src/methods/insert.ts @@ -1,256 +1,116 @@ -import type { BooleanIndex, Lyra, PropertiesSchema, ResolveSchema } from "../types/index.js"; -import type { Language, TokenizerConfigExec } from "../tokenizer/index.js"; -import type { AVLNode } from "../../src/trees/avl/node.js"; -import type { RadixNode } from "../trees/radix/node.js"; -import { trackInsertion } from "../insertion-checker.js"; -import { insert as radixInsert } from "../trees/radix/index.js"; -import { insert as AVLInsert } from "../trees/avl/index.js"; -import { uniqueId } from "../utils.js"; -import { assertDocSchema } from "./common.js"; -import { hookRunner } from "./hooks.js"; -import * as ERRORS from "../errors.js"; +import { runMultipleHook, runSingleHook } from "../components/hooks.js"; +import { createError } from "../errors.js"; +import { trackInsertion } from "../components/sync-blocking-checker.js"; +import { Document, Schema, OpaqueIndex, OpaqueDocumentStore, Lyra } from "../types.js"; + +export async function insert( + lyra: Lyra, + doc: Document, + language?: string, + skipHooks?: boolean, +): Promise { + await lyra.validateSchema(doc, lyra.schema); + const { index, docs } = lyra.data; -export type InsertConfig = { - language?: Language; - id?: (doc: ResolveSchema) => string | Promise; -}; + const id = await lyra.getDocumentIndexId(doc); -export type InsertBatchConfig = InsertConfig & { - batchSize?: number; -}; + if (typeof id !== "string") { + throw createError("DOCUMENT_ID_MUST_BE_STRING", typeof id); + } -/** - * Inserts a document into a database. - * @param lyra The database to insert document into. - * @param doc The document to insert. - * @param config Optional parameter for overriding default configuration. - * @returns An object containing id of the inserted document. - * @example - * const { id } = await insert(db, { - * quote: 'You miss 100% of the shots you don\'t take', - * author: 'Wayne Gretzky - Michael Scott' - * }); - */ -export async function insert( - lyra: Lyra, - doc: ResolveSchema, - config?: InsertConfig, -): Promise<{ id: string }> { - config = { language: lyra.defaultLanguage, ...config }; + if (!(await lyra.documentsStore.store(docs, id, doc))) { + throw createError("DOCUMENT_ALREADY_EXISTS", id); + } - const id = await getDocumentID(doc, config); + const docsCount = await lyra.documentsStore.count(docs); - // If the ID already exists, we throw an error. - if (lyra.docs[id]) throw new Error(ERRORS.ID_ALREADY_EXISTS(id)); + if (!skipHooks) { + await runSingleHook(lyra.beforeInsert, lyra, id, doc); + } - lyra.components?.tokenizer?.assertSupportedLanguage?.(config.language!); + const indexableProperties = await lyra.index.getSearchableProperties(index); + const indexablePropertiesWithTypes = await lyra.index.getSearchablePropertiesWithTypes(index); + const values = await lyra.getDocumentProperties(doc, indexableProperties); - assertDocSchema(doc, lyra.schema); + for (const [key, value] of Object.entries(values)) { + if (typeof value === "undefined") { + continue; + } - lyra.docs[id] = doc; - lyra.docsCount++; - recursiveradixInsertion(lyra, doc, id, config, undefined, lyra.components?.tokenizer as TokenizerConfigExec); - trackInsertion(lyra); + const actualType = typeof value; + const expectedType = indexablePropertiesWithTypes[key]; - return { id }; -} + if (actualType !== expectedType) { + throw createError("INVALID_DOCUMENT_PROPERTY", key, expectedType, actualType); + } + } -/** - * Inserts a document into a database. - * @param lyra The database to insert document into. - * @param doc The document to insert. - * @param config Optional parameter for overriding default configuration. - * @returns A Promise object containing id of the inserted document. - * @example - * const { id } = await insert(db, { - * quote: 'You miss 100% of the shots you don\'t take', - * author: 'Wayne Gretzky - Michael Scott' - * }); - */ -export async function insertWithHooks( - lyra: Lyra, - doc: ResolveSchema, - config?: InsertConfig, -): Promise<{ id: string }> { - config = { language: lyra.defaultLanguage, ...config }; - const id = await getDocumentID(doc, config); + for (const prop of indexableProperties) { + const value = values[prop]; - lyra.components?.tokenizer?.assertSupportedLanguage?.(config.language!); + if (typeof value === "undefined") { + continue; + } - assertDocSchema(doc, lyra.schema); + await lyra.index.beforeInsert?.(lyra.data.index, prop, id, value, language, lyra.tokenizer, docsCount); + await lyra.index.insert(lyra.data.index, prop, id, value, language, lyra.tokenizer, docsCount); + await lyra.index.afterInsert?.(lyra.data.index, prop, id, value, language, lyra.tokenizer, docsCount); + } - lyra.docs[id] = doc; - lyra.docsCount++; - recursiveradixInsertion(lyra, doc, id, config, undefined, lyra.components?.tokenizer as TokenizerConfigExec); - trackInsertion(lyra); - if (lyra.hooks.afterInsert) { - await hookRunner.call(lyra, lyra.hooks.afterInsert, id); + if (!skipHooks) { + await runSingleHook(lyra.afterInsert, lyra, id, doc); } - return { id }; + trackInsertion(lyra); + + return id; } -/** - * Inserts a large array of documents into a database without blocking the event loop. - * @param lyra The database to insert document into. - * @param docs Array of documents to insert. - * @param config Optional parameter for overriding default configuration. - * @returns Promise. - * @example - * insertBatch(db, [ - * { - * quote: 'You miss 100% of the shots you don\'t take', - * author: 'Wayne Gretzky - Michael Scott' - * }, - * { - * quote: 'What I cannot createm I do not understand', - * author: 'Richard Feynman' - * } - * ]); - */ -export async function insertBatch( - lyra: Lyra, - docs: ResolveSchema[], - config?: InsertBatchConfig, -): Promise { - const batchSize = config?.batchSize ?? 1000; +export async function insertMultiple( + lyra: Lyra, + docs: Document[], + batchSize?: number, + language?: string, + skipHooks?: boolean, +): Promise { + if (!batchSize) { + batchSize = 1000; + } + + if (!skipHooks) { + await runMultipleHook(lyra.beforeMultipleInsert, lyra, docs); + } - return new Promise((resolve, reject) => { + const ids: string[] = []; + + await new Promise((resolve, reject) => { let i = 0; - async function _insertBatch() { - const batch = docs.slice(i * batchSize, (i + 1) * batchSize); + async function _insertMultiple() { + const batch = docs.slice(i * batchSize!, (i + 1) * batchSize!); i++; if (!batch.length) { return resolve(); } - for (const line of batch) { + for (const doc of batch) { try { - await insertWithHooks(lyra, line, config); + const id = await insert(lyra, doc, language, skipHooks); + ids.push(id); } catch (err) { reject(err); } } - setTimeout(_insertBatch, 0); + setTimeout(_insertMultiple, 0); } - setTimeout(_insertBatch, 0); + setTimeout(_insertMultiple, 0); }); -} - -function recursiveradixInsertion( - lyra: Lyra, - doc: ResolveSchema, - id: string, - config: InsertConfig, - prefix = "", - tokenizerConfig: TokenizerConfigExec, - schema: PropertiesSchema = lyra.schema, -) { - config = { language: lyra.defaultLanguage, ...config }; - const { index, frequencies, tokenOccurrencies } = lyra; - - for (const key of Object.keys(doc)) { - const isNested = typeof doc[key] === "object"; - const isSchemaNested = typeof schema[key] == "object"; - const propName = `${prefix}${key}`; - if (isNested && key in schema && isSchemaNested) { - recursiveradixInsertion( - lyra, - doc[key] as ResolveSchema, - id, - config, - propName + ".", - tokenizerConfig, - schema[key] as PropertiesSchema, - ); - } - - if (typeof doc[key] === "number" && key in schema && !isSchemaNested) { - AVLInsert(lyra.index[propName] as AVLNode, doc[key] as number, [id]); - } - - if (typeof doc[key] === "boolean" && key in schema && !isSchemaNested) { - const docKey = doc[key].toString() as "true" | "false"; - (lyra.index[propName] as BooleanIndex)[docKey].push(id); - } - - if (typeof doc[key] === "string" && key in schema && !isSchemaNested) { - // Use propName here because if doc is a nested object - // We will get the wrong index - const requestedTrie = index[propName]; - const tokens = tokenizerConfig.tokenizerFn(doc[key] as string, config.language!, false, tokenizerConfig); - if (!(propName in frequencies)) { - frequencies[propName] = {}; - } - - if (!(propName in tokenOccurrencies)) { - tokenOccurrencies[propName] = {}; - } - - if (!(id in frequencies[propName])) { - frequencies[propName][id] = {}; - } - - if (!(propName in lyra.fieldLengths)) { - lyra.fieldLengths[propName] = {}; - } - - lyra.fieldLengths[propName][id] = tokens.length; - lyra.avgFieldLength[propName] = ((lyra.avgFieldLength[propName] ?? 0) * (lyra.docsCount - 1) + tokens.length) / lyra.docsCount; - - for (const token of tokens) { - let tokenFrequency = 0; - - for (const t of tokens) { - if (t === token) { - tokenFrequency++; - } - } - - const tf = tokenFrequency / tokens.length; - - frequencies[propName][id][token] = tf; - - if (!(token in tokenOccurrencies[propName])) { - tokenOccurrencies[propName][token] = 0; - } - - // increase a token counter that may not yet exist - tokenOccurrencies[propName][token] = (tokenOccurrencies[propName][token] ?? 0) + 1; - - radixInsert(requestedTrie as RadixNode, token, id); - } - } + if (!skipHooks) { + await runMultipleHook(lyra.afterMultipleInsert, lyra, docs); } -} - -async function getDocumentID( - doc: ResolveSchema, - config: InsertConfig, -): Promise { - let id: string; - // If the user passes a custom ID function, we use it to generate the ID. - // This has the maximum priority. - if (config?.id) { - id = await config.id(doc); - - // If the user passes an ID in the document, we use it. - } else if (doc.id && typeof doc.id === "string") { - id = doc.id; - - // If the user passes an ID in the document, but it's not a string, we throw a type error. - } else if (doc.id && typeof doc.id !== "string") { - throw new TypeError(ERRORS.TYPE_ERROR_ID_MUST_BE_STRING(typeof doc.id)); - - // If the user doesn't pass an ID, we generate one. - } else { - id = uniqueId(); - } - - return id; + return ids; } diff --git a/src/methods/load.ts b/src/methods/load.ts deleted file mode 100644 index 166c1f6c6..000000000 --- a/src/methods/load.ts +++ /dev/null @@ -1,21 +0,0 @@ -import * as ERRORS from "../errors.js"; -import type { Data, Lyra, PropertiesSchema } from "../types/index.js"; - -export async function load( - lyra: Lyra, - { index, docs, schema, frequencies, tokenOccurrencies, defaultLanguage, fieldLengths, avgFieldLength }: Data, -): Promise { - if (!lyra.edge) { - throw new Error(ERRORS.GETTER_SETTER_WORKS_ON_EDGE_ONLY("load")); - } - - lyra.index = index; - lyra.docs = docs; - lyra.docsCount = Object.keys(docs).length; - lyra.schema = schema; - lyra.frequencies = frequencies; - lyra.tokenOccurrencies = tokenOccurrencies; - lyra.defaultLanguage = defaultLanguage; - lyra.fieldLengths = fieldLengths; - lyra.avgFieldLength = avgFieldLength; -} diff --git a/src/methods/remove.ts b/src/methods/remove.ts index 299405229..509386254 100644 --- a/src/methods/remove.ts +++ b/src/methods/remove.ts @@ -1,107 +1,84 @@ -import type { RadixNode } from "../trees/radix/node.js"; -import type { Lyra, PropertiesSchema, ResolveSchema, BooleanIndex } from "../types/index.js"; -import { defaultTokenizerConfig } from "../tokenizer/index.js"; -import { removeDocumentByWord } from "../trees/radix/index.js"; -import { flattenObject, getNested } from "../utils.js"; -import { getNodeByKey as getAVLNodeByKey } from "../trees/avl/index.js"; -import * as ERRORS from "../errors.js"; -import { AVLNode } from "../trees/avl/node.js"; - -/** - * Removes a document from a database. - * @param lyra The database to remove the document from. - * @param docID The id of the document to remove. - * @example - * const isDeleted = await remove(db, 'L1tpqQxc0c2djrSN2a6TJ'); - */ -export async function remove(lyra: Lyra, docID: string): Promise { - if (!lyra.components?.tokenizer) { - lyra.components = { - ...(lyra.components ?? {}), - tokenizer: defaultTokenizerConfig(lyra.defaultLanguage), - }; +import { runMultipleHook, runSingleHook } from "../components/hooks.js"; +import { trackRemoval } from "../components/sync-blocking-checker.js"; +import { createError } from "../errors.js"; +import { Lyra, OpaqueDocumentStore, OpaqueIndex, Schema } from "../types.js"; + +export async function remove( + lyra: Lyra, + id: string, + language?: string, + skipHooks?: boolean, +): Promise { + const { index, docs } = lyra.data; + + const doc = await lyra.documentsStore.get(docs, id); + if (!doc) { + throw createError("DOCUMENT_DOES_NOT_EXIST", id); } - if (!(docID in lyra.docs)) { - throw new Error(ERRORS.DOC_ID_DOES_NOT_EXISTS(docID)); - } + const docsCount = await lyra.documentsStore.count(docs); - const document = lyra.docs[docID] || ({} as Record>); - const documentKeys = Object.keys(document || {}); - - const documentKeysLength = documentKeys.length; - for (let i = 0; i < documentKeysLength; i++) { - const key = documentKeys[i]; - - const propertyType = lyra.schema[key]; - - if (propertyType === "string") { - const idx = lyra.index[key]; - const tokens: string[] = lyra.components.tokenizer!.tokenizerFn!( - document[key] as string, - lyra.defaultLanguage, - false, - lyra.components.tokenizer!, - )!; - - lyra.avgFieldLength[key] = (lyra.avgFieldLength[key] * lyra.docsCount - lyra.fieldLengths[key][docID]) / (lyra.docsCount - 1); - delete lyra.fieldLengths[key][docID]; - - const tokensLength = tokens.length; - for (let k = 0; k < tokensLength; k++) { - const token = tokens[k]; - delete lyra.frequencies[key][docID]; - lyra.tokenOccurrencies[key][token]--; - if (token && !removeDocumentByWord(idx as RadixNode, token, docID)) { - throw new Error(ERRORS.CANT_DELETE_DOCUMENT(docID, key, token)); - } - } - } + if (!skipHooks) { + await runSingleHook(lyra.beforeRemove, lyra, id); } - removeNumericValue(lyra, docID); - removeBooleanValue(lyra, docID); + const indexableProperties = await lyra.index.getSearchableProperties(index); + const values = await lyra.getDocumentProperties(doc, indexableProperties); + + for (const prop of indexableProperties) { + const value = values[prop]; + await lyra.index.beforeRemove?.(lyra.data.index, prop, id, value, language, lyra.tokenizer, docsCount); + await lyra.index.remove(lyra.data.index, prop, id, value, language, lyra.tokenizer, docsCount); + await lyra.index.afterRemove?.(lyra.data.index, prop, id, value, language, lyra.tokenizer, docsCount); + } - lyra.docs[docID] = undefined; - lyra.docsCount--; + if (!skipHooks) { + await runSingleHook(lyra.afterRemove, lyra, id); + } - return true; + trackRemoval(lyra); } -function removeNumericValue(lyra: Lyra, docID: string) { - const document = lyra.docs[docID] as Record>; - const flatDocument = flattenObject(document); - const documentNumericOnly = Object.keys(flatDocument).reduce((acc, key) => { - if (getNested(lyra.schema, key) === "number") { - acc[key] = (flatDocument as any)[key]; - } - return acc; - }, {} as Record); - - for (const [property, value] of Object.entries(documentNumericOnly)) { - const idx = lyra.index[property] as AVLNode; - const node = getAVLNodeByKey(idx, value); +export async function removeMultiple( + lyra: Lyra, + ids: string[], + batchSize?: number, + language?: string, + skipHooks?: boolean, +): Promise { + if (!batchSize) { + batchSize = 1000; + } - if (node) { - node.value = node.value.filter((id) => id !== docID); - } + if (!skipHooks) { + await runMultipleHook(lyra.beforeMultipleRemove, lyra, ids); } -} - -function removeBooleanValue(lyra: Lyra, docID: string) { - const document = lyra.docs[docID] as Record>; - const flatDocument = flattenObject(document); - const documentBooleanOnly = Object.keys(flatDocument).reduce((acc, key) => { - if (getNested(lyra.schema, key) === "boolean") { - acc[key] = (flatDocument as any)[key]; + + await new Promise((resolve, reject) => { + let i = 0; + async function _insertMultiple() { + const batch = ids.slice(i * batchSize!, (i + 1) * batchSize!); + i++; + + if (!batch.length) { + return resolve(); + } + + for (const doc of batch) { + try { + await remove(lyra, doc, language, skipHooks); + } catch (err) { + reject(err); + } + } + + setTimeout(_insertMultiple, 0); } - return acc; - }, {} as Record); - for (const [property] of Object.entries(documentBooleanOnly)) { - const idx = lyra.index[property] as BooleanIndex; + setTimeout(_insertMultiple, 0); + }); - idx.true.slice(idx.true.indexOf(docID), 1); - idx.false.slice(idx.false.indexOf(docID), 1); + if (!skipHooks) { + await runMultipleHook(lyra.afterMultipleRemove, lyra, ids); } -} \ No newline at end of file +} diff --git a/src/methods/save.ts b/src/methods/save.ts deleted file mode 100644 index ceb5901e3..000000000 --- a/src/methods/save.ts +++ /dev/null @@ -1,14 +0,0 @@ -import type { Data, Lyra, PropertiesSchema } from "../types/index.js"; - -export async function save(lyra: Lyra): Promise> { - return { - index: lyra.index, - docs: lyra.docs, - schema: lyra.schema, - frequencies: lyra.frequencies, - tokenOccurrencies: lyra.tokenOccurrencies, - defaultLanguage: lyra.defaultLanguage, - avgFieldLength: lyra.avgFieldLength, - fieldLengths: lyra.fieldLengths, - }; -} diff --git a/src/methods/search.ts b/src/methods/search.ts index d6e0b4195..3c06e2a1f 100644 --- a/src/methods/search.ts +++ b/src/methods/search.ts @@ -1,203 +1,41 @@ -import type { RadixNode } from "../trees/radix/node.js"; -import type { Lyra, PropertiesSchema, ResolveSchema, SearchProperties, TokenMap, TokenScore, BM25Params, BM25OptionalParams, PropertiesBoost, FacetsSearch } from "../types/index.js"; -import type { WhereFilter } from "../types/filters.js"; -import { defaultTokenizerConfig, Language } from "../tokenizer/index.js"; -import { find as radixFind } from "../trees/radix/index.js"; -import { formatNanoseconds, getNanosecondsTime, sortTokenScorePredicate } from "../utils.js"; -import { getIndices } from "./common.js"; -import { prioritizeTokenScores, BM25 } from "../algorithms.js"; -import { FacetReturningValue, getFacets } from "../facets.js"; -import { getWhereFiltersIDs, intersectFilteredIDs } from "../filters.js"; - -type IndexMap = Record; - -export type RetrievedDoc = { - /** - * The id of the document. - */ - id: string; - /** - * The score of the document in the search. - */ - score: number; - /** - * The document - */ - document: ResolveSchema; -}; - -export type SearchParams = { - /** - * The word to search. - */ - term: string; - /** - * The properties of the document to search in. - */ - properties?: "*" | SearchProperties[]; - /** - * The number of matched documents to return. - */ - limit?: number; - /** - * The number of matched documents to skip. - */ - offset?: number; - /** - * Whether to match the term exactly. - */ - exact?: boolean; - /** - * The maximum [levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) - * between the term and the searchable property. - */ - tolerance?: number; - /** - * The BM25 parameters to use. - * - * k: Term frequency saturation parameter. - * The higher the value, the more important the term frequency becomes. - * The default value is 1.2. It should be set to a value between 1.2 and 2.0. - * - * b: Document length saturation impact. The higher the value, the more - * important the document length becomes. The default value is 0.75. - * - * d: Frequency normalization lower bound. Default value is 0.5. - * - * @see https://en.wikipedia.org/wiki/Okapi_BM25 - */ - relevance?: BM25OptionalParams; - /** - * The boost to apply to the properties. - * - * The boost is a number that is multiplied to the score of the property. - * It can be used to give more importance to some properties. - * - * @example - * // Give more importance to the 'title' property. - * const result = await search(db, { - * term: 'Michael', - * properties: ['title', 'author'], - * boost: { - * title: 2 - * } - * }); - * - * // In that case, the score of the 'title' property will be multiplied by 2. - */ - boost?: PropertiesBoost; - /** - * Facets configuration - * - * A facet is a feature that allows users to narrow down their search results by specific - * attributes or characteristics, such as category, price, or location. - * This can help users find more relevant and specific results for their search query. - * - * @example - * - * const results = await search(db, { - * term: 'Personal Computer', - * properties: ['title', 'description', 'category.primary', 'category.secondary'], - * facets: { - * 'category.primary': { - * size: 10, - * sort: 'ASC', - * } - * } - * }); - */ - facets?: FacetsSearch; - - /** - * Filter the search results. - * - * @example - * // Search for documents that contain 'Headphones' in the 'description' and 'title' fields and - * // have a price less than 100. - * - * const result = await search(db, { - * term: 'Headphones', - * properties: ['description', 'title'], - * where: { - * price: { - * lt: 100 - * } - * } - * }); - */ - where?: WhereFilter; -}; +import { prioritizeTokenScores } from "../components/algorithms.js"; +import { getFacets } from "../components/facets.js"; +import { intersectFilteredIDs } from "../components/filters.js"; +import { createError } from "../errors.js"; +import { + BM25Params, + IndexMap, + Lyra, + OpaqueDocumentStore, + OpaqueIndex, + Result, + Results, + Schema, + SearchContext, + SearchParams, + TokenMap, +} from "../types.js"; +import { getNanosecondsTime, sortTokenScorePredicate } from "../utils.js"; -export type SearchResult = { - /** - * The number of all the matched documents. - */ - count: number; - /** - * An array of matched documents taking `limit` and `offset` into account. - */ - hits: RetrievedDoc[]; - /** - * The time taken to search. - */ - elapsed: bigint | string; - /** - * The facets results. - */ - facets?: FacetReturningValue; +const defaultBM25Params: BM25Params = { + k: 1.2, + b: 0.75, + d: 0.5, }; -/** - * Searches for documents in a database. - * @param lyra The database to search. - * @param params The search query. - * @param language Optional parameter to override the default language analyzer. - * @example - * // Search for documents that contain 'Michael' in the 'author' field. - * const result = await search(db, { - * term: 'Michael', - * properties: ['author'] - * }); - */ -export async function search( - lyra: Lyra, - params: SearchParams, - language?: Language, -): Promise> { - if (!language) { - language = lyra.defaultLanguage; - } - - if (!lyra.components?.tokenizer) { - lyra.components = { - ...(lyra.components ?? {}), - tokenizer: defaultTokenizerConfig(language), - }; - } - - params.relevance = getBM25Parameters(params.relevance); - - const shouldCalculateFacets = params.facets && Object.keys(params.facets).length > 0; - const { limit = 10, offset = 0, exact = false, term, properties } = params; - const tokens = lyra.components.tokenizer!.tokenizerFn!(term, language, false, lyra.components.tokenizer!); - const indices = getIndices(lyra, properties); - const results: RetrievedDoc[] = Array.from({ - length: limit, - }); - const N = lyra.docsCount; - - const timeStart = getNanosecondsTime(); - +function createSearchContext( + properties: string[], + tokens: string[], + params: SearchParams, + docsCount: number, +): SearchContext { // If filters are enabled, we need to get the IDs of the documents that match the filters. - const hasFilters = Object.keys(params.where ?? {}).length > 0; - let whereFiltersIDs: string[] = []; + // const hasFilters = Object.keys(params.where ?? {}).length > 0; + // let whereFiltersIDs: string[] = []; - if (hasFilters) { - whereFiltersIDs = getWhereFiltersIDs(params.where!, lyra); - } - - // uniqueDocsIDs contains unique document IDs for all the tokens in all the indices. - const uniqueDocsIDs: Record = {}; + // if (hasFilters) { + // whereFiltersIDs = getWhereFiltersIDs(params.where!, lyra); + // } // indexMap is an object containing all the indexes considered for the current search, // and an array of doc IDs for each token in all the indices. @@ -213,6 +51,7 @@ export async function search( // } // } const indexMap: IndexMap = {}; + // After we create the indexMap, we need to calculate the intersection // between all the postings lists for each token. // Given the example above, docsIntersection will look like this: @@ -224,85 +63,103 @@ export async function search( // as doc2 is the only document present in all the postings lists for the "description" index. const docsIntersection: TokenMap = {}; - for (const index of indices) { + for (const prop of properties) { const tokensMap: TokenMap = {}; for (const token of tokens) { tokensMap[token] = []; } - indexMap[index] = tokensMap; - docsIntersection[index] = []; + indexMap[prop] = tokensMap; + docsIntersection[prop] = []; } - // Now it's time to loop over all the indices and get the documents IDs for every single term - const indexesLength = indices.length; - for (let i = 0; i < indexesLength; i++) { - const index = indices[i]; - const avgFieldLength = lyra.avgFieldLength[index]; - const fieldLengths = lyra.fieldLengths[index]; + return { + timeStart: getNanosecondsTime(), + params, + docsCount, + uniqueDocsIDs: {}, + indexMap, + docsIntersection, + }; +} + +export async function search( + lyra: Lyra, + params: SearchParams, + language?: string, +): Promise { + params.relevance = Object.assign(params.relevance ?? {}, defaultBM25Params); + + const shouldCalculateFacets = params.facets && Object.keys(params.facets).length > 0; + const { limit = 10, offset = 0, term, properties } = params; + + const { index, docs } = lyra.data; + const tokens = lyra.tokenizer.tokenize(term, language); + + // Get searchable string properties + let propertiesToSearch = await lyra.index.getSearchableProperties(index); + const propertiesToSearchWithTypes = await lyra.index.getSearchablePropertiesWithTypes(index); + propertiesToSearch = propertiesToSearch.filter((prop: string) => propertiesToSearchWithTypes[prop] === "string"); + + if (properties && properties !== "*") { + for (const prop of properties) { + if (!propertiesToSearch.includes(prop)) { + throw createError("UNKNOWN_INDEX", prop, propertiesToSearch.join(", ")); + } + } - if (!(index in lyra.tokenOccurrencies)) continue; + propertiesToSearch = propertiesToSearch.filter((prop: string) => properties.includes(prop)); + } - const lyraOccurrencies = lyra.tokenOccurrencies[index]; - const lyraFrequencies = lyra.frequencies[index]; + // Create the search context and the results + const context = createSearchContext(propertiesToSearch, tokens, params, await lyra.documentsStore.count(docs)); + const results: Result[] = Array.from({ + length: limit, + }); + + // If filters are enabled, we need to get the IDs of the documents that match the filters. + const hasFilters = Object.keys(params.where ?? {}).length > 0; + let whereFiltersIDs: string[] = []; + + if (hasFilters) { + whereFiltersIDs = lyra.index.searchByWhereClause(index, params.where!); + } + + // Now it's time to loop over all the indices and get the documents IDs for every single term + const indexesLength = propertiesToSearch.length; + for (let i = 0; i < indexesLength; i++) { + const prop = propertiesToSearch[i]; const tokensLength = tokens.length; for (let j = 0; j < tokensLength; j++) { const term = tokens[j]; - // Here we get a TypeScript error: Type instantiation is excessively deep and possibly infinite. - // Type definition is correct, but TypeScript is not able to infer the type recursively. - // eslint-disable-next-line @typescript-eslint/ban-ts-comment - // @ts-ignore - const documentIDs = getDocumentIDsFromSearch(lyra, { ...params, index, term, exact }); - - // lyraOccurrencies[term] can be undefined, 0, string, or { [k: string]: number } - const termOccurrencies = typeof lyraOccurrencies[term] === "number" ? lyraOccurrencies[term] ?? 0 : 0; - - const scoreList: TokenScore[] = []; - - // Calculate TF-IDF value for each term, in each document, for each index. - // Then insert sorted results into orderedTFIDFList. - const documentIDsLength = documentIDs.length; - for (let k = 0; k < documentIDsLength; k++) { - const id = documentIDs[k]; - const tf = lyraFrequencies?.[id]?.[term] ?? 0; - - const bm25 = BM25( - tf, - termOccurrencies, - N, - fieldLengths[id], - avgFieldLength, - params.relevance as BM25Params, - ); - - scoreList.push([id, bm25]); - } + // Lookup + const scoreList = await lyra.index.search(index, prop, term, context); - indexMap[index][term].push(...scoreList); + context.indexMap[prop][term].push(...scoreList); } - const docIds = indexMap[index]; + const docIds = context.indexMap[prop]; const vals = Object.values(docIds); - docsIntersection[index] = prioritizeTokenScores(vals, params?.boost?.[index] ?? 1); - const uniqueDocs = docsIntersection[index]; + context.docsIntersection[prop] = prioritizeTokenScores(vals, params?.boost?.[prop] ?? 1); + const uniqueDocs = context.docsIntersection[prop]; const uniqueDocsLength = uniqueDocs.length; for (let i = 0; i < uniqueDocsLength; i++) { - const [id, tfIdfScore] = uniqueDocs[i]; + const [id, score] = uniqueDocs[i]; - const prevScore = uniqueDocsIDs[id]; + const prevScore = context.uniqueDocsIDs[id]; if (prevScore) { - uniqueDocsIDs[id] = prevScore + tfIdfScore + 0.5; + context.uniqueDocsIDs[id] = prevScore + score + 0.5; } else { - uniqueDocsIDs[id] = tfIdfScore; + context.uniqueDocsIDs[id] = score; } } } // Get unique doc IDs from uniqueDocsIDs map, sorted by value. - let uniqueDocsArray = Object.entries(uniqueDocsIDs).sort(sortTokenScorePredicate); - + let uniqueDocsArray = Object.entries(context.uniqueDocsIDs).sort(sortTokenScorePredicate); + // If filters are enabled, we need to remove the IDs of the documents that don't match the filters. if (hasFilters) { uniqueDocsArray = intersectFilteredIDs(whereFiltersIDs, uniqueDocsArray); @@ -310,7 +167,7 @@ export async function search( const resultIDs: Set = new Set(); // Populate facets if needed - const facets = shouldCalculateFacets ? getFacets(lyra.schema, lyra.docs, uniqueDocsArray, params.facets!) : {}; + const facets = shouldCalculateFacets ? await getFacets(lyra, uniqueDocsArray, params.facets!) : {}; // We already have the list of ALL the document IDs containing the search terms. // We loop over them starting from a positional value "offset" and ending at "offset + limit" @@ -328,20 +185,14 @@ export async function search( if (!resultIDs.has(id)) { // We retrieve the full document only AFTER making sure that we really want it. // We never retrieve the full document preventively. - const fullDoc = lyra.docs[id]!; - results[i] = { id, score, document: fullDoc }; + const fullDoc = await lyra.documentsStore.get(docs, id); + results[i] = { id, score, document: fullDoc! }; resultIDs.add(id); } } - let elapsed: bigint | string = getNanosecondsTime() - timeStart; - - if (lyra.components.elapsed?.format === "human") { - elapsed = formatNanoseconds(elapsed); - } - - const searchResult: SearchResult = { - elapsed, + const searchResult: Results = { + elapsed: await lyra.formatElapsedTime(getNanosecondsTime() - context.timeStart), hits: results.filter(Boolean), count: uniqueDocsArray.length, }; @@ -352,35 +203,3 @@ export async function search( return searchResult; } - -function getDocumentIDsFromSearch( - lyra: Lyra, - params: SearchParams & { index: string }, -): string[] { - const idx = lyra.index[params.index]; - const searchResult = radixFind(idx as RadixNode, { - term: params.term, - exact: params.exact, - tolerance: params.tolerance, - }); - - const ids = new Set(); - - for (const key in searchResult) { - for (const id of searchResult[key]) { - ids.add(id); - } - } - - return Array.from(ids); -} - -const defaultBM25Params: BM25Params = { - k: 1.2, - b: 0.75, - d: 0.5 -} - -function getBM25Parameters(params: BM25OptionalParams = defaultBM25Params): BM25Params { - return Object.assign({}, defaultBM25Params, params); -} diff --git a/src/methods/serialization.ts b/src/methods/serialization.ts new file mode 100644 index 000000000..52c4ce285 --- /dev/null +++ b/src/methods/serialization.ts @@ -0,0 +1,23 @@ +import { Lyra, OpaqueDocumentStore, OpaqueIndex, Schema } from "../types.js"; + +export interface RawData { + index: unknown; + docs: unknown; +} + +export async function load( + lyra: Lyra, + raw: RawData, +): Promise { + lyra.data.index = await lyra.index.load(raw.index); + lyra.data.docs = await lyra.documentsStore.load(raw.docs); +} + +export async function save( + lyra: Lyra, +): Promise { + return { + index: await lyra.index.save(lyra.data.index), + docs: await lyra.documentsStore.save(lyra.data.docs), + }; +} diff --git a/src/methods/update.ts b/src/methods/update.ts new file mode 100644 index 000000000..e27933be6 --- /dev/null +++ b/src/methods/update.ts @@ -0,0 +1,30 @@ +import { Document, Schema, OpaqueIndex, OpaqueDocumentStore, Lyra } from "../types.js"; +import { insert, insertMultiple } from "./insert.js"; +import { remove, removeMultiple } from "./remove.js"; + +export async function update( + lyra: Lyra, + id: string, + doc: Document, + language?: string, + skipHooks?: boolean, +): Promise { + await remove(lyra, id, language, skipHooks); + return insert(lyra, doc, language, skipHooks); +} + +export async function updateMultiple( + lyra: Lyra, + ids: string[], + docs: Document[], + batchSize?: number, + language?: string, + skipHooks?: boolean, +): Promise { + if (!batchSize) { + batchSize = 1000; + } + + await removeMultiple(lyra, ids, batchSize, language, skipHooks); + return insertMultiple(lyra, docs, batchSize, language, skipHooks); +} diff --git a/src/tokenizer/diacritics.ts b/src/tokenizer/diacritics.ts index 33a64ccee..d69e7b74f 100644 --- a/src/tokenizer/diacritics.ts +++ b/src/tokenizer/diacritics.ts @@ -193,7 +193,7 @@ const CHARCODE_REPLACE_MAPPING = [ 122, 90, 122, - 115 + 115, ]; function replaceChar(charCode: number): number { diff --git a/src/tokenizer/index.ts b/src/tokenizer/index.ts index b8001934b..8d58b51b3 100644 --- a/src/tokenizer/index.ts +++ b/src/tokenizer/index.ts @@ -1,93 +1,44 @@ -import { stemmer } from "@stemmer/en.js"; +import { createError } from "../errors.js"; +import { Tokenizer } from "../types.js"; import { replaceDiacritics } from "./diacritics.js"; -import * as ERRORS from "../errors.js"; -import { Language, SUPPORTED_LANGUAGES } from "./languages.js"; -import { availableStopWords, stopWords } from "./stop-words/index.js"; - -export * from "./languages.js"; +import { Language, SPLITTERS, STEMMERS, SUPPORTED_LANGUAGES } from "./languages.js"; +import { stopWords as defaultStopWords } from "./stop-words/index.js"; export type Stemmer = (word: string) => string; export type TokenizerConfig = { - enableStemming?: boolean; - enableStopWords?: boolean; - customStopWords?: ((stopWords: string[]) => string[]) | string[]; - stemmingFn?: Stemmer; - tokenizerFn?: Tokenizer; - assertSupportedLanguage?: (language: string) => void; -}; - -export type TokenizerConfigExec = { - enableStemming: boolean; - enableStopWords: boolean; - customStopWords: string[]; - stemmingFn?: Stemmer; - tokenizerFn: Tokenizer; - assertSupportedLanguage: (language: string) => void; + stemming?: boolean; + stemmer?: Stemmer; + stopWords?: boolean | string[] | ((stopWords: string[]) => string[] | Promise); + allowDuplicates?: boolean; }; -export type Tokenizer = ( - text: string, - language: Language, - allowDuplicates: boolean, - tokenizerConfig: TokenizerConfig, - frequency?: boolean, -) => string[]; - -const splitRegex: Record = { - dutch: /[^A-Za-zàèéìòóù0-9_'-]+/gim, - english: /[^A-Za-zàèéìòóù0-9_'-]+/gim, - french: /[^a-z0-9äâàéèëêïîöôùüûœç-]+/gim, - italian: /[^A-Za-zàèéìòóù0-9_'-]+/gim, - norwegian: /[^a-z0-9_æøåÆØÅäÄöÖüÜ]+/gim, - portuguese: /[^a-z0-9à-úÀ-Ú]/gim, - russian: /[^a-z0-9а-яА-ЯёЁ]+/gim, - spanish: /[^a-z0-9A-Zá-úÁ-ÚñÑüÜ]+/gim, - swedish: /[^a-z0-9_åÅäÄöÖüÜ-]+/gim, - german: /[^a-z0-9A-ZäöüÄÖÜß]+/gim, - finnish: /[^a-z0-9äöÄÖ]+/gim, - danish: /[^a-z0-9æøåÆØÅ]+/gim, - hungarian: /[^a-z0-9áéíóöőúüűÁÉÍÓÖŐÚÜŰ]+/gim, - romanian: /[^a-z0-9ăâîșțĂÂÎȘȚ]+/gim, - serbian: /[^a-z0-9čćžšđČĆŽŠĐ]+/gim, - turkish: /[^a-z0-9çÇğĞıİöÖşŞüÜ]+/gim, - lithuanian: /[^a-z0-9ąčęėįšųūžĄČĘĖĮŠŲŪŽ]+/gim, - arabic: /[^a-z0-9أ-ي]+/gim, - nepali: /[^a-z0-9अ-ह]+/gim, - irish: /[^a-z0-9áéíóúÁÉÍÓÚ]+/gim, - indian: /[^a-z0-9अ-ह]+/gim, - armenian: /[^a-z0-9ա-ֆ]+/gim, - greek: /[^a-z0-9α-ωά-ώ]+/gim, - indonesian: /[^a-z0-9]+/gim, - ukrainian: /[^a-z0-9а-яА-ЯіїєІЇЄ]+/gim, - slovenian: /[^a-z0-9螚ȎŠ]+/gim, - bulgarian: /[^a-z0-9а-яА-Я]+/gim, -}; +interface DefaultTokenizer extends Tokenizer { + language: string; + stemmer?: Stemmer; + stopWords?: string[]; + allowDuplicates: boolean; + normalizeToken(this: DefaultTokenizer, token: string): string; +} export const normalizationCache = new Map(); -function normalizeToken(token: string, language: Language, tokenizerConfig: TokenizerConfig): string { - const key = `${language}:${token}`; +function normalizeToken(this: DefaultTokenizer, token: string): string { + const key = `${this.language}:${token}`; if (normalizationCache.has(key)) { return normalizationCache.get(key)!; } - // Check if stop-words removal is enabled - if (tokenizerConfig?.enableStopWords) { - // Remove stop-words - if ((tokenizerConfig.customStopWords as string[]).includes(token)) { - const token = ""; - normalizationCache.set(key, token); - return token; - } + + // Remove stopwords if enabled + if (this.stopWords?.includes(token)) { + normalizationCache.set(key, ""); + return ""; } - // Check if stemming is enabled - if (tokenizerConfig?.enableStemming) { - // Stem token when a stemming function is available - if (typeof tokenizerConfig?.stemmingFn === "function") { - token = tokenizerConfig?.stemmingFn(token); - } + // Apply stemming if enabled + if (this.stemmer) { + token = this.stemmer(token); } token = replaceDiacritics(token); @@ -106,107 +57,94 @@ function trim(text: string[]): string[] { return text; } -function assertSupportedLanguage(language: string) { - if (!SUPPORTED_LANGUAGES.includes(language)) { - throw new Error(ERRORS.LANGUAGE_NOT_SUPPORTED(language)); +function tokenize(this: DefaultTokenizer, input: string, language?: string): string[] { + if (language && language !== this.language) { + throw createError("LANGUAGE_NOT_SUPPORTED", language); } -} -export function tokenize( - input: string, - language: Language = "english", - allowDuplicates = false, - tokenizerConfig: TokenizerConfig = defaultTokenizerConfig(language), -) { /* c8 ignore next 3 */ if (typeof input !== "string") { return [input]; } - const splitRule = splitRegex[language]; - const tokens = input - .toLowerCase() - .split(splitRule) - .map(token => normalizeToken(token, language, tokenizerConfig!)) - .filter(Boolean); + const splitRule = SPLITTERS[this.language]; + const tokens = input.toLowerCase().split(splitRule).map(this.normalizeToken).filter(Boolean); const trimTokens = trim(tokens); - if (!allowDuplicates) { + if (!this.allowDuplicates) { return Array.from(new Set(trimTokens)); } return trimTokens; } -export function defaultTokenizerConfig(language: Language, tokenizerConfig: TokenizerConfig = {}): TokenizerConfigExec { - let defaultStopWords: string[] = []; - let customStopWords: string[] = []; - let defaultStemmingFn: Stemmer | undefined; - let defaultTokenizerFn: Tokenizer = tokenize; +export async function createTokenizer(language: Language, config: TokenizerConfig = {}): Promise { + if (!SUPPORTED_LANGUAGES.includes(language)) { + throw createError("LANGUAGE_NOT_SUPPORTED", language); + } + + // Handle stemming + let stemmer: Stemmer | undefined; - // Enable custom tokenizer function - if (tokenizerConfig?.tokenizerFn) { - if (typeof tokenizerConfig.tokenizerFn !== "function") { - throw Error(ERRORS.INVALID_TOKENIZER_FUNCTION()); + if (config.stemming !== false) { + if (config.stemmer && typeof config.stemmer !== "function") { + throw createError("INVALID_STEMMER_FUNCTION_TYPE"); } - /* c8 ignore next 4 */ - defaultTokenizerFn = tokenizerConfig.tokenizerFn; - - // If there's no custom tokenizer, we can proceed setting custom - // stemming functions and stop-words. - } else { - // Enable custom stemming function - if (tokenizerConfig?.stemmingFn) { - if (typeof tokenizerConfig.stemmingFn !== "function") { - throw Error(ERRORS.INVALID_STEMMER_FUNCTION_TYPE()); - } - defaultStemmingFn = tokenizerConfig.stemmingFn; + + if (config.stemmer) { + stemmer = config.stemmer; } else { - defaultStemmingFn = stemmer; + // Check if we are in a TypeScript or Javascript scenario and determine the stemmers path + // Note that the initial .. is purposely left inside the import in order to be compatible + // with vite. + + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore This fails when verifying CJS but it's actually correct + const stemmersPath = import.meta.url.endsWith("ts") ? "../stemmer/lib" : "stemmer"; + const stemmerImport = await import(`../${stemmersPath}/${STEMMERS[language]}.js`); + stemmer = stemmerImport.stemmer; } + } - // Enable default stop-words + // Handle stopwords + let stopWords: string[] | undefined; - if (availableStopWords.includes(language)) { - /* c8 ignore next */ - defaultStopWords = stopWords[language] ?? []; + if (config.stopWords !== false) { + stopWords = defaultStopWords[language] ?? []; + + if (Array.isArray(config.stopWords)) { + stopWords = config.stopWords; + } else if (typeof config.stopWords === "function") { + stopWords = await config.stopWords(stopWords); + } else if (config.stopWords) { + throw createError("CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY"); + } + + // Make sure stopWords is just an array of strings + if (!Array.isArray(stopWords)) { + throw createError("CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY"); } - if (tokenizerConfig?.customStopWords) { - switch (typeof tokenizerConfig.customStopWords) { - // Execute the custom step-words function. - // This will pass the default step-words for a given language as a first parameter. - case "function": - customStopWords = tokenizerConfig.customStopWords(defaultStopWords); - break; - - // Check if the custom step-words is an array. - // If it's an object, throw an exception. If the array contains any non-string value, throw an exception. - case "object": - if (!Array.isArray(tokenizerConfig.customStopWords)) { - throw Error(ERRORS.CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY()); - } - customStopWords = tokenizerConfig.customStopWords as string[]; - if (customStopWords.some(x => typeof x !== "string")) { - throw Error(ERRORS.CUSTOM_STOP_WORDS_ARRAY_MUST_BE_STRING_ARRAY()); - } - break; - - // By default, throw an exception, as this is a misconfiguration. - default: - throw Error(ERRORS.CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY()); + for (const s of stopWords) { + if (typeof s !== "string") { + throw createError("CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY"); } } } - return { - /* c8 ignore next 5 */ - enableStopWords: tokenizerConfig?.enableStopWords ?? true, - enableStemming: tokenizerConfig?.enableStemming ?? true, - stemmingFn: defaultStemmingFn, - customStopWords: customStopWords ?? defaultStopWords, - tokenizerFn: defaultTokenizerFn, - assertSupportedLanguage: tokenizerConfig.assertSupportedLanguage ?? assertSupportedLanguage, + // Create the tokenizer + const tokenizer: DefaultTokenizer = { + tokenize, + language, + stemmer, + stopWords, + allowDuplicates: Boolean(config.allowDuplicates), + normalizeToken, }; + + tokenizer.tokenize = tokenize.bind(tokenizer); + tokenizer.normalizeToken = normalizeToken.bind(tokenizer); + + return tokenizer; } diff --git a/src/tokenizer/languages.ts b/src/tokenizer/languages.ts index 7b73ccf09..0b8470c47 100644 --- a/src/tokenizer/languages.ts +++ b/src/tokenizer/languages.ts @@ -1,31 +1,63 @@ -export type Language = typeof SUPPORTED_LANGUAGES[number]; +export const STEMMERS: Record = { + arabic: "ar", + armenian: "am", + bulgarian: "bg", + danish: "dk", + dutch: "nl", + english: "en", + finnish: "fi", + french: "fr", + german: "de", + greek: "gr", + hungarian: "hu", + indian: "in", + indonesian: "id", + irish: "ie", + italian: "it", + lithuanian: "lt", + nepali: "np", + norwegian: "no", + portuguese: "pt", + romanian: "ro", + russian: "ru", + serbian: "rs", + slovenian: "ru", + spanish: "es", + swedish: "se", + turkish: "tr", + ukrainian: "uk", +}; + +export const SPLITTERS: Record = { + dutch: /[^A-Za-zàèéìòóù0-9_'-]+/gim, + english: /[^A-Za-zàèéìòóù0-9_'-]+/gim, + french: /[^a-z0-9äâàéèëêïîöôùüûœç-]+/gim, + italian: /[^A-Za-zàèéìòóù0-9_'-]+/gim, + norwegian: /[^a-z0-9_æøåÆØÅäÄöÖüÜ]+/gim, + portuguese: /[^a-z0-9à-úÀ-Ú]/gim, + russian: /[^a-z0-9а-яА-ЯёЁ]+/gim, + spanish: /[^a-z0-9A-Zá-úÁ-ÚñÑüÜ]+/gim, + swedish: /[^a-z0-9_åÅäÄöÖüÜ-]+/gim, + german: /[^a-z0-9A-ZäöüÄÖÜß]+/gim, + finnish: /[^a-z0-9äöÄÖ]+/gim, + danish: /[^a-z0-9æøåÆØÅ]+/gim, + hungarian: /[^a-z0-9áéíóöőúüűÁÉÍÓÖŐÚÜŰ]+/gim, + romanian: /[^a-z0-9ăâîșțĂÂÎȘȚ]+/gim, + serbian: /[^a-z0-9čćžšđČĆŽŠĐ]+/gim, + turkish: /[^a-z0-9çÇğĞıİöÖşŞüÜ]+/gim, + lithuanian: /[^a-z0-9ąčęėįšųūžĄČĘĖĮŠŲŪŽ]+/gim, + arabic: /[^a-z0-9أ-ي]+/gim, + nepali: /[^a-z0-9अ-ह]+/gim, + irish: /[^a-z0-9áéíóúÁÉÍÓÚ]+/gim, + indian: /[^a-z0-9अ-ह]+/gim, + armenian: /[^a-z0-9ա-ֆ]+/gim, + greek: /[^a-z0-9α-ωά-ώ]+/gim, + indonesian: /[^a-z0-9]+/gim, + ukrainian: /[^a-z0-9а-яА-ЯіїєІЇЄ]+/gim, + slovenian: /[^a-z0-9螚ȎŠ]+/gim, + bulgarian: /[^a-z0-9а-яА-Я]+/gim, +}; -export const SUPPORTED_LANGUAGES = [ - "arabic", - "armenian", - "bulgarian", - "danish", - "dutch", - "english", - "finnish", - "french", - "german", - "greek", - "hungarian", - "indian", - "indonesian", - "irish", - "italian", - "lithuanian", - "nepali", - "norwegian", - "portuguese", - "romanian", - "russian", - "serbian", - "slovenian", - "spanish", - "swedish", - "turkish", - "ukrainian" -]; +export const SUPPORTED_LANGUAGES = Object.keys(STEMMERS); + +export type Language = typeof SUPPORTED_LANGUAGES[number]; diff --git a/src/trees/avl.ts b/src/trees/avl.ts new file mode 100644 index 000000000..d93622a9f --- /dev/null +++ b/src/trees/avl.ts @@ -0,0 +1,326 @@ +export type Node = { + key: K; + value: V; + left: Node | null; + right: Node | null; + height: number; +}; + +const BALANCE_STATE = { + UNBALANCED_RIGHT: 1, + SLIGHTLY_UNBALANCED_RIGHT: 2, + BALANCED: 3, + SLIGHTLY_UNBALANCED_LEFT: 4, + UNBALANCED_LEFT: 5, +}; + +function getBalanceFactor(node: Node): number { + const heightDifference = getHeight(node.left) - getHeight(node.right); + + switch (heightDifference) { + case -2: + return BALANCE_STATE.UNBALANCED_RIGHT; + case -1: + return BALANCE_STATE.SLIGHTLY_UNBALANCED_RIGHT; + case 1: + return BALANCE_STATE.SLIGHTLY_UNBALANCED_LEFT; + case 2: + return BALANCE_STATE.UNBALANCED_LEFT; + default: + return BALANCE_STATE.BALANCED; + } +} + +function getHeight(node: Node | null): number { + return node ? node.height : -1; +} + +function rotateLeft(node: Node): Node { + const right = node.right as Node; + node.right = right.left; + right.left = node; + node.height = Math.max(getHeight(node.left), getHeight(node.right)) + 1; + right.height = Math.max(getHeight(right.left), getHeight(right.right)) + 1; + return right; +} + +function rotateRight(node: Node): Node { + const left = node.left as Node; + node.left = left.right; + left.right = node; + node.height = Math.max(getHeight(node.left), getHeight(node.right)) + 1; + left.height = Math.max(getHeight(left.left), getHeight(left.right)) + 1; + return left; +} + +function findMin(node: Node): Node { + return node.left ? findMin(node.left) : node; +} + +export function contains(node: Node, key: K): boolean { + return !!find(node, key); +} + +export function getSize(node: Node | null): number { + if (!node) { + return 0; + } + + return 1 + getSize(node.left) + getSize(node.right); +} + +export function isBalanced(node: Node | null): boolean { + if (!node) { + return true; + } + + const heightDiff = Math.abs(getHeight(node.left) - getHeight(node.right)); + return heightDiff <= 1 && isBalanced(node.left) && isBalanced(node.right); +} + +export function rangeSearch(node: Node, min: K, max: K): V { + if (!node) { + return [] as unknown as V; + } + + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + const result: V = []; + + function traverse(node: Node) { + if (!node) { + return; + } + + if (node.key > min) { + traverse(node.left as Node); + } + + if (node.key >= min && node.key <= max) { + (result as V[]).push(...(node.value as V[])); + } + + if (node.key < max) { + traverse(node.right as Node); + } + } + + traverse(node); + + return result; +} + +export function greaterThan(node: Node, key: K, inclusive = false): V { + if (!node) { + return [] as unknown as V; + } + + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + const result: V = []; + + function traverse(node: Node) { + if (!node) { + return; + } + + if (inclusive && node.key >= key) { + (result as V[]).push(...(node.value as V[])); + } + + if (!inclusive && node.key > key) { + (result as V[]).push(...(node.value as V[])); + } + + traverse(node.left as Node); + traverse(node.right as Node); + } + + traverse(node); + + return result; +} + +export function lessThan(node: Node, key: K, inclusive = false): V { + if (!node) { + return [] as unknown as V; + } + + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + const result: V = []; + + function traverse(node: Node) { + if (!node) { + return; + } + + if (inclusive && node.key <= key) { + (result as V[]).push(...(node.value as V[])); + } + + if (!inclusive && node.key < key) { + (result as V[]).push(...(node.value as V[])); + } + + traverse(node.left as Node); + traverse(node.right as Node); + } + + traverse(node); + + return result; +} + +function getNodeByKey(node: Node, key: K): Node | null { + if (!node) { + return null; + } + + if (node.key === key) { + return node; + } + + if (key < node.key) { + return node.left ? getNodeByKey(node.left, key) : null; + } + + return node.right ? getNodeByKey(node.right, key) : null; +} + +export function create(key: K, value: V): Node { + return { + key, + value, + left: null, + right: null, + height: 0, + }; +} + +export function insert(node: Node, key: K, value: V): Node { + if (!node) { + return create(key, value); + } + + if (key < node.key) { + node.left = insert(node.left as Node, key, value); + } else if (key > node.key) { + node.right = insert(node.right as Node, key, value); + } else { + (node.value as string[]) = (node.value as string[]).concat(value as string); + return node; + } + + const balanceFactor = getBalanceFactor(node); + + if (balanceFactor === BALANCE_STATE.UNBALANCED_LEFT) { + if (key < (node.left as Node).key) { + node = rotateRight(node); + } else { + node.left = rotateLeft(node.left as Node); + node = rotateRight(node); + } + } + + if (balanceFactor === BALANCE_STATE.UNBALANCED_RIGHT) { + if (key > (node.right as Node).key) { + node = rotateLeft(node); + } else { + node.right = rotateRight(node.right as Node); + node = rotateLeft(node); + } + } + + return node; +} + +export function find(node: Node, key: K): V | null { + if (!node) { + return null; + } + + if (node.key === key) { + return node.value; + } + + if (key < node.key) { + return node.left ? find(node.left, key) : null; + } + + return node.right ? find(node.right, key) : null; +} + +export function remove(node: Node, key: K): Node | null { + if (!node) { + return null; + } + + if (key < node.key) { + node.left = remove(node.left as Node, key); + } else if (key > node.key) { + node.right = remove(node.right as Node, key); + } else { + if (!node.left && !node.right) { + return null; + } + + if (!node.left) { + return node.right as Node; + } + + if (!node.right) { + return node.left as Node; + } + + const temp = findMin(node.right as Node); + node.key = temp.key; + node.right = remove(node.right as Node, temp.key); + } + + const balanceFactor = getBalanceFactor(node); + + const leftNode = node.left as Node; + const rightNode = node.right as Node; + + if (balanceFactor === BALANCE_STATE.UNBALANCED_LEFT) { + if ( + getBalanceFactor(leftNode) === BALANCE_STATE.BALANCED || + getBalanceFactor(leftNode) === BALANCE_STATE.SLIGHTLY_UNBALANCED_LEFT + ) { + return rotateRight(node); + } + + if (getBalanceFactor(leftNode) === BALANCE_STATE.SLIGHTLY_UNBALANCED_RIGHT) { + node.left = rotateLeft(leftNode); + return rotateRight(node); + } + } + + if (balanceFactor === BALANCE_STATE.UNBALANCED_RIGHT) { + if ( + getBalanceFactor(rightNode) === BALANCE_STATE.BALANCED || + getBalanceFactor(rightNode) === BALANCE_STATE.SLIGHTLY_UNBALANCED_RIGHT + ) { + return rotateLeft(node); + } + + if (getBalanceFactor(rightNode) === BALANCE_STATE.SLIGHTLY_UNBALANCED_LEFT) { + node.right = rotateRight(rightNode); + return rotateLeft(node); + } + } + + return node; +} + +export function removeDocument(root: Node, id: string, key: K): void { + const node = getNodeByKey(root, key)!; + + if (node.value.length === 1) { + remove(root, key); + return; + } + + node.value.splice(node.value.indexOf(id), 1); +} diff --git a/src/trees/avl/index.ts b/src/trees/avl/index.ts deleted file mode 100644 index 157669c16..000000000 --- a/src/trees/avl/index.ts +++ /dev/null @@ -1,248 +0,0 @@ -import type { AVLNode } from "./node.js"; -import { createAVLNode } from "./node.js"; -import { BALANCE_STATE, getBalanceFactor, rotateLeft, rotateRight, findMin, getHeight } from "./utils.js"; - -export function create(key: K, value: V): AVLNode { - return createAVLNode(key, value); -} - -export function insert(node: AVLNode, key: K, value: V): AVLNode { - if (!node) { - return create(key, value); - } - - if (key < node.key) { - node.left = insert(node.left as AVLNode, key, value); - } else if (key > node.key) { - node.right = insert(node.right as AVLNode, key, value); - } else { - (node.value as string[]) = node.value.concat(value); - return node; - } - - const balanceFactor = getBalanceFactor(node); - - if (balanceFactor === BALANCE_STATE.UNBALANCED_LEFT) { - if (key < (node.left as AVLNode).key) { - node = rotateRight(node); - } else { - node.left = rotateLeft(node.left as AVLNode); - node = rotateRight(node); - } - } - - if (balanceFactor === BALANCE_STATE.UNBALANCED_RIGHT) { - if (key > (node.right as AVLNode).key) { - node = rotateLeft(node); - } else { - node.right = rotateRight(node.right as AVLNode); - node = rotateLeft(node); - } - } - - return node; -} - -export function find(node: AVLNode, key: K): V | null { - if (!node) { - return null; - } - - if (node.key === key) { - return node.value; - } - - if (key < node.key) { - return node.left ? find(node.left, key) : null; - } - - return node.right ? find(node.right, key) : null; -} - -export function getNodeByKey(node: AVLNode, key: K): AVLNode | null { - if (!node) { - return null; - } - - if (node.key === key) { - return node; - } - - if (key < node.key) { - return node.left ? getNodeByKey(node.left, key) : null; - } - - return node.right ? getNodeByKey(node.right, key) : null; -} - -export function remove(node: AVLNode, key: K): AVLNode | null { - if (!node) { - return null; - } - - if (key < node.key) { - node.left = remove(node.left as AVLNode, key); - } else if (key > node.key) { - node.right = remove(node.right as AVLNode, key); - } else { - if (!node.left && !node.right) { - return null; - } - - if (!node.left) { - return node.right as AVLNode; - } - - if (!node.right) { - return node.left as AVLNode; - } - - const temp = findMin(node.right as AVLNode); - node.key = temp.key; - node.right = remove(node.right as AVLNode, temp.key); - } - - const balanceFactor = getBalanceFactor(node); - - const leftNode = node.left as AVLNode - const rightNode = node.right as AVLNode - - if (balanceFactor === BALANCE_STATE.UNBALANCED_LEFT) { - if (getBalanceFactor(leftNode) === BALANCE_STATE.BALANCED || getBalanceFactor(leftNode) === BALANCE_STATE.SLIGHTLY_UNBALANCED_LEFT) { - return rotateRight(node); - } - - if (getBalanceFactor(leftNode) === BALANCE_STATE.SLIGHTLY_UNBALANCED_RIGHT) { - node.left = rotateLeft(leftNode); - return rotateRight(node); - } - } - - if (balanceFactor === BALANCE_STATE.UNBALANCED_RIGHT) { - if (getBalanceFactor(rightNode) === BALANCE_STATE.BALANCED || getBalanceFactor(rightNode) === BALANCE_STATE.SLIGHTLY_UNBALANCED_RIGHT) { - return rotateLeft(node); - } - - if (getBalanceFactor(rightNode) === BALANCE_STATE.SLIGHTLY_UNBALANCED_LEFT) { - node.right = rotateRight(rightNode); - return rotateLeft(node); - } - } - - return node; -} - -export function contains(node: AVLNode, key: K): boolean { - return !!find(node, key); -} - -export function getSize(node: AVLNode | null): number { - if (!node) { - return 0; - } - - return 1 + getSize(node.left) + getSize(node.right); -} - -export function isBalanced(node: AVLNode | null): boolean { - if (!node) { - return true; - } - - const heightDiff = Math.abs(getHeight(node.left) - getHeight(node.right)); - return heightDiff <= 1 && isBalanced(node.left) && isBalanced(node.right); -} - -export function rangeSearch(node: AVLNode, min: K, max: K): V { - if (!node) { - return [] as unknown as V; - } - - // eslint-disable-next-line @typescript-eslint/ban-ts-comment - // @ts-ignore - const result: V = []; - - function traverse(node: AVLNode) { - if (!node) { - return; - } - - if (node.key > min) { - traverse(node.left as AVLNode); - } - - if (node.key >= min && node.key <= max) { - result.push(...node.value as V[]); - } - - if (node.key < max) { - traverse(node.right as AVLNode); - } - } - - traverse(node); - - return result; -} - -export function greaterThan(node: AVLNode, key: K, inclusive = false): V { - if (!node) { - return [] as unknown as V; - } - - // eslint-disable-next-line @typescript-eslint/ban-ts-comment - // @ts-ignore - const result: V = []; - - function traverse(node: AVLNode) { - if (!node) { - return; - } - - if (inclusive && node.key >= key) { - result.push(...node.value); - } - - if (!inclusive && node.key > key) { - result.push(...node.value as V[]); - } - - traverse(node.left as AVLNode); - traverse(node.right as AVLNode); - } - - traverse(node); - - return result; -} - -export function lessThan(node: AVLNode, key: K, inclusive = false): V { - if (!node) { - return [] as unknown as V; - } - - // eslint-disable-next-line @typescript-eslint/ban-ts-comment - // @ts-ignore - const result: V = []; - - function traverse(node: AVLNode) { - if (!node) { - return; - } - - if (inclusive && node.key <= key) { - result.push(...node.value as V[]); - } - - if (!inclusive && node.key < key) { - result.push(...node.value as V[]); - } - - traverse(node.left as AVLNode); - traverse(node.right as AVLNode); - } - - traverse(node); - - return result; -} \ No newline at end of file diff --git a/src/trees/avl/node.ts b/src/trees/avl/node.ts deleted file mode 100644 index 7e12b0662..000000000 --- a/src/trees/avl/node.ts +++ /dev/null @@ -1,17 +0,0 @@ -export type AVLNode = { - key: K, - value: V; - left: AVLNode | null; - right: AVLNode | null; - height: number; -}; - -export function createAVLNode(key: K, value: V): AVLNode { - return { - key, - value, - left: null, - right: null, - height: 0, - }; -} diff --git a/src/trees/avl/utils.ts b/src/trees/avl/utils.ts deleted file mode 100644 index e245d0556..000000000 --- a/src/trees/avl/utils.ts +++ /dev/null @@ -1,52 +0,0 @@ -import type { AVLNode } from './node.js'; - -export const BALANCE_STATE = { - UNBALANCED_RIGHT: 1, - SLIGHTLY_UNBALANCED_RIGHT: 2, - BALANCED: 3, - SLIGHTLY_UNBALANCED_LEFT: 4, - UNBALANCED_LEFT: 5, -}; - -export function getBalanceFactor(node: AVLNode): number { - const heightDifference = getHeight(node.left) - getHeight(node.right); - - switch (heightDifference) { - case -2: - return BALANCE_STATE.UNBALANCED_RIGHT; - case -1: - return BALANCE_STATE.SLIGHTLY_UNBALANCED_RIGHT; - case 1: - return BALANCE_STATE.SLIGHTLY_UNBALANCED_LEFT; - case 2: - return BALANCE_STATE.UNBALANCED_LEFT; - default: - return BALANCE_STATE.BALANCED; - } -} - -export function getHeight(node: AVLNode | null): number { - return node ? node.height : -1; -} - -export function rotateLeft(node: AVLNode): AVLNode { - const right = node.right as AVLNode; - node.right = right.left; - right.left = node; - node.height = Math.max(getHeight(node.left), getHeight(node.right)) + 1; - right.height = Math.max(getHeight(right.left), getHeight(right.right)) + 1; - return right; -} - -export function rotateRight(node: AVLNode): AVLNode { - const left = node.left as AVLNode; - node.left = left.right; - left.right = node; - node.height = Math.max(getHeight(node.left), getHeight(node.right)) + 1; - left.height = Math.max(getHeight(left.left), getHeight(left.right)) + 1; - return left; -} - -export function findMin(node: AVLNode): AVLNode { - return node.left ? findMin(node.left) : node; -} \ No newline at end of file diff --git a/src/trees/radix/index.ts b/src/trees/radix.ts similarity index 77% rename from src/trees/radix/index.ts rename to src/trees/radix.ts index 69a82ce5d..bfb413f70 100644 --- a/src/trees/radix/index.ts +++ b/src/trees/radix.ts @@ -1,16 +1,131 @@ -import { boundedLevenshtein } from "../../levenshtein.js"; -import { getOwnProperty } from "../../utils.js"; -import { addDocument, create as createNode, RadixNode, removeDocument, updateParent } from "./node.js"; +import { boundedLevenshtein } from "../components/levenshtein.js"; +import { Nullable } from "../types.js"; +import { getOwnProperty, uniqueId } from "../utils.js"; + +export interface Node { + id: string; + key: string; + subWord: string; + parent: Nullable; + children: Record; + docs: string[]; + end: boolean; + word: string; +} -export type FindParams = { +type FindParams = { term: string; exact?: boolean; tolerance?: number; }; -export type FindResult = Record; +type FindResult = Record; + +/* c8 ignore next 5 */ +function serialize(this: Node): object { + const { word, subWord, children, docs, end } = this; + + return { word, subWord, children, docs, end }; +} + +function updateParent(node: Node, parent: Node): void { + node.parent = parent.id; + node.word = parent.word + node.subWord; +} + +function addDocument(node: Node, docID: string): void { + node.docs.push(docID); +} + +function removeDocument(node: Node, docID: string): boolean { + const index = node.docs.indexOf(docID); + + /* c8 ignore next 3 */ + if (index === -1) { + return false; + } + + node.docs.splice(index, 1); + + return true; +} + +function findAllWords(node: Node, output: FindResult, term: string, exact?: boolean, tolerance?: number) { + if (node.end) { + const { word, docs: docIDs } = node; + + if (exact && word !== term) { + return {}; + } + + // always check in own property to prevent access to inherited properties + // fix https://github.com/LyraSearch/lyra/issues/137 + if (!getOwnProperty(output, word)) { + if (tolerance) { + // computing the absolute difference of letters between the term and the word + const difference = Math.abs(term.length - word.length); + + // if the tolerance is set, check whether the edit distance is within tolerance. + // In that case, we don't need to add the word to the output + if (difference <= tolerance && boundedLevenshtein(term, word, tolerance).isBounded) { + output[word] = []; + } + } else { + // prevent default tolerance not set + output[word] = []; + } + } + + // check if _output[word] exists and then add the doc to it + // always check in own property to prevent access to inherited properties + // fix https://github.com/LyraSearch/lyra/issues/137 + if (getOwnProperty(output, word) && docIDs.length) { + const docs = new Set(output[word]); + + const docIDsLength = docIDs.length; + for (let i = 0; i < docIDsLength; i++) { + docs.add(docIDs[i]); + } + output[word] = Array.from(docs); + } + } + + // recursively search the children + for (const character of Object.keys(node.children)) { + findAllWords(node.children[character], output, term, exact, tolerance); + } + return output; +} + +function getCommonPrefix(a: string, b: string) { + let commonPrefix = ""; + const len = Math.min(a.length, b.length); + for (let i = 0; i < len; i++) { + if (a[i] !== b[i]) { + return commonPrefix; + } + commonPrefix += a[i]; + } + return commonPrefix; +} + +export function create(end = false, subWord = "", key = ""): Node { + const node = { + id: uniqueId(), + key, + subWord, + parent: null, + children: {}, + docs: [], + end, + word: "", + }; + + Object.defineProperty(node, "toJSON", { value: serialize }); + return node; +} -export function insert(root: RadixNode, word: string, docId: string) { +export function insert(root: Node, word: string, docId: string) { for (let i = 0; i < word.length; i++) { const currentCharacter = word[i]; const wordAtIndex = word.substring(i); @@ -33,7 +148,7 @@ export function insert(root: RadixNode, word: string, docId: string) { // the wordAtIndex is completely contained in the child node subword if (commonPrefixLength < edgeLabelLength && commonPrefixLength === wordAtIndex.length) { - const newNode = createNode(true, wordAtIndex, currentCharacter); + const newNode = create(true, wordAtIndex, currentCharacter); newNode.children[edgeLabelAtCommonPrefix] = rootChildCurrentChar; const newNodeChild = newNode.children[edgeLabelAtCommonPrefix]; @@ -49,7 +164,7 @@ export function insert(root: RadixNode, word: string, docId: string) { // the wordAtIndex is partially contained in the child node subword if (commonPrefixLength < edgeLabelLength && commonPrefixLength < wordAtIndex.length) { - const inbetweenNode = createNode(false, commonPrefix, currentCharacter); + const inbetweenNode = create(false, commonPrefix, currentCharacter); inbetweenNode.children[edgeLabelAtCommonPrefix] = rootChildCurrentChar; root.children[currentCharacter] = inbetweenNode; @@ -58,7 +173,7 @@ export function insert(root: RadixNode, word: string, docId: string) { inbetweenNodeChild.key = edgeLabelAtCommonPrefix; const wordAtCommonPrefix = wordAtIndex[commonPrefixLength]; - const newNode = createNode(true, word.substring(i + commonPrefixLength), wordAtCommonPrefix); + const newNode = create(true, word.substring(i + commonPrefixLength), wordAtCommonPrefix); addDocument(newNode, docId); inbetweenNode.children[wordAtCommonPrefix] = newNode; @@ -75,7 +190,7 @@ export function insert(root: RadixNode, word: string, docId: string) { root = rootChildCurrentChar; } else { // if the node for the current character doesn't exist create new node - const newNode = createNode(true, wordAtIndex, currentCharacter); + const newNode = create(true, wordAtIndex, currentCharacter); addDocument(newNode, docId); root.children[currentCharacter] = newNode; @@ -85,7 +200,7 @@ export function insert(root: RadixNode, word: string, docId: string) { } } -export function find(root: RadixNode, { term, exact, tolerance }: FindParams) { +export function find(root: Node, { term, exact, tolerance }: FindParams): FindResult { // find the closest node to the term for (let i = 0; i < term.length; i++) { const character = term[i]; @@ -121,66 +236,7 @@ export function find(root: RadixNode, { term, exact, tolerance }: FindParams) { return output; } -function findAllWords(node: RadixNode, output: FindResult, term: string, exact?: boolean, tolerance?: number) { - if (node.end) { - const { word, docs: docIDs } = node; - - if (exact && word !== term) { - return {}; - } - - // always check in own property to prevent access to inherited properties - // fix https://github.com/LyraSearch/lyra/issues/137 - if (!getOwnProperty(output, word)) { - if (tolerance) { - // computing the absolute difference of letters between the term and the word - const difference = Math.abs(term.length - word.length); - - // if the tolerance is set, check whether the edit distance is within tolerance. - // In that case, we don't need to add the word to the output - if (difference <= tolerance && boundedLevenshtein(term, word, tolerance).isBounded) { - output[word] = []; - } - } else { - // prevent default tolerance not set - output[word] = []; - } - } - - // check if _output[word] exists and then add the doc to it - // always check in own property to prevent access to inherited properties - // fix https://github.com/LyraSearch/lyra/issues/137 - if (getOwnProperty(output, word) && docIDs.length) { - const docs = new Set(output[word]); - - const docIDsLength = docIDs.length; - for (let i = 0; i < docIDsLength; i++) { - docs.add(docIDs[i]); - } - output[word] = Array.from(docs); - } - } - - // recursively search the children - for (const character of Object.keys(node.children)) { - findAllWords(node.children[character], output, term, exact, tolerance); - } - return output; -} - -function getCommonPrefix(a: string, b: string) { - let commonPrefix = ""; - const len = Math.min(a.length, b.length); - for (let i = 0; i < len; i++) { - if (a[i] !== b[i]) { - return commonPrefix; - } - commonPrefix += a[i]; - } - return commonPrefix; -} - -export function contains(root: RadixNode, term: string): boolean { +export function contains(root: Node, term: string): boolean { for (let i = 0; i < term.length; i++) { const character = term[i]; @@ -203,8 +259,7 @@ export function contains(root: RadixNode, term: string): boolean { return true; } -// unused -export function removeWord(root: RadixNode, term: string): boolean { +export function removeWord(root: Node, term: string): boolean { if (!term) { return false; } @@ -228,7 +283,7 @@ export function removeWord(root: RadixNode, term: string): boolean { return false; } -export function removeDocumentByWord(root: RadixNode, term: string, docID: string, exact = true): boolean { +export function removeDocumentByWord(root: Node, term: string, docID: string, exact = true): boolean { if (!term) { return true; } diff --git a/src/trees/radix/node.ts b/src/trees/radix/node.ts deleted file mode 100644 index 435b02c71..000000000 --- a/src/trees/radix/node.ts +++ /dev/null @@ -1,58 +0,0 @@ -import type { Nullable } from "../../types/index.js"; -import { uniqueId } from "../../utils.js"; - -export interface RadixNode { - id: string; - key: string; - subWord: string; - parent: Nullable; - children: Record; - docs: string[]; - end: boolean; - word: string; -} - -export function create(end = false, subWord = "", key = ""): RadixNode { - const node = { - id: uniqueId(), - key, - subWord, - parent: null, - children: {}, - docs: [], - end, - word: "", - }; - - Object.defineProperty(node, "toJSON", { value: serialize }); - return node; -} - -export function updateParent(node: RadixNode, parent: RadixNode): void { - node.parent = parent.id; - node.word = parent.word + node.subWord; -} - -export function addDocument(node: RadixNode, docID: string): void { - node.docs.push(docID); -} - -export function removeDocument(node: RadixNode, docID: string): boolean { - const index = node.docs.indexOf(docID); - - /* c8 ignore next 3 */ - if (index === -1) { - return false; - } - - node.docs.splice(index, 1); - - return true; -} - -/* c8 ignore next 5 */ -function serialize(this: RadixNode): object { - const { word, subWord, children, docs, end } = this; - - return { word, subWord, children, docs, end }; -} diff --git a/src/types.ts b/src/types.ts new file mode 100644 index 000000000..bafb08c80 --- /dev/null +++ b/src/types.ts @@ -0,0 +1,335 @@ +export type Nullable = T | null; + +export type CallbackComponentReturnValue = T | Promise; + +// eslint-disable-next-line @typescript-eslint/no-empty-interface +export interface OpaqueIndex {} + +// eslint-disable-next-line @typescript-eslint/no-empty-interface +export interface OpaqueDocumentStore {} + +// eslint-disable-next-line @typescript-eslint/no-empty-interface +export interface Schema extends Record {} + +// eslint-disable-next-line @typescript-eslint/no-empty-interface +export interface Document extends Record {} + +export type SearchableType = "string" | "number" | "boolean"; + +export type SearchableValue = string | number | boolean; + +export type BM25Params = { + k?: number; + b?: number; + d?: number; +}; + +export type FacetSorting = "asc" | "desc" | "ASC" | "DESC"; + +export interface StringFacetDefinition { + limit?: number; + offset?: number; + sort?: FacetSorting; +} + +export interface NumberFacetDefinition { + ranges: { from: number; to: number }[]; +} + +export interface BooleanFacetDefinition { + true?: boolean; + false?: boolean; +} + +export type FacetDefinition = StringFacetDefinition | NumberFacetDefinition | BooleanFacetDefinition; + +export type ComparisonOperator = { + gt?: number; + gte?: number; + lt?: number; + lte?: number; + eq?: number; + between?: [number, number]; +}; + +export type SearchParams = { + /** + * The word to search. + */ + term: string; + /** + * The properties of the document to search in. + */ + properties?: "*" | string[]; + /** + * The number of matched documents to return. + */ + limit?: number; + /** + * The number of matched documents to skip. + */ + offset?: number; + /** + * Whether to match the term exactly. + */ + exact?: boolean; + /** + * The maximum [levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) + * between the term and the searchable property. + */ + tolerance?: number; + /** + * The BM25 parameters to use. + * + * k: Term frequency saturation parameter. + * The higher the value, the more important the term frequency becomes. + * The default value is 1.2. It should be set to a value between 1.2 and 2.0. + * + * b: Document length saturation impact. The higher the value, the more + * important the document length becomes. The default value is 0.75. + * + * d: Frequency normalization lower bound. Default value is 0.5. + * + * @see https://en.wikipedia.org/wiki/Okapi_BM25 + */ + relevance?: BM25Params; + /** + * The boost to apply to the properties. + * + * The boost is a number that is multiplied to the score of the property. + * It can be used to give more importance to some properties. + * + * @example + * // Give more importance to the 'title' property. + * const result = await search(db, { + * term: 'Michael', + * properties: ['title', 'author'], + * boost: { + * title: 2 + * } + * }); + * + * // In that case, the score of the 'title' property will be multiplied by 2. + */ + boost?: Record; + /** + * Facets configuration + * + * A facet is a feature that allows users to narrow down their search results by specific + * attributes or characteristics, such as category, price, or location. + * This can help users find more relevant and specific results for their search query. + * + * @example + * + * const results = await search(db, { + * term: 'Personal Computer', + * properties: ['title', 'description', 'category.primary', 'category.secondary'], + * facets: { + * 'category.primary': { + * size: 10, + * sort: 'ASC', + * } + * } + * }); + */ + facets?: Record; + + /** + * Filter the search results. + * + * @example + * // Search for documents that contain 'Headphones' in the 'description' and 'title' fields and + * // have a price less than 100. + * + * const result = await search(db, { + * term: 'Headphones', + * properties: ['description', 'title'], + * where: { + * price: { + * lt: 100 + * } + * } + * }); + */ + where?: Record; +}; + +export type Result = { + /** + * The id of the document. + */ + id: string; + /** + * The score of the document in the search. + */ + score: number; + /** + * The document + */ + document: Document; +}; + +export type FacetResult = Record< + string, + { + count: number; + values: Record; + } +>; + +export type TokenScore = [string, number]; + +export type TokenMap = Record; + +export type IndexMap = Record; + +export type SearchContext = { + timeStart: bigint; + params: SearchParams; + docsCount: number; + uniqueDocsIDs: Record; + indexMap: IndexMap; + docsIntersection: TokenMap; +}; + +export type Results = { + /** + * The number of all the matched documents. + */ + count: number; + /** + * An array of matched documents taking `limit` and `offset` into account. + */ + hits: Result[]; + /** + * The time taken to search. + */ + elapsed: bigint | string; + /** + * The facets results. + */ + facets?: FacetResult; +}; + +export type SingleCallbackComponent = ( + lyra: Lyra, + id: string, + doc?: Document, +) => CallbackComponentReturnValue; + +export type MultipleCallbackComponent = ( + lyra: Lyra, + doc: Document[] | string[], +) => CallbackComponentReturnValue; + +export type IIndexInsertOrRemoveFunction = ( + index: I, + id: string, + prop: string, + value: SearchableValue, + language: string | undefined, + tokenizer: Tokenizer, + docsCount: number, +) => CallbackComponentReturnValue; + +export type IIndexRemoveFunction = (index: I, id: string, prop: string) => CallbackComponentReturnValue; + +export interface IIndex { + create: (lyra: Lyra, schema: Schema) => I; + + beforeInsert?: IIndexInsertOrRemoveFunction; + insert: IIndexInsertOrRemoveFunction; + afterInsert?: IIndexInsertOrRemoveFunction; + + beforeRemove?: IIndexInsertOrRemoveFunction; + remove: IIndexInsertOrRemoveFunction; + afterRemove?: IIndexInsertOrRemoveFunction; + + search(index: I, prop: string, terms: string, context: SearchContext): CallbackComponentReturnValue; + searchByWhereClause(index: I, filters: Record): string[]; + + getSearchableProperties(index: I): CallbackComponentReturnValue; + getSearchablePropertiesWithTypes(index: I): CallbackComponentReturnValue>; + + load(raw: unknown): I | Promise; + save(index: I): unknown | Promise; +} + +export interface IDocumentsStore { + create: (lyra: Lyra) => D; + get(store: D, id: string): CallbackComponentReturnValue; + getMultiple(store: D, ids: string[]): CallbackComponentReturnValue<(Document | undefined)[]>; + store(store: D, id: string, doc: Document): CallbackComponentReturnValue; + remove(store: D, id: string): CallbackComponentReturnValue; + count(store: D): CallbackComponentReturnValue; + + load(raw: unknown): D | Promise; + save(store: D): unknown | Promise; +} + +export interface Tokenizer { + tokenize: (raw: string, language?: string) => string[]; +} + +export interface ComplexComponent { + tokenizer: Tokenizer; + index: IIndex; + documentsStore: IDocumentsStore; +} + +export interface SimpleComponents { + validateSchema(doc: Document, schema: Schema): CallbackComponentReturnValue; + getDocumentIndexId(doc: Document): CallbackComponentReturnValue; + getDocumentProperties( + doc: Document, + paths: string[], + ): CallbackComponentReturnValue>; + formatElapsedTime(number: bigint): CallbackComponentReturnValue | CallbackComponentReturnValue; +} + +export interface SimpleOrArrayCallbackComponents< + S extends Schema, + I extends OpaqueIndex, + D extends OpaqueDocumentStore, +> { + beforeInsert: SingleCallbackComponent | SingleCallbackComponent[]; + afterInsert: SingleCallbackComponent | SingleCallbackComponent[]; + beforeRemove: SingleCallbackComponent | SingleCallbackComponent[]; + afterRemove: SingleCallbackComponent | SingleCallbackComponent[]; + beforeMultipleInsert: MultipleCallbackComponent | MultipleCallbackComponent[]; + afterMultipleInsert: MultipleCallbackComponent | MultipleCallbackComponent[]; + beforeMultipleRemove: MultipleCallbackComponent | MultipleCallbackComponent[]; + afterMultipleRemove: MultipleCallbackComponent | MultipleCallbackComponent[]; +} + +export interface ArrayCallbackComponents { + beforeInsert: SingleCallbackComponent[]; + afterInsert: SingleCallbackComponent[]; + beforeRemove: SingleCallbackComponent[]; + afterRemove: SingleCallbackComponent[]; + beforeMultipleInsert: MultipleCallbackComponent[]; + afterMultipleInsert: MultipleCallbackComponent[]; + beforeMultipleRemove: MultipleCallbackComponent[]; + afterMultipleRemove: MultipleCallbackComponent[]; +} + +export type Components = Partial< + ComplexComponent & SimpleComponents & SimpleOrArrayCallbackComponents +>; + +export const kInsertions = Symbol("lyra.insertions"); +export const kRemovals = Symbol("lyra.removals"); + +export type Lyra = SimpleComponents & + ArrayCallbackComponents & { + schema: S; + tokenizer: Tokenizer; + index: IIndex; + documentsStore: IDocumentsStore; + data: { + index: I; + docs: D; + }; + [kInsertions]: number | undefined; + [kRemovals]: number | undefined; + }; diff --git a/src/types/facets.ts b/src/types/facets.ts deleted file mode 100644 index 2bc320bea..000000000 --- a/src/types/facets.ts +++ /dev/null @@ -1,26 +0,0 @@ -import { PropertiesSchema, PropertyType } from "./index.js"; - -export type FacetSorting = "asc" | "desc" | "ASC" | "DESC"; - -export type FacetsSearch = K extends string - ? S[K] extends PropertiesSchema - ? FacetsSearch - : S[K] extends PropertyType - ? { [key in `${P}${K}`]?: FacetTypeInterfaces[S[K]] } - : never - : never; - -type FacetTypeInterfaces = { - string: { - limit?: number; - offset?: number; - sort?: FacetSorting; - }; - number: { - ranges: {from: number, to: number}[] - }; - boolean: { - true?: boolean; - false?: boolean; - }; -} \ No newline at end of file diff --git a/src/types/filters.ts b/src/types/filters.ts deleted file mode 100644 index cd29dd82d..000000000 --- a/src/types/filters.ts +++ /dev/null @@ -1,33 +0,0 @@ -import type { PropertiesSchema } from "./index.js"; - -export type FilterOperation = - | "gt" - | "gte" - | "lt" - | "lte" - | "eq" - | "between"; - -type ComparisonOperator = { - gt?: number; - gte?: number; - lt?: number; - lte?: number; - eq?: number; - between?: [number, number]; -} - -type PickOne = { [P in keyof T]: Record & Partial, undefined>> }[keyof T] - -export type WhereFilter< - S extends PropertiesSchema, - P extends string = "", - K extends keyof S = keyof S> = K extends string - ? S[K] extends PropertiesSchema - ? WhereFilter - : S[K] extends "number" - ? { [key in `${P}${K}`]?: PickOne } - : S[K] extends "boolean" - ? { [key in `${P}${K}`]?: boolean } - : never - : never; \ No newline at end of file diff --git a/src/types/index.ts b/src/types/index.ts deleted file mode 100644 index e0ed06015..000000000 --- a/src/types/index.ts +++ /dev/null @@ -1,132 +0,0 @@ -import type { Language, TokenizerConfig } from "../tokenizer/index.js"; -import type { Hooks } from "../methods/hooks.js"; -import type { RadixNode } from "../trees/radix/node.js"; -import type { AVLNode } from "../trees/avl/node.js"; - -export * from "./filters.js"; -export * from "./facets.js"; - -export type TokenScore = [string, number]; -export type Nullable = T | null; - -export type IIntersectTokenScores = (arrays: TokenScore[][]) => TokenScore[]; - -export type ResolveSchema = { - [P in keyof T]: ResolveTypes; -}; - -export type SearchProperties< - TSchema extends PropertiesSchema, - TKey extends keyof TSchema = keyof TSchema, -> = TKey extends string - ? TSchema[TKey] extends PropertiesSchema - ? `${TKey}.${SearchProperties}` - : TKey - : never; - - -export type PropertyType = "string" | "number" | "boolean"; - -export type PropertiesSchema = { - [key: string]: PropertyType | PropertiesSchema; -}; - -export type AlgorithmsConfig = { - intersectTokenScores: IIntersectTokenScores; -}; - -export type PropertiesBoost = { - [P in keyof S]?: number; -}; - -export type ElaspedConfig = { - format?: "human" | "raw", -} - -export type Configuration = { - /** - * The structure of the document to be inserted into the database. - */ - schema: S; - /** - * The default language analyzer to use. - */ - defaultLanguage?: Language; - edge?: boolean; - hooks?: Hooks; - components?: Components; -}; - -export type Data = { - docs: Record | undefined>; - defaultLanguage: Language; - index: Index; - schema: S; - frequencies: FrequencyMap; - tokenOccurrencies: TokenOccurrency; - avgFieldLength: Record; - fieldLengths: Record>; -}; - -export type Components = { - elapsed?: ElaspedConfig; - tokenizer?: TokenizerConfig; - algorithms?: AlgorithmsConfig; -}; - -export interface Lyra extends Data { - defaultLanguage: Language; - schema: S; - edge: boolean; - hooks: Hooks; - components?: Components; - frequencies: FrequencyMap; - docsCount: number; - avgFieldLength: Record; - fieldLengths: Record>; -} - -export type BM25OptionalParams = { - k?: number; - b?: number; - d?: number; -}; - -export type BM25Params = { - k: number; - b: number; - d: number; -}; - -export type TokenMap = Record; - -export type BooleanIndex = { - 'true': string[]; - 'false': string[]; -} - -type ResolveTypes = TType extends "string" - ? string - : TType extends "boolean" - ? boolean - : TType extends "number" - ? number - : TType extends PropertiesSchema - ? { [P in keyof TType]: ResolveTypes } - : never; - -type Index = Record | BooleanIndex>; - -type FrequencyMap = { - [property: string]: { - [documentID: string]: { - [token: string]: number; - }; - }; -}; - -type TokenOccurrency = { - [property: string]: { - [token: string]: number; - }; -}; diff --git a/src/utils.ts b/src/utils.ts index a3bce3181..5b85d726b 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -1,8 +1,11 @@ -import type { TokenScore } from "./types/index.js"; +import type { Document, SearchableValue, TokenScore } from "./types.js"; const baseId = Date.now().toString().slice(5); let lastId = 0; +// Checks if `hasOwn` method is defined avoiding errors with older Node.js versions +const hasOwn = Object.hasOwn ?? Object.prototype.hasOwnProperty.call; + const k = 1024; const nano = BigInt(1e3); const milli = BigInt(1e6); @@ -10,6 +13,40 @@ const second = BigInt(1e9); export const isServer = typeof window === "undefined"; +export function sprintf(template: string, ...args: (string | number)[]): string { + return template.replaceAll( + /%(?:(?\d+)\$)?(?-?\d*\.?\d*)(?[dfs])/g, + function (...replaceArgs: Array>): string { + const { width: rawWidth, type, position } = replaceArgs.at(-1) as Record; + + const replacement = position ? args[Number.parseInt(position) - 1]! : args.shift()!; + const width = rawWidth === "" ? 0 : Number.parseInt(rawWidth); + + switch (type) { + case "d": + return replacement.toString().padStart(width, "0"); + case "f": { + let value = replacement; + const [padding, precision] = rawWidth.split(".").map(w => Number.parseFloat(w)); + + if (typeof precision === "number" && precision >= 0) { + value = (value as number).toFixed(precision); + } + + return typeof padding === "number" && padding >= 0 ? value.toString().padStart(width, "0") : value.toString(); + } + case "s": + return width < 0 + ? (replacement as string).toString().padEnd(-width, " ") + : (replacement as string).toString().padStart(width, " "); + + default: + return replacement as string; + } + }, + ); +} + export function formatBytes(bytes: number, decimals = 2): string { if (bytes === 0) { return "0 Bytes"; @@ -54,12 +91,7 @@ export function uniqueId(): string { } export function getOwnProperty(object: Record, property: string): T | undefined { - // Checks if `hasOwn` method is defined avoiding errors with older Node.js versions - if (Object.hasOwn === undefined) { - return Object.prototype.hasOwnProperty.call(object, property) ? object[property] : undefined; - } - - return Object.hasOwn(object, property) ? object[property] : undefined; + return hasOwn(object, property) ? object[property] : undefined; } export function getTokenFrequency(token: string, tokens: string[]): number { @@ -104,10 +136,14 @@ export function sortTokenScorePredicate(a: TokenScore, b: TokenScore): number { // Intersection function taken from https://github.com/lovasoa/fast_array_intersect. // MIT Licensed at the time of writing. export function intersect(arrays: ReadonlyArray[]): T[] { - if (arrays.length === 0) return []; + if (arrays.length === 0) { + return []; + } else if (arrays.length === 1) { + return arrays[0] as T[]; + } - for (let i=1; i(arrays: ReadonlyArray[]): T[] { } const set = new Map(); - for(const elem of arrays[0]) { + for (const elem of arrays[0]) { set.set(elem, 1); } - for (let i=1; i { const count = set.get(e); if (count !== undefined) set.set(e, 0); - return count === arrays.length + return count === arrays.length; }); } -/** - * Retrieve a deeply nested value from an object using a dot-separated string path. - * - * @template T - The expected type of the nested value. - * @param {Record} obj - The object to retrieve the value from. - * @param {string} path - The dot-separated string path to the nested value. - * @returns {(T | undefined)} - The nested value, or undefined if the path is invalid. - */ - -export function getNested( - obj: Record, - path: string -): T | undefined { - return path.split(".").reduce((o, p) => o && typeof o === "object" ? o[p] : undefined, obj) as T | undefined; +export function getDocumentProperties(doc: Document, paths: string[]): Record { + const properties: Record = {}; + + const pathsLength = paths.length; + for (let i = 0; i < pathsLength; i++) { + const path = paths[i]; + const pathTokens = path.split("."); + + let current: SearchableValue | Document | undefined = doc; + const pathTokensLength = pathTokens.length; + for (let j = 0; j < pathTokensLength; j++) { + current = (current as Document)[pathTokens[j]!]; + + // We found an object but we were supposed to be done + if (typeof current === "object" && current !== null && j === pathTokensLength - 1) { + current = undefined; + break; + } else if ((current === null || typeof current !== "object") && j < pathTokensLength - 1) { + // We can't recurse anymore but we were supposed to + current = undefined; + break; + } + } + + if (typeof current !== "undefined") { + properties[path] = current as SearchableValue; + } + } + + return properties; +} + +export function getNested(obj: object, path: string): T | undefined { + const props = getDocumentProperties(obj as Document, [path]); + + return props[path] as T | undefined; } -/** - * Flattens an object with deeply nested properties, such that (for example), this: - * `{ foo: { bar: { baz: 10 } } }` becomes: `{ 'foo.bar.baz': 10 }` - * - * @param {object} obj - The object to flatten. - * @param {string} [prefix=''] - The prefix to use for each key in the flattened object. - * @returns {object} - The flattened object. - */ - -export function flattenObject(obj: object, prefix = ''): object { - const result: { [key: string]: any } = {}; +export function flattenObject(obj: object, prefix = ""): Document { + const result: Document = {}; + for (const key in obj) { - const objKey = (obj as any)[key]; - if (typeof objKey === 'object' && objKey !== null) { - Object.assign(result, flattenObject(objKey, prefix + key + '.')); + const prop = `${prefix}${key}`; + const objKey = (obj as Document)[key]; + + if (typeof objKey === "object" && objKey !== null) { + Object.assign(result, flattenObject(objKey, `${prop}.`)); } else { - result[prefix + key] = objKey; + result[prop] = objKey; } } return result; diff --git a/tests/algorithms.test.ts b/tests/algorithms.test.ts deleted file mode 100644 index 464dea354..000000000 --- a/tests/algorithms.test.ts +++ /dev/null @@ -1,53 +0,0 @@ -import t from "tap"; -import { intersectTokenScores } from "../src/algorithms.js"; - -t.test("utils", t => { - t.plan(1); - - t.test("should correctly intersect 2 or more arrays", async t => { - t.plan(3); - - t.same( - await intersectTokenScores([ - [ - ["foo", 1], - ["bar", 1], - ["baz", 2], - ], - [ - ["foo", 4], - ["quick", 10], - ["brown", 3], - ["bar", 2], - ], - [ - ["fox", 12], - ["foo", 4], - ["jumps", 3], - ["bar", 6], - ], - ]), - [ - ["foo", 9], - ["bar", 9], - ], - ); - - t.same( - await intersectTokenScores([ - [ - ["foo", 1], - ["bar", 1], - ["baz", 2], - ], - [ - ["quick", 10], - ["brown", 3], - ], - ]), - [], - ); - - t.same(await intersectTokenScores([]), []); - }); -}); diff --git a/tests/boosting.test.ts b/tests/boosting.test.ts index 33a821e4c..2cb7cd173 100644 --- a/tests/boosting.test.ts +++ b/tests/boosting.test.ts @@ -1,4 +1,4 @@ -import t from "tap" +import t from "tap"; import { create, insert, search } from "../src/index.js"; t.test("boosting", t => { @@ -11,20 +11,21 @@ t.test("boosting", t => { schema: { id: "string", title: "string", - description: "string" - } + description: "string", + }, }); await insert(db, { id: "1", title: "Powerful computer with 16GB RAM", - description: "A powerful computer with 16GB RAM and a 1TB SSD, perfect for gaming and video editing." + description: "A powerful computer with 16GB RAM and a 1TB SSD, perfect for gaming and video editing.", }); await insert(db, { id: "2", title: "PC with 8GB RAM. Good for gaming and browsing the web.", - description: "A personal computer with 8GB RAM and a 500GB SSD, perfect for browsing the web and watching movies. This computer is also great for kids." + description: + "A personal computer with 8GB RAM and a 500GB SSD, perfect for browsing the web and watching movies. This computer is also great for kids.", }); const { hits: hits1 } = await search(db, { @@ -34,16 +35,16 @@ t.test("boosting", t => { const { hits: hits2 } = await search(db, { term: "computer for browsing and movies", boost: { - title: 2.5 - } + title: 2.5, + }, }); try { await search(db, { term: "computer for browsing and movies", boost: { - title: 0 - } + title: 0, + }, }); } catch (err) { t.same(err.message, `Boost value must be a number greater than, or less than 0.`); @@ -51,4 +52,4 @@ t.test("boosting", t => { t.equal(hits1[0].score < hits2[0].score, true); }); -}); \ No newline at end of file +}); diff --git a/tests/ci/playwright/browsers.spec.ts b/tests/ci/playwright/browsers.spec.ts index 6f840ddff..35082f48b 100644 --- a/tests/ci/playwright/browsers.spec.ts +++ b/tests/ci/playwright/browsers.spec.ts @@ -3,6 +3,7 @@ import { expect, test } from "@playwright/test"; test("works correctly", async ({ page }) => { await page.goto("/"); + await page.waitForSelector("#searchResult"); const result = await page.evaluate(() => JSON.parse(document.getElementById("searchResult")!.innerHTML)); expect(result).toMatchObject({ diff --git a/tests/ci/playwright/index.html b/tests/ci/playwright/index.html index bb7869b90..195994e6c 100644 --- a/tests/ci/playwright/index.html +++ b/tests/ci/playwright/index.html @@ -1,66 +1,66 @@ - - + Lyra e2e - + - -
+ - - - \ No newline at end of file + + diff --git a/tests/ci/playwright/vite.config.js b/tests/ci/playwright/vite.config.js index 2bba4b1c5..734d1cb30 100644 --- a/tests/ci/playwright/vite.config.js +++ b/tests/ci/playwright/vite.config.js @@ -1,13 +1,8 @@ -import path from 'path' +import path from "path"; export default { - root: path.resolve(__dirname), - server: { - port: 3000, - }, - resolve: { - alias: { - '@stemmer': path.resolve(__dirname, '../../../stemmer/lib') - }, - } -} \ No newline at end of file + root: path.resolve(__dirname), + server: { + port: 3000, + }, +}; diff --git a/tests/lyra.dataset.test.ts b/tests/dataset.test.ts similarity index 89% rename from tests/lyra.dataset.test.ts rename to tests/dataset.test.ts index 85e72c7f1..bad2d7b90 100644 --- a/tests/lyra.dataset.test.ts +++ b/tests/dataset.test.ts @@ -1,7 +1,6 @@ import t from "tap"; -import { create, insertBatch, remove, search } from "../src/index.js"; -import type { SearchResult } from "../src/methods/search.js"; -import type { PropertiesSchema } from "../src/types.js"; +import { DocumentsStore } from "../src/components/documents-store.js"; +import { create, insertMultiple, remove, Results, search } from "../src/index.js"; import dataset from "./datasets/events.json" assert { type: "json" }; import snapshots from "./snapshots/events.json" assert { type: "json" }; @@ -17,7 +16,7 @@ type EventJson = { }; }; -function removeVariadicData(res: SearchResult): Omit, "elapsed"> { +function removeVariadicData(res: Results): Omit { const hits = res.hits.map(h => { h.id = ""; return h; @@ -53,7 +52,7 @@ t.test("lyra.dataset", async t => { }, })); - await insertBatch(db, events); + await insertMultiple(db, events); t.test("should correctly populate the database with a large dataset", async t => { t.plan(4); @@ -82,7 +81,7 @@ t.test("lyra.dataset", async t => { offset: 0, }); - t.equal(Object.keys(db.docs).length, (dataset as EventJson).result.events.length); + t.equal(Object.keys((db.data.docs as DocumentsStore).docs).length, (dataset as EventJson).result.events.length); t.equal(s1.count, 1117); t.equal(s2.count, 7314); t.equal(s3.count, 7314); @@ -107,9 +106,9 @@ t.test("lyra.dataset", async t => { properties: ["description"], }); - t.equal(s1.count, 31294); - t.equal(s2.count, 28747); - t.equal(s3.count, 33644); + t.equal(s1.count, 14931); + t.equal(s2.count, 2922); + t.equal(s3.count, 3331); }); t.test("should perform paginate search", async t => { @@ -205,7 +204,7 @@ t.test("lyra.dataset", async t => { }); for (const doc of documentsToDelete.hits) { - remove(db, doc.id); + await remove(db, doc.id); } const newSearch = await search(db, { diff --git a/tests/docs.test.ts b/tests/docs.test.ts index 686aa2043..005cdca8d 100644 --- a/tests/docs.test.ts +++ b/tests/docs.test.ts @@ -1,20 +1,20 @@ -import * as t from 'tap'; -import { count, getByID, create, insert } from '../src/index.js'; +import * as t from "tap"; +import { count, getByID, create, insert } from "../src/index.js"; -t.test('count', async t => { +t.test("count", async t => { t.plan(2); const db = await create({ schema: { - id: 'string', - title: 'string' - } + id: "string", + title: "string", + }, }); - await insert(db, { id: 'doc1', title: 'Hello World 1' }); - await insert(db, { id: 'doc2', title: 'Hello World 2' }); - await insert(db, { id: 'doc3', title: 'Hello World 3' }); + await insert(db, { id: "doc1", title: "Hello World 1" }); + await insert(db, { id: "doc2", title: "Hello World 2" }); + await insert(db, { id: "doc3", title: "Hello World 3" }); - t.equal(await count(db), 3, 'count'); - t.equal((await getByID(db, 'doc1'))?.title, 'Hello World 1', 'getByID'); -}); \ No newline at end of file + t.equal(await count(db), 3, "count"); + t.equal((await getByID(db, "doc1"))?.title, "Hello World 1", "getByID"); +}); diff --git a/tests/elapsed.test.ts b/tests/elapsed.test.ts index 15f373024..d57744a04 100644 --- a/tests/elapsed.test.ts +++ b/tests/elapsed.test.ts @@ -1,5 +1,6 @@ import t from "tap"; import { create, insert, search } from "../src/index.js"; +import { formatNanoseconds } from "../src/utils.js"; t.test("elapsed", t => { t.plan(2); @@ -12,10 +13,8 @@ t.test("elapsed", t => { body: "string", }, components: { - elapsed: { - format: "human" - } - } + formatElapsedTime: formatNanoseconds, + }, }); await insert(db, { @@ -24,24 +23,19 @@ t.test("elapsed", t => { }); const results = await search(db, { - term: "test" + term: "test", }); t.same(typeof results.elapsed, "string"); }); - t.test("should correctly set elapsed time to a raw, bigInt", async t => { + t.test("should correctly set elapsed time to a bigint by default", async t => { t.plan(1); const db = await create({ schema: { title: "string", body: "string", }, - components: { - elapsed: { - format: "raw" - } - } }); await insert(db, { @@ -50,9 +44,9 @@ t.test("elapsed", t => { }); const results = await search(db, { - term: "test" + term: "test", }); t.same(typeof results.elapsed, "bigint"); }); -}); \ No newline at end of file +}); diff --git a/tests/facets.test.ts b/tests/facets.test.ts index c5a6b40cf..5de02b524 100644 --- a/tests/facets.test.ts +++ b/tests/facets.test.ts @@ -1,7 +1,7 @@ import t from "tap"; import { create, insert, search } from "../src/index.js"; -t.test("facets", (t) => { +t.test("facets", t => { t.plan(2); t.test("should generate correct facets", async t => { @@ -14,8 +14,8 @@ t.test("facets", (t) => { meta: { tag: "string", isFavorite: "boolean", - } - } + }, + }, }); await insert(db, { @@ -23,8 +23,8 @@ t.test("facets", (t) => { quote: "Be the change you wish to see in the world", meta: { tag: "inspirational", - isFavorite: true - } + isFavorite: true, + }, }); await insert(db, { @@ -32,8 +32,8 @@ t.test("facets", (t) => { quote: "I have not failed. I've just found 10,000 ways that won't work.", meta: { tag: "inspirational", - isFavorite: true - } + isFavorite: true, + }, }); await insert(db, { @@ -41,17 +41,18 @@ t.test("facets", (t) => { quote: "It does not matter how slowly you go as long as you do not stop.", meta: { tag: "inspirational", - isFavorite: false - } + isFavorite: false, + }, }); await insert(db, { author: "Helen Keller", - quote: "The best and most beautiful things in the world cannot be seen or even touched - they must be felt with the heart.", + quote: + "The best and most beautiful things in the world cannot be seen or even touched - they must be felt with the heart.", meta: { tag: "love", - isFavorite: true - } + isFavorite: true, + }, }); await insert(db, { @@ -59,8 +60,8 @@ t.test("facets", (t) => { quote: "Your time is limited, so don't waste it living someone else's life.", meta: { tag: "inspirational", - isFavorite: false - } + isFavorite: false, + }, }); await insert(db, { @@ -68,8 +69,8 @@ t.test("facets", (t) => { quote: "The only way to do great work is to love what you do.", meta: { tag: "inspirational", - isFavorite: false - } + isFavorite: false, + }, }); const results = await search(db, { @@ -80,15 +81,16 @@ t.test("facets", (t) => { false: false, }, "meta.tag": {}, - "author": {} - }}); - - t.same(results.facets?.['meta.isFavorite'].count, 2) - t.same(results.facets?.['meta.isFavorite'].values, { true: 1, false: 2 }); - t.same(results.facets?.['meta.tag'].count, 1); - t.same(results.facets?.['meta.tag'].values, { inspirational: 3 }); - t.same(results.facets?.author.count, 2); - t.same(results.facets?.author.values, { "Steve Jobs": 2, "Thomas A. Edison": 1 }); + author: {}, + }, + }); + + t.same(results.facets?.["meta.isFavorite"].count, 2); + t.same(results.facets?.["meta.isFavorite"].values, { true: 1, false: 2 }); + t.same(results.facets?.["meta.tag"].count, 1); + t.same(results.facets?.["meta.tag"].values, { inspirational: 3 }); + t.same(results.facets?.author.count, 2); + t.same(results.facets?.author.values, { "Steve Jobs": 2, "Thomas A. Edison": 1 }); }); t.test("should correctly handle range facets", async t => { @@ -98,49 +100,49 @@ t.test("facets", (t) => { schema: { name: "string", price: "number", - category: "string" - } + category: "string", + }, }); await insert(db, { name: "Chocolate", price: 1.99, - category: "groceries" - }) + category: "groceries", + }); await insert(db, { name: "Milk", price: 2.99, - category: "groceries" - }) + category: "groceries", + }); await insert(db, { name: "Bread", price: 3.99, - category: "groceries" - }) + category: "groceries", + }); await insert(db, { name: "Eggs", price: 4.99, - category: "groceries" - }) + category: "groceries", + }); await insert(db, { name: "Cheese", price: 5.99, - category: "groceries" - }) + category: "groceries", + }); await insert(db, { name: "Butter", price: 6.99, - category: "groceries" - }) + category: "groceries", + }); const results = await search(db, { term: "groceries", - properties: ['category'], + properties: ["category"], facets: { price: { ranges: [ @@ -148,16 +150,15 @@ t.test("facets", (t) => { { from: 2, to: 4 }, { from: 4, to: 6 }, { from: 6, to: 8 }, - ] - } - } + ], + }, + }, }); t.same(results.facets?.price.count, 4); - t.same(results.facets?.price.values['0-2'], 1); - t.same(results.facets?.price.values['2-4'], 2); - t.same(results.facets?.price.values['4-6'], 2); - t.same(results.facets?.price.values['6-8'], 1); - + t.same(results.facets?.price.values["0-2"], 1); + t.same(results.facets?.price.values["2-4"], 2); + t.same(results.facets?.price.values["4-6"], 2); + t.same(results.facets?.price.values["6-8"], 1); }); -}); \ No newline at end of file +}); diff --git a/tests/filters.test.ts b/tests/filters.test.ts index 6999498e3..770fe784e 100644 --- a/tests/filters.test.ts +++ b/tests/filters.test.ts @@ -2,50 +2,52 @@ import t from "tap"; import { create, insert, search, remove } from "../src/index.js"; async function createSimpleDB() { + let i = 0; const db = await create({ schema: { - id: 'string', - name: 'string', - rating: 'number', - price: 'number', + name: "string", + rating: "number", + price: "number", meta: { - sales: 'number', - } - } + sales: "number", + }, + }, + components: { + getDocumentIndexId(): string { + return `__${++i}`; + }, + }, }); await insert(db, { - id: '__1', - name: 'washing machine', + name: "washing machine", rating: 5, price: 900, meta: { sales: 100, - } + }, }); await insert(db, { - id: '__2', - name: 'coffee maker', + name: "coffee maker", rating: 3, price: 30, meta: { sales: 25, - } + }, }); await insert(db, { - id: '__3', - name: 'coffee maker deluxe', + name: "coffee maker deluxe", rating: 5, price: 45, meta: { sales: 25, - } + }, }); return db; -} +} t.test("filters", t => { t.plan(8); @@ -56,16 +58,16 @@ t.test("filters", t => { const db = await createSimpleDB(); const r1_gt = await search(db, { - term: 'coffee', + term: "coffee", where: { rating: { gt: 4, - } - } + }, + }, }); t.equal(r1_gt.count, 1); - t.equal(r1_gt.hits[0].id, '__3'); + t.equal(r1_gt.hits[0].id, "__3"); }); t.test("greater than or equal to", async t => { @@ -74,17 +76,17 @@ t.test("filters", t => { const db = await createSimpleDB(); const r1_gte = await search(db, { - term: 'coffee', + term: "coffee", where: { rating: { gte: 3, - } - } + }, + }, }); t.equal(r1_gte.count, 2); - t.equal(r1_gte.hits[0].id, '__2'); - t.equal(r1_gte.hits[1].id, '__3'); + t.equal(r1_gte.hits[0].id, "__2"); + t.equal(r1_gte.hits[1].id, "__3"); }); t.test("less than", async t => { @@ -93,16 +95,16 @@ t.test("filters", t => { const db = await createSimpleDB(); const r1_lt = await search(db, { - term: 'coffee', + term: "coffee", where: { rating: { lt: 5, - } - } + }, + }, }); t.equal(r1_lt.count, 1); - t.equal(r1_lt.hits[0].id, '__2'); + t.equal(r1_lt.hits[0].id, "__2"); }); t.test("less than or equal to", async t => { @@ -111,16 +113,16 @@ t.test("filters", t => { const db = await createSimpleDB(); const r1_lte = await search(db, { - term: 'coffee', + term: "coffee", where: { rating: { lte: 3, - } - } + }, + }, }); t.equal(r1_lte.count, 1); - t.equal(r1_lte.hits[0].id, '__2'); + t.equal(r1_lte.hits[0].id, "__2"); }); t.test("equal", async t => { @@ -129,16 +131,16 @@ t.test("filters", t => { const db = await createSimpleDB(); const r1_lte = await search(db, { - term: 'coffee', + term: "coffee", where: { rating: { eq: 3, - } - } + }, + }, }); t.equal(r1_lte.count, 1); - t.equal(r1_lte.hits[0].id, '__2'); + t.equal(r1_lte.hits[0].id, "__2"); }); t.test("between", async t => { @@ -147,16 +149,16 @@ t.test("filters", t => { const db = await createSimpleDB(); const r1_lte = await search(db, { - term: 'coffee', + term: "coffee", where: { rating: { between: [1, 4], - } - } + }, + }, }); t.equal(r1_lte.count, 1); - t.equal(r1_lte.hits[0].id, '__2'); + t.equal(r1_lte.hits[0].id, "__2"); }); t.test("multiple filters", async t => { @@ -165,19 +167,19 @@ t.test("filters", t => { const db = await createSimpleDB(); const r1_lte = await search(db, { - term: 'coffee', + term: "coffee", where: { rating: { between: [1, 4], }, price: { lte: 40, - } - } + }, + }, }); t.equal(r1_lte.count, 1); - t.equal(r1_lte.hits[0].id, '__2'); + t.equal(r1_lte.hits[0].id, "__2"); }); t.test("multiple filters, and operation", async t => { @@ -186,7 +188,7 @@ t.test("filters", t => { const db = await createSimpleDB(); const r1_lte = await search(db, { - term: 'coffee', + term: "coffee", where: { rating: { between: [1, 4], @@ -194,14 +196,14 @@ t.test("filters", t => { price: { lte: 40, }, - 'meta.sales': { - eq: 25 - } + "meta.sales": { + eq: 25, + }, }, }); t.equal(r1_lte.count, 1); - t.equal(r1_lte.hits[0].id, '__2'); + t.equal(r1_lte.hits[0].id, "__2"); }); }); @@ -210,66 +212,66 @@ t.test("filters after removing docs", t => { t.test("remove doc with simple schema", async t => { t.plan(3); - + const db = await createSimpleDB(); - + const r1_gt = await search(db, { - term: 'coffee', + term: "coffee", where: { rating: { gt: 4, - } - } + }, + }, }); - + t.equal(r1_gt.count, 1); - t.equal(r1_gt.hits[0].id, '__3'); - - await remove(db, '__3'); - + t.equal(r1_gt.hits[0].id, "__3"); + + await remove(db, "__3"); + const r2_gt = await search(db, { - term: 'coffee', + term: "coffee", where: { rating: { gt: 4, - } - } + }, + }, }); - + t.equal(r2_gt.count, 0); }); t.test("remove doc on nested schema", async t => { t.plan(5); - + const db = await createSimpleDB(); - + const r1_gt = await search(db, { - term: 'coffee', + term: "coffee", where: { - 'meta.sales': { - eq: 25 - } - } + "meta.sales": { + eq: 25, + }, + }, }); t.equal(r1_gt.count, 2); - t.equal(r1_gt.hits[0].id, '__2'); - t.equal(r1_gt.hits[1].id, '__3'); - - await remove(db, '__3'); - + t.equal(r1_gt.hits[0].id, "__2"); + t.equal(r1_gt.hits[1].id, "__3"); + + await remove(db, "__3"); + const r2_gt = await search(db, { - term: 'coffee', + term: "coffee", where: { - 'meta.sales': { - eq: 25 - } - } + "meta.sales": { + eq: 25, + }, + }, }); - + t.equal(r2_gt.count, 1); - t.equal(r2_gt.hits[0].id, '__2'); + t.equal(r2_gt.hits[0].id, "__2"); }); }); @@ -278,21 +280,19 @@ t.test("should throw when using multiple operators", async t => { const db = await createSimpleDB(); - try { - await search(db, { - term: 'coffee', - where: { - rating: { - // @ts-expect-error error case - gt: 4, - // @ts-expect-error error case - lte: 10 - } - } - }); - } catch (error) { - t.equal(error.message, 'You can only use one operation per filter. Found 2: gt, lte'); - } + await t.rejects( + () => + search(db, { + term: "coffee", + where: { + rating: { + gt: 4, + lte: 10, + }, + }, + }), + { code: "INVALID_FILTER_OPERATION" }, + ); }); t.test("boolean filters", async t => { @@ -300,61 +300,60 @@ t.test("boolean filters", async t => { const db = await create({ schema: { - id: 'string', - isAvailable: 'boolean', - name: 'string' - } + id: "string", + isAvailable: "boolean", + name: "string", + }, }); await insert(db, { - id: '1', + id: "1", isAvailable: true, - name: 'coffee' + name: "coffee", }); await insert(db, { - id: '2', + id: "2", isAvailable: true, - name: 'coffee machine' + name: "coffee machine", }); await insert(db, { - id: '3', + id: "3", isAvailable: false, - name: 'coffee maker' + name: "coffee maker", }); const r1 = await search(db, { - term: 'coffee', + term: "coffee", where: { isAvailable: true, - } + }, }); t.equal(r1.count, 2); - t.equal(r1.hits[0].id, '1'); - t.equal(r1.hits[1].id, '2'); + t.equal(r1.hits[0].id, "1"); + t.equal(r1.hits[1].id, "2"); const r2 = await search(db, { - term: 'coffee', + term: "coffee", where: { isAvailable: false, - } + }, }); t.equal(r2.count, 1); - t.equal(r2.hits[0].id, '3'); + t.equal(r2.hits[0].id, "3"); - await remove(db, '2'); + await remove(db, "2"); const r3 = await search(db, { - term: 'coffee', + term: "coffee", where: { isAvailable: true, - } + }, }); t.equal(r3.count, 1); - t.equal(r3.hits[0].id, '1'); - -}); \ No newline at end of file + t.equal(r3.hits[0].id, "1"); +}); diff --git a/tests/insert.test.ts b/tests/insert.test.ts index 1580306f1..6f0bb77b7 100644 --- a/tests/insert.test.ts +++ b/tests/insert.test.ts @@ -1,6 +1,6 @@ import t from "tap"; import { create } from "../src/methods/create.js"; -import { insert, insertBatch } from "../src/methods/insert.js"; +import { insert, insertMultiple } from "../src/methods/insert.js"; t.test("insert", t => { t.plan(6); @@ -25,8 +25,8 @@ t.test("insert", t => { name: "Doe", }); - t.equal(i1.id, "john-01"); - t.equal(i2.id, "doe-02"); + t.equal(i1, "john-01"); + t.equal(i2, "doe-02"); }); t.test("should use the custom 'id' function passed in the configuration object", async t => { @@ -37,26 +37,25 @@ t.test("insert", t => { id: "string", name: "string", }, + components: { + getDocumentIndexId(doc: { name: string }): string { + return `${doc.name.toLowerCase()}-foo-bar-baz`; + }, + }, }); - const i1 = await insert( - db, - { - id: "john-01", - name: "John", - }, - { - id: doc => `${doc.name.toLowerCase()}-foo-bar-baz`, - }, - ); + const i1 = await insert(db, { + id: "john-01", + name: "John", + }); const i2 = await insert(db, { id: "doe-02", name: "Doe", }); - t.equal(i1.id, "john-foo-bar-baz"); - t.equal(i2.id, "doe-02"); + t.equal(i1, "john-foo-bar-baz"); + t.equal(i2, "doe-foo-bar-baz"); }); t.test("should throw an error if the 'id' field is not a string", async t => { @@ -72,11 +71,10 @@ t.test("insert", t => { await t.rejects( () => insert(db, { - // @ts-expect-error error case id: 123, name: "John", }), - { message: '"id" must be of type "string". Got "number" instead.' }, + { code: "DOCUMENT_ID_MUST_BE_STRING" }, ); }); @@ -101,7 +99,7 @@ t.test("insert", t => { id: "john-01", name: "John", }), - { message: 'Document with ID "john-01" already exists.' }, + { code: "DOCUMENT_ALREADY_EXISTS" }, ); }); @@ -115,15 +113,14 @@ t.test("insert", t => { }); const i1 = await insert(db, { - // @ts-expect-error error case id: "john-01", name: "John", }); - t.equal(i1.id, "john-01"); + t.equal(i1, "john-01"); }); - t.test("custom ID should work with insertBatch as well", async t => { + t.test("custom ID should work with insertMultiple as well", async t => { t.plan(1); const db = await create({ @@ -131,25 +128,24 @@ t.test("insert", t => { id: "string", name: "string", }, + components: { + getDocumentIndexId(doc: { id: string; name: string }): string { + return `${doc.name.toLowerCase()}-${doc.id}`; + }, + }, }); - await insertBatch( - db, - [ - { - id: "01", - name: "John", - }, - { - id: "02", - name: "Doe", - }, - ], + const ids = await insertMultiple(db, [ { - id: doc => `${doc.name.toLowerCase()}-${doc.id}`, + id: "01", + name: "John", }, - ); + { + id: "02", + name: "Doe", + }, + ]); - t.same(Object.keys(db.docs), ["john-01", "doe-02"]); + t.strictSame(ids, ["john-01", "doe-02"]); }); }); diff --git a/tests/levenshtein.test.ts b/tests/levenshtein.test.ts index 2bb922e8e..800914d10 100644 --- a/tests/levenshtein.test.ts +++ b/tests/levenshtein.test.ts @@ -1,5 +1,5 @@ import t from "tap"; -import { boundedLevenshtein, levenshtein } from "../src/levenshtein.js"; +import { boundedLevenshtein, levenshtein } from "../src/components/levenshtein.js"; t.test("levenshtein", t => { t.plan(3); diff --git a/tests/lyra.test.ts b/tests/main.test.ts similarity index 87% rename from tests/lyra.test.ts rename to tests/main.test.ts index ae3f31bfe..c3acab6a1 100644 --- a/tests/lyra.test.ts +++ b/tests/main.test.ts @@ -1,10 +1,13 @@ import t from "tap"; -import { create, insert, insertBatch, insertWithHooks, remove, search } from "../src/index.js"; +import type { Document } from "../src/types"; +import { DocumentsStore } from "../src/components/documents-store.js"; +import { Index } from "../src/components/index.js"; +import { create, insert, insertMultiple, remove, search } from "../src/index.js"; +import { createTokenizer } from "../src/tokenizer/index.js"; import { SUPPORTED_LANGUAGES } from "../src/tokenizer/languages.js"; -import { INVALID_DOC_SCHEMA, LANGUAGE_NOT_SUPPORTED } from "../src/errors.js"; import dataset from "./datasets/events.json" assert { type: "json" }; -interface BaseDataEvent { +interface BaseDataEvent extends Document { description: string; lang: string; category1: string; @@ -36,7 +39,7 @@ t.test("defaultLanguage", t => { schema: {}, defaultLanguage: "latin", }), - { message: LANGUAGE_NOT_SUPPORTED("latin") }, + { code: "LANGUAGE_NOT_SUPPORTED" }, ); }); @@ -54,9 +57,9 @@ t.test("defaultLanguage", t => { { foo: "bar", }, - { language: "latin" }, + "latin", ), - { message: LANGUAGE_NOT_SUPPORTED("latin") }, + { code: "LANGUAGE_NOT_SUPPORTED" }, ); }); @@ -106,7 +109,7 @@ t.test("defaultLanguage", t => { }); }); -t.test("checkInsertDocSchema", t => { +t.test("document validation", t => { t.plan(3); t.test("should compare the inserted doc with the schema definition", async t => { @@ -119,16 +122,16 @@ t.test("checkInsertDocSchema", t => { }, }); - t.ok((await insert(db, { quote: "hello, world!", author: "me" })).id); + t.ok(await insert(db, { quote: "hello, world!", author: "me" })); - // @ts-expect-error test error case await t.rejects(() => insert(db, { quote: "hello, world!", author: true }), { - message: INVALID_DOC_SCHEMA({ quote: "string", author: "string" }, { quote: "hello, world!", author: true }), + code: "INVALID_DOCUMENT_PROPERTY", }); }); t.test("should allow doc with missing schema keys to be inserted without indexing those keys", async t => { t.plan(6); + const db = await create({ schema: { quote: "string", @@ -137,25 +140,21 @@ t.test("checkInsertDocSchema", t => { }); await insert(db, { quote: "hello, world!", - // @ts-expect-error test error case authors: "author should be singular", }); - t.ok(Object.keys(db.docs).length === 1); + t.equal(Object.keys((db.data.docs as DocumentsStore).docs).length, 1); const docWithExtraKey = { quote: "hello, world!", foo: { bar: 10 } }; - // @ts-expect-error test error case + const insertedInfo = await insert(db, docWithExtraKey); - t.ok(insertedInfo.id); - t.equal(Object.keys(db.docs).length, 2); - t.ok( - insertedInfo.id in db.docs && - // @ts-expect-error test error case - "foo" in db.docs[insertedInfo.id], - ); - // @ts-expect-error test error case - t.same(docWithExtraKey.foo, db.docs[insertedInfo.id].foo); - t.notOk(db.index.foo); + + t.ok(insertedInfo); + t.equal(Object.keys((db.data.docs as DocumentsStore).docs).length, 2); + + t.ok("foo" in (db.data.docs as DocumentsStore).docs[insertedInfo]!); + t.same(docWithExtraKey.foo, (db.data.docs as DocumentsStore).docs[insertedInfo]!.foo); + t.notOk("foo" in (db.data.index as Index).indexes); }); t.test( @@ -194,18 +193,26 @@ t.test("checkInsertDocSchema", t => { }; const insertedInfo = await insert(db, nestedExtraKeyDoc); - t.ok(insertedInfo.id); - t.equal(Object.keys(db.docs).length, 1); + t.ok(insertedInfo); + t.equal(Object.keys((db.data.docs as DocumentsStore).docs).length, 1); + + t.same( + nestedExtraKeyDoc.unexpectedProperty, + (db.data.docs as DocumentsStore).docs[insertedInfo]!.unexpectedProperty, + ); - // @ts-expect-error test error case - t.same(nestedExtraKeyDoc.unexpectedProperty, db.docs[insertedInfo.id].unexpectedProperty); - // @ts-expect-error test error case - t.same(nestedExtraKeyDoc.tag.unexpectedNestedProperty, db.docs[insertedInfo.id].tag.unexpectedNestedProperty); - t.notOk(db.index.unexpectedProperty); - t.notOk(db.index["tag.unexpectedProperty"]); + t.same( + nestedExtraKeyDoc.tag.unexpectedNestedProperty, + ((db.data.docs as DocumentsStore).docs[insertedInfo]!.tag as unknown as Record) + .unexpectedNestedProperty, + ); + + t.notOk("unexpectedProperty" in (db.data.index as Index).indexes); + t.notOk("tag.unexpectedProperty" in (db.data.index as Index).indexes); }, ); }); + t.test("lyra", t => { t.plan(19); @@ -249,8 +256,8 @@ t.test("lyra", t => { const result7 = await search(db, { term: "They are the best" }); const result8 = await search(db, { term: "Foxes are nice animals" }); - t.equal(result7.count, 4); - t.equal(result8.count, 4); + t.equal(result7.count, 2); + t.equal(result8.count, 2); }); t.test("should correctly search for data returning doc including with unindexed keys", async t => { @@ -261,6 +268,9 @@ t.test("lyra", t => { quote: "string", author: "string", }, + components: { + tokenizer: await createTokenizer("english", { stemming: false, stopWords: false }), + }, }); const documentWithUnindexedField = { @@ -280,8 +290,8 @@ t.test("lyra", t => { const result1 = await search(db, { term: "They are the best" }); const result2 = await search(db, { term: "Foxes are nice animals" }); - t.equal(result1.count, 2); - t.equal(result2.count, 2); + t.equal(result1.count, 1); + t.equal(result2.count, 1); t.same(result1.hits[0].document, documentWithUnindexedField); t.same(result2.hits[0].document, documentWithNestedUnindexedField); }); @@ -299,14 +309,12 @@ t.test("lyra", t => { await insert(db, { quote: "I like dogs. They are the best.", author: "Jane Doe", - //@ts-expect-error test error case nested: { unindexedNestedField: "unindexedNestedValue" }, }); await insert(db, { quote: "I like cats. They are the best.", author: "Jane Doe", - //@ts-expect-error test error case unindexedField: "unindexedValue", }); @@ -332,7 +340,7 @@ t.test("lyra", t => { properties: ["example"], }); - t.ok(ex1Insert.id); + t.ok(ex1Insert); t.equal(ex1Search.count, 1); t.type(ex1Search.elapsed, "bigint"); t.equal(ex1Search.hits[0].document.example, "The quick, brown, fox"); @@ -382,18 +390,16 @@ t.test("lyra", t => { () => search(db, { term: "foo", - //@ts-expect-error test error case properties: ["bar"], }), { - message: - 'Invalid property name. Expected a wildcard string ("*") or array containing one of the following properties: foo, baz, but got: bar', + code: "UNKNOWN_INDEX", }, ); }); t.test("Should correctly remove a document after its insertion", async t => { - t.plan(5); + t.plan(4); const db = await create({ schema: { @@ -402,12 +408,12 @@ t.test("lyra", t => { }, }); - const { id: id1 } = await insert(db, { + const id1 = await insert(db, { quote: "Be yourself; everyone else is already taken.", author: "Oscar Wilde", }); - const { id: id2 } = await insert(db, { + const id2 = await insert(db, { quote: "To live is the rarest thing in the world. Most people exist, that is all.", author: "Oscar Wilde", }); @@ -417,14 +423,13 @@ t.test("lyra", t => { author: "Frank Zappa", }); - const res = remove(db, id1); + await remove(db, id1); const searchResult = await search(db, { term: "Oscar", properties: ["author"], }); - t.ok(res); t.equal(searchResult.count, 1); t.equal(searchResult.hits[0].document.author, "Oscar Wilde"); t.equal( @@ -444,11 +449,11 @@ t.test("lyra", t => { }, }); - const { id: halo } = await insert(lyra, { word: "Halo" }); + const halo = await insert(lyra, { word: "Halo" }); await insert(lyra, { word: "Halloween" }); await insert(lyra, { word: "Hal" }); - remove(lyra, halo); + await remove(lyra, halo); const searchResult = await search(lyra, { term: "Hal", @@ -470,7 +475,7 @@ t.test("lyra", t => { }, }); - const { id: harryPotter } = await insert(movieDB, { + const harryPotter = await insert(movieDB, { title: "Harry Potter and the Philosopher's Stone", director: "Chris Columbus", plot: "Harry Potter, an eleven-year-old orphan, discovers that he is a wizard and is invited to study at Hogwarts. Even as he escapes a dreary life and enters a world of magic, he finds trouble awaiting him.", @@ -483,7 +488,7 @@ t.test("lyra", t => { properties: ["title", "director", "plot"], }); - remove(movieDB, harryPotter); + await remove(movieDB, harryPotter); const testSearch2 = await search(movieDB, { term: "Harry Potter", @@ -514,7 +519,7 @@ t.test("lyra", t => { const { id } = searchResult.hits[0]; - remove(db, id); + await remove(db, id); const searchResult2 = await search(db, { term: "stelle", exact: true }); @@ -537,7 +542,7 @@ t.test("lyra", t => { const searchResult = await search(db, { term: "abc", exact: true }); const id = searchResult.hits[0].id; - remove(db, id); + await remove(db, id); const searchResult2 = await search(db, { term: "abc", exact: true }); @@ -546,7 +551,7 @@ t.test("lyra", t => { }); t.test("Should preserve identical docs after deletion", async t => { - t.plan(9); + t.plan(8); const db = await create({ schema: { @@ -555,12 +560,12 @@ t.test("lyra", t => { }, }); - const { id: id1 } = await insert(db, { + const id1 = await insert(db, { quote: "Be yourself; everyone else is already taken.", author: "Oscar Wilde", }); - const { id: id2 } = await insert(db, { + const id2 = await insert(db, { quote: "Be yourself; everyone else is already taken.", author: "Oscar Wilde", }); @@ -570,7 +575,7 @@ t.test("lyra", t => { author: "Frank Zappa", }); - const res = remove(db, id1); + await remove(db, id1); const searchResult = await search(db, { term: "Oscar", @@ -582,7 +587,6 @@ t.test("lyra", t => { properties: ["quote"], }); - t.ok(res); t.equal(searchResult.count, 1); t.equal(searchResult.hits[0].document.author, "Oscar Wilde"); t.equal(searchResult.hits[0].document.quote, "Be yourself; everyone else is already taken."); @@ -864,13 +868,13 @@ t.test("lyra", t => { const wrongSchemaDocs: WrongDataEvent[] = docs.map(doc => ({ ...doc, date: +new Date() })); try { - await insertBatch(db, docs); - t.equal(Object.keys(db.docs).length, 4000); + await insertMultiple(db, docs); + t.equal(Object.keys((db.data.docs as DocumentsStore).docs).length, 4000); // eslint-disable-next-line no-empty } catch (_e) {} - await t.rejects(() => insertBatch(db, wrongSchemaDocs as unknown as DataEvent[])); + await t.rejects(() => insertMultiple(db, wrongSchemaDocs as unknown as DataEvent[])); }); }); @@ -883,13 +887,13 @@ t.test("lyra - hooks", t => { () => create({ schema: { date: "string" }, - hooks: { + components: { ["anotherHookName" as string]: () => { t.fail("it shouldn't be called"); }, }, }), - { message: "The following hooks aren't supported. Hooks: anotherHookName" }, + { code: "UNSUPPORTED_COMPONENT" }, ); }); @@ -903,14 +907,13 @@ t.test("lyra - hooks", t => { surname: "string", }, }, - hooks: { - // eslint-disable-next-line @typescript-eslint/no-unused-vars - afterInsert: function (_id: string): void { + components: { + afterInsert(): void { t.same(++callOrder, 1); }, }, }); - await insertWithHooks(db, { + await insert(db, { quote: "Harry Potter, the boy who lived, come to die. Avada kedavra.", author: { name: "Tom", @@ -932,7 +935,9 @@ t.test("custom tokenizer configuration", t => { }, components: { tokenizer: { - tokenizerFn: text => text.split(","), + tokenize(text: string) { + return text.split(","); + }, }, }, }); @@ -978,12 +983,14 @@ t.test("should access own properties exclusively", async t => { }); t.test("should search numbers in supported languages", async t => { - for (const supportedLanguage of SUPPORTED_LANGUAGES) { + for (const language of SUPPORTED_LANGUAGES) { const db = await create({ schema: { number: "string", }, - defaultLanguage: supportedLanguage, + components: { + tokenizer: await createTokenizer(language, { stemming: false }), + }, }); await insert(db, { @@ -994,7 +1001,7 @@ t.test("should search numbers in supported languages", async t => { term: "123", }); - t.same(searchResult.count, 1, `Language: ${supportedLanguage}`); + t.same(searchResult.count, 1, `Language: ${language}`); } t.end(); @@ -1006,7 +1013,9 @@ t.test("should correctly search accented words in Italian", async t => { schema: { description: "string", }, - defaultLanguage: "italian", + components: { + tokenizer: await createTokenizer("italian", { stemming: false }), + }, }); await insert(db, { @@ -1025,7 +1034,9 @@ t.test("should correctly search accented words in English", async t => { schema: { description: "string", }, - defaultLanguage: "english", + components: { + tokenizer: await createTokenizer("english", { stemming: false }), + }, }); await insert(db, { @@ -1044,7 +1055,9 @@ t.test("should correctly search accented words in Dutch", async t => { schema: { description: "string", }, - defaultLanguage: "dutch", + components: { + tokenizer: await createTokenizer("dutch", { stemming: false }), + }, }); await insert(db, { @@ -1062,7 +1075,9 @@ t.test("should correctly search accented words in Slovenian", async t => { schema: { description: "string", }, - defaultLanguage: "slovenian", + components: { + tokenizer: await createTokenizer("slovenian", { stemming: false }), + }, }); await insert(db, { @@ -1088,7 +1103,9 @@ t.test("should correctly search words in Bulgarian", async t => { schema: { description: "string", }, - defaultLanguage: "bulgarian", + components: { + tokenizer: await createTokenizer("bulgarian", { stemming: false }), + }, }); await insert(db, { diff --git a/tests/remove.test.ts b/tests/remove.test.ts index 32f1bc0cf..9c517128d 100644 --- a/tests/remove.test.ts +++ b/tests/remove.test.ts @@ -2,10 +2,11 @@ import t from "tap"; import { remove } from "../src/methods/remove.js"; import { create } from "../src/methods/create.js"; import { insert } from "../src/methods/insert.js"; +import { Index } from "../src/components/index.js"; t.test("remove method", t => { t.plan(1); - + t.test("should remove a document and update field length", async t => { t.plan(2); @@ -15,8 +16,8 @@ t.test("remove method", t => { author: "string", meta: { tags: "string", - } - } + }, + }, }); await insert(db, { @@ -24,7 +25,7 @@ t.test("remove method", t => { author: "John Lennon", meta: { tags: "music, life, music", - } + }, }); await insert(db, { @@ -32,7 +33,7 @@ t.test("remove method", t => { author: "Richard Feynman", meta: { tags: "physics, science, philosophy", - } + }, }); await insert(db, { @@ -40,23 +41,23 @@ t.test("remove method", t => { author: "Henry Thoreau", meta: { tags: "life, philosophy, dreams, imagination", - } + }, }); - const fieldLengths = {...db.fieldLengths}; - const avgFieldLength = {...db.avgFieldLength}; + const fieldLengths = { ...(db.data.index as Index).fieldLengths }; + const avgFieldLength = { ...(db.data.index as Index).avgFieldLength }; const d1 = await insert(db, { quote: "It is during our darkest moments that we must focus to see the light.", author: "Aristotle", meta: { tags: "philosophy, life, light", - } + }, }); - await remove(db, d1.id); + await remove(db, d1); - t.same(db.fieldLengths, fieldLengths); - t.same(db.avgFieldLength, avgFieldLength); + t.same((db.data.index as Index).fieldLengths, fieldLengths); + t.same((db.data.index as Index).avgFieldLength, avgFieldLength); }); -}); \ No newline at end of file +}); diff --git a/tests/lyra.edge.test.ts b/tests/serialization.test.ts similarity index 63% rename from tests/lyra.edge.test.ts rename to tests/serialization.test.ts index c3704365a..94c0f5020 100644 --- a/tests/lyra.edge.test.ts +++ b/tests/serialization.test.ts @@ -1,26 +1,26 @@ -import type { PropertiesSchema, ResolveSchema } from "../src/types/index.js"; -import type { RadixNode } from "../src/trees/radix/node.js"; -import type { RetrievedDoc } from "../src/methods/search.js"; import t from "tap"; -import { create, insert, load, save, search } from "../src/index.js"; -import { contains as trieContains } from "../src/trees/radix/index.js"; - -function extractOriginalDoc(result: RetrievedDoc[]): ResolveSchema[] { - return result.map(({ document }: RetrievedDoc) => document); +import type { Document } from "../src/types.js"; +import { Node as RadixNode } from "../src/trees/radix.js"; +import { create, insert, load, Result, save, search } from "../src/index.js"; +import { contains as trieContains } from "../src/trees/radix.js"; +import { Index } from "../src/components/index.js"; +import { DocumentsStore } from "../src/components/documents-store.js"; + +function extractOriginalDoc(result: Result[]): Document[] { + return result.map(({ document }: Result) => document); } t.test("Edge getters", t => { - t.plan(5); + t.plan(4); t.test("should correctly enable edge index getter", async t => { - t.plan(3); + t.plan(2); const db = await create({ schema: { name: "string", age: "number", }, - edge: true, }); await insert(db, { @@ -33,13 +33,12 @@ t.test("Edge getters", t => { age: 25, }); - const { index, defaultLanguage } = await save(db); - const nameIndex = index["name"]; + const { index } = await save(db); + const nameIndex = (index as Index).indexes["name"]; // Remember that tokenizers an stemmers sets content to lowercase t.ok(trieContains(nameIndex as RadixNode, "john")); t.ok(trieContains(nameIndex as RadixNode, "jane")); - t.same(defaultLanguage, "english"); }); t.test("should correctly enable edge docs getter", async t => { @@ -50,7 +49,6 @@ t.test("Edge getters", t => { name: "string", age: "number", }, - edge: true, }); const doc1 = await insert(db, { @@ -65,8 +63,8 @@ t.test("Edge getters", t => { const { docs } = await save(db); - t.strictSame(docs[doc1.id], { name: "John", age: 30 }); - t.strictSame(docs[doc2.id], { name: "Jane", age: 25 }); + t.strictSame((docs as DocumentsStore).docs[doc1], { name: "John", age: 30 }); + t.strictSame((docs as DocumentsStore).docs[doc2], { name: "Jane", age: 25 }); }); t.test("should correctly enable index setter", async t => { @@ -77,7 +75,6 @@ t.test("Edge getters", t => { name: "string", age: "number", }, - edge: true, }); const jonh = { @@ -108,7 +105,6 @@ t.test("Edge getters", t => { name: "string", age: "number", }, - edge: true, }); await insert(db2, michele); @@ -131,7 +127,7 @@ t.test("Edge getters", t => { t.strictSame(extractOriginalDoc(search4.hits), [michele]); }); - t.test("It should correctly save and load data", async t => { + t.test("should correctly save and load data", async t => { t.plan(2); const originalDB = await create({ @@ -158,10 +154,9 @@ t.test("Edge getters", t => { name: "string", age: "number", }, - edge: true, }); - load(newDB, DBData); + await load(newDB, DBData); const search1 = await search(originalDB, { term: "Michele" }); const search2 = await search(newDB, { term: "Michele" }); @@ -172,42 +167,4 @@ t.test("Edge getters", t => { t.strictSame(search1.hits, search2.hits); t.strictSame(search3.hits, search4.hits); }); - - t.test("It should correctly save and load the defaultLanguage option", async t => { - t.plan(2); - const db = await create({ - schema: { - name: "string", - age: "number", - }, - edge: true, - defaultLanguage: "italian", - }); - - const db2 = await create({ - schema: { - name: "string", - age: "number", - }, - edge: true, - }); - - await insert(db, { - name: "Michele", - age: 27, - }); - - await insert(db, { - name: "John", - age: 25, - }); - - const originalInstance = await save(db); - load(db2, originalInstance); - - const { defaultLanguage } = originalInstance; - - t.same(originalInstance.defaultLanguage, "italian"); - t.same(defaultLanguage, "italian"); - }); }); diff --git a/tests/snapshots/events.json b/tests/snapshots/events.json index ac3226284..cbb3d135e 100644 --- a/tests/snapshots/events.json +++ b/tests/snapshots/events.json @@ -4,7 +4,7 @@ "hits": [ { "id": "", - "score": 4.921722598465424, + "score": 4.744426329679039, "document": { "date": "-89", "description": "Social War:", @@ -17,7 +17,7 @@ }, { "id": "", - "score": 4.921722598465424, + "score": 4.744426329679039, "document": { "date": "-57", "description": "Gallic Wars:", @@ -30,7 +30,7 @@ }, { "id": "", - "score": 4.921722598465424, + "score": 4.744426329679039, "document": { "date": "-55", "description": "Gallic War", @@ -43,7 +43,7 @@ }, { "id": "", - "score": 4.921722598465424, + "score": 4.744426329679039, "document": { "date": "-54", "description": "Gallic Wars", @@ -56,7 +56,7 @@ }, { "id": "", - "score": 4.921722598465424, + "score": 4.744426329679039, "document": { "date": "-53", "description": "Parthian war:", @@ -69,7 +69,7 @@ }, { "id": "", - "score": 4.921722598465424, + "score": 4.744426329679039, "document": { "date": "-53", "description": "Gallic War:", @@ -82,7 +82,7 @@ }, { "id": "", - "score": 4.921722598465424, + "score": 4.744426329679039, "document": { "date": "-48", "description": "Civil War:", @@ -95,7 +95,7 @@ }, { "id": "", - "score": 4.921722598465424, + "score": 4.744426329679039, "document": { "date": "-47", "description": "Civil War:", @@ -108,7 +108,7 @@ }, { "id": "", - "score": 4.921722598465424, + "score": 4.744426329679039, "document": { "date": "-46", "description": "Civil War:", @@ -121,7 +121,7 @@ }, { "id": "", - "score": 4.921722598465424, + "score": 4.744426329679039, "document": { "date": "1809/03/13", "description": "Peninsular War", @@ -139,7 +139,7 @@ "hits": [ { "id": "", - "score": 4.921722598465424, + "score": 4.744426329679039, "document": { "date": "1867/12/02", "description": "Paraguayan War.", @@ -152,117 +152,117 @@ }, { "id": "", - "score": 4.921722598465424, + "score": 4.744426329679039, "document": { - "date": "1950/02/14", - "description": " Cold War:", + "date": "1914/12/24", + "description": " World War I:", "granularity": "year", "categories": { - "first": "February", + "first": "December", "second": "" } } }, { "id": "", - "score": 4.349997355979656, + "score": 4.744426329679039, "document": { - "date": "-86", - "description": "First Mithridatic War", + "date": "1950/02/14", + "description": " Cold War:", "granularity": "year", "categories": { - "first": "By place", - "second": "Roman Republic" + "first": "February", + "second": "" } } }, { "id": "", - "score": 4.349997355979656, + "score": 4.087300377720357, "document": { - "date": "632/01/27", - "description": "Ridda Wars begins", + "date": "-113", + "description": "War between the Celtiberians and the Romans.", "granularity": "year", "categories": { "first": "By place", - "second": "Asia" + "second": "Roman Republic" } } }, { "id": "", - "score": 4.349997355979656, + "score": 4.087300377720357, "document": { - "date": "988", - "description": "Rus'–Byzantine War", + "date": "-86", + "description": "First Mithridatic War", "granularity": "year", "categories": { "first": "By place", - "second": "Europe" + "second": "Roman Republic" } } }, { "id": "", - "score": 4.349997355979656, + "score": 4.087300377720357, "document": { - "date": "1043/10/31", - "description": "Rus'-Byzantine War (1043).", + "date": "630", + "description": "The Byzantine-Arab Wars begin.", "granularity": "year", "categories": { - "first": "", - "second": "" + "first": "By place", + "second": "Byzantine Empire" } } }, { "id": "", - "score": 4.349997355979656, + "score": 4.087300377720357, "document": { - "date": "1861/04/27", - "description": " American Civil War:", + "date": "632/01/27", + "description": "Ridda Wars begins", "granularity": "year", "categories": { - "first": "April/June", - "second": "" + "first": "By place", + "second": "Asia" } } }, { "id": "", - "score": 4.349997355979656, + "score": 4.087300377720357, "document": { - "date": "1914/12/24", - "description": " World War I:", + "date": "941", + "description": "The Rus'-Byzantine War is fought.", "granularity": "year", "categories": { - "first": "December", - "second": "" + "first": "By place", + "second": "Asia" } } }, { "id": "", - "score": 3.860623950530306, + "score": 4.087300377720357, "document": { - "date": "630", - "description": "The Byzantine-Arab Wars begin.", + "date": "988", + "description": "Rus'–Byzantine War", "granularity": "year", "categories": { "first": "By place", - "second": "Byzantine Empire" + "second": "Europe" } } }, { "id": "", - "score": 3.860623950530306, + "score": 4.087300377720357, "document": { - "date": "1522/12/20", - "description": "The Habsburg-Valois Wars begin.", + "date": "1043/10/31", + "description": "Rus'-Byzantine War (1043).", "granularity": "year", "categories": { - "first": "Date unknown", + "first": "", "second": "" } } @@ -274,23 +274,23 @@ "hits": [ { "id": "", - "score": 3.860623950530306, + "score": 4.087300377720357, "document": { - "date": "1728/10/20", - "description": "The Meerkat–Mongoose war.", + "date": "1447/07/15", + "description": "The Albanian-Venetian War of 1447-1448.", "granularity": "year", "categories": { - "first": "In fiction", + "first": "Date unknown", "second": "" } } }, { "id": "", - "score": 3.860623950530306, + "score": 4.087300377720357, "document": { - "date": "1830/12/20", - "description": "The Java War ends.", + "date": "1470/10/30", + "description": "Start of the Anglo-Hanseatic War.", "granularity": "year", "categories": { "first": "Date unknown", @@ -300,105 +300,105 @@ }, { "id": "", - "score": 3.860623950530306, + "score": 4.087300377720357, "document": { - "date": "1862/09/17", - "description": " American Civil War ampndash", + "date": "1522/12/20", + "description": "The Habsburg-Valois Wars begin.", "granularity": "year", "categories": { - "first": "July/September", + "first": "Date unknown", "second": "" } } }, { "id": "", - "score": 3.860623950530306, + "score": 4.087300377720357, "document": { - "date": "1879/01/11", - "description": " The Anglo-Zulu War begins.", + "date": "1558/01/22", + "description": " Beginning of the Livonian War.", "granularity": "year", "categories": { - "first": "January/March", + "first": "January/June", "second": "" } } }, { "id": "", - "score": 3.860623950530306, + "score": 4.087300377720357, "document": { - "date": "1919/02/14", - "description": " The Polish-Soviet War begins.", + "date": "1728/10/20", + "description": "The Meerkat–Mongoose war.", "granularity": "year", "categories": { - "first": "February", + "first": "In fiction", "second": "" } } }, { "id": "", - "score": 3.860623950530306, + "score": 4.087300377720357, "document": { - "date": "1939/03/23", - "description": " The Slovak-Hungarian War begins.", + "date": "1830/12/20", + "description": "The Java War ends.", "granularity": "year", "categories": { - "first": "March", + "first": "Date unknown", "second": "" } } }, { "id": "", - "score": 3.860623950530306, + "score": 4.087300377720357, "document": { - "date": "2003/02/05", - "description": "War in Darfur begins.", + "date": "1857/04/04", + "description": " End of the Anglo-Persian War.", "granularity": "year", "categories": { - "first": "February", + "first": "April/June", "second": "" } } }, { "id": "", - "score": 3.4503660467084463, + "score": 4.087300377720357, "document": { - "date": "-156", - "description": "The first Dalmatian war begins.", + "date": "1861/04/27", + "description": " American Civil War:", "granularity": "year", "categories": { - "first": "By place", - "second": "Roman Republic" + "first": "April/June", + "second": "" } } }, { "id": "", - "score": 3.4503660467084463, + "score": 4.087300377720357, "document": { - "date": "-119", - "description": "The second Dalmatian war begins.", + "date": "1879/01/11", + "description": " The Anglo-Zulu War begins.", "granularity": "year", "categories": { - "first": "By place", - "second": "Roman Republic" + "first": "January/March", + "second": "" } } }, { "id": "", - "score": 3.4503660467084463, + "score": 4.087300377720357, "document": { - "date": "-78", - "description": "The Third Dalmatian war begins.", + "date": "1919/02/14", + "description": " The Polish-Soviet War begins.", "granularity": "year", "categories": { - "first": "By place", - "second": "Roman Republic" + "first": "February", + "second": "" } } } diff --git a/tests/tokenizer.test.ts b/tests/tokenizer.test.ts index 1bd9726c6..05587e15d 100644 --- a/tests/tokenizer.test.ts +++ b/tests/tokenizer.test.ts @@ -1,598 +1,372 @@ import t from "tap"; -import { LANGUAGE_NOT_SUPPORTED } from "../src/errors.js"; -import { defaultTokenizerConfig, normalizationCache, tokenize } from "../src/tokenizer/index.js"; -import { stopWords } from "../src/tokenizer/stop-words/index.js"; -import { stemmer as ENStemmer } from "../stemmer/lib/en.js"; + +import { createTokenizer, normalizationCache } from "../src/tokenizer/index.js"; +import { stemmer as BGStemmer } from "../stemmer/lib/bg.js"; +import { stemmer as DEStemmer } from "../stemmer/lib/de.js"; +import { stemmer as DKStemmer } from "../stemmer/lib/dk.js"; +import { stemmer as ESStemmer } from "../stemmer/lib/es.js"; +import { stemmer as FIStemmer } from "../stemmer/lib/fi.js"; import { stemmer as FRStemmer } from "../stemmer/lib/fr.js"; import { stemmer as ITStemmer } from "../stemmer/lib/it.js"; +import { stemmer as NLStemmer } from "../stemmer/lib/nl.js"; import { stemmer as NOStemmer } from "../stemmer/lib/no.js"; import { stemmer as PTStemmer } from "../stemmer/lib/pt.js"; import { stemmer as RUStemmer } from "../stemmer/lib/ru.js"; import { stemmer as SEStemmer } from "../stemmer/lib/se.js"; -import { stemmer as ESStemmer } from "../stemmer/lib/es.js"; -import { stemmer as NLStemmer } from "../stemmer/lib/nl.js"; -import { stemmer as DEStemmer } from "../stemmer/lib/de.js"; -import { stemmer as FIStemmer } from "../stemmer/lib/fi.js"; -import { stemmer as DKStemmer } from "../stemmer/lib/dk.js"; import { stemmer as UKStemmer } from "../stemmer/lib/uk.js"; -import { stemmer as BGStemmer } from "../stemmer/lib/bg.js"; -t.test("Tokenizer", t => { +t.test("Tokenizer", async t => { t.plan(15); - t.test("Should tokenize and stem correctly in english", t => { + t.test("Should tokenize and stem correctly in english", async t => { t.plan(2); + const tokenizer = await createTokenizer("english", { stopWords: false }); + const I1 = "the quick brown fox jumps over the lazy dog"; const I2 = "I baked some cakes"; - const O1 = tokenize(I1, "english"); - const O2 = tokenize(I2, "english"); + const O1 = tokenizer.tokenize(I1, "english"); + const O2 = tokenizer.tokenize(I2, "english"); t.strictSame(O1, ["the", "quick", "brown", "fox", "jump", "over", "lazi", "dog"]); t.strictSame(O2, ["i", "bake", "some", "cake"]); }); - t.test("Should tokenize and stem correctly in english and allow duplicates", t => { + t.test("Should tokenize and stem correctly in english and allow duplicates", async t => { t.plan(2); + const tokenizer = await createTokenizer("english", { allowDuplicates: true, stopWords: false }); + const I1 = "this is a test with test duplicates"; const I2 = "it's alive! it's alive!"; - const O1 = tokenize(I1, "english", true); - const O2 = tokenize(I2, "english", true); + const O1 = tokenizer.tokenize(I1, "english"); + const O2 = tokenizer.tokenize(I2, "english"); t.strictSame(O1, ["thi", "is", "a", "test", "with", "test", "duplic"]); t.strictSame(O2, ["it'", "aliv", "it'", "aliv"]); }); - t.test("Should tokenize and stem correctly in french", t => { + t.test("Should tokenize and stem correctly in french", async t => { t.plan(2); + const tokenizer = await createTokenizer("french", { stemmer: FRStemmer }); + const I1 = "voyons quel temps il fait dehors"; const I2 = "j'ai fait des gâteaux"; - const O1 = tokenize( - I1, - "french", - false, - defaultTokenizerConfig("french", { - stemmingFn: FRStemmer, - customStopWords: stopWords.french, - }), - ); - const O2 = tokenize( - I2, - "french", - false, - defaultTokenizerConfig("french", { - stemmingFn: FRStemmer, - customStopWords: stopWords.french, - }), - ); + const O1 = tokenizer.tokenize(I1); + const O2 = tokenizer.tokenize(I2); t.strictSame(O1, ["voyon", "temp", "fait", "dehor"]); t.strictSame(O2, ["fait", "gateau"]); }); - t.test("Should tokenize and stem correctly in italian", t => { + t.test("Should tokenize and stem correctly in italian", async t => { t.plan(2); + const tokenizer = await createTokenizer("italian", { stemmer: ITStemmer }); + const I1 = "ho cucinato delle torte"; const I2 = "dormire è una cosa difficile quando i test non passano"; - const O1 = tokenize( - I1, - "italian", - false, - defaultTokenizerConfig("italian", { - stemmingFn: ITStemmer, - customStopWords: stopWords.italian, - }), - ); - const O2 = tokenize( - I2, - "italian", - false, - defaultTokenizerConfig("italian", { - stemmingFn: ITStemmer, - customStopWords: stopWords.italian, - }), - ); + const O1 = tokenizer.tokenize(I1); + const O2 = tokenizer.tokenize(I2); t.strictSame(O1, ["cucin", "tort"]); t.strictSame(O2, ["dorm", "cos", "difficil", "quand", "test", "pass"]); }); - t.test("Should tokenize and stem correctly in norwegian", t => { + t.test("Should tokenize and stem correctly in norwegian", async t => { t.plan(2); + const tokenizer = await createTokenizer("norwegian", { stemmer: NOStemmer }); const I1 = "Jeg kokte noen kaker"; const I2 = "å sove er en vanskelig ting når testene mislykkes"; - const O1 = tokenize( - I1, - "norwegian", - false, - defaultTokenizerConfig("norwegian", { - stemmingFn: NOStemmer, - customStopWords: stopWords.norwegian, - }), - ); - const O2 = tokenize( - I2, - "norwegian", - false, - defaultTokenizerConfig("norwegian", { - stemmingFn: NOStemmer, - customStopWords: stopWords.norwegian, - }), - ); + const O1 = tokenizer.tokenize(I1); + const O2 = tokenizer.tokenize(I2); t.strictSame(O1, ["kokt", "kak"]); t.strictSame(O2, ["sov", "vansk", "ting", "test", "mislykk"]); }); - t.test("Should tokenize and stem correctly in portuguese", t => { + t.test("Should tokenize and stem correctly in portuguese", async t => { t.plan(2); + const tokenizer = await createTokenizer("portuguese", { stemmer: PTStemmer }); + const I1 = "Eu cozinhei alguns bolos"; const I2 = "dormir é uma coisa difícil quando os testes falham"; - const O1 = tokenize( - I1, - "portuguese", - false, - defaultTokenizerConfig("portuguese", { - stemmingFn: PTStemmer, - customStopWords: stopWords.portuguese, - }), - ); - const O2 = tokenize( - I2, - "portuguese", - false, - defaultTokenizerConfig("portuguese", { - stemmingFn: PTStemmer, - customStopWords: stopWords.portuguese, - }), - ); + const O1 = tokenizer.tokenize(I1); + const O2 = tokenizer.tokenize(I2); t.strictSame(O1, ["cozinh", "alguns", "bol"]); t.strictSame(O2, ["dorm", "e", "cois", "dificil", "test", "falh"]); }); - t.test("Should tokenize and stem correctly in russian", t => { + t.test("Should tokenize and stem correctly in russian", async t => { t.plan(2); + const tokenizer = await createTokenizer("russian", { stemmer: RUStemmer }); + const I1 = "я приготовила пирожные"; const I2 = "спать трудно, когда тесты не срабатывают"; - const O1 = tokenize( - I1, - "russian", - false, - defaultTokenizerConfig("russian", { - stemmingFn: RUStemmer, - customStopWords: stopWords.russian, - }), - ); - const O2 = tokenize( - I2, - "russian", - false, - defaultTokenizerConfig("russian", { - stemmingFn: RUStemmer, - customStopWords: stopWords.russian, - }), - ); + const O1 = tokenizer.tokenize(I1); + const O2 = tokenizer.tokenize(I2); t.strictSame(O1, ["приготов", "пирожн"]); t.strictSame(O2, ["спат", "трудн", "тест", "срабатыва"]); }); - t.test("Should tokenize and stem correctly in swedish", t => { + t.test("Should tokenize and stem correctly in swedish", async t => { t.plan(2); + const tokenizer = await createTokenizer("swedish", { stemmer: SEStemmer }); const I1 = "Jag lagade några kakor"; const I2 = "att sova är en svår sak när testerna misslyckas"; - const O1 = tokenize( - I1, - "swedish", - false, - defaultTokenizerConfig("swedish", { - stemmingFn: SEStemmer, - customStopWords: stopWords.swedish, - }), - ); - const O2 = tokenize( - I2, - "swedish", - false, - defaultTokenizerConfig("swedish", { - stemmingFn: SEStemmer, - customStopWords: stopWords.swedish, - }), - ); + const O1 = tokenizer.tokenize(I1); + const O2 = tokenizer.tokenize(I2); t.strictSame(O1, ["lag", "kak"]); t.strictSame(O2, ["sov", "svar", "sak", "test", "misslyck"]); }); - t.test("Should tokenize and stem correctly in spanish", t => { + t.test("Should tokenize and stem correctly in spanish", async t => { t.plan(2); + const tokenizer = await createTokenizer("spanish", { stemmer: ESStemmer }); + const I1 = "cociné unos pasteles"; const I2 = "dormir es algo dificil cuando las pruebas fallan"; - const O1 = tokenize( - I1, - "spanish", - false, - defaultTokenizerConfig("spanish", { - stemmingFn: ESStemmer, - customStopWords: stopWords.spanish, - }), - ); - const O2 = tokenize( - I2, - "spanish", - false, - defaultTokenizerConfig("spanish", { - stemmingFn: ESStemmer, - customStopWords: stopWords.spanish, - }), - ); + const O1 = tokenizer.tokenize(I1); + const O2 = tokenizer.tokenize(I2); t.strictSame(O1, ["cocin", "pastel"]); t.strictSame(O2, ["dorm", "dificil", "prueb", "fall"]); }); - t.test("Should tokenize and stem correctly in dutch", t => { + t.test("Should tokenize and stem correctly in dutch", async t => { t.plan(2); + const tokenizer = await createTokenizer("dutch", { stemmer: NLStemmer }); const I1 = "de kleine koeien"; const I2 = "Ik heb wat taarten gemaakt"; - const O2 = tokenize( - I2, - "dutch", - false, - defaultTokenizerConfig("dutch", { - stemmingFn: NLStemmer, - customStopWords: stopWords.dutch, - }), - ); - const O1 = tokenize( - I1, - "dutch", - false, - defaultTokenizerConfig("dutch", { - stemmingFn: NLStemmer, - customStopWords: stopWords.dutch, - }), - ); + const O2 = tokenizer.tokenize(I2); + const O1 = tokenizer.tokenize(I1); t.strictSame(O1, ["klein", "koei"]); t.strictSame(O2, ["taart", "gemaakt"]); }); - t.test("Should tokenize and stem correctly in german", t => { + t.test("Should tokenize and stem correctly in german", async t => { t.plan(2); + const tokenizer = await createTokenizer("german", { stemmer: DEStemmer }); + const I1 = "Schlaf ist eine harte Sache, wenn Tests fehlschlagen"; const I2 = "Ich habe ein paar Kekse gebacken"; - const O1 = tokenize( - I1, - "german", - false, - defaultTokenizerConfig("german", { - stemmingFn: DEStemmer, - customStopWords: stopWords.german, - }), - ); - const O2 = tokenize( - I2, - "german", - false, - defaultTokenizerConfig("german", { - stemmingFn: DEStemmer, - customStopWords: stopWords.german, - }), - ); + const O1 = tokenizer.tokenize(I1); + const O2 = tokenizer.tokenize(I2); t.strictSame(O1, ["schlaf", "hart", "sach", "test", "fehlschlag"]); t.strictSame(O2, ["paar", "keks", "geback"]); }); - t.test("Should tokenize and stem correctly in finnish", t => { + t.test("Should tokenize and stem correctly in finnish", async t => { t.plan(2); + const tokenizer = await createTokenizer("finnish", { stemmer: FIStemmer }); + const I1 = "Uni on vaikea asia, kun testit epäonnistuvat"; const I2 = "Leivoin keksejä"; - const O1 = tokenize( - I1, - "finnish", - false, - defaultTokenizerConfig("finnish", { - stemmingFn: FIStemmer, - customStopWords: stopWords.finnish, - }), - ); - const O2 = tokenize( - I2, - "finnish", - false, - defaultTokenizerConfig("finnish", { - stemmingFn: FIStemmer, - customStopWords: stopWords.finnish, - }), - ); + const O1 = tokenizer.tokenize(I1); + const O2 = tokenizer.tokenize(I2); t.strictSame(O1, ["uni", "vaike", "as", "test", "epaonnistuv"]); t.strictSame(O2, ["leivo", "keksej"]); }); - t.test("Should tokenize and stem correctly in danish", t => { + t.test("Should tokenize and stem correctly in danish", async t => { t.plan(2); + const tokenizer = await createTokenizer("danish", { stemmer: DKStemmer }); + const I1 = "Søvn er en svær ting, når prøver mislykkes"; const I2 = "Jeg bagte småkager"; - const O1 = tokenize( - I1, - "danish", - false, - defaultTokenizerConfig("danish", { - stemmingFn: DKStemmer, - customStopWords: stopWords.danish, - }), - ); - const O2 = tokenize( - I2, - "danish", - false, - defaultTokenizerConfig("danish", { - stemmingFn: DKStemmer, - customStopWords: stopWords.danish, - }), - ); + const O1 = tokenizer.tokenize(I1); + const O2 = tokenizer.tokenize(I2); t.strictSame(O1, ["sovn", "svar", "ting", "prov", "mislyk"]); t.strictSame(O2, ["bagt", "smakag"]); }); - t.test("Should tokenize and stem correctly in ukrainian", t => { + t.test("Should tokenize and stem correctly in ukrainian", async t => { t.plan(2); + const tokenizer = await createTokenizer("ukrainian", { stemmer: UKStemmer }); + const I1 = "Коли тести не проходять, спати важко"; const I2 = "я приготувала тістечка"; - const O1 = tokenize( - I1, - "ukrainian", - false, - defaultTokenizerConfig("ukrainian", { - stemmingFn: UKStemmer, - customStopWords: [], - }), - ); - const O2 = tokenize( - I2, - "ukrainian", - false, - defaultTokenizerConfig("ukrainian", { - stemmingFn: UKStemmer, - customStopWords: [], - }), - ); + const O1 = tokenizer.tokenize(I1); + const O2 = tokenizer.tokenize(I2); + t.strictSame(O1, ["кол", "тест", "не", "проход", "спат", "важк"]); t.strictSame(O2, ["я", "приготувал", "тістечк"]); }); - t.test("Should tokenize and stem correctly in bulgarian", t => { + t.test("Should tokenize and stem correctly in bulgarian", async t => { t.plan(2); + const tokenizer = await createTokenizer("bulgarian", { stemmer: BGStemmer, stopWords: [] }); + const I1 = "Кокошката е малка крава която не може да се събере с теста"; const I2 = "Има първа вероятност да се случи нещо неочаквано докато се изпълняват тестовете"; - const O1 = tokenize( - I1, - "bulgarian", - false, - defaultTokenizerConfig("bulgarian", { - stemmingFn: BGStemmer, - customStopWords: [], - }), - ); - const O2 = tokenize( - I2, - "bulgarian", - true, - defaultTokenizerConfig("bulgarian", { - stemmingFn: BGStemmer, - customStopWords: [], - }), - ); + const O1 = tokenizer.tokenize(I1); + const O2 = tokenizer.tokenize(I2); + t.strictSame(O1, ["кокошк", "е", "малк", "крав", "коят", "не", "мож", "да", "се", "събер", "с", "тест"]); - t.strictSame(O2, ["има", "първ", "вероятност", "да", "се", "случ", "нещ", "неочакван", "док", "се", "изпълняват", "тест"]); + t.strictSame(O2, [ + "има", + "първ", + "вероятност", + "да", + "се", + "случ", + "нещ", + "неочакван", + "док", + "изпълняват", + "тест", + ]); }); }); -t.test("Custom stop-words rules", t => { +t.test("Custom stop-words rules", async t => { t.plan(6); - t.test("custom array of stop-words", t => { + t.test("custom array of stop-words", async t => { t.plan(2); normalizationCache.clear(); + const tokenizer = await createTokenizer("english", { stopWords: ["quick", "brown", "fox", "dog"] }); + const I1 = "the quick brown fox jumps over the lazy dog"; const I2 = "I baked some cakes"; - const O1 = tokenize( - I1, - "english", - false, - defaultTokenizerConfig("english", { - stemmingFn: ENStemmer, - customStopWords: ["quick", "brown", "fox", "dog"], - }), - ); - - const O2 = tokenize( - I2, - "english", - false, - defaultTokenizerConfig("english", { - stemmingFn: ENStemmer, - customStopWords: ["quick", "brown", "fox", "dog"], - }), - ); + const O1 = tokenizer.tokenize(I1); + + const O2 = tokenizer.tokenize(I2); t.same(O1, ["the", "jump", "over", "lazi"]); t.same(O2, ["i", "bake", "some", "cake"]); }); - t.test("custom stop-words function", t => { + t.test("custom stop-words function", async t => { t.plan(2); normalizationCache.clear(); + const tokenizer = await createTokenizer("english", { + stopWords(words: string[]): string[] { + return [...words, "quick", "brown", "fox", "dog"]; + }, + }); + const I1 = "the quick brown fox jumps over the lazy dog"; const I2 = "I baked some cakes"; - const O1 = tokenize( - I1, - "english", - false, - defaultTokenizerConfig("english", { - customStopWords(words: string[]): string[] { - return [...words, "quick", "brown", "fox", "dog"]; - }, - }), - ); - const O2 = tokenize( - I2, - "english", - false, - defaultTokenizerConfig("english", { - customStopWords(words: string[]): string[] { - return [...words, "quick", "brown", "fox", "dog"]; - }, - }), - ); + const O1 = tokenizer.tokenize(I1); + const O2 = tokenizer.tokenize(I2); t.same(O1, ["jump", "lazi"]); t.same(O2, ["bake", "cake"]); }); - t.test("disable stop-words", t => { + t.test("disable stop-words", async t => { t.plan(2); normalizationCache.clear(); + const tokenizer = await createTokenizer("english", { stopWords: false }); + const I1 = "the quick brown fox jumps over the lazy dog"; const I2 = "I baked some cakes"; - const O1 = tokenize(I1, "english", false, defaultTokenizerConfig("english", { enableStopWords: false })); - const O2 = tokenize(I2, "english", false, defaultTokenizerConfig("english", { enableStopWords: false })); + const O1 = tokenizer.tokenize(I1); + const O2 = tokenizer.tokenize(I2); t.same(O1, ["the", "quick", "brown", "fox", "jump", "over", "lazi", "dog"]); t.same(O2, ["i", "bake", "some", "cake"]); }); - t.test("disable stemming", t => { + t.test("disable stemming", async t => { t.plan(2); normalizationCache.clear(); + const tokenizer = await createTokenizer("english", { stemming: false }); + const I1 = "the quick brown fox jumps over the lazy dog"; const I2 = "I baked some cakes"; - const O1 = tokenize( - I1, - "english", - false, - defaultTokenizerConfig("english", { - enableStemming: false, - customStopWords: stopWords.english, - }), - ); - const O2 = tokenize( - I2, - "english", - false, - defaultTokenizerConfig("english", { - enableStemming: false, - customStopWords: stopWords.english, - }), - ); + const O1 = tokenizer.tokenize(I1); + const O2 = tokenizer.tokenize(I2); t.same(O1, ["quick", "brown", "fox", "jumps", "lazy", "dog"]); t.same(O2, ["baked", "cakes"]); }); - t.test("custom stemming function", t => { + t.test("custom stemming function", async t => { t.plan(2); normalizationCache.clear(); + const tokenizer = await createTokenizer("english", { stemmer: word => `${word}-ish` }); + const I1 = "the quick brown fox jumps over the lazy dog"; const I2 = "I baked some cakes"; - const O1 = tokenize( - I1, - "english", - false, - defaultTokenizerConfig("english", { - customStopWords: stopWords.english, - stemmingFn: word => `${word}-ish`, - }), - ); - const O2 = tokenize( - I2, - "english", - false, - defaultTokenizerConfig("english", { - customStopWords: stopWords.english, - stemmingFn: word => `${word}-ish`, - }), - ); + const O1 = tokenizer.tokenize(I1); + const O2 = tokenizer.tokenize(I2); t.same(O1, ["quick-ish", "brown-ish", "fox-ish", "jumps-ish", "lazy-ish", "dog-ish"]); t.same(O2, ["baked-ish", "cakes-ish"]); }); - t.test("should validate options", t => { - t.plan(6); + await t.test("should validate options", async t => { + t.plan(5); - t.throws(() => defaultTokenizerConfig("english").assertSupportedLanguage("weird-language"), { - message: LANGUAGE_NOT_SUPPORTED("weird-language"), - }); + await t.rejects(() => createTokenizer("weird-language"), { code: "LANGUAGE_NOT_SUPPORTED" }); // @ts-expect-error testing validation - t.throws(() => defaultTokenizerConfig("english", { tokenizerFn: "FOO" }), { - message: "tokenizer.tokenizerFn must be a function.", - }); - // @ts-expect-error testing validation - t.throws(() => defaultTokenizerConfig("english", { stemmingFn: "FOO" }), { - message: "tokenizer.stemmingFn property must be a function.", - }); + await t.rejects(() => createTokenizer("english", { stemmer: "FOO" }), { code: "INVALID_STEMMER_FUNCTION_TYPE" }); + // @ts-expect-error testing validation - t.throws(() => defaultTokenizerConfig("english", { stemmingFn: ENStemmer, customStopWords: "FOO" }), { - message: "Custom stop words must be a function or an array of strings.", + await t.rejects(() => createTokenizer("english", { stopWords: "FOO" }), { + code: "CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY", }); + // @ts-expect-error testing validation - t.throws(() => defaultTokenizerConfig("english", { stemmingFn: ENStemmer, customStopWords: [1, 2, 3] }), { - message: "Custom stop words array must only contain strings.", + await t.rejects(() => createTokenizer("english", { stopWords: [1, 2, 3] }), { + code: "CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY", }); + // @ts-expect-error testing validation - t.throws(() => defaultTokenizerConfig("english", { stemmingFn: ENStemmer, customStopWords: {} }), { - message: "Custom stop words must be a function or an array of strings.", + await t.rejects(() => createTokenizer("english", { stopWords: {} }), { + code: "CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY", }); }); }); diff --git a/tests/tree.avl.test.ts b/tests/tree.avl.test.ts index 246752dc8..c80ef16c9 100644 --- a/tests/tree.avl.test.ts +++ b/tests/tree.avl.test.ts @@ -1,56 +1,67 @@ -import t from 'tap' -import { create, insert, find, getSize, remove, contains, isBalanced, greaterThan, lessThan, rangeSearch } from '../src/trees/avl/index.js' - -t.test('AVL Tree', t => { +import t from "tap"; +import { + create, + insert, + find, + getSize, + remove, + contains, + isBalanced, + greaterThan, + lessThan, + rangeSearch, +} from "../src/trees/avl.js"; + +t.test("AVL Tree", t => { t.plan(8); - t.test('create', t => { + t.test("create", t => { t.plan(3); - const tree = create(1, 'foo'); + const tree = create(1, "foo"); t.equal(getSize(tree), 1); - t.equal(find(tree, 1), 'foo'); + t.equal(find(tree, 1), "foo"); t.equal(find(tree, 4), null); }); - t.test('insert', t => { + t.test("insert", t => { t.plan(1); - const tree = create(1, 'foo'); + const tree = create(1, "foo"); - insert(tree, 2, 'bar'); - insert(tree, 10, 'baz'); - insert(tree, 25, 'qux'); - insert(tree, 5, 'quux'); - insert(tree, 20, 'quuz'); - insert(tree, 12, 'corge'); + insert(tree, 2, "bar"); + insert(tree, 10, "baz"); + insert(tree, 25, "qux"); + insert(tree, 5, "quux"); + insert(tree, 20, "quuz"); + insert(tree, 12, "corge"); t.equal(getSize(tree), 7); }); - t.test('isBalanced', t => { + t.test("isBalanced", t => { t.plan(1); - const tree = create(1, { foo: 'bar' }); - - insert(tree, 2, { foo: 'baz' }); - insert(tree, 10, { foo: 'qux' }); - insert(tree, 25, { foo: 'quux' }); - insert(tree, 5, { foo: 'quuz' }); - insert(tree, 20, { foo: 'corge' }); - insert(tree, 12, { foo: 'grault' }); - insert(tree, 15, { foo: 'garply' }); - insert(tree, 30, { foo: 'waldo' }); - insert(tree, 40, { foo: 'fred' }); - insert(tree, 520, { foo: 'plugh' }); - insert(tree, 630, { foo: 'xyzzy' }); - insert(tree, 3, { foo: 'thud' }); - insert(tree, 480, { foo: 'thuds' }); + const tree = create(1, { foo: "bar" }); + + insert(tree, 2, { foo: "baz" }); + insert(tree, 10, { foo: "qux" }); + insert(tree, 25, { foo: "quux" }); + insert(tree, 5, { foo: "quuz" }); + insert(tree, 20, { foo: "corge" }); + insert(tree, 12, { foo: "grault" }); + insert(tree, 15, { foo: "garply" }); + insert(tree, 30, { foo: "waldo" }); + insert(tree, 40, { foo: "fred" }); + insert(tree, 520, { foo: "plugh" }); + insert(tree, 630, { foo: "xyzzy" }); + insert(tree, 3, { foo: "thud" }); + insert(tree, 480, { foo: "thuds" }); t.equal(isBalanced(tree), true); }); - t.test('find', t => { + t.test("find", t => { t.plan(2); const tree = create(1, [1, 2, 3]); @@ -66,17 +77,17 @@ t.test('AVL Tree', t => { t.same(find(tree, 20), [16, 17, 18]); }); - t.test('remove', t => { + t.test("remove", t => { t.plan(3); - const tree = create(1, 'foo'); + const tree = create(1, "foo"); - insert(tree, 2, 'bar'); - insert(tree, 10, 'baz'); - insert(tree, 25, 'qux'); - insert(tree, 5, 'quux'); - insert(tree, 20, 'quuz'); - insert(tree, 12, 'corge'); + insert(tree, 2, "bar"); + insert(tree, 10, "baz"); + insert(tree, 25, "qux"); + insert(tree, 5, "quux"); + insert(tree, 20, "quuz"); + insert(tree, 12, "corge"); remove(tree, 20); @@ -85,49 +96,48 @@ t.test('AVL Tree', t => { t.equal(isBalanced(tree), true); }); - t.test('rangeSearch', t => { + t.test("rangeSearch", t => { t.plan(1); - const tree = create(1, ['foo']); + const tree = create(1, ["foo"]); - insert(tree, 2, ['bar']); - insert(tree, 10, ['baz']); - insert(tree, 25, ['qux']); - insert(tree, 5, ['quux']); - insert(tree, 20, ['quuz']); - insert(tree, 12, ['corge']); + insert(tree, 2, ["bar"]); + insert(tree, 10, ["baz"]); + insert(tree, 25, ["qux"]); + insert(tree, 5, ["quux"]); + insert(tree, 20, ["quuz"]); + insert(tree, 12, ["corge"]); - t.same(rangeSearch(tree, 5, 20), ['quux', 'baz', 'corge', 'quuz']); + t.same(rangeSearch(tree, 5, 20), ["quux", "baz", "corge", "quuz"]); }); - t.test('greaterThan', t => { + t.test("greaterThan", t => { t.plan(1); - const tree = create(1, ['foo']); + const tree = create(1, ["foo"]); - insert(tree, 2, ['bar']); - insert(tree, 10, ['baz']); - insert(tree, 25, ['qux']); - insert(tree, 5, ['quux']); - insert(tree, 20, ['quuz']); - insert(tree, 12, ['corge']); + insert(tree, 2, ["bar"]); + insert(tree, 10, ["baz"]); + insert(tree, 25, ["qux"]); + insert(tree, 5, ["quux"]); + insert(tree, 20, ["quuz"]); + insert(tree, 12, ["corge"]); - t.same(greaterThan(tree, 10), ['qux', 'quuz', 'corge']); + t.same(greaterThan(tree, 10), ["qux", "quuz", "corge"]); }); - t.test('lessThan', t => { + t.test("lessThan", t => { t.plan(1); - const tree = create(1, ['foo']); + const tree = create(1, ["foo"]); - insert(tree, 2, ['bar']); - insert(tree, 10, ['baz']); - insert(tree, 25, ['qux']); - insert(tree, 5, ['quux']); - insert(tree, 20, ['quuz']); - insert(tree, 12, ['corge']); + insert(tree, 2, ["bar"]); + insert(tree, 10, ["baz"]); + insert(tree, 25, ["qux"]); + insert(tree, 5, ["quux"]); + insert(tree, 20, ["quuz"]); + insert(tree, 12, ["corge"]); - t.same(lessThan(tree, 10), ['foo', 'bar', 'quux']); + t.same(lessThan(tree, 10), ["foo", "bar", "quux"]); }); - -}); \ No newline at end of file +}); diff --git a/tests/radix.test.ts b/tests/tree.radix.test.ts similarity index 98% rename from tests/radix.test.ts rename to tests/tree.radix.test.ts index c1359c326..f0a791e53 100644 --- a/tests/radix.test.ts +++ b/tests/tree.radix.test.ts @@ -1,12 +1,12 @@ import t from "tap"; -import { create as createNode } from "../src/trees/radix/node.js"; import { contains as radixContains, + create as createNode, find as radixFind, insert as radixInsert, removeDocumentByWord as radixRemoveDocumentByWord, removeWord as radixRemoveWord, -} from "../src/trees/radix/index.js"; +} from "../src/trees/radix.js"; const phrases = [ { id: "1", doc: "the quick, brown fox" }, diff --git a/tests/utils.test.ts b/tests/utils.test.ts index 6b14112c9..7f6c683f5 100644 --- a/tests/utils.test.ts +++ b/tests/utils.test.ts @@ -55,18 +55,18 @@ t.test("utils", t => { nested: { nested2: { nested3: { - bar: "baz" - } + bar: "baz", + }, }, null: null, - noop: () => null - } + noop: () => null, + }, }; t.equal(getNested(myObject, "foo"), "bar"); - t.same(getNested(myObject, "nested"), myObject.nested); - t.same(getNested(myObject, "nested.nested2"), myObject.nested.nested2); - t.same(getNested(myObject, "nested.nested2.nested3"), myObject.nested.nested2.nested3); + t.same(getNested(myObject, "nested"), undefined); + t.same(getNested(myObject, "nested.nested2"), undefined); + t.same(getNested(myObject, "nested.nested2.nested3"), undefined); t.equal(getNested(myObject, "nested.nested2.nested3.bar"), "baz"); t.equal(getNested(myObject, "nested1.nested3.bar"), undefined); t.equal(getNested(myObject, "nested.null.bar"), undefined); @@ -81,17 +81,17 @@ t.test("utils", t => { nested: { nested2: { nested3: { - bar: "baz" - } + bar: "baz", + }, }, null: null, - noop: () => null - } + noop: () => null, + }, }; const flattened = flattenObject(myObject); - t.equal((flattened as any).foo, "bar"); + t.equal((flattened as Record).foo, "bar"); t.equal(flattened["nested.nested2.nested3.bar"], "baz"); }); }); diff --git a/tsconfig.json b/tsconfig.json index a78f9f8bb..b71e4a703 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -15,7 +15,7 @@ "moduleResolution": "Node", "baseUrl": ".", "paths": { - "@stemmer/*": ["./stemmer/lib/*"] + "../stemmer/*": ["./stemmer/lib/*"] } }, "include": ["src/*.ts", "src/**/*.ts"] From 60c25944c42e4de33a23168c208e569e689a5c90 Mon Sep 17 00:00:00 2001 From: Paolo Insogna Date: Mon, 13 Mar 2023 12:05:59 +0100 Subject: [PATCH 2/5] feat: Implemented PR suggestions. --- src/components/documents-store.ts | 6 +- src/components/facets.ts | 1 + src/components/hooks.ts | 4 ++ src/components/index.ts | 14 +++-- src/errors.ts | 98 ++++++++----------------------- src/methods/create.ts | 16 ++--- src/methods/remove.ts | 18 ++++-- src/methods/update.ts | 27 ++++++++- src/types.ts | 77 +++++++++++++----------- tests/main.test.ts | 12 ++-- 10 files changed, 133 insertions(+), 140 deletions(-) diff --git a/src/components/documents-store.ts b/src/components/documents-store.ts index 3f783278c..8de8ab6ff 100644 --- a/src/components/documents-store.ts +++ b/src/components/documents-store.ts @@ -54,7 +54,7 @@ function count(store: DocumentsStore): number { return store.count; } -function load(raw: unknown): DocumentsStore { +function load(raw: R): DocumentsStore { const rawDocument = raw as DocumentsStore; return { @@ -63,11 +63,11 @@ function load(raw: unknown): DocumentsStore { }; } -function save(docs: DocumentsStore): unknown { +function save(docs: DocumentsStore): R { return { docs: docs.docs, count: docs.count, - }; + } as R; } export function createDocumentsStore(): DefaultDocumentsStore { diff --git a/src/components/facets.ts b/src/components/facets.ts index 330dce5e2..4a996fe8d 100644 --- a/src/components/facets.ts +++ b/src/components/facets.ts @@ -36,6 +36,7 @@ export async function getFacets( diff --git a/src/components/index.ts b/src/components/index.ts index 9692fc51a..3a8f5878d 100644 --- a/src/components/index.ts +++ b/src/components/index.ts @@ -172,16 +172,16 @@ function remove( language: string | undefined, tokenizer: Tokenizer, docsCount: number, -): void { +): boolean { if (typeof value === "number") { avlRemoveDocument(index.indexes[prop] as AVLNode, id, value); - return; + return true; } else if (typeof value === "boolean") { const booleanKey = value ? "true" : "false"; const position = (index.indexes[prop] as BooleanIndex)[booleanKey].indexOf(id); (index.indexes[prop] as BooleanIndex)[value ? "true" : "false"].splice(position, 1); - return; + return true; } const tokens = tokenizer.tokenize(value as string, language); @@ -195,6 +195,8 @@ function remove( index.tokenOccurrencies[prop][token]--; radixRemoveDocument(index.indexes[prop] as RadixNode, token, id); } + + return true; } function search(index: Index, prop: string, term: string, context: SearchContext): TokenScore[] { @@ -329,7 +331,7 @@ function getSearchablePropertiesWithTypes(index: Index): Record(raw: R): Index { const { indexes, searchableProperties, @@ -351,7 +353,7 @@ function load(raw: unknown): Index { }; } -function save(index: Index): unknown { +function save(index: Index): R { const { indexes, searchableProperties, @@ -370,7 +372,7 @@ function save(index: Index): unknown { tokenOccurrencies, avgFieldLength, fieldLengths, - } as unknown; + } as R; } export function createIndex(): DefaultIndex { diff --git a/src/errors.ts b/src/errors.ts index 72e3ec741..49ba13a8d 100644 --- a/src/errors.ts +++ b/src/errors.ts @@ -1,87 +1,35 @@ import { SUPPORTED_LANGUAGES } from "./tokenizer/languages.js"; import { sprintf } from "./utils.js"; -export type ErrorCode = - | "NO_DEFAULT_LANGUAGE_WITH_CUSTOM_TOKENIZER" - | "LANGUAGE_NOT_SUPPORTED" - | "INVALID_STEMMER_FUNCTION_TYPE" - | "CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY" - | "UNSUPPORTED_COMPONENT" - | "COMPONENT_MUST_BE_FUNCTION" - | "COMPONENT_MUST_BE_FUNCTION_OR_ARRAY_FUNCTIONS" - | "INVALID_SCHEMA_TYPE" - | "TYPE_ERROR_ID_MUST_BE_STRING" - | "DOCUMENT_ID_MUST_BE_STRING" - | "DOCUMENT_ALREADY_EXISTS" - | "DOCUMENT_DOES_NOT_EXIST" - | "MISSING_DOCUMENT_PROPERTY" - | "INVALID_DOCUMENT_PROPERTY" - | "INVALID_BOOST_VALUE" - | "UNKNOWN_INDEX" - | "INVALID_FILTER_OPERATION"; +const allLanguages = SUPPORTED_LANGUAGES.join("\n - "); + +const errors = { + NO_LANGUAGE_WITH_CUSTOM_TOKENIZER: "Do not pass the language option to create when using a custom tokenizer.", + LANGUAGE_NOT_SUPPORTED: `Language "%s" is not supported.\nSupported languages are:\n - ${allLanguages}`, + INVALID_STEMMER_FUNCTION_TYPE: `config.stemmer property must be a function.`, + CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY: "Custom stop words array must only contain strings.", + UNSUPPORTED_COMPONENT: `Unsupported component "%s".`, + COMPONENT_MUST_BE_FUNCTION: `The component "%s" must be a function.`, + COMPONENT_MUST_BE_FUNCTION_OR_ARRAY_FUNCTIONS: `The component "%s" must be a function or an array of functions.`, + INVALID_SCHEMA_TYPE: `Unsupported schema type "%s". Expected "string", "boolean" or "number".`, + DOCUMENT_ID_MUST_BE_STRING: `Document id must be of type "string". Got "%s" instead.`, + DOCUMENT_ALREADY_EXISTS: `A document with id "%s" already exists.`, + DOCUMENT_DOES_NOT_EXIST: `A document with id "%s" does not exists.`, + MISSING_DOCUMENT_PROPERTY: `Missing searchable property "%s".`, + INVALID_DOCUMENT_PROPERTY: `Invalid document property "%s": expected "%s", got "%s"`, + UNKNOWN_INDEX: `Invalid property name "%s". Expected a wildcard string ("*") or array containing one of the following properties: %s`, + INVALID_BOOST_VALUE: `Boost value must be a number greater than, or less than 0.`, + INVALID_FILTER_OPERATION: `You can only use one operation per filter, you requested %d.`, +}; + +export type ErrorCode = keyof typeof errors; export interface LyraError extends Error { code: string; } export function createError(code: ErrorCode, ...args: Array): LyraError { - let message = ""; - - switch (code) { - case "NO_DEFAULT_LANGUAGE_WITH_CUSTOM_TOKENIZER": - message = "Do not pass the defaultLanguage option to create when using a custom tokenizer."; - break; - case "LANGUAGE_NOT_SUPPORTED": - message = `Language "%s" is not supported.\nSupported languages are:\n - ${SUPPORTED_LANGUAGES.join("\n - ")}`; - break; - case "INVALID_STEMMER_FUNCTION_TYPE": - message = `config.stemmer property must be a function.`; - break; - case "CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY": - message = "Custom stop words array must only contain strings."; - break; - case "UNSUPPORTED_COMPONENT": - message = `Unsupported component "%s".`; - break; - case "COMPONENT_MUST_BE_FUNCTION": - message = `The component "%s" must be a function.`; - break; - case "COMPONENT_MUST_BE_FUNCTION_OR_ARRAY_FUNCTIONS": - message = `The component "%s" must be a function or an array of functions.`; - break; - case "INVALID_SCHEMA_TYPE": - message = `Unsupported schema type "%s". Expected "string", "boolean" or "number".`; - break; - case "DOCUMENT_ID_MUST_BE_STRING": - message = `Document id must be of type "string". Got "%s" instead.`; - break; - case "DOCUMENT_ALREADY_EXISTS": - message = `A document with id "%s" already exists.`; - break; - case "DOCUMENT_DOES_NOT_EXIST": - message = `A document with id "%s" does not exists.`; - break; - case "MISSING_DOCUMENT_PROPERTY": - message = `Missing searchable property "%s".`; - break; - case "INVALID_DOCUMENT_PROPERTY": - message = `Invalid document property "%s": expected "%s", got "%s"`; - break; - case "UNKNOWN_INDEX": - message = `Invalid property name "%s". Expected a wildcard string ("*") or array containing one of the following properties: %s`; - break; - case "INVALID_BOOST_VALUE": - message = `Boost value must be a number greater than, or less than 0.`; - break; - case "INVALID_FILTER_OPERATION": - message = `You can only use one operation per filter, you requested %d.`; - break; - default: - message = `Unsupported Lyra Error code: ${code}`; - break; - } - - const error = new Error(sprintf(message, ...args)) as LyraError; + const error = new Error(sprintf(errors[code] ?? `Unsupported Lyra Error code: ${code}`, ...args)) as LyraError; error.code = code; if ("captureStackTrace" in Error.prototype) { Error.captureStackTrace(error); diff --git a/src/methods/create.ts b/src/methods/create.ts index 30036a301..06cf8d385 100644 --- a/src/methods/create.ts +++ b/src/methods/create.ts @@ -1,7 +1,8 @@ import { getDefaultComponents } from "../components/defaults.js"; -import { createError } from "../errors.js"; +import { createDocumentsStore } from "../components/documents-store.js"; import { COMPLEX_COMPONENTS, SIMPLE_COMPONENTS, SIMPLE_OR_ARRAY_COMPONENTS } from "../components/hooks.js"; import { createIndex } from "../components/index.js"; +import { createError } from "../errors.js"; import { createTokenizer } from "../tokenizer/index.js"; import { ArrayCallbackComponents, @@ -15,11 +16,10 @@ import { SimpleComponents, SimpleOrArrayCallbackComponents, } from "../types.js"; -import { createDocumentsStore } from "../components/documents-store.js"; interface CreateArguments { schema: Schema; - defaultLanguage?: string; + language?: string; components?: Components; } @@ -71,7 +71,7 @@ function validateComponents({ schema, - defaultLanguage, + language, components, }: CreateArguments): Promise> { if (!components) { @@ -84,10 +84,10 @@ export async function create { +): Promise { + let result = true; const { index, docs } = lyra.data; const doc = await lyra.documentsStore.get(docs, id); @@ -28,7 +29,9 @@ export async function remove( @@ -45,7 +49,9 @@ export async function removeMultiple { +): Promise { + let result = true; + if (!batchSize) { batchSize = 1000; } @@ -66,7 +72,9 @@ export async function removeMultiple { + if (!skipHooks) { + await runSingleHook(lyra.beforeUpdate, lyra, id); + } + await remove(lyra, id, language, skipHooks); - return insert(lyra, doc, language, skipHooks); + const newId = await insert(lyra, doc, language, skipHooks); + + if (!skipHooks) { + await runSingleHook(lyra.afterUpdate, lyra, newId); + } + + return newId; } export async function updateMultiple( @@ -25,6 +36,16 @@ export async function updateMultiple = T | null; -export type CallbackComponentReturnValue = T | Promise; +export type SingleOrArray = T | T[]; + +export type SyncOrAsyncValue = T | Promise; // eslint-disable-next-line @typescript-eslint/no-empty-interface export interface OpaqueIndex {} @@ -215,14 +217,14 @@ export type SingleCallbackComponent, id: string, doc?: Document, -) => CallbackComponentReturnValue; +) => SyncOrAsyncValue; export type MultipleCallbackComponent = ( lyra: Lyra, doc: Document[] | string[], -) => CallbackComponentReturnValue; +) => SyncOrAsyncValue; -export type IIndexInsertOrRemoveFunction = ( +export type IIndexInsertOrRemoveFunction = ( index: I, id: string, prop: string, @@ -230,9 +232,9 @@ export type IIndexInsertOrRemoveFunction = ( language: string | undefined, tokenizer: Tokenizer, docsCount: number, -) => CallbackComponentReturnValue; +) => SyncOrAsyncValue; -export type IIndexRemoveFunction = (index: I, id: string, prop: string) => CallbackComponentReturnValue; +export type IIndexRemoveFunction = (index: I, id: string, prop: string) => SyncOrAsyncValue; export interface IIndex { create: (lyra: Lyra, schema: Schema) => I; @@ -242,29 +244,29 @@ export interface IIndex; beforeRemove?: IIndexInsertOrRemoveFunction; - remove: IIndexInsertOrRemoveFunction; + remove: IIndexInsertOrRemoveFunction; afterRemove?: IIndexInsertOrRemoveFunction; - search(index: I, prop: string, terms: string, context: SearchContext): CallbackComponentReturnValue; + search(index: I, prop: string, terms: string, context: SearchContext): SyncOrAsyncValue; searchByWhereClause(index: I, filters: Record): string[]; - getSearchableProperties(index: I): CallbackComponentReturnValue; - getSearchablePropertiesWithTypes(index: I): CallbackComponentReturnValue>; + getSearchableProperties(index: I): SyncOrAsyncValue; + getSearchablePropertiesWithTypes(index: I): SyncOrAsyncValue>; - load(raw: unknown): I | Promise; - save(index: I): unknown | Promise; + load(raw: R): I | Promise; + save(index: I): R | Promise; } export interface IDocumentsStore { create: (lyra: Lyra) => D; - get(store: D, id: string): CallbackComponentReturnValue; - getMultiple(store: D, ids: string[]): CallbackComponentReturnValue<(Document | undefined)[]>; - store(store: D, id: string, doc: Document): CallbackComponentReturnValue; - remove(store: D, id: string): CallbackComponentReturnValue; - count(store: D): CallbackComponentReturnValue; - - load(raw: unknown): D | Promise; - save(store: D): unknown | Promise; + get(store: D, id: string): SyncOrAsyncValue; + getMultiple(store: D, ids: string[]): SyncOrAsyncValue<(Document | undefined)[]>; + store(store: D, id: string, doc: Document): SyncOrAsyncValue; + remove(store: D, id: string): SyncOrAsyncValue; + count(store: D): SyncOrAsyncValue; + + load(raw: R): D | Promise; + save(store: D): R | Promise; } export interface Tokenizer { @@ -278,13 +280,10 @@ export interface ComplexComponent; - getDocumentIndexId(doc: Document): CallbackComponentReturnValue; - getDocumentProperties( - doc: Document, - paths: string[], - ): CallbackComponentReturnValue>; - formatElapsedTime(number: bigint): CallbackComponentReturnValue | CallbackComponentReturnValue; + validateSchema(doc: Document, schema: Schema): SyncOrAsyncValue; + getDocumentIndexId(doc: Document): SyncOrAsyncValue; + getDocumentProperties(doc: Document, paths: string[]): SyncOrAsyncValue>; + formatElapsedTime(number: bigint): SyncOrAsyncValue | SyncOrAsyncValue; } export interface SimpleOrArrayCallbackComponents< @@ -292,14 +291,18 @@ export interface SimpleOrArrayCallbackComponents< I extends OpaqueIndex, D extends OpaqueDocumentStore, > { - beforeInsert: SingleCallbackComponent | SingleCallbackComponent[]; - afterInsert: SingleCallbackComponent | SingleCallbackComponent[]; - beforeRemove: SingleCallbackComponent | SingleCallbackComponent[]; - afterRemove: SingleCallbackComponent | SingleCallbackComponent[]; - beforeMultipleInsert: MultipleCallbackComponent | MultipleCallbackComponent[]; - afterMultipleInsert: MultipleCallbackComponent | MultipleCallbackComponent[]; - beforeMultipleRemove: MultipleCallbackComponent | MultipleCallbackComponent[]; - afterMultipleRemove: MultipleCallbackComponent | MultipleCallbackComponent[]; + beforeInsert: SingleOrArray>; + afterInsert: SingleOrArray>; + beforeRemove: SingleOrArray>; + afterRemove: SingleOrArray>; + beforeUpdate: SingleOrArray>; + afterUpdate: SingleOrArray>; + beforeMultipleInsert: SingleOrArray>; + afterMultipleInsert: SingleOrArray>; + beforeMultipleRemove: SingleOrArray>; + afterMultipleRemove: SingleOrArray>; + beforeMultipleUpdate: SingleOrArray>; + afterMultipleUpdate: SingleOrArray>; } export interface ArrayCallbackComponents { @@ -307,10 +310,14 @@ export interface ArrayCallbackComponents[]; beforeRemove: SingleCallbackComponent[]; afterRemove: SingleCallbackComponent[]; + beforeUpdate: SingleCallbackComponent[]; + afterUpdate: SingleCallbackComponent[]; beforeMultipleInsert: MultipleCallbackComponent[]; afterMultipleInsert: MultipleCallbackComponent[]; beforeMultipleRemove: MultipleCallbackComponent[]; afterMultipleRemove: MultipleCallbackComponent[]; + beforeMultipleUpdate: MultipleCallbackComponent[]; + afterMultipleUpdate: MultipleCallbackComponent[]; } export type Components = Partial< diff --git a/tests/main.test.ts b/tests/main.test.ts index c3acab6a1..7dfd0674e 100644 --- a/tests/main.test.ts +++ b/tests/main.test.ts @@ -1,10 +1,10 @@ import t from "tap"; -import type { Document } from "../src/types"; import { DocumentsStore } from "../src/components/documents-store.js"; import { Index } from "../src/components/index.js"; import { create, insert, insertMultiple, remove, search } from "../src/index.js"; import { createTokenizer } from "../src/tokenizer/index.js"; import { SUPPORTED_LANGUAGES } from "../src/tokenizer/languages.js"; +import type { Document } from "../src/types"; import dataset from "./datasets/events.json" assert { type: "json" }; interface BaseDataEvent extends Document { @@ -27,7 +27,7 @@ interface DataSet { result: { events: DataEvent[] }; } -t.test("defaultLanguage", t => { +t.test("language", t => { t.plan(5); t.test("should throw an error if the desired language is not supported", async t => { @@ -37,7 +37,7 @@ t.test("defaultLanguage", t => { () => create({ schema: {}, - defaultLanguage: "latin", + language: "latin", }), { code: "LANGUAGE_NOT_SUPPORTED" }, ); @@ -69,7 +69,7 @@ t.test("defaultLanguage", t => { try { await create({ schema: {}, - defaultLanguage: "portuguese", + language: "portuguese", }); t.pass(); @@ -84,7 +84,7 @@ t.test("defaultLanguage", t => { try { await create({ schema: {}, - defaultLanguage: "slovenian", + language: "slovenian", }); t.pass(); @@ -99,7 +99,7 @@ t.test("defaultLanguage", t => { try { await create({ schema: {}, - defaultLanguage: "bulgarian", + language: "bulgarian", }); t.pass(); From 357a09a438f6918cc091d212d9cc94ebb6ce584c Mon Sep 17 00:00:00 2001 From: Paolo Insogna Date: Mon, 13 Mar 2023 12:07:18 +0100 Subject: [PATCH 3/5] chore: Linted code. --- package.json | 2 +- src/tokenizer/languages.ts | 2 +- tests/snapshots/events.json | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/package.json b/package.json index 8001e76be..a3c5fd331 100644 --- a/package.json +++ b/package.json @@ -61,7 +61,7 @@ "test:smoke": "tap --rcfile=tests/config/tap.yml tests/smoke/*.test.ts", "ci": "npm run build && npm run test:ci && npm run test:smoke", "prepare": "husky install && npm run build", - "commit": "pnpm lint-staged && cz", + "commit": "pnpm run format && pnpm lint-staged && cz", "benchmark": "node --no-warnings benchmarks/src/typo-tolerant-search.js" }, "keywords": [ diff --git a/src/tokenizer/languages.ts b/src/tokenizer/languages.ts index 0b8470c47..4ba591bc2 100644 --- a/src/tokenizer/languages.ts +++ b/src/tokenizer/languages.ts @@ -60,4 +60,4 @@ export const SPLITTERS: Record = { export const SUPPORTED_LANGUAGES = Object.keys(STEMMERS); -export type Language = typeof SUPPORTED_LANGUAGES[number]; +export type Language = (typeof SUPPORTED_LANGUAGES)[number]; diff --git a/tests/snapshots/events.json b/tests/snapshots/events.json index cbb3d135e..040b328c3 100644 --- a/tests/snapshots/events.json +++ b/tests/snapshots/events.json @@ -404,4 +404,4 @@ } ] } -} \ No newline at end of file +} From 2f65899214477992442c400d9182b7fa5180368e Mon Sep 17 00:00:00 2001 From: Paolo Insogna Date: Mon, 13 Mar 2023 14:24:48 +0100 Subject: [PATCH 4/5] fix: Fixed import. --- benchmarks/src/typo-tolerant-search.js | 2 +- src/methods/update.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/src/typo-tolerant-search.js b/benchmarks/src/typo-tolerant-search.js index da9618781..1834688b4 100644 --- a/benchmarks/src/typo-tolerant-search.js +++ b/benchmarks/src/typo-tolerant-search.js @@ -1,6 +1,6 @@ import cronometro from "cronometro"; import { isMainThread } from "worker_threads"; -import { create, search, insertMultiple } from "../../dist/index.js"; +import { create, insertMultiple, search } from "../../dist/index.js"; import { createTokenizer } from "../../dist/internals.js"; import { formattedEvents } from "./utils/dataset.js"; diff --git a/src/methods/update.ts b/src/methods/update.ts index fe040e198..4537a7dd9 100644 --- a/src/methods/update.ts +++ b/src/methods/update.ts @@ -1,4 +1,4 @@ -import { runMultipleHook, runSingleHook } from "src/components/hooks.js"; +import { runMultipleHook, runSingleHook } from "../components/hooks.js"; import { Document, Lyra, OpaqueDocumentStore, OpaqueIndex, Schema } from "../types.js"; import { insert, insertMultiple } from "./insert.js"; import { remove, removeMultiple } from "./remove.js"; From 689156901b074f7fce95c437604e39d2364e9cc4 Mon Sep 17 00:00:00 2001 From: Paolo Insogna Date: Mon, 13 Mar 2023 15:28:18 +0100 Subject: [PATCH 5/5] feat: Add internal cache for Lyra. --- src/methods/create.ts | 1 + src/methods/search.ts | 12 +++++++++--- src/types.ts | 1 + 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/methods/create.ts b/src/methods/create.ts index 06cf8d385..86644f518 100644 --- a/src/methods/create.ts +++ b/src/methods/create.ts @@ -119,6 +119,7 @@ export async function create propertiesToSearchWithTypes[prop] === "string"); + let propertiesToSearch = lyra.caches["propertiesToSearch"] as string[]; + if (!propertiesToSearch) { + const propertiesToSearchWithTypes = await lyra.index.getSearchablePropertiesWithTypes(index); + + propertiesToSearch = await lyra.index.getSearchableProperties(index); + propertiesToSearch = propertiesToSearch.filter((prop: string) => propertiesToSearchWithTypes[prop] === "string"); + + lyra.caches["propertiesToSearch"] = propertiesToSearch; + } if (properties && properties !== "*") { for (const prop of properties) { diff --git a/src/types.ts b/src/types.ts index a7ac0b182..8156f17d7 100644 --- a/src/types.ts +++ b/src/types.ts @@ -337,6 +337,7 @@ export type Lyra; [kInsertions]: number | undefined; [kRemovals]: number | undefined; };