diff --git a/packages/orama/src/components/index.ts b/packages/orama/src/components/index.ts index bd905a95b..0402de72e 100644 --- a/packages/orama/src/components/index.ts +++ b/packages/orama/src/components/index.ts @@ -214,7 +214,7 @@ export async function create( if (isVectorType(type as string)) { index.searchableProperties.push(path) - index.searchablePropertiesWithTypes[path] = (type as SearchableType) + index.searchablePropertiesWithTypes[path] = type as SearchableType index.vectorIndexes[path] = { size: getVectorSize(type as string), vectors: {}, @@ -297,7 +297,6 @@ export async function insert( tokenizer: Tokenizer, docsCount: number, ): Promise { - if (isVectorType(schemaType)) { return insertVector(index, prop, value as number[] | Float32Array, id) } @@ -329,7 +328,7 @@ function insertVector(index: Index, prop: string, value: number[] | VectorType, if (!(value instanceof Float32Array)) { value = new Float32Array(value) } - + const size = index.vectorIndexes[prop].size const magnitude = getMagnitude(value, size) @@ -476,8 +475,10 @@ export async function searchByWhereClause(index: Index): Promise { for (const idx of Object.keys(vectorIndexes)) { const vectors = vectorIndexes[idx].vectors - + for (const vec in vectors) { vectors[vec] = [vectors[vec][0], Array.from(vectors[vec][1]) as unknown as Float32Array] } vectorIndexesAsArrays[idx] = { size: vectorIndexes[idx].size, - vectors + vectors, } } diff --git a/packages/orama/src/components/tokenizer/index.ts b/packages/orama/src/components/tokenizer/index.ts index a543d6e9d..c88715fe4 100644 --- a/packages/orama/src/components/tokenizer/index.ts +++ b/packages/orama/src/components/tokenizer/index.ts @@ -7,6 +7,7 @@ import { stemmer as english } from './english-stemmer.js' interface DefaultTokenizer extends Tokenizer { language: Language stemmer?: Stemmer + tokenizeSkipProperties: Set stemmerSkipProperties: Set stopWords?: string[] allowDuplicates: boolean @@ -58,12 +59,18 @@ function tokenize(this: DefaultTokenizer, input: string, language?: string, prop return [input] } - const splitRule = SPLITTERS[this.language] - const tokens = input - .toLowerCase() - .split(splitRule) - .map(this.normalizeToken.bind(this, prop ?? '')) - .filter(Boolean) + let tokens: string[] + if (prop && this.tokenizeSkipProperties.has(prop)) { + tokens = [this.normalizeToken.bind(this, prop ?? '')(input)] + } else { + const splitRule = SPLITTERS[this.language] + tokens = input + .toLowerCase() + .split(splitRule) + .map(this.normalizeToken.bind(this, prop ?? '')) + .filter(Boolean) + } + const trimTokens = trim(tokens) if (!this.allowDuplicates) { @@ -131,6 +138,7 @@ export async function createTokenizer(config: DefaultTokenizerConfig = {}): Prom language: config.language, stemmer, stemmerSkipProperties: new Set(config.stemmerSkipProperties ? [config.stemmerSkipProperties].flat() : []), + tokenizeSkipProperties: new Set(config.tokenizeSkipProperties ? [config.tokenizeSkipProperties].flat() : []), stopWords, allowDuplicates: Boolean(config.allowDuplicates), normalizeToken, diff --git a/packages/orama/src/types.ts b/packages/orama/src/types.ts index 6cc87f5b4..ecd1e2451 100644 --- a/packages/orama/src/types.ts +++ b/packages/orama/src/types.ts @@ -530,6 +530,7 @@ export type DefaultTokenizerConfig = { stemming?: boolean stemmer?: Stemmer stemmerSkipProperties?: string | string[] + tokenizeSkipProperties?: string | string[] stopWords?: boolean | string[] | ((stopWords: string[]) => string[] | Promise) allowDuplicates?: boolean } diff --git a/packages/orama/tests/tokenizeSkipProperties.test.ts b/packages/orama/tests/tokenizeSkipProperties.test.ts new file mode 100644 index 000000000..0d23c9140 --- /dev/null +++ b/packages/orama/tests/tokenizeSkipProperties.test.ts @@ -0,0 +1,120 @@ +import * as t from 'tap' +import { Orama, create, getByID, insert, search } from '../src/index.js' + +t.test('tokenizeSkipProperties', t => { + t.test('skipProperties', async t => { + const [db, id1, id2, id3, id4] = await createSimpleDB(true) + + const result = await search(db, { + where: { + 'meta.finish': 'black matte', + }, + }) + + t.ok(result.elapsed) + t.ok(result.elapsed.raw) + t.ok(result.elapsed.formatted) + t.equal(result.count, 1) + t.equal(result.hits[0].id, id1) + + t.end() + }) + + t.test('noSkipProperties', async t => { + const [db, id1, id2, id3, id4] = await createSimpleDB(false) + + const result = await search(db, { + where: { + 'meta.finish': 'black matte', + }, + }) + + t.ok(result.elapsed) + t.ok(result.elapsed.raw) + t.ok(result.elapsed.formatted) + t.equal(result.count, 3) + + for (const id of [id1, id2, id4]) { + t.ok(result.hits.find(d => d.id === id)) + } + + t.end() + }) + t.end() +}) + +async function createSimpleDB(skipProperties: boolean): Promise<[Orama, string, string, string, string]> { + let db: Orama + if (skipProperties) { + db = await create({ + schema: { + name: 'string', + rating: 'number', + price: 'number', + meta: { + sales: 'number', + finish: 'string', + }, + }, + components: { + tokenizer: { + tokenizeSkipProperties: ['meta.finish'], + }, + }, + }) + } else { + db = await create({ + schema: { + name: 'string', + rating: 'number', + price: 'number', + meta: { + sales: 'number', + finish: 'string', + }, + }, + }) + } + + const id1 = await insert(db, { + name: 'super coffee maker', + rating: 5, + price: 900, + meta: { + sales: 100, + finish: 'black matte', + }, + }) + + const id2 = await insert(db, { + name: 'washing machine', + rating: 5, + price: 900, + meta: { + sales: 100, + finish: 'gloss black', + }, + }) + + const id3 = await insert(db, { + name: 'coffee maker', + rating: 3, + price: 30, + meta: { + sales: 25, + finish: 'gloss blue', + }, + }) + + const id4 = await insert(db, { + name: 'coffee maker deluxe', + rating: 5, + price: 45, + meta: { + sales: 25, + finish: 'blue matte', + }, + }) + + return [db, id1, id2, id3, id4] +}