From 7a68ed1b37686b185b155df97d5216c73a16ac92 Mon Sep 17 00:00:00 2001 From: Michele Riva Date: Thu, 3 Aug 2023 23:21:40 +0200 Subject: [PATCH] feat(orama, plugin-data-persistence): adds support for vector serialization --- packages/orama/src/components/defaults.ts | 7 +-- packages/orama/src/components/index.ts | 62 +++++++++++++------ packages/orama/src/errors.ts | 2 +- packages/orama/src/methods/search-vector.ts | 2 +- .../plugin-data-persistence/src/server.ts | 4 +- .../test/index.test.ts | 55 +++++++++++++--- 6 files changed, 96 insertions(+), 36 deletions(-) diff --git a/packages/orama/src/components/defaults.ts b/packages/orama/src/components/defaults.ts index 5b63f147b..391af9b36 100644 --- a/packages/orama/src/components/defaults.ts +++ b/packages/orama/src/components/defaults.ts @@ -35,10 +35,9 @@ export async function validateSchema(doc: Document, s const typeOfType = typeof type if (isVectorType(type as string)) { - // TODO: validate vector size - if (!Array.isArray(value)) { - // TODO: run actual validation - return undefined + const vectorSize = getVectorSize(type as string) + if (!Array.isArray(value) || value.length !== vectorSize) { + throw createError('INVALID_INPUT_VECTOR', prop, vectorSize, (value as number[]).length) } continue } diff --git a/packages/orama/src/components/index.ts b/packages/orama/src/components/index.ts index 97a534135..bd905a95b 100644 --- a/packages/orama/src/components/index.ts +++ b/packages/orama/src/components/index.ts @@ -1,20 +1,20 @@ import type { -ArraySearchableType, -BM25Params, -ComparisonOperator, -IIndex, -Magnitude, -OpaqueDocumentStore, -OpaqueIndex, -Orama, -ScalarSearchableType, -Schema, -SearchableType, -SearchableValue, -SearchContext, -Tokenizer, -TokenScore, -VectorType, + ArraySearchableType, + BM25Params, + ComparisonOperator, + IIndex, + Magnitude, + OpaqueDocumentStore, + OpaqueIndex, + Orama, + ScalarSearchableType, + Schema, + SearchableType, + SearchableValue, + SearchContext, + Tokenizer, + TokenScore, + VectorType, } from '../types.js' import { createError } from '../errors.js' import { @@ -586,8 +586,17 @@ export async function load(sharedInternalDocumentStore: InternalDoc indexes[prop] = loadNode(value) } - for (const prop of Object.keys(rawVectorIndexes)) { - // TODO: load vector indexes, convert arrays into Float32Arrays + for (const idx of Object.keys(rawVectorIndexes)) { + const vectors = rawVectorIndexes[idx].vectors + + for (const vec in vectors) { + vectors[vec] = [vectors[vec][0], new Float32Array(vectors[vec][1])] + } + + vectorIndexes[idx] = { + size: rawVectorIndexes[idx].size, + vectors, + } } return { @@ -615,9 +624,24 @@ export async function save(index: Index): Promise { fieldLengths, } = index + const vectorIndexesAsArrays: Index['vectorIndexes'] = {} + + for (const idx of Object.keys(vectorIndexes)) { + const vectors = vectorIndexes[idx].vectors + + for (const vec in vectors) { + vectors[vec] = [vectors[vec][0], Array.from(vectors[vec][1]) as unknown as Float32Array] + } + + vectorIndexesAsArrays[idx] = { + size: vectorIndexes[idx].size, + vectors + } + } + return { indexes, - vectorIndexes, + vectorIndexes: vectorIndexesAsArrays, searchableProperties, searchablePropertiesWithTypes, frequencies, diff --git a/packages/orama/src/errors.ts b/packages/orama/src/errors.ts index eea4c7c7c..d6e51a0a0 100644 --- a/packages/orama/src/errors.ts +++ b/packages/orama/src/errors.ts @@ -31,7 +31,7 @@ const errors = { UNKNOWN_FILTER_PROPERTY: `Unknown filter property "%s".`, INVALID_VECTOR_SIZE: `Vector size must be a number greater than 0. Got "%s" instead.`, INVALID_VECTOR_VALUE: `Vector value must be a number greater than 0. Got "%s" instead.`, - WRONG_VECTOR_SIZE: `Vector size must be %s. Got a vector of %s dimensions instead.` + INVALID_INPUT_VECTOR: `Property "%s" was declared as a %s-dimentional vector, but got a %s-dimentional vector instead.\nInput vectors must be of the size declared in the schema, as calculating similarity between vectors of different sizes can lead to unexpected results.`, } export type ErrorCode = keyof typeof errors diff --git a/packages/orama/src/methods/search-vector.ts b/packages/orama/src/methods/search-vector.ts index abb1f4ef8..f93f7a9a5 100644 --- a/packages/orama/src/methods/search-vector.ts +++ b/packages/orama/src/methods/search-vector.ts @@ -22,7 +22,7 @@ export async function searchVector(orama: Orama, params: SearchVectorParams): Pr const vectors = vectorIndex.vectors as Record if (vector.length !== vectorSize) { - throw createError('WRONG_VECTOR_SIZE', vectorSize, vector.length) + throw createError('INVALID_INPUT_VECTOR', property, vectorSize, vector.length) } if (!(vector instanceof Float32Array)) { diff --git a/packages/plugin-data-persistence/src/server.ts b/packages/plugin-data-persistence/src/server.ts index b59aac650..f557a21aa 100644 --- a/packages/plugin-data-persistence/src/server.ts +++ b/packages/plugin-data-persistence/src/server.ts @@ -118,9 +118,9 @@ export async function getDefaultFileName(format: PersistenceFormat, runtime?: Ru /* c8 ignore next 3 */ if (runtime === 'deno') { // @ts-expect-error Deno is only available in Deno - dbName = Deno.env.get('LYRA_DB_NAME') ?? DEFAULT_DB_NAME + dbName = Deno.env.get('ORAMA_DB_NAME') ?? DEFAULT_DB_NAME } else { - dbName = process?.env?.LYRA_DB_NAME ?? DEFAULT_DB_NAME + dbName = process?.env?.ORAMA_DB_NAME ?? DEFAULT_DB_NAME } return `${dbName}.${extension}` diff --git a/packages/plugin-data-persistence/test/index.test.ts b/packages/plugin-data-persistence/test/index.test.ts index 5337a8ed5..3cbc629b8 100644 --- a/packages/plugin-data-persistence/test/index.test.ts +++ b/packages/plugin-data-persistence/test/index.test.ts @@ -1,4 +1,4 @@ -import { create, insert, Orama, search } from '@orama/orama' +import { create, insert, Orama, search, searchVector } from '@orama/orama' import t from 'tap' import { UNSUPPORTED_FORMAT, METHOD_MOVED } from '../src/errors.js' import { @@ -122,7 +122,7 @@ t.test('binary persistence', t => { await rm(path) }) - t.test('should generate a persistence file on the disk using LYRA_DB_NAME env', async t => { + t.test('should generate a persistence file on the disk using ORAMA_DB_NAME env', async t => { t.plan(3) let currentOramaDBNameValue: string | undefined @@ -130,13 +130,13 @@ t.test('binary persistence', t => { // @ts-expect-error Deno is only available in Deno if (typeof Deno !== 'undefined') { // @ts-expect-error Deno is only available in Deno - currentOramaDBNameValue = Deno.env.get('LYRA_DB_NAME') + currentOramaDBNameValue = Deno.env.get('ORAMA_DB_NAME') // @ts-expect-error Deno is only available in Deno - Deno.env.set('LYRA_DB_NAME', 'example_db_dump') + Deno.env.set('ORAMA_DB_NAME', 'example_db_dump') } else { - currentOramaDBNameValue = process.env.LYRA_DB_NAME - process.env.LYRA_DB_NAME = 'example_db_dump' + currentOramaDBNameValue = process.env.ORAMA_DB_NAME + process.env.ORAMA_DB_NAME = 'example_db_dump' } const db = await generateTestDBInstance() @@ -174,16 +174,16 @@ t.test('binary persistence', t => { // @ts-expect-error Deno is only available in Deno if (typeof Deno !== 'undefined') { // @ts-expect-error Deno is only available in Deno - Deno.env.set('LYRA_DB_NAME', currentOramaDBNameValue) + Deno.env.set('ORAMA_DB_NAME', currentOramaDBNameValue) } else { - process.env.LYRA_DB_NAME = currentOramaDBNameValue + process.env.ORAMA_DB_NAME = currentOramaDBNameValue } } }) }) t.test('json persistence', t => { - t.plan(2) + t.plan(3) t.test('should generate a persistence file on the disk with random name and json format', async t => { t.plan(2) @@ -219,6 +219,43 @@ t.test('json persistence', t => { await rm(path) }) + t.test('should generate a persistence file on the disk with support for vectors', async t => { + t.plan(1) + + const db1 = await create({ + schema: { + text: 'string', + vector: 'vector[5]' + } + }) + + await insert(db1, { text: 'vector 1', vector: [1, 0, 0, 0, 0] }) + await insert(db1, { text: 'vector 2', vector: [1, 1, 0, 0, 0] }) + await insert(db1, { text: 'vector 3', vector: [0, 0, 0, 0, 0] }) + + // Persist database on disk in json format + const path = await persistToFile(db1, 'json', 'test.json') + + // Load database from disk in json format + const db2 = await restoreFromFile('json', 'test.json') + + const qp1 = await searchVector(db1, { + vector: [1, 0, 0, 0, 0], + property: 'vector' + }) + + const qp2 = await searchVector(db2, { + vector: [1, 0, 0, 0, 0], + property: 'vector' + }) + + // Queries on the loaded database should match the original database + t.same(qp1.hits, qp2.hits) + + // Clean up + await rm(path) + }) + t.test('should generate a persistence file on the disk with a given name and json format', async t => { t.plan(2)