Skip to content

Commit

Permalink
feat(orama, plugin-data-persistence): adds support for vector seriali…
Browse files Browse the repository at this point in the history
…zation
  • Loading branch information
micheleriva committed Aug 3, 2023
1 parent b33aaac commit 7a68ed1
Show file tree
Hide file tree
Showing 6 changed files with 96 additions and 36 deletions.
7 changes: 3 additions & 4 deletions packages/orama/src/components/defaults.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,9 @@ export async function validateSchema<S extends Schema = Schema>(doc: Document, s
const typeOfType = typeof type

if (isVectorType(type as string)) {
// TODO: validate vector size
if (!Array.isArray(value)) {
// TODO: run actual validation
return undefined
const vectorSize = getVectorSize(type as string)
if (!Array.isArray(value) || value.length !== vectorSize) {
throw createError('INVALID_INPUT_VECTOR', prop, vectorSize, (value as number[]).length)
}
continue
}
Expand Down
62 changes: 43 additions & 19 deletions packages/orama/src/components/index.ts
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
import type {
ArraySearchableType,
BM25Params,
ComparisonOperator,
IIndex,
Magnitude,
OpaqueDocumentStore,
OpaqueIndex,
Orama,
ScalarSearchableType,
Schema,
SearchableType,
SearchableValue,
SearchContext,
Tokenizer,
TokenScore,
VectorType,
ArraySearchableType,
BM25Params,
ComparisonOperator,
IIndex,
Magnitude,
OpaqueDocumentStore,
OpaqueIndex,
Orama,
ScalarSearchableType,
Schema,
SearchableType,
SearchableValue,
SearchContext,
Tokenizer,
TokenScore,
VectorType,
} from '../types.js'
import { createError } from '../errors.js'
import {
Expand Down Expand Up @@ -586,8 +586,17 @@ export async function load<R = unknown>(sharedInternalDocumentStore: InternalDoc
indexes[prop] = loadNode(value)
}

for (const prop of Object.keys(rawVectorIndexes)) {
// TODO: load vector indexes, convert arrays into Float32Arrays
for (const idx of Object.keys(rawVectorIndexes)) {
const vectors = rawVectorIndexes[idx].vectors

for (const vec in vectors) {
vectors[vec] = [vectors[vec][0], new Float32Array(vectors[vec][1])]
}

vectorIndexes[idx] = {
size: rawVectorIndexes[idx].size,
vectors,
}
}

return {
Expand Down Expand Up @@ -615,9 +624,24 @@ export async function save<R = unknown>(index: Index): Promise<R> {
fieldLengths,
} = index

const vectorIndexesAsArrays: Index['vectorIndexes'] = {}

for (const idx of Object.keys(vectorIndexes)) {
const vectors = vectorIndexes[idx].vectors

for (const vec in vectors) {
vectors[vec] = [vectors[vec][0], Array.from(vectors[vec][1]) as unknown as Float32Array]
}

vectorIndexesAsArrays[idx] = {
size: vectorIndexes[idx].size,
vectors
}
}

return {
indexes,
vectorIndexes,
vectorIndexes: vectorIndexesAsArrays,
searchableProperties,
searchablePropertiesWithTypes,
frequencies,
Expand Down
2 changes: 1 addition & 1 deletion packages/orama/src/errors.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ const errors = {
UNKNOWN_FILTER_PROPERTY: `Unknown filter property "%s".`,
INVALID_VECTOR_SIZE: `Vector size must be a number greater than 0. Got "%s" instead.`,
INVALID_VECTOR_VALUE: `Vector value must be a number greater than 0. Got "%s" instead.`,
WRONG_VECTOR_SIZE: `Vector size must be %s. Got a vector of %s dimensions instead.`
INVALID_INPUT_VECTOR: `Property "%s" was declared as a %s-dimentional vector, but got a %s-dimentional vector instead.\nInput vectors must be of the size declared in the schema, as calculating similarity between vectors of different sizes can lead to unexpected results.`,
}

export type ErrorCode = keyof typeof errors
Expand Down
2 changes: 1 addition & 1 deletion packages/orama/src/methods/search-vector.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ export async function searchVector(orama: Orama, params: SearchVectorParams): Pr
const vectors = vectorIndex.vectors as Record<string, [Magnitude, VectorType]>

if (vector.length !== vectorSize) {
throw createError('WRONG_VECTOR_SIZE', vectorSize, vector.length)
throw createError('INVALID_INPUT_VECTOR', property, vectorSize, vector.length)
}

if (!(vector instanceof Float32Array)) {
Expand Down
4 changes: 2 additions & 2 deletions packages/plugin-data-persistence/src/server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,9 @@ export async function getDefaultFileName(format: PersistenceFormat, runtime?: Ru
/* c8 ignore next 3 */
if (runtime === 'deno') {
// @ts-expect-error Deno is only available in Deno
dbName = Deno.env.get('LYRA_DB_NAME') ?? DEFAULT_DB_NAME
dbName = Deno.env.get('ORAMA_DB_NAME') ?? DEFAULT_DB_NAME
} else {
dbName = process?.env?.LYRA_DB_NAME ?? DEFAULT_DB_NAME
dbName = process?.env?.ORAMA_DB_NAME ?? DEFAULT_DB_NAME
}

return `${dbName}.${extension}`
Expand Down
55 changes: 46 additions & 9 deletions packages/plugin-data-persistence/test/index.test.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { create, insert, Orama, search } from '@orama/orama'
import { create, insert, Orama, search, searchVector } from '@orama/orama'
import t from 'tap'
import { UNSUPPORTED_FORMAT, METHOD_MOVED } from '../src/errors.js'
import {
Expand Down Expand Up @@ -122,21 +122,21 @@ t.test('binary persistence', t => {
await rm(path)
})

t.test('should generate a persistence file on the disk using LYRA_DB_NAME env', async t => {
t.test('should generate a persistence file on the disk using ORAMA_DB_NAME env', async t => {
t.plan(3)

let currentOramaDBNameValue: string | undefined

// @ts-expect-error Deno is only available in Deno
if (typeof Deno !== 'undefined') {
// @ts-expect-error Deno is only available in Deno
currentOramaDBNameValue = Deno.env.get('LYRA_DB_NAME')
currentOramaDBNameValue = Deno.env.get('ORAMA_DB_NAME')

// @ts-expect-error Deno is only available in Deno
Deno.env.set('LYRA_DB_NAME', 'example_db_dump')
Deno.env.set('ORAMA_DB_NAME', 'example_db_dump')
} else {
currentOramaDBNameValue = process.env.LYRA_DB_NAME
process.env.LYRA_DB_NAME = 'example_db_dump'
currentOramaDBNameValue = process.env.ORAMA_DB_NAME
process.env.ORAMA_DB_NAME = 'example_db_dump'
}

const db = await generateTestDBInstance()
Expand Down Expand Up @@ -174,16 +174,16 @@ t.test('binary persistence', t => {
// @ts-expect-error Deno is only available in Deno
if (typeof Deno !== 'undefined') {
// @ts-expect-error Deno is only available in Deno
Deno.env.set('LYRA_DB_NAME', currentOramaDBNameValue)
Deno.env.set('ORAMA_DB_NAME', currentOramaDBNameValue)
} else {
process.env.LYRA_DB_NAME = currentOramaDBNameValue
process.env.ORAMA_DB_NAME = currentOramaDBNameValue
}
}
})
})

t.test('json persistence', t => {
t.plan(2)
t.plan(3)

t.test('should generate a persistence file on the disk with random name and json format', async t => {
t.plan(2)
Expand Down Expand Up @@ -219,6 +219,43 @@ t.test('json persistence', t => {
await rm(path)
})

t.test('should generate a persistence file on the disk with support for vectors', async t => {
t.plan(1)

const db1 = await create({
schema: {
text: 'string',
vector: 'vector[5]'
}
})

await insert(db1, { text: 'vector 1', vector: [1, 0, 0, 0, 0] })
await insert(db1, { text: 'vector 2', vector: [1, 1, 0, 0, 0] })
await insert(db1, { text: 'vector 3', vector: [0, 0, 0, 0, 0] })

// Persist database on disk in json format
const path = await persistToFile(db1, 'json', 'test.json')

// Load database from disk in json format
const db2 = await restoreFromFile('json', 'test.json')

const qp1 = await searchVector(db1, {
vector: [1, 0, 0, 0, 0],
property: 'vector'
})

const qp2 = await searchVector(db2, {
vector: [1, 0, 0, 0, 0],
property: 'vector'
})

// Queries on the loaded database should match the original database
t.same(qp1.hits, qp2.hits)

// Clean up
await rm(path)
})

t.test('should generate a persistence file on the disk with a given name and json format', async t => {
t.plan(2)

Expand Down

0 comments on commit 7a68ed1

Please sign in to comment.