Skip to content

Commit

Permalink
fixup! refactor: move the internal id to orama
Browse files Browse the repository at this point in the history
  • Loading branch information
H4ad committed Jul 11, 2023
1 parent 7f16262 commit a38619a
Show file tree
Hide file tree
Showing 13 changed files with 163 additions and 113 deletions.
19 changes: 13 additions & 6 deletions packages/orama/src/components/algorithms.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import { createError } from '../errors.js'
import { TokenScore, BM25Params } from '../types.js'
import { InternalDocumentID } from './internal-document-id-store.js';

export function prioritizeTokenScores(arrays: TokenScore[][], boost: number, threshold = 1): TokenScore[] {
if (boost === 0) {
throw createError('INVALID_BOOST_VALUE')
}

const tokenMap: Record<number, number> = {}
const tokenMap = new Map<InternalDocumentID, number>()

const mapsLength = arrays.length
for (let i = 0; i < mapsLength; i++) {
Expand All @@ -16,17 +17,23 @@ export function prioritizeTokenScores(arrays: TokenScore[][], boost: number, thr
for (let j = 0; j < entriesLength; j++) {
const [token, score] = arr[j]
const boostScore = score * boost
const oldScore = tokenMap.get(token)

if (token in tokenMap) {
tokenMap[token] *= 1.5 + boostScore
if (oldScore !== undefined) {
tokenMap.set(token, oldScore * 1.5 + boostScore)
} else {
tokenMap[token] = boostScore
tokenMap.set(token, boostScore);
}
}
}

const results = Object.keys(tokenMap)
.map(key => [+key, tokenMap[+key]] as TokenScore)
const tokenScores: TokenScore[] = []

for (const tokenStore of tokenMap.entries()) {
tokenScores.push(tokenStore);
}

const results = tokenScores
.sort((a, b) => b[1] - a[1])

// If threshold is 1, it means we will return all the results with at least one search term,
Expand Down
10 changes: 5 additions & 5 deletions packages/orama/src/components/documents-store.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { DocumentID, getInternalDocumentId, InternalDocumentID, InternalDocumentIDStore } from "./internal-document-id-store.js";
import { Document, IDocumentsStore, OpaqueDocumentStore } from "../types.js";
import { DocumentID, getInternalDocumentId, InternalDocumentID, InternalDocumentIDStore } from './internal-document-id-store.js';
import { Document, IDocumentsStore, OpaqueDocumentStore } from '../types.js';

export interface DocumentsStore extends OpaqueDocumentStore {
sharedInternalDocumentStore: InternalDocumentIDStore;
Expand All @@ -17,7 +17,7 @@ export async function create(sharedInternalDocumentStore: InternalDocumentIDStor
}
}

export async function get(store: DocumentsStore, id: string): Promise<Document | undefined> {
export async function get(store: DocumentsStore, id: DocumentID): Promise<Document | undefined> {
const internalId = getInternalDocumentId(store.sharedInternalDocumentStore, id);

return store.docs[internalId]
Expand All @@ -34,8 +34,8 @@ export async function getMultiple(store: DocumentsStore, ids: DocumentID[]): Pro
return found
}

export async function getAll(store: DocumentsStore): Promise<Record<string, Document>> {
return store.docs as unknown as Record<string, Document>
export async function getAll(store: DocumentsStore): Promise<Record<InternalDocumentID, Document>> {
return store.docs as Record<InternalDocumentID, Document>;
}

export async function store(store: DocumentsStore, id: DocumentID, doc: Document): Promise<boolean> {
Expand Down
6 changes: 4 additions & 2 deletions packages/orama/src/components/filters.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
export function intersectFilteredIDs(filtered: number[], lookedUp: [number, number][]): [number, number][] {
import { InternalDocumentID } from './internal-document-id-store.js';

export function intersectFilteredIDs(filtered: InternalDocumentID[], lookedUp: [InternalDocumentID, number][]): [InternalDocumentID, number][] {
const map = new Map<number, boolean>()
const result: [number, number][] = []

Expand All @@ -8,7 +10,7 @@ export function intersectFilteredIDs(filtered: number[], lookedUp: [number, numb

for (const [id, score] of lookedUp) {
if (map.has(id)) {
result.push([+id, score])
result.push([id, score])
map.delete(id)
}
}
Expand Down
2 changes: 1 addition & 1 deletion packages/orama/src/components/groups.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import type { Orama, ScalarSearchableValue, TokenScore, GroupByParams, GroupResult, Result, Reduce } from '../types.js'
import { createError } from '../errors.js'
import { getNested, intersect } from '../utils.js'
import { getDocumentIdFromInternalId } from "./internal-document-id-store.js";
import { getDocumentIdFromInternalId } from './internal-document-id-store.js';

interface PropertyGroup {
property: string
Expand Down
80 changes: 55 additions & 25 deletions packages/orama/src/components/index.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,41 @@
import { DocumentID, getInternalDocumentId, InternalDocumentID, InternalDocumentIDStore } from "./internal-document-id-store.js";
import { createError } from "../errors.js";
import { create as avlCreate, find as avlFind, greaterThan as avlGreaterThan, insert as avlInsert, lessThan as avlLessThan, Node as AVLNode, rangeSearch as avlRangeSearch, removeDocument as avlRemoveDocument } from "../trees/avl.js";
import { create as radixCreate, find as radixFind, insert as radixInsert, Node as RadixNode, removeDocumentByWord as radixRemoveDocument } from "../trees/radix.js";
import { ArraySearchableType, BM25Params, ComparisonOperator, IIndex, OpaqueDocumentStore, OpaqueIndex, Orama, ScalarSearchableType, Schema, SearchableType, SearchableValue, SearchContext, Tokenizer, TokenScore } from "../types.js";
import { intersect } from "../utils.js";
import { BM25 } from "./algorithms.js";
import { getInnerType, isArrayType } from "./defaults.js";
import { createError } from '../errors.js'
import {
create as avlCreate,
find as avlFind,
greaterThan as avlGreaterThan,
insert as avlInsert,
lessThan as avlLessThan,
Node as AVLNode,
rangeSearch as avlRangeSearch,
removeDocument as avlRemoveDocument,
} from '../trees/avl.js'
import {
create as radixCreate,
find as radixFind,
insert as radixInsert,
Node as RadixNode,
removeDocumentByWord as radixRemoveDocument,
} from '../trees/radix.js'
import {
ArraySearchableType,
BM25Params,
ComparisonOperator,
IIndex,
OpaqueDocumentStore,
OpaqueIndex,
Orama,
ScalarSearchableType,
Schema,
SearchableType,
SearchableValue,
SearchContext,
Tokenizer,
TokenScore,
} from '../types.js'
import { intersect } from '../utils.js'
import { BM25 } from './algorithms.js'
import { getInnerType, isArrayType } from './defaults.js'
import { DocumentID, getInternalDocumentId, InternalDocumentID, InternalDocumentIDStore } from './internal-document-id-store.js'

export type FrequencyMap = {
[property: string]: {
Expand All @@ -15,7 +45,7 @@ export type FrequencyMap = {
}
| undefined
}
};
}

export type BooleanIndex = {
true: InternalDocumentID[]
Expand All @@ -38,11 +68,11 @@ export type DefaultIndex = IIndex<Index>
export async function insertDocumentScoreParameters(
index: Index,
prop: string,
id: string,
id: DocumentID,
tokens: string[],
docsCount: number,
): Promise<void> {
const internalId = getInternalDocumentId(index.sharedInternalDocumentStore, id);
const internalId = getInternalDocumentId(index.sharedInternalDocumentStore, id)

index.avgFieldLength[prop] = ((index.avgFieldLength[prop] ?? 0) * (docsCount - 1) + tokens.length) / docsCount
index.fieldLengths[prop][internalId] = tokens.length
Expand All @@ -52,7 +82,7 @@ export async function insertDocumentScoreParameters(
export async function insertTokenScoreParameters(
index: Index,
prop: string,
id: string,
id: DocumentID,
tokens: string[],
token: string,
): Promise<void> {
Expand All @@ -64,7 +94,7 @@ export async function insertTokenScoreParameters(
}
}

const internalId = getInternalDocumentId(index.sharedInternalDocumentStore, id);
const internalId = getInternalDocumentId(index.sharedInternalDocumentStore, id)
const tf = tokenFrequency / tokens.length

index.frequencies[prop][internalId]![token] = tf
Expand All @@ -80,10 +110,10 @@ export async function insertTokenScoreParameters(
export async function removeDocumentScoreParameters(
index: Index,
prop: string,
id: string,
id: DocumentID,
docsCount: number,
): Promise<void> {
const internalId = getInternalDocumentId(index.sharedInternalDocumentStore, id);
const internalId = getInternalDocumentId(index.sharedInternalDocumentStore, id)

index.avgFieldLength[prop] =
(index.avgFieldLength[prop] * docsCount - index.fieldLengths[prop][internalId]!) / (docsCount - 1)
Expand Down Expand Up @@ -118,7 +148,7 @@ export async function calculateResultScores<I extends OpaqueIndex, D extends Opa
// Calculate TF-IDF value for each term, in each document, for each index.
const documentIDsLength = documentIDs.length
for (let k = 0; k < documentIDsLength; k++) {
const internalId = getInternalDocumentId(index.sharedInternalDocumentStore, documentIDs[k]);
const internalId = getInternalDocumentId(index.sharedInternalDocumentStore, documentIDs[k])
const tf = oramaFrequencies?.[internalId]?.[term] ?? 0

const bm25 = BM25(
Expand Down Expand Up @@ -177,7 +207,7 @@ export async function create(
case 'string[]':
index.indexes[path] = radixCreate()
index.avgFieldLength[path] = 0
index.frequencies[path] = {};
index.frequencies[path] = {}
index.tokenOccurrences[path] = {}
index.fieldLengths[path] = {}
break
Expand All @@ -196,14 +226,14 @@ async function insertScalar(
implementation: IIndex<Index>,
index: Index,
prop: string,
id: string,
id: DocumentID,
value: SearchableValue,
schemaType: ScalarSearchableType,
language: string | undefined,
tokenizer: Tokenizer,
docsCount: number,
): Promise<void> {
const internalId = getInternalDocumentId(index.sharedInternalDocumentStore, id);
const internalId = getInternalDocumentId(index.sharedInternalDocumentStore, id)

switch (schemaType) {
case 'boolean': {
Expand All @@ -216,10 +246,10 @@ async function insertScalar(
break
case 'string': {
const tokens = await tokenizer.tokenize(value as string, language, prop)
await implementation.insertDocumentScoreParameters(index, prop, id, tokens, docsCount)
await implementation.insertDocumentScoreParameters(index, prop, internalId, tokens, docsCount)

for (const token of tokens) {
await implementation.insertTokenScoreParameters(index, prop, id, tokens, token)
await implementation.insertTokenScoreParameters(index, prop, internalId, tokens, token)

radixInsert(index.indexes[prop] as RadixNode, token, internalId)
}
Expand All @@ -233,7 +263,7 @@ export async function insert(
implementation: DefaultIndex,
index: Index,
prop: string,
id: string,
id: DocumentID,
value: SearchableValue,
schemaType: SearchableType,
language: string | undefined,
Expand Down Expand Up @@ -267,14 +297,14 @@ async function removeScalar(
implementation: IIndex<Index>,
index: Index,
prop: string,
id: string,
id: DocumentID,
value: SearchableValue,
schemaType: ScalarSearchableType,
language: string | undefined,
tokenizer: Tokenizer,
docsCount: number,
): Promise<boolean> {
const internalId = getInternalDocumentId(index.sharedInternalDocumentStore, id);
const internalId = getInternalDocumentId(index.sharedInternalDocumentStore, id)

switch (schemaType) {
case 'number': {
Expand Down Expand Up @@ -307,7 +337,7 @@ export async function remove(
implementation: DefaultIndex,
index: Index,
prop: string,
id: string,
id: DocumentID,
value: SearchableValue,
schemaType: SearchableType,
language: string | undefined,
Expand Down
50 changes: 27 additions & 23 deletions packages/orama/src/components/internal-document-id-store.ts
Original file line number Diff line number Diff line change
@@ -1,66 +1,70 @@
import { Orama } from '../types.js';

export type DocumentID = string | number;
export type InternalDocumentID = number;
export type DocumentID = string | number
export type InternalDocumentID = number

export type InternalDocumentIDStore = {
idToInternalId: Map<string, number>;
internalIdToId: string[];
save: (store: InternalDocumentIDStore) => unknown;
load: (orama: Orama, raw: unknown) => void;
};
idToInternalId: Map<string, number>
internalIdToId: string[]
save: (store: InternalDocumentIDStore) => unknown
load: (orama: Orama, raw: unknown) => void
}

export function createInternalDocumentIDStore(): InternalDocumentIDStore {
return {
idToInternalId: new Map(),
internalIdToId: [],
save,
load,
};
}
}

export function save(store: InternalDocumentIDStore): unknown {
return {
internalIdToId: store.internalIdToId,
};
}
}

export function load(orama: Orama, raw: unknown): void {
const { internalIdToId } = raw as InternalDocumentIDStore;
const { internalIdToId } = raw as InternalDocumentIDStore

orama.internalDocumentIDStore.idToInternalId.clear();
orama.internalDocumentIDStore.internalIdToId = [];
orama.internalDocumentIDStore.idToInternalId.clear()
orama.internalDocumentIDStore.internalIdToId = []

for (let i = 0; i < internalIdToId.length; i++) {
orama.internalDocumentIDStore.idToInternalId.set(internalIdToId[i], i + 1);
orama.internalDocumentIDStore.internalIdToId.push(internalIdToId[i]);
orama.internalDocumentIDStore.idToInternalId.set(internalIdToId[i], i + 1)
orama.internalDocumentIDStore.internalIdToId.push(internalIdToId[i])
}
}

export function getInternalDocumentId(store: InternalDocumentIDStore, id: DocumentID): InternalDocumentID {
if (typeof id === "string") {
const internalId = store.idToInternalId.get(id);
const internalId = store.idToInternalId.get(id)

if (internalId) {
return internalId;
return internalId
}

const currentId = store.idToInternalId.size + 1;
const currentId = store.idToInternalId.size + 1

store.idToInternalId.set(id, currentId);
store.internalIdToId.push(id);
store.idToInternalId.set(id, currentId)
store.internalIdToId.push(id)

return currentId
}

return currentId;
if (id > store.internalIdToId.length) {
return getInternalDocumentId(store, id.toString())
}

return id as number;
return id
}

export function getDocumentIdFromInternalId(store: InternalDocumentIDStore, internalId: InternalDocumentID): string {
if (store.internalIdToId.length < internalId) {
throw new Error(`Invalid internalId ${internalId}`);
throw new Error(`Invalid internalId ${internalId}`)
}

return store.internalIdToId[internalId - 1];
return store.internalIdToId[internalId - 1]
}

Loading

0 comments on commit a38619a

Please sign in to comment.