Skip to content

Commit

Permalink
fix: fixes #797 on edit distance calculation (#823)
Browse files Browse the repository at this point in the history
  • Loading branch information
allevo authored Oct 16, 2024
1 parent ba6cdde commit 0b80412
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 38 deletions.
36 changes: 23 additions & 13 deletions packages/orama/src/components/levenshtein.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,26 +7,36 @@ export type BoundedMetric = {
* Inspired by:
* https://github.com/Yomguithereal/talisman/blob/86ae55cbd040ff021d05e282e0e6c71f2dde21f8/src/metrics/levenshtein.js#L218-L340
*/
function _boundedLevenshtein(a: string, b: string, tolerance: number): number {
function _boundedLevenshtein(term: string, word: string, tolerance: number): number {
// Handle base cases
if (tolerance < 0) return -1
if (a === b) return 0
if (term === word) return 0

const m = a.length
const n = b.length
const m = term.length
const n = word.length

// Special case for empty strings
if (m === 0) return n <= tolerance ? n : -1
if (n === 0) return m <= tolerance ? m : -1

a = a.toLowerCase()
b = b.toLowerCase()
// term = term.toLowerCase()
// word = word.toLowerCase()

const diff = Math.abs(m - n)
// Special case for prefixes
if (b.startsWith(a) || a.startsWith(b)) return 0
// If the searching word starts with the indexed word, return early.
if (term.startsWith(word)) {
// We just check if the remaining characters are within the tolerance
return diff <= tolerance ? diff : -1
}
// If the indexed word starts with the searching word, return early.
if (word.startsWith(term)) {
// any prefixed word is within the tolerance
return 0
}

// If the length difference is greater than the tolerance, return early
if (Math.abs(m - n) > tolerance) return -1
if (diff > tolerance) return -1

// Initialize the matrix
const matrix: number[][] = []
Expand All @@ -41,7 +51,7 @@ function _boundedLevenshtein(a: string, b: string, tolerance: number): number {
for (let i = 1; i <= m; i++) {
let rowMin = Infinity
for (let j = 1; j <= n; j++) {
if (a[i - 1] === b[j - 1]) {
if (term[i - 1] === word[j - 1]) {
matrix[i][j] = matrix[i - 1][j - 1]
} else {
matrix[i][j] = Math.min(
Expand All @@ -68,17 +78,17 @@ function _boundedLevenshtein(a: string, b: string, tolerance: number): number {
* It assumes that:
* - tolerance >= ||a| - |b|| >= 0
*/
export function boundedLevenshtein(a: string, b: string, tolerance: number): BoundedMetric {
const distance = _boundedLevenshtein(a, b, tolerance)
export function boundedLevenshtein(term: string, w: string, tolerance: number): BoundedMetric {
const distance = _boundedLevenshtein(term, w, tolerance)
return {
distance,
isBounded: distance >= 0
}
}

// This is only used internally, keep in sync with the previous one
export function syncBoundedLevenshtein(a: string, b: string, tolerance: number): BoundedMetric {
const distance = _boundedLevenshtein(a, b, tolerance)
export function syncBoundedLevenshtein(term: string, w: string, tolerance: number): BoundedMetric {
const distance = _boundedLevenshtein(term, w, tolerance)
return {
distance,
isBounded: distance >= 0
Expand Down
8 changes: 4 additions & 4 deletions packages/orama/src/trees/radix.ts
Original file line number Diff line number Diff line change
Expand Up @@ -183,12 +183,12 @@ export class RadixNode {
while (stack.length > 0) {
const { node, index, tolerance } = stack.pop()!

if (tolerance < 0) {
if (node.w.startsWith(term)) {
node.findAllWords(output, term, false, 0)
continue
}

if (node.w.startsWith(term)) {
node.findAllWords(output, term, false, 0)
if (tolerance < 0) {
continue
}

Expand All @@ -198,7 +198,7 @@ export class RadixNode {
if (syncBoundedLevenshtein(term, w, originalTolerance).isBounded) {
output[w] = []
}
if (getOwnProperty(output, w) != null && docIDs.size > 0) {
if (getOwnProperty(output, w) !== undefined && docIDs.size > 0) {
const docs = new Set(output[w])

for (const docID of docIDs) {
Expand Down
90 changes: 69 additions & 21 deletions packages/orama/tests/levenshtein.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,13 @@ t.test('syncBoundedLevenshtein', async (t) => {
'Suffix should return correct distance and isBounded true if within tolerance'
)

// Test case sensitivity
t.same(
syncBoundedLevenshtein('Hello', 'hello', 1),
{ distance: 0, isBounded: true },
'Case difference should not be counted in the distance'
)
// This never happens in the real world: the function accepts tokenized strings
// so, the stings are always the same case
// t.same(
// syncBoundedLevenshtein('Hello', 'hello', 1),
// { distance: 0, isBounded: true },
// 'Case difference should not be counted in the distance'
// )

// Test with tolerance 0
t.same(
Expand Down Expand Up @@ -172,8 +173,11 @@ t.test('syncBoundedLevenshtein substrings are ok even if with tolerance pppppp',

t.match(boundedLevenshtein('Crxy', 'Caig', 3), { isBounded: true, distance: 3 })

t.match(boundedLevenshtein('Chris', 'Christopher', 0), { isBounded: true, distance: 0 })
t.match(boundedLevenshtein('Chris', 'Christopher', 1), { isBounded: true, distance: 0 })
t.match(boundedLevenshtein('Christopher', 'Chris' , 0), { isBounded: false, distance: -1 })

t.match(boundedLevenshtein('Christopher', 'Chris', 1), { isBounded: false, distance: -1 })
// To return true, the prefix must be within tolerance
t.match(boundedLevenshtein('Christopher', 'Chris', 'Christopher'.length - 'Chris'.length), { isBounded: true, distance: 6 })

t.end()
})
Expand All @@ -186,28 +190,72 @@ t.test('Issue #744', async (t) => {
} as const
})

await insertMultiple(index, [
{ libelle: 'ABRICOT MOELLEUX' },
{ libelle: 'MOELLEUX CHOC BIO' },
{ libelle: 'CREPE MOELLEUSE' },
{ libelle: 'OS MOELLE' }
])
const docs = [
{ id: '1', libelle: 'abricot moelleux' },
{ id: '2', libelle: 'moelleux choc bio' },
{ id: '3', libelle: 'crepe moelleuse' },
{ id: '4', libelle: 'os moelle' }
]
await insertMultiple(index, docs)

const searchTerm = 'moelleux'

// doc1 and doc2 match searchTerm exactly
t.equal(syncBoundedLevenshtein(searchTerm, searchTerm, 0).isBounded, true)
// doc3 don't match searchTerm with tolerance 1
t.equal(syncBoundedLevenshtein(searchTerm, 'moelleuse', 1).isBounded, false)
// but doc3 match searchTerm with tolerance 2 ("x" => "se" are 2 operations)
t.equal(syncBoundedLevenshtein(searchTerm, 'moelleuse', 2).isBounded, true)
// doc4 don't match searchTerm with tolerance 1
t.equal(syncBoundedLevenshtein(searchTerm, 'moelle', 1).isBounded, false)
// but doc4 match searchTerm with tolerance 2 ("ux" => "" are 2 operation)
t.equal(syncBoundedLevenshtein('moelle', searchTerm, 2).isBounded, true)

const s1 = await search(index, {
term: 'moelleux'
term: searchTerm
})
t.equal(s1.count, 2)
t.strictSame(s1.hits.map(h => h.id), ['1', '2'])

const s2 = await search(index, {
term: 'moelleux',
tolerance: 1,
term: searchTerm,
tolerance: 0,
})
t.equal(s2.count, 2)
t.strictSame(s2.hits.map(h => h.id), ['1', '2'])

const s3 = await search(index, {
term: 'moelleux',
term: searchTerm,
tolerance: 1,
})
t.equal(s3.count, 2)
t.strictSame(s3.hits.map(h => h.id), ['1', '2'])

const s4 = await search(index, {
term: searchTerm,
tolerance: 2,
})
t.equal(s4.count, 4)
t.strictSame(s4.hits.map(h => h.id), ['3', '4', '1', '2'])
})

t.equal(s1.count, 2)
t.equal(s2.count, 1)
t.equal(s3.count, 4)
// https://github.com/askorama/orama/issues/797
t.test('Issue #797', async t => {
const db = await create({
schema: {
name: 'string'
} as const
})
await insertMultiple(db, [
{ id: '1', name: "S" },
{ id: '2', name: "Scroll" },
])

const res = await search(db, {
term: "scrol",
tolerance: 1,
})

t.equal(res.count, 1)
t.equal(res.hits[0].id, '2')
})

0 comments on commit 0b80412

Please sign in to comment.