fix: fixes #797 on edit distance calculation (#823)

oramasearch · Oct 16, 2024 · 0b80412 · 0b80412
1 parent ba6cdde
commit 0b80412
Show file tree

Hide file tree

Showing 3 changed files with 96 additions and 38 deletions.
diff --git a/packages/orama/src/components/levenshtein.ts b/packages/orama/src/components/levenshtein.ts
@@ -7,26 +7,36 @@ export type BoundedMetric = {
  * Inspired by:
  * https://github.com/Yomguithereal/talisman/blob/86ae55cbd040ff021d05e282e0e6c71f2dde21f8/src/metrics/levenshtein.js#L218-L340
  */
-function _boundedLevenshtein(a: string, b: string, tolerance: number): number {
+function _boundedLevenshtein(term: string, word: string, tolerance: number): number {
   // Handle base cases
   if (tolerance < 0) return -1
-  if (a === b) return 0
+  if (term === word) return 0
 
-  const m = a.length
-  const n = b.length
+  const m = term.length
+  const n = word.length
 
   // Special case for empty strings
   if (m === 0) return n <= tolerance ? n : -1
   if (n === 0) return m <= tolerance ? m : -1
 
-  a = a.toLowerCase()
-  b = b.toLowerCase()
+  // term = term.toLowerCase()
+  // word = word.toLowerCase()
 
+  const diff = Math.abs(m - n)
   // Special case for prefixes
-  if (b.startsWith(a) || a.startsWith(b)) return 0
+  // If the searching word starts with the indexed word, return early.
+  if (term.startsWith(word)) {
+    // We just check if the remaining characters are within the tolerance
+    return diff <= tolerance ? diff : -1
+  }
+  // If the indexed word starts with the searching word, return early.
+  if (word.startsWith(term)) {
+    // any prefixed word is within the tolerance
+    return 0
+  }
 
   // If the length difference is greater than the tolerance, return early
-  if (Math.abs(m - n) > tolerance) return -1
+  if (diff > tolerance) return -1
 
   // Initialize the matrix
   const matrix: number[][] = []
@@ -41,7 +51,7 @@ function _boundedLevenshtein(a: string, b: string, tolerance: number): number {
   for (let i = 1; i <= m; i++) {
     let rowMin = Infinity
     for (let j = 1; j <= n; j++) {
-      if (a[i - 1] === b[j - 1]) {
+      if (term[i - 1] === word[j - 1]) {
         matrix[i][j] = matrix[i - 1][j - 1]
       } else {
         matrix[i][j] = Math.min(
@@ -68,17 +78,17 @@ function _boundedLevenshtein(a: string, b: string, tolerance: number): number {
  * It assumes that:
  * - tolerance >= ||a| - |b|| >= 0
  */
-export function boundedLevenshtein(a: string, b: string, tolerance: number): BoundedMetric {
-  const distance = _boundedLevenshtein(a, b, tolerance)
+export function boundedLevenshtein(term: string, w: string, tolerance: number): BoundedMetric {
+  const distance = _boundedLevenshtein(term, w, tolerance)
   return {
     distance,
     isBounded: distance >= 0
   }
 }
 
 // This is only used internally, keep in sync with the previous one
-export function syncBoundedLevenshtein(a: string, b: string, tolerance: number): BoundedMetric {
-  const distance = _boundedLevenshtein(a, b, tolerance)
+export function syncBoundedLevenshtein(term: string, w: string, tolerance: number): BoundedMetric {
+  const distance = _boundedLevenshtein(term, w, tolerance)
   return {
     distance,
     isBounded: distance >= 0

diff --git a/packages/orama/src/trees/radix.ts b/packages/orama/src/trees/radix.ts
@@ -183,12 +183,12 @@ export class RadixNode {
     while (stack.length > 0) {
       const { node, index, tolerance } = stack.pop()!
 
-      if (tolerance < 0) {
+      if (node.w.startsWith(term)) {
+        node.findAllWords(output, term, false, 0)
         continue
       }
 
-      if (node.w.startsWith(term)) {
-        node.findAllWords(output, term, false, 0)
+      if (tolerance < 0) {
         continue
       }
 
@@ -198,7 +198,7 @@ export class RadixNode {
           if (syncBoundedLevenshtein(term, w, originalTolerance).isBounded) {
             output[w] = []
           }
-          if (getOwnProperty(output, w) != null && docIDs.size > 0) {
+          if (getOwnProperty(output, w) !== undefined && docIDs.size > 0) {
             const docs = new Set(output[w])
 
             for (const docID of docIDs) {

diff --git a/packages/orama/tests/levenshtein.test.ts b/packages/orama/tests/levenshtein.test.ts
@@ -54,12 +54,13 @@ t.test('syncBoundedLevenshtein', async (t) => {
     'Suffix should return correct distance and isBounded true if within tolerance'
   )
 
-  // Test case sensitivity
-  t.same(
-    syncBoundedLevenshtein('Hello', 'hello', 1),
-    { distance: 0, isBounded: true },
-    'Case difference should not be counted in the distance'
-  )
+  // This never happens in the real world: the function accepts tokenized strings
+  // so, the stings are always the same case
+  // t.same(
+  //   syncBoundedLevenshtein('Hello', 'hello', 1),
+  //   { distance: 0, isBounded: true },
+  //   'Case difference should not be counted in the distance'
+  // )
 
   // Test with tolerance 0
   t.same(
@@ -172,8 +173,11 @@ t.test('syncBoundedLevenshtein substrings are ok even if with tolerance pppppp',
 
   t.match(boundedLevenshtein('Crxy', 'Caig', 3), { isBounded: true, distance: 3 })
 
-  t.match(boundedLevenshtein('Chris', 'Christopher', 0), { isBounded: true, distance: 0 })
-  t.match(boundedLevenshtein('Chris', 'Christopher', 1), { isBounded: true, distance: 0 })
+  t.match(boundedLevenshtein('Christopher', 'Chris' , 0), { isBounded: false, distance: -1 })
+
+  t.match(boundedLevenshtein('Christopher', 'Chris', 1), { isBounded: false, distance: -1 })
+  // To return true, the prefix must be within tolerance
+  t.match(boundedLevenshtein('Christopher', 'Chris', 'Christopher'.length - 'Chris'.length), { isBounded: true, distance: 6 })
 
   t.end()
 })
@@ -186,28 +190,72 @@ t.test('Issue #744', async (t) => {
     } as const
   })
 
-  await insertMultiple(index, [
-    { libelle: 'ABRICOT MOELLEUX' },
-    { libelle: 'MOELLEUX CHOC BIO' },
-    { libelle: 'CREPE MOELLEUSE' },
-    { libelle: 'OS MOELLE' }
-  ])
+  const docs = [
+    { id: '1', libelle: 'abricot moelleux' },
+    { id: '2', libelle: 'moelleux choc bio' },
+    { id: '3', libelle: 'crepe moelleuse' },
+    { id: '4', libelle: 'os moelle' }
+  ]
+  await insertMultiple(index, docs)
+
+  const searchTerm = 'moelleux'
+
+  // doc1 and doc2 match searchTerm exactly
+  t.equal(syncBoundedLevenshtein(searchTerm, searchTerm, 0).isBounded, true)
+  // doc3 don't match searchTerm with tolerance 1
+  t.equal(syncBoundedLevenshtein(searchTerm, 'moelleuse', 1).isBounded, false)
+  // but doc3 match searchTerm with tolerance 2 ("x" => "se" are 2 operations)
+  t.equal(syncBoundedLevenshtein(searchTerm, 'moelleuse', 2).isBounded, true)
+  // doc4 don't match searchTerm with tolerance 1
+  t.equal(syncBoundedLevenshtein(searchTerm, 'moelle', 1).isBounded, false)
+  // but doc4 match searchTerm with tolerance 2 ("ux" => "" are 2 operation)
+  t.equal(syncBoundedLevenshtein('moelle', searchTerm, 2).isBounded, true)
 
   const s1 = await search(index, {
-    term: 'moelleux'
+    term: searchTerm
   })
+  t.equal(s1.count, 2)
+  t.strictSame(s1.hits.map(h => h.id), ['1', '2'])
 
   const s2 = await search(index, {
-    term: 'moelleux',
-    tolerance: 1,
+    term: searchTerm,
+    tolerance: 0,
   })
+  t.equal(s2.count, 2)
+  t.strictSame(s2.hits.map(h => h.id), ['1', '2'])
 
   const s3 = await search(index, {
-    term: 'moelleux',
+    term: searchTerm,
+    tolerance: 1,
+  })
+  t.equal(s3.count, 2)
+  t.strictSame(s3.hits.map(h => h.id), ['1', '2'])
+
+  const s4 = await search(index, {
+    term: searchTerm,
     tolerance: 2,
   })
+  t.equal(s4.count, 4)
+  t.strictSame(s4.hits.map(h => h.id), ['3', '4', '1', '2'])
+})
 
-  t.equal(s1.count, 2)
-  t.equal(s2.count, 1)
-  t.equal(s3.count, 4)
+// https://github.com/askorama/orama/issues/797
+t.test('Issue #797', async t => {
+  const db = await create({
+    schema: {
+        name: 'string'
+    } as const
+  })
+  await insertMultiple(db, [
+    { id: '1', name: "S" },
+    { id: '2', name: "Scroll" },
+  ])
+
+  const res = await search(db, {
+    term: "scrol",
+    tolerance: 1,
+  })
+
+  t.equal(res.count, 1)
+  t.equal(res.hits[0].id, '2')
 })