Skip to content

Commit

Permalink
Use a flat vector in Damerau-Levenshtein
Browse files Browse the repository at this point in the history
Instead of representing a 2x2 grid with a vector of vectors, just use a single
vector to improve performance. We can do this since the dimensions are fixed.

This method was suggested by @lovasoa as an alternative to adding a dependency
on the ndarray crate.

In my benchmark testing, the new approach is about as fast using ndarray. On my
machine, the original approach takes about 22,000 ns/iter, whereas the new
approach takes about 17,000 ns/iter.

See #34 for more context.
  • Loading branch information
dguo committed May 5, 2019
1 parent 5907665 commit 3978cd7
Showing 1 changed file with 21 additions and 16 deletions.
37 changes: 21 additions & 16 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,12 @@ pub fn osa_distance(a: &str, b: &str) -> usize {

}

/* Returns the final index for a fixed 2d vector of vectors that is represented
as a single vector. */
fn flat_index(i: usize, j: usize, width: usize) -> usize {
j * width + i
}

/// Like optimal string alignment, but substrings can be edited an unlimited
/// number of times, and the triangle inequality holds.
///
Expand All @@ -326,18 +332,19 @@ pub fn generic_damerau_levenshtein<Elem>(a_elems: &[Elem], b_elems: &[Elem]) ->
if a_len == 0 { return b_len; }
if b_len == 0 { return a_len; }

let mut distances = vec![vec![0; b_len + 2]; a_len + 2];
let width = a_len + 2;
let mut distances = vec![0; (a_len + 2) * (b_len + 2)];
let max_distance = a_len + b_len;
distances[0][0] = max_distance;
distances[0] = max_distance;

for i in 0..(a_len + 1) {
distances[i + 1][0] = max_distance;
distances[i + 1][1] = i;
distances[flat_index(i + 1, 0, width)] = max_distance;
distances[flat_index(i + 1, 1, width)] = i;
}

for j in 0..(b_len + 1) {
distances[0][j + 1] = max_distance;
distances[1][j + 1] = j;
distances[flat_index(0, j + 1, width)] = max_distance;
distances[flat_index(1, j + 1, width)] = j;
}

let mut elems: HashMap<Elem, usize> = HashMap::new();
Expand All @@ -359,22 +366,20 @@ pub fn generic_damerau_levenshtein<Elem>(a_elems: &[Elem], b_elems: &[Elem]) ->
db = j;
}

let substitution_cost = distances[i][j] + cost;
let insertion_cost = distances[i][j + 1] + 1;
let deletion_cost = distances[i + 1][j] + 1;
let transposition_cost = distances[k][l] + (i - k - 1) + 1 +
(j - l - 1);
let substitution_cost = distances[flat_index(i, j, width)] + cost;
let insertion_cost = distances[flat_index(i, j + 1, width)] + 1;
let deletion_cost = distances[flat_index(i + 1, j, width)] + 1;
let transposition_cost = distances[flat_index(k, l, width)] +
(i - k - 1) + 1 + (j - l - 1);

distances[i + 1][j + 1] = min(substitution_cost,
min(insertion_cost,
min(deletion_cost,
transposition_cost)));
distances[flat_index(i + 1, j + 1, width)] = min(substitution_cost,
min(insertion_cost, min(deletion_cost, transposition_cost)));
}

elems.insert(a_elems[i - 1].clone(), i);
}

distances[a_len + 1][b_len + 1]
distances[flat_index(a_len + 1, b_len + 1, width)]
}

/// Like optimal string alignment, but substrings can be edited an unlimited
Expand Down

0 comments on commit 3978cd7

Please sign in to comment.