From 40d0c99cc20abd7a949eaa3eaaf8dcce98523212 Mon Sep 17 00:00:00 2001 From: ibuda Date: Sun, 7 Apr 2019 17:48:34 +0300 Subject: [PATCH] Updated the implementation of Weighted Levenshtein distance and Levenshtein distance. Added new test cases that spotted the bug in the implementation. This fixes #268, along with #239 and #244. --- .../garshol/duke/comparators/Levenshtein.java | 50 ++++++++++--------- .../duke/comparators/WeightedLevenshtein.java | 29 +++++------ .../duke/comparators/LevenshteinTest.java | 16 ++++-- .../comparators/WeightedLevenshteinTest.java | 22 +++++++- 4 files changed, 73 insertions(+), 44 deletions(-) diff --git a/duke-core/src/main/java/no/priv/garshol/duke/comparators/Levenshtein.java b/duke-core/src/main/java/no/priv/garshol/duke/comparators/Levenshtein.java index be81a9bb..80eb5c4e 100644 --- a/duke-core/src/main/java/no/priv/garshol/duke/comparators/Levenshtein.java +++ b/duke-core/src/main/java/no/priv/garshol/duke/comparators/Levenshtein.java @@ -19,7 +19,7 @@ */ public class Levenshtein implements Comparator { - public double compare(String s1, String s2) { + public double compare(String s1, String s2) { int len = Math.min(s1.length(), s2.length()); // we know that if the outcome here is 0.5 or lower, then the @@ -34,7 +34,7 @@ public double compare(String s1, String s2) { // if the strings are equal we can stop right here. if (len == maxlen && s1.equals(s2)) return 1.0; - + // we couldn't shortcut, so now we go ahead and compute the full // metric int dist = Math.min(compactDistance(s1, s2), len); @@ -51,18 +51,20 @@ public boolean isTokenized() { * speed, but still computes the entire matrix. */ public static int distance(String s1, String s2) { - if (s1.length() == 0) - return s2.length(); - if (s2.length() == 0) - return s1.length(); - int s1len = s1.length(); + int s2len = s2.length(); + if (s1len == 0) + return s2len; + if (s2len == 0) + return s1len; + + // we use a flat array for better performance. we address it by // s1ix + s1len * s2ix. this modification improves performance // by about 30%, which is definitely worth the extra complexity. - int[] matrix = new int[(s1len + 1) * (s2.length() + 1)]; - for (int col = 0; col <= s2.length(); col++) - matrix[col * s1len] = col; + int[] matrix = new int[(s1len + 1) * (s2len + 1)]; + for (int col = 0; col <= s2len; col++) + matrix[col * (s1len + 1)] = col; for (int row = 0; row <= s1len; row++) matrix[row] = row; @@ -75,11 +77,11 @@ public static int distance(String s1, String s2) { else cost = 1; - int left = matrix[ix1 + ((ix2 + 1) * s1len)] + 1; - int above = matrix[ix1 + 1 + (ix2 * s1len)] + 1; - int aboveleft = matrix[ix1 + (ix2 * s1len)] + cost; - matrix[ix1 + 1 + ((ix2 + 1) * s1len)] = - Math.min(left, Math.min(above, aboveleft)); + int left = matrix[ix1 + ((ix2 + 1) * (s1len + 1))] + 1; + int above = matrix[ix1 + 1 + (ix2 * (s1len + 1))] + 1; + int aboveleft = matrix[ix1 + (ix2 * (s1len + 1))] + cost; + matrix[ix1 + 1 + ((ix2 + 1) * (s1len + 1))] = + Math.min(left, Math.min(above, aboveleft)); } } @@ -89,10 +91,10 @@ public static int distance(String s1, String s2) { // } // System.out.println(); // } - - return matrix[s1len + (s2.length() * s1len)]; + + return matrix[(s1len + 1) * (s2.length() + 1)-1]; } - + // /** // * An optimized version of the Wagner & Fischer algorithm, which // * exploits our knowledge that if the distance is above a certain @@ -138,7 +140,7 @@ public static int distance(String s1, String s2) { // matrix[ix1 + 1 + ((ix2 + 1) * s1len)] = distance; // } // } - + // return matrix[s1len + (s2.length() * s1len)]; // } @@ -163,7 +165,7 @@ public static int distance(String s1, String s2) { // // FIXME: modify to avoid having to initialize // for (int ix = 1; ix < matrix.length; ix++) // matrix[ix] = -1; - + // return computeRecursively(matrix, s1, s2, s1.length(), s2.length()); // } @@ -213,7 +215,7 @@ public static int distance(String s1, String s2) { // else // // it' can't be smaller than above, so no need to compute // left = above; - + // int distance = Math.min(left, Math.min(above, aboveleft)) + cost; // matrix[pos] = distance; // return distance; @@ -233,7 +235,7 @@ public static int compactDistance(String s1, String s2) { // the maximum edit distance there is any point in reporting. int maxdist = Math.min(s1.length(), s2.length()) / 2; - + // we allocate just one column instead of the entire matrix, in // order to save space. this also enables us to implement the // algorithm somewhat faster. the first cell is always the @@ -271,7 +273,7 @@ public static int compactDistance(String s1, String s2) { // aboveleft: column[ix1 - 1] // left: column[ix1] int value = Math.min(Math.min(above, column[ix1 - 1]), column[ix1]) + - cost; + cost; column[ix1 - 1] = above; // write previous above = value; // keep current smallest = Math.min(smallest, value); @@ -285,5 +287,5 @@ public static int compactDistance(String s1, String s2) { // ok, we're done return above; - } + } } \ No newline at end of file diff --git a/duke-core/src/main/java/no/priv/garshol/duke/comparators/WeightedLevenshtein.java b/duke-core/src/main/java/no/priv/garshol/duke/comparators/WeightedLevenshtein.java index 6031ca3a..524f32c1 100644 --- a/duke-core/src/main/java/no/priv/garshol/duke/comparators/WeightedLevenshtein.java +++ b/duke-core/src/main/java/no/priv/garshol/duke/comparators/WeightedLevenshtein.java @@ -46,23 +46,24 @@ public WeightEstimator getEstimator() { public static double distance(String s1, String s2, WeightEstimator weight) { int s1len = s1.length(); + int s2len = s2.length(); if (s1len == 0) return estimateCharacters(s2, weight); - if (s2.length() == 0) + if (s2len == 0) return estimateCharacters(s1, weight); // we use a flat array for better performance. we address it by // s1ix + s1len * s2ix. this modification improves performance // by about 30%, which is definitely worth the extra complexity. - double[] matrix = new double[(s1len + 1) * (s2.length() + 1)]; - for (int col = 0; col <= s2.length(); col++) - matrix[col * s1len] = col; + double[] matrix = new double[(s1len + 1) * (s2len + 1)]; + for (int col = 0; col <= s2len; col++) + matrix[col * (s1len + 1)] = col; for (int row = 0; row <= s1len; row++) matrix[row] = row; for (int ix1 = 0; ix1 < s1len; ix1++) { char ch1 = s1.charAt(ix1); - for (int ix2 = 0; ix2 < s2.length(); ix2++) { + for (int ix2 = 0; ix2 < s2len; ix2++) { double cost; char ch2 = s2.charAt(ix2); if (ch1 == ch2) @@ -70,13 +71,13 @@ public static double distance(String s1, String s2, WeightEstimator weight) { else cost = weight.substitute(ix1, ch1, s2.charAt(ix2)); - double left = matrix[ix1 + ((ix2 + 1) * s1len)] + - weight.delete(ix1, ch1); - double above = matrix[ix1 + 1 + (ix2 * s1len)] + - weight.insert(ix1, ch2); - double aboveleft = matrix[ix1 + (ix2 * s1len)] + cost; - matrix[ix1 + 1 + ((ix2 + 1) * s1len)] = - Math.min(left, Math.min(above, aboveleft)); + double left = matrix[ix1 + ((ix2 + 1) * (s1len + 1))] + + weight.delete(ix1, ch1); + double above = matrix[ix1 + 1 + (ix2 * (s1len + 1))] + + weight.insert(ix1, ch2); + double aboveleft = matrix[ix1 + (ix2 * (s1len + 1))] + cost; + matrix[ix1 + 1 + ((ix2 + 1) * (s1len + 1))] = + Math.min(left, Math.min(above, aboveleft)); } } @@ -87,7 +88,7 @@ public static double distance(String s1, String s2, WeightEstimator weight) { // System.out.println(); // } - return matrix[s1len + (s2.length() * s1len)]; + return matrix[(s1len +1) * (s2len + 1) - 1]; } // /** @@ -249,7 +250,7 @@ else if (Character.isDigit(ch)) int type = Character.getType(ch); // 20, 21, 22, 23, 24, 25, 26, 27 if (Character.isSpace(ch) || - (type >= 20 && type <= 27)) + (type >= 20 && type <= 27)) weight = punctuation; } diff --git a/duke-core/src/test/java/no/priv/garshol/duke/comparators/LevenshteinTest.java b/duke-core/src/test/java/no/priv/garshol/duke/comparators/LevenshteinTest.java index 36ad44c5..1fc09490 100644 --- a/duke-core/src/test/java/no/priv/garshol/duke/comparators/LevenshteinTest.java +++ b/duke-core/src/test/java/no/priv/garshol/duke/comparators/LevenshteinTest.java @@ -14,7 +14,7 @@ public class LevenshteinTest { public void setup() { this.comp = new Levenshtein(); } - + // tests for the comparator @Test @@ -31,9 +31,9 @@ public void testComparatorTotallyDifferent() { public void testComparatorOneInFour() { assertEquals(0.75, comp.compare("fooz", "foos")); } - + // tests for the original algorithm - + @Test public void testEmpty() { assertEquals(0, Levenshtein.distance("", "")); @@ -60,13 +60,19 @@ public void testDays() { assertEquals(3, Levenshtein.distance("saturday", "sunday")); assertEquals(3, Levenshtein.distance("sunday", "saturday")); } - + @Test public void testGambol() { assertEquals(2, Levenshtein.distance("gambol", "gumbo")); assertEquals(2, Levenshtein.distance("gumbo", "gambol")); } + @Test + public void testAbc() { + assertEquals(2, Levenshtein.distance("a", "abc")); + assertEquals(2, Levenshtein.distance("abc", "a")); + } + @Test public void testTotallyUnlike() { assertEquals(4, Levenshtein.distance("abcd", "efgh")); @@ -100,7 +106,7 @@ public void testCDays() { assertEquals(3, Levenshtein.compactDistance("saturday", "sunday")); assertEquals(3, Levenshtein.compactDistance("sunday", "saturday")); } - + @Test public void testCGambol() { assertEquals(2, Levenshtein.compactDistance("gambol", "gumbo")); diff --git a/duke-core/src/test/java/no/priv/garshol/duke/comparators/WeightedLevenshteinTest.java b/duke-core/src/test/java/no/priv/garshol/duke/comparators/WeightedLevenshteinTest.java index f735b03a..f15ce975 100644 --- a/duke-core/src/test/java/no/priv/garshol/duke/comparators/WeightedLevenshteinTest.java +++ b/duke-core/src/test/java/no/priv/garshol/duke/comparators/WeightedLevenshteinTest.java @@ -13,7 +13,7 @@ public class WeightedLevenshteinTest { public void setup() { e = new WeightedLevenshtein.DefaultWeightEstimator(); } - + @Test public void testEmpty() { assertEquals(0.0, WeightedLevenshtein.distance("", "", e)); @@ -43,6 +43,26 @@ public void testSubstitute2() { assertEquals(3.0, WeightedLevenshtein.distance("totanic 1", "titanic 2", e)); } + @Test + public void testAbc() { + assertEquals(2.0, WeightedLevenshtein.distance("abc", "a", e)); + assertEquals(2.0, WeightedLevenshtein.distance("a", "abc", e)); + } + + @Test + public void test123() { + e.setDigitWeight(2.0); + assertEquals(4.0, WeightedLevenshtein.distance("1", "123", e)); + assertEquals(4.0, WeightedLevenshtein.distance("123", "1", e)); + } + + @Test + public void testAlphaNumeric() { + e.setDigitWeight(2.0); + assertEquals(8.0, WeightedLevenshtein.distance("a2c3e", "1b1d1", e)); + assertEquals(8.0, WeightedLevenshtein.distance("1b1d1", "a2c3e", e)); + } + @Test public void testComparator() { WeightedLevenshtein comp = new WeightedLevenshtein();