Remove support for sparse vectors

tdebatty · Jun 18, 2019 · 9f4a7a2 · 9f4a7a2
1 parent 42a520c
commit 9f4a7a2
Show file tree

Hide file tree

Showing 6 changed files with 17 additions and 146 deletions.
diff --git a/pom.xml b/pom.xml
@@ -161,11 +161,6 @@
             <version>4.10</version>
             <scope>test</scope>
         </dependency>
-        <dependency>
-            <groupId>${project.groupId}</groupId>
-            <artifactId>java-string-similarity</artifactId>
-            <version>0.12</version>
-        </dependency>
     </dependencies>
 </project>
 

diff --git a/src/main/java/info/debatty/java/lsh/LSH.java b/src/main/java/info/debatty/java/lsh/LSH.java
@@ -4,7 +4,7 @@
 
 /**
  * Implementation of Locality Sensitive Hashing (LSH) principle, as described in
- * Leskovec, Rajaraman & Ullman (2014), "Mining of Massive Datasets",
+ * Leskovec, Rajaraman &amp; Ullman (2014), "Mining of Massive Datasets",
  * Cambridge University Press.
  *
  * @author Thibault Debatty http://www.debatty.info

diff --git a/src/main/java/info/debatty/java/lsh/LSHSuperBit.java b/src/main/java/info/debatty/java/lsh/LSHSuperBit.java
@@ -24,8 +24,6 @@
 
 package info.debatty.java.lsh;
 
-import info.debatty.java.utils.SparseDoubleVector;
-import info.debatty.java.utils.SparseIntegerVector;
 import java.io.Serializable;
 
 /**
@@ -43,7 +41,6 @@ public class LSHSuperBit extends LSH implements Serializable {
      *
      * Supported input types:
      * - double[]
-     * - sparseIntegerVector
      * - int[]
      * - others to come...
      *
@@ -70,7 +67,6 @@ public LSHSuperBit(
      *
      * Supported input types:
      * - double[]
-     * - sparseIntegerVector
      * - int[]
      * - others to come...
      *
@@ -139,24 +135,6 @@ public final int[] hash(final double[] vector) {
         return hashSignature(sb.signature(vector));
     }
 
-    /**
-     * Hash (bin) a vector in s stages into b buckets.
-     * @param vector
-     * @return
-     */
-    public final int[] hash(final SparseIntegerVector vector) {
-        return hashSignature(sb.signature(vector));
-    }
-
-    /**
-     * Hash (bin) a vector in s stages into b buckets.
-     * @param vector
-     * @return
-     */
-    public final int[] hash(final SparseDoubleVector vector) {
-        return hashSignature(sb.signature(vector));
-    }
-
     /**
      * Hash (bin) a vector in s stages into b buckets.
      * @param vector

diff --git a/src/main/java/info/debatty/java/lsh/SuperBit.java b/src/main/java/info/debatty/java/lsh/SuperBit.java
@@ -24,8 +24,6 @@
 
 package info.debatty.java.lsh;
 
-import info.debatty.java.utils.SparseDoubleVector;
-import info.debatty.java.utils.SparseIntegerVector;
 import java.io.Serializable;
 import java.util.Random;
 
@@ -40,7 +38,6 @@
  * Advances in Neural Information Processing Systems 25, 2012
  *
  * Supported input types:
- * - SparseIntegerVector
  * - double[]
  * - others to come...
  *
@@ -176,32 +173,6 @@ public SuperBit() {
 
     }
 
-    /**
-     * Compute the signature of this vector.
-     * @param vector
-     * @return
-     */
-    public final boolean[] signature(final SparseIntegerVector vector) {
-        boolean[] sig = new boolean[this.hyperplanes.length];
-        for (int i = 0; i < this.hyperplanes.length; i++) {
-            sig[i] = (vector.dotProduct(this.hyperplanes[i]) >= 0);
-        }
-        return sig;
-    }
-
-    /**
-     * Compute the signature of this vector.
-     * @param vector
-     * @return
-     */
-    public final boolean[] signature(final SparseDoubleVector vector) {
-        boolean[] sig = new boolean[this.hyperplanes.length];
-        for (int i = 0; i < this.hyperplanes.length; i++) {
-            sig[i] = (vector.dotProduct(this.hyperplanes[i]) >= 0);
-        }
-        return sig;
-    }
-
     /**
      * Compute the signature of this vector.
      * @param vector

diff --git a/src/main/java/info/debatty/java/lsh/examples/LSHMinHashExample.java b/src/main/java/info/debatty/java/lsh/examples/LSHMinHashExample.java
@@ -40,40 +40,40 @@ public class LSHMinHashExample {
     public static void main(String[] args) {
         // Number of sets
         int count = 2000;
-        
+
         // Size of dictionary
         int n = 100;
-        
+
         // Number of buckets
         // Attention: to get relevant results, the number of elements per bucket
         // should be at least 100
         int buckets = 10;
-        
+
         // Let's generate some random sets
         boolean[][] vectors = new boolean[count][];
         Random r = new Random();
-        
+
         // To get some interesting measures, we first generate a single
         // sparse random vector
-        vectors[0] = new boolean[n];    
+        vectors[0] = new boolean[n];
         for (int j = 0; j < n; j++) {
             vectors[0][j] = (r.nextInt(10) == 0);
         }
-        
-        // Then we generate the other vectors, which have a reasonable chance 
+
+        // Then we generate the other vectors, which have a reasonable chance
         // to look like the first one...
         for (int i = 1; i < count; i++) {
             vectors[i] = new boolean[n];
-            
+
             for (int j = 0; j < n; j++) {
                 vectors[i][j] = (r.nextDouble() <= 0.7 ? vectors[0][j] : (r.nextInt(10) == 0));
             }
         }
-        
+
         // Now we can proceed to LSH binning
         // We will test multiple stages
         for (int stages = 1; stages <= 10; stages++) {
-            
+
             // Compute the LSH hash of each vector
             LSHMinHash lsh = new LSHMinHash(stages, buckets, n);
             int[][] hashes = new int[count][];
@@ -83,7 +83,7 @@ public static void main(String[] args) {
             }
 
             // We now have the LSH hash for each input set
-            // Let's have a look at how similar sets (according to Jaccard 
+            // Let's have a look at how similar sets (according to Jaccard
             // index) were binned...
             int[][] results = new int[11][2];
             for (int i = 0; i < vectors.length; i++) {
@@ -93,15 +93,15 @@ public static void main(String[] args) {
                 for (int j = 0; j < i; j++) {
                     boolean[] vector2 = vectors[j];
                     int[] hash2 = hashes[j];
-                    
+
                     // We compute the similarity between each pair of sets
                     double similarity = MinHash.jaccardIndex(vector1, vector2);
 
-                    // We count the number of pairs with similarity 0.1, 0.2, 
+                    // We count the number of pairs with similarity 0.1, 0.2,
                     // 0.3, etc.
                     results[(int) (10 * similarity)][0]++;
 
-                    // Do they fall in the same bucket for one of the stages?                
+                    // Do they fall in the same bucket for one of the stages?
                     for (int stage = 0; stage < stages; stage++) {
                         if (hash1[stage] == hash2[stage]) {
                             results[(int) (10 * similarity)][1]++;
@@ -116,14 +116,14 @@ public static void main(String[] args) {
             // in the same bucket for at least one of the stages is y
             for (int i = 0; i < results.length; i++) {
                 double similarity = (double) i / 10;
-                
+
                 double probability = 0;
                 if (results[i][0] != 0) {
                     probability = (double) results[i][1] / results[i][0];
                 }
                 System.out.println("" + similarity + "\t" + probability + "\t" + stages);
             }
-            
+
             // Separate the series for Gnuplot...
             System.out.print("\n");
         }

diff --git a/src/main/java/info/debatty/java/lsh/examples/SuperBitSparseExample.java b/src/main/java/info/debatty/java/lsh/examples/SuperBitSparseExample.java