Skip to content

Commit

Permalink
Remove support for sparse vectors
Browse files Browse the repository at this point in the history
  • Loading branch information
stevie400 committed Jun 18, 2019
1 parent 42a520c commit 9f4a7a2
Show file tree
Hide file tree
Showing 6 changed files with 17 additions and 146 deletions.
5 changes: 0 additions & 5 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -161,11 +161,6 @@
<version>4.10</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>java-string-similarity</artifactId>
<version>0.12</version>
</dependency>
</dependencies>
</project>

Expand Down
2 changes: 1 addition & 1 deletion src/main/java/info/debatty/java/lsh/LSH.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

/**
* Implementation of Locality Sensitive Hashing (LSH) principle, as described in
* Leskovec, Rajaraman & Ullman (2014), "Mining of Massive Datasets",
* Leskovec, Rajaraman &amp; Ullman (2014), "Mining of Massive Datasets",
* Cambridge University Press.
*
* @author Thibault Debatty http://www.debatty.info
Expand Down
22 changes: 0 additions & 22 deletions src/main/java/info/debatty/java/lsh/LSHSuperBit.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@

package info.debatty.java.lsh;

import info.debatty.java.utils.SparseDoubleVector;
import info.debatty.java.utils.SparseIntegerVector;
import java.io.Serializable;

/**
Expand All @@ -43,7 +41,6 @@ public class LSHSuperBit extends LSH implements Serializable {
*
* Supported input types:
* - double[]
* - sparseIntegerVector
* - int[]
* - others to come...
*
Expand All @@ -70,7 +67,6 @@ public LSHSuperBit(
*
* Supported input types:
* - double[]
* - sparseIntegerVector
* - int[]
* - others to come...
*
Expand Down Expand Up @@ -139,24 +135,6 @@ public final int[] hash(final double[] vector) {
return hashSignature(sb.signature(vector));
}

/**
* Hash (bin) a vector in s stages into b buckets.
* @param vector
* @return
*/
public final int[] hash(final SparseIntegerVector vector) {
return hashSignature(sb.signature(vector));
}

/**
* Hash (bin) a vector in s stages into b buckets.
* @param vector
* @return
*/
public final int[] hash(final SparseDoubleVector vector) {
return hashSignature(sb.signature(vector));
}

/**
* Hash (bin) a vector in s stages into b buckets.
* @param vector
Expand Down
29 changes: 0 additions & 29 deletions src/main/java/info/debatty/java/lsh/SuperBit.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@

package info.debatty.java.lsh;

import info.debatty.java.utils.SparseDoubleVector;
import info.debatty.java.utils.SparseIntegerVector;
import java.io.Serializable;
import java.util.Random;

Expand All @@ -40,7 +38,6 @@
* Advances in Neural Information Processing Systems 25, 2012
*
* Supported input types:
* - SparseIntegerVector
* - double[]
* - others to come...
*
Expand Down Expand Up @@ -176,32 +173,6 @@ public SuperBit() {

}

/**
* Compute the signature of this vector.
* @param vector
* @return
*/
public final boolean[] signature(final SparseIntegerVector vector) {
boolean[] sig = new boolean[this.hyperplanes.length];
for (int i = 0; i < this.hyperplanes.length; i++) {
sig[i] = (vector.dotProduct(this.hyperplanes[i]) >= 0);
}
return sig;
}

/**
* Compute the signature of this vector.
* @param vector
* @return
*/
public final boolean[] signature(final SparseDoubleVector vector) {
boolean[] sig = new boolean[this.hyperplanes.length];
for (int i = 0; i < this.hyperplanes.length; i++) {
sig[i] = (vector.dotProduct(this.hyperplanes[i]) >= 0);
}
return sig;
}

/**
* Compute the signature of this vector.
* @param vector
Expand Down
32 changes: 16 additions & 16 deletions src/main/java/info/debatty/java/lsh/examples/LSHMinHashExample.java
Original file line number Diff line number Diff line change
Expand Up @@ -40,40 +40,40 @@ public class LSHMinHashExample {
public static void main(String[] args) {
// Number of sets
int count = 2000;

// Size of dictionary
int n = 100;

// Number of buckets
// Attention: to get relevant results, the number of elements per bucket
// should be at least 100
int buckets = 10;

// Let's generate some random sets
boolean[][] vectors = new boolean[count][];
Random r = new Random();

// To get some interesting measures, we first generate a single
// sparse random vector
vectors[0] = new boolean[n];
vectors[0] = new boolean[n];
for (int j = 0; j < n; j++) {
vectors[0][j] = (r.nextInt(10) == 0);
}
// Then we generate the other vectors, which have a reasonable chance

// Then we generate the other vectors, which have a reasonable chance
// to look like the first one...
for (int i = 1; i < count; i++) {
vectors[i] = new boolean[n];

for (int j = 0; j < n; j++) {
vectors[i][j] = (r.nextDouble() <= 0.7 ? vectors[0][j] : (r.nextInt(10) == 0));
}
}

// Now we can proceed to LSH binning
// We will test multiple stages
for (int stages = 1; stages <= 10; stages++) {

// Compute the LSH hash of each vector
LSHMinHash lsh = new LSHMinHash(stages, buckets, n);
int[][] hashes = new int[count][];
Expand All @@ -83,7 +83,7 @@ public static void main(String[] args) {
}

// We now have the LSH hash for each input set
// Let's have a look at how similar sets (according to Jaccard
// Let's have a look at how similar sets (according to Jaccard
// index) were binned...
int[][] results = new int[11][2];
for (int i = 0; i < vectors.length; i++) {
Expand All @@ -93,15 +93,15 @@ public static void main(String[] args) {
for (int j = 0; j < i; j++) {
boolean[] vector2 = vectors[j];
int[] hash2 = hashes[j];

// We compute the similarity between each pair of sets
double similarity = MinHash.jaccardIndex(vector1, vector2);

// We count the number of pairs with similarity 0.1, 0.2,
// We count the number of pairs with similarity 0.1, 0.2,
// 0.3, etc.
results[(int) (10 * similarity)][0]++;

// Do they fall in the same bucket for one of the stages?
// Do they fall in the same bucket for one of the stages?
for (int stage = 0; stage < stages; stage++) {
if (hash1[stage] == hash2[stage]) {
results[(int) (10 * similarity)][1]++;
Expand All @@ -116,14 +116,14 @@ public static void main(String[] args) {
// in the same bucket for at least one of the stages is y
for (int i = 0; i < results.length; i++) {
double similarity = (double) i / 10;

double probability = 0;
if (results[i][0] != 0) {
probability = (double) results[i][1] / results[i][0];
}
System.out.println("" + similarity + "\t" + probability + "\t" + stages);
}

// Separate the series for Gnuplot...
System.out.print("\n");
}
Expand Down

This file was deleted.

0 comments on commit 9f4a7a2

Please sign in to comment.