Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add BitVectors format and make flat vectors format easier to extend #13288

Merged
Merged
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,9 @@ New Features
* GITHUB#13197: Expand support for new scalar bit levels for HNSW vectors. This includes 4-bit vectors and an option
to compress them to gain a 50% reduction in memory usage. (Ben Trent)

* GITHUB#13288: Make HNSW and Flat storage vector formats easier to extend with new FlatVectorScorer interface. Add
new Hnsw format for binary quantized vectors. (Ben Trent)

Improvements
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
*
* @lucene.experimental
*/
public class Word2VecModel implements RandomAccessVectorValues<float[]> {
public class Word2VecModel implements RandomAccessVectorValues.Floats {
benwtrent marked this conversation as resolved.
Show resolved Hide resolved

private final int dictionarySize;
private final int vectorDimension;
Expand Down Expand Up @@ -88,7 +88,7 @@ public int size() {
}

@Override
public RandomAccessVectorValues<float[]> copy() throws IOException {
public Word2VecModel copy() throws IOException {
return new Word2VecModel(
this.dictionarySize, this.vectorDimension, this.termsAndVectors, this.word2Vec);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ public final class Lucene90HnswGraphBuilder {
private final Lucene90NeighborArray scratch;

private final VectorSimilarityFunction similarityFunction;
private final RandomAccessVectorValues<float[]> vectorValues;
private final RandomAccessVectorValues.Floats vectorValues;
private final SplittableRandom random;
private final Lucene90BoundsChecker bound;
final Lucene90OnHeapHnswGraph hnsw;
Expand All @@ -58,7 +58,7 @@ public final class Lucene90HnswGraphBuilder {

// we need two sources of vectors in order to perform diversity check comparisons without
// colliding
private final RandomAccessVectorValues<float[]> buildVectors;
private final RandomAccessVectorValues.Floats buildVectors;

/**
* Reads all the vectors from vector values, builds a graph connecting them by their dense
Expand All @@ -73,7 +73,7 @@ public final class Lucene90HnswGraphBuilder {
* to ensure repeatable construction.
*/
public Lucene90HnswGraphBuilder(
RandomAccessVectorValues<float[]> vectors,
RandomAccessVectorValues.Floats vectors,
VectorSimilarityFunction similarityFunction,
int maxConn,
int beamWidth,
Expand Down Expand Up @@ -104,8 +104,7 @@ public Lucene90HnswGraphBuilder(
* @param vectors the vectors for which to build a nearest neighbors graph. Must be an independet
* accessor for the vectors
*/
public Lucene90OnHeapHnswGraph build(RandomAccessVectorValues<float[]> vectors)
throws IOException {
public Lucene90OnHeapHnswGraph build(RandomAccessVectorValues.Floats vectors) throws IOException {
if (vectors == vectorValues) {
throw new IllegalArgumentException(
"Vectors to build must be independent of the source of vectors provided to HnswGraphBuilder()");
Expand Down Expand Up @@ -231,7 +230,7 @@ private boolean diversityCheck(
float[] candidate,
float score,
Lucene90NeighborArray neighbors,
RandomAccessVectorValues<float[]> vectorValues)
RandomAccessVectorValues.Floats vectorValues)
throws IOException {
bound.set(score);
for (int i = 0; i < neighbors.size(); i++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ int size() {

/** Read the vector values from the index input. This supports both iterated and random access. */
static class OffHeapFloatVectorValues extends FloatVectorValues
implements RandomAccessVectorValues<float[]> {
implements RandomAccessVectorValues.Floats {

final int dimension;
final int[] ordToDoc;
Expand Down Expand Up @@ -412,7 +412,7 @@ public int advance(int target) {
}

@Override
public RandomAccessVectorValues<float[]> copy() {
public OffHeapFloatVectorValues copy() {
return new OffHeapFloatVectorValues(dimension, ordToDoc, dataIn.clone());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ public static NeighborQueue search(
float[] query,
int topK,
int numSeed,
RandomAccessVectorValues<float[]> vectors,
RandomAccessVectorValues.Floats vectors,
VectorSimilarityFunction similarityFunction,
HnswGraph graphValues,
Bits acceptOrds,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,7 @@ int ordToDoc(int ord) {

/** Read the vector values from the index input. This supports both iterated and random access. */
static class OffHeapFloatVectorValues extends FloatVectorValues
implements RandomAccessVectorValues<float[]> {
implements RandomAccessVectorValues.Floats {

private final int dimension;
private final int size;
Expand Down Expand Up @@ -454,7 +454,7 @@ public int advance(int target) {
}

@Override
public RandomAccessVectorValues<float[]> copy() {
public OffHeapFloatVectorValues copy() {
return new OffHeapFloatVectorValues(dimension, size, ordToDoc, dataIn.clone());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

/** Read the vector values from the index input. This supports both iterated and random access. */
abstract class OffHeapFloatVectorValues extends FloatVectorValues
implements RandomAccessVectorValues<float[]> {
implements RandomAccessVectorValues.Floats {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not for this PR but I would like to try splitting FloatVectorValues and RandomAccessVectorValues.Floats. Having a single hierarchy that mixes the access pattern is not ideal. With the FlatVectorFomat in the mix we should be able to produce RandomAccessVectorValues and FloatVectorValues independently. This change should help this simplification :)


protected final int dimension;
protected final int size;
Expand Down Expand Up @@ -113,7 +113,7 @@ public int advance(int target) throws IOException {
}

@Override
public RandomAccessVectorValues<float[]> copy() throws IOException {
public DenseOffHeapVectorValues copy() throws IOException {
return new DenseOffHeapVectorValues(dimension, size, slice.clone());
}

Expand Down Expand Up @@ -172,7 +172,7 @@ public int advance(int target) throws IOException {
}

@Override
public RandomAccessVectorValues<float[]> copy() throws IOException {
public OffHeapFloatVectorValues copy() throws IOException {
return new SparseOffHeapVectorValues(fieldEntry, dataIn, slice.clone());
}

Expand Down Expand Up @@ -239,7 +239,7 @@ public int advance(int target) throws IOException {
}

@Override
public RandomAccessVectorValues<float[]> copy() throws IOException {
public OffHeapFloatVectorValues copy() throws IOException {
throw new UnsupportedOperationException();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

/** Read the vector values from the index input. This supports both iterated and random access. */
abstract class OffHeapByteVectorValues extends ByteVectorValues
implements RandomAccessVectorValues<byte[]> {
implements RandomAccessVectorValues.Bytes {

protected final int dimension;
protected final int size;
Expand Down Expand Up @@ -122,7 +122,7 @@ public int advance(int target) throws IOException {
}

@Override
public RandomAccessVectorValues<byte[]> copy() throws IOException {
public OffHeapByteVectorValues copy() throws IOException {
return new DenseOffHeapVectorValues(dimension, size, slice.clone(), byteSize);
}

Expand Down Expand Up @@ -184,7 +184,7 @@ public int advance(int target) throws IOException {
}

@Override
public RandomAccessVectorValues<byte[]> copy() throws IOException {
public OffHeapByteVectorValues copy() throws IOException {
return new SparseOffHeapVectorValues(fieldEntry, dataIn, slice.clone(), byteSize);
}

Expand Down Expand Up @@ -251,7 +251,7 @@ public int advance(int target) throws IOException {
}

@Override
public RandomAccessVectorValues<byte[]> copy() throws IOException {
public OffHeapByteVectorValues copy() throws IOException {
throw new UnsupportedOperationException();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

/** Read the vector values from the index input. This supports both iterated and random access. */
abstract class OffHeapFloatVectorValues extends FloatVectorValues
implements RandomAccessVectorValues<float[]> {
implements RandomAccessVectorValues.Floats {

protected final int dimension;
protected final int size;
Expand Down Expand Up @@ -119,7 +119,7 @@ public int advance(int target) throws IOException {
}

@Override
public RandomAccessVectorValues<float[]> copy() throws IOException {
public OffHeapFloatVectorValues copy() throws IOException {
return new DenseOffHeapVectorValues(dimension, size, slice.clone(), byteSize);
}

Expand Down Expand Up @@ -181,7 +181,7 @@ public int advance(int target) throws IOException {
}

@Override
public RandomAccessVectorValues<float[]> copy() throws IOException {
public OffHeapFloatVectorValues copy() throws IOException {
return new SparseOffHeapVectorValues(fieldEntry, dataIn, slice.clone(), byteSize);
}

Expand Down Expand Up @@ -248,7 +248,7 @@ public int advance(int target) throws IOException {
}

@Override
public RandomAccessVectorValues<float[]> copy() throws IOException {
public OffHeapFloatVectorValues copy() throws IOException {
throw new UnsupportedOperationException();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.HnswGraphProvider;
import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.codecs.hnsw.HnswGraphProvider;
import org.apache.lucene.codecs.lucene95.OffHeapByteVectorValues;
import org.apache.lucene.codecs.lucene95.OffHeapFloatVectorValues;
import org.apache.lucene.codecs.lucene95.OrdToDocDISIReaderConfiguration;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ private void writeGraphOffsets(IndexOutput out, long[] offsets) throws IOExcepti

private void writeGraph(
IndexOutput graphData,
RandomAccessVectorValues<float[]> vectorValues,
RandomAccessVectorValues.Floats vectorValues,
VectorSimilarityFunction similarityFunction,
long graphDataOffset,
long[] offsets,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ public final class Lucene91HnswGraphBuilder {
private final Lucene91NeighborArray scratch;

private final VectorSimilarityFunction similarityFunction;
private final RandomAccessVectorValues<float[]> vectorValues;
private final RandomAccessVectorValues.Floats vectorValues;
private final SplittableRandom random;
private final Lucene91BoundsChecker bound;
private final HnswGraphSearcher graphSearcher;
Expand All @@ -66,7 +66,7 @@ public final class Lucene91HnswGraphBuilder {

// we need two sources of vectors in order to perform diversity check comparisons without
// colliding
private RandomAccessVectorValues<float[]> buildVectors;
private RandomAccessVectorValues.Floats buildVectors;

/**
* Reads all the vectors from vector values, builds a graph connecting them by their dense
Expand All @@ -81,7 +81,7 @@ public final class Lucene91HnswGraphBuilder {
* to ensure repeatable construction.
*/
public Lucene91HnswGraphBuilder(
RandomAccessVectorValues<float[]> vectors,
RandomAccessVectorValues.Floats vectors,
VectorSimilarityFunction similarityFunction,
int maxConn,
int beamWidth,
Expand Down Expand Up @@ -118,8 +118,7 @@ public Lucene91HnswGraphBuilder(
* @param vectors the vectors for which to build a nearest neighbors graph. Must be an independet
* accessor for the vectors
*/
public Lucene91OnHeapHnswGraph build(RandomAccessVectorValues<float[]> vectors)
throws IOException {
public Lucene91OnHeapHnswGraph build(RandomAccessVectorValues.Floats vectors) throws IOException {
if (vectors == vectorValues) {
throw new IllegalArgumentException(
"Vectors to build must be independent of the source of vectors provided to HnswGraphBuilder()");
Expand Down Expand Up @@ -253,7 +252,7 @@ private boolean diversityCheck(
float[] candidate,
float score,
Lucene91NeighborArray neighbors,
RandomAccessVectorValues<float[]> vectorValues)
RandomAccessVectorValues.Floats vectorValues)
throws IOException {
bound.set(score);
for (int i = 0; i < neighbors.size(); i++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ private void writeMeta(
}

private Lucene91OnHeapHnswGraph writeGraph(
RandomAccessVectorValues<float[]> vectorValues, VectorSimilarityFunction similarityFunction)
RandomAccessVectorValues.Floats vectorValues, VectorSimilarityFunction similarityFunction)
throws IOException {

// build graph
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ private void writeMeta(
}

private OnHeapHnswGraph writeGraph(
RandomAccessVectorValues<float[]> vectorValues, VectorSimilarityFunction similarityFunction)
RandomAccessVectorValues.Floats vectorValues, VectorSimilarityFunction similarityFunction)
throws IOException {

// build graph
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -656,14 +656,13 @@ public float[] copyValue(float[] value) {
this.dim = fieldInfo.getVectorDimension();
this.docsWithField = new DocsWithFieldSet();
vectors = new ArrayList<>();
RandomAccessVectorValues<T> raVectors = new RAVectorValues<>(vectors, dim);
RandomVectorScorerSupplier scorerSupplier =
switch (fieldInfo.getVectorEncoding()) {
case BYTE -> RandomVectorScorerSupplier.createBytes(
(RandomAccessVectorValues<byte[]>) raVectors,
RandomAccessVectorValues.fromBytes((List<byte[]>) vectors, dim),
fieldInfo.getVectorSimilarityFunction());
case FLOAT32 -> RandomVectorScorerSupplier.createFloats(
(RandomAccessVectorValues<float[]>) raVectors,
RandomAccessVectorValues.fromFloats((List<float[]>) vectors, dim),
fieldInfo.getVectorSimilarityFunction());
};
hnswGraphBuilder =
Expand Down Expand Up @@ -708,34 +707,4 @@ public long ramBytesUsed() {
+ hnswGraphBuilder.getGraph().ramBytesUsed();
}
}

private static class RAVectorValues<T> implements RandomAccessVectorValues<T> {
private final List<T> vectors;
private final int dim;

RAVectorValues(List<T> vectors, int dim) {
this.vectors = vectors;
this.dim = dim;
}

@Override
public int size() {
return vectors.size();
}

@Override
public int dimension() {
return dim;
}

@Override
public T vectorValue(int targetOrd) throws IOException {
return vectors.get(targetOrd);
}

@Override
public RAVectorValues<T> copy() throws IOException {
return this;
}
}
}
Loading