Skip to content

Commit

Permalink
Make embeddings optional in Lucene (#101)
Browse files Browse the repository at this point in the history
  • Loading branch information
serras authored May 24, 2023
1 parent bd7e3bc commit eb5bf4f
Showing 1 changed file with 14 additions and 11 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package com.xebia.functional.xef.vectorstores

import arrow.fx.coroutines.Resource
import arrow.fx.coroutines.ResourceScope
import arrow.fx.coroutines.autoCloseable
import com.xebia.functional.xef.embeddings.Embedding
import com.xebia.functional.xef.embeddings.Embeddings
Expand All @@ -22,13 +22,13 @@ import org.apache.lucene.store.MMapDirectory
open class Lucene(
private val writer: IndexWriter,
private val searcher: IndexSearcher,
private val embeddings: Embeddings,
private val embeddings: Embeddings?,
private val similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN
) : VectorStore, AutoCloseable {

constructor(
writer: IndexWriter,
embeddings: Embeddings,
embeddings: Embeddings?,
similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN
) : this(writer, IndexSearcher(DirectoryReader.open(writer)), embeddings, similarity)

Expand All @@ -37,20 +37,22 @@ open class Lucene(

override suspend fun addTexts(texts: List<String>) =
texts.forEach {
val embedding = embeddings.embedQuery(it, requestConfig)
val embedding = embeddings?.embedQuery(it, requestConfig)
val doc =
Document().apply {
add(TextField("contents", it, Field.Store.YES))
add(KnnFloatVectorField("embedding", embedding.toFloatArray(), similarity))
if (embedding != null) add(KnnFloatVectorField("embedding", embedding.toFloatArray(), similarity))
}
writer.addDocument(doc)
}

override suspend fun similaritySearch(query: String, limit: Int): List<String> =
search(FuzzyQuery(Term("contents", query)), limit)

override suspend fun similaritySearchByVector(embedding: Embedding, limit: Int): List<String> =
search(KnnFloatVectorQuery("embedding", embedding.data.toFloatArray(), limit), limit)
override suspend fun similaritySearchByVector(embedding: Embedding, limit: Int): List<String> {
requireNotNull(embeddings) { "no embeddings were computed for this model" }
return search(KnnFloatVectorQuery("embedding", embedding.data.toFloatArray(), limit), limit)
}

private fun search(q: Query, limit: Int): List<String> =
searcher.search(q, limit).scoreDocs.map {
Expand All @@ -65,7 +67,7 @@ open class Lucene(
class DirectoryLucene(
private val directory: Directory,
writerConfig: IndexWriterConfig = IndexWriterConfig(),
embeddings: Embeddings,
embeddings: Embeddings?,
similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN
) : Lucene(IndexWriter(directory, writerConfig), embeddings, similarity) {
override fun close() {
Expand All @@ -77,16 +79,17 @@ class DirectoryLucene(
fun InMemoryLucene(
path: Path,
writerConfig: IndexWriterConfig = IndexWriterConfig(),
embeddings: Embeddings,
embeddings: Embeddings?,
similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN
): DirectoryLucene = DirectoryLucene(MMapDirectory(path), writerConfig, embeddings, similarity)

fun InMemoryLuceneBuilder(
path: Path,
useAIEmbeddings: Boolean = true,
writerConfig: IndexWriterConfig = IndexWriterConfig(),
similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN
): (Embeddings) -> Resource<VectorStore> = { embeddings ->
autoCloseable { InMemoryLucene(path, writerConfig, embeddings, similarity) }
): suspend ResourceScope.(Embeddings) -> DirectoryLucene = { embeddings ->
autoCloseable { InMemoryLucene(path, writerConfig, embeddings.takeIf { useAIEmbeddings }, similarity) }
}

fun List<Embedding>.toFloatArray(): FloatArray = flatMap { it.data }.toFloatArray()

0 comments on commit eb5bf4f

Please sign in to comment.