Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

vectorization on demand #1258

Merged
merged 37 commits into from
Jul 29, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
4581baa
init
Yuqi-Du Jun 14, 2024
77415ad
fix
Yuqi-Du Jun 14, 2024
5fecd64
Merge remote-tracking branch 'origin/main' into fix-update-vectorize
Yuqi-Du Jun 17, 2024
8a99d28
fix
Yuqi-Du Jun 17, 2024
3e296e8
Merge remote-tracking branch 'origin/main' into fix-update-vectorize
Yuqi-Du Jun 26, 2024
94ded0e
Merge remote-tracking branch 'origin/main' into fix-update-vectorize
Yuqi-Du Jul 8, 2024
7d27b2a
added logic to check if there is $vectorize text diff
Yuqi-Du Jul 8, 2024
e961200
delete two vectorize update unit tests
Yuqi-Du Jul 8, 2024
3e681ba
Merge branch 'main' into fix-update-vectorize
Yuqi-Du Jul 8, 2024
9a5aafc
Merge branch 'main' into fix-update-vectorize
Yuqi-Du Jul 8, 2024
3f332bc
Merge branch 'main' into fix-update-vectorize
Yuqi-Du Jul 9, 2024
a35e328
Merge remote-tracking branch 'origin/main' into fix-update-vectorize
Yuqi-Du Jul 9, 2024
5073fc2
refactor
Yuqi-Du Jul 10, 2024
f50d64a
Merge remote-tracking branch 'origin/fix-update-vectorize' into fix-u…
Yuqi-Du Jul 10, 2024
1cb614d
Merge remote-tracking branch 'origin/main' into fix-update-vectorize
Yuqi-Du Jul 10, 2024
b1b5ad7
fix
Yuqi-Du Jul 10, 2024
95ed7d9
Merge remote-tracking branch 'origin/main' into fix-update-vectorize
Yuqi-Du Jul 10, 2024
62ce663
fix
Yuqi-Du Jul 10, 2024
4f1d47c
add unset test
Yuqi-Du Jul 10, 2024
6887ac3
Merge remote-tracking branch 'origin/main' into fix-update-vectorize
Yuqi-Du Jul 11, 2024
930dc2e
merge from main
Yuqi-Du Jul 11, 2024
a65880c
fix IT
Yuqi-Du Jul 11, 2024
fa810c1
Merge remote-tracking branch 'origin/main' into fix-update-vectorize
Yuqi-Du Jul 11, 2024
6710a6b
Merge remote-tracking branch 'origin/main' into fix-update-vectorize
Yuqi-Du Jul 16, 2024
d51ff20
Merge remote-tracking branch 'origin/main' into fix-update-vectorize
Yuqi-Du Jul 16, 2024
2e3e7f6
refactor
Yuqi-Du Jul 16, 2024
f59ed44
Merge branch 'main' into fix-update-vectorize
Yuqi-Du Jul 16, 2024
fc5b0fc
fix
Yuqi-Du Jul 16, 2024
55f1c25
Merge remote-tracking branch 'origin/main' into fix-update-vectorize
Yuqi-Du Jul 22, 2024
7d3e0df
fix comments
Yuqi-Du Jul 22, 2024
2eec55b
fix comments
Yuqi-Du Jul 22, 2024
f2f6a21
fix IT
Yuqi-Du Jul 22, 2024
2b744b4
Merge remote-tracking branch 'origin/main' into fix-update-vectorize
Yuqi-Du Jul 23, 2024
577703f
merged from tables, fix conflicts
Yuqi-Du Jul 23, 2024
3e4fe40
Merge branch 'main' into fix-update-vectorize
Yuqi-Du Jul 24, 2024
247742c
Merge remote-tracking branch 'origin/main' into fix-update-vectorize
Yuqi-Du Jul 26, 2024
ca9d04f
fix comments
Yuqi-Du Jul 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
added logic to check if there is $vectorize text diff
  • Loading branch information
Yuqi-Du committed Jul 8, 2024
commit 7d27b2a3de0cd8df6c73b63a96eee51c0ce31ff1
Original file line number Diff line number Diff line change
Expand Up @@ -68,30 +68,39 @@ public Uni<Supplier<CommandResult>> execute(
.getDocuments(dataApiRequestInfo, queryExecutor, findOperation().pageState(), null);
return docsToUpdate
.onItem()
.transformToMulti(
.transformToUni(
findResponse -> {
pageStateReference.set(findResponse.pageState());
final List<ReadDocument> docs = findResponse.docs();

// vectorize the updateClause or ReplacementDocument as needed
Uni<Boolean> vectorization = Uni.createFrom().item(false);
final DataVectorizer dataVectorizer =
dataVectorizerService.constructDataVectorizer(dataApiRequestInfo, commandContext);

// 1. UpdateCommand(updateOne, findOneAndUpdate, updateMany):
if (documentUpdater.updateType() == DocumentUpdater.UpdateType.UPDATE) {
// if there are documents found, vectorize the updateOperation
if (docs.size() != 0) {
documentUpdater.vectorizeUpdateClause(dataVectorizer);
// if there are documents found, and there is $vectorize text diff
if (docs.size() != 0 && documentUpdater.hasVectorizeDiff(docs)) {
vectorization = documentUpdater.vectorizeUpdateClause(dataVectorizer);
// if there is no document found, but upsert mode, vectorize the updateOperation
} else if (upsert() && matchedCount.get() == 0) {
documentUpdater.vectorizeUpdateClause(dataVectorizer);
vectorization = documentUpdater.vectorizeUpdateClause(dataVectorizer);
}
// 2.replaceCommand(findOneAndReplace)
} else if (documentUpdater.updateType() == DocumentUpdater.UpdateType.REPLACE) {
// if there is a document found, vectorize it first in documentUpdater
if (docs.size() != 0) {
documentUpdater.vectorizeTheReplacementDocument(dataVectorizer);
// if there is a document found and there is $vectorize text diff
if (docs.size() != 0 && documentUpdater.hasVectorizeDiff(docs)) {
vectorization = documentUpdater.vectorizeTheReplacementDocument(dataVectorizer);
}
}

return vectorization
.onItem()
.transformToUni(vectorized -> Uni.createFrom().item(docs));
})
.onItem()
.transformToMulti(
docs -> {
if (upsert() && docs.size() == 0 && matchedCount.get() == 0) {
return Multi.createFrom().item(findOperation().getNewDocument());
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import io.smallrye.mutiny.infrastructure.Infrastructure;
import io.smallrye.mutiny.Uni;
import io.stargate.sgv2.jsonapi.api.model.command.clause.update.*;
import io.stargate.sgv2.jsonapi.config.constants.DocumentConstants;
import io.stargate.sgv2.jsonapi.exception.ErrorCode;
import io.stargate.sgv2.jsonapi.exception.JsonApiException;
import io.stargate.sgv2.jsonapi.service.embedding.DataVectorizer;
import io.stargate.sgv2.jsonapi.service.operation.model.impl.ReadDocument;
import io.stargate.sgv2.jsonapi.util.JsonUtil;
import java.util.List;
import java.util.concurrent.ExecutionException;

/**
* Updates the document read from the database with the updates came as part of the request.
Expand Down Expand Up @@ -93,21 +93,8 @@ private boolean update(ObjectNode docToUpdate, boolean docInserted) {
*
* @param dataVectorizer
*/
public void vectorizeUpdateClause(DataVectorizer dataVectorizer) {
try {
dataVectorizer
.vectorizeUpdateClause(updateClause)
.runSubscriptionOn(Infrastructure.getDefaultWorkerPool())
.subscribeAsCompletionStage()
.get();
} catch (Exception e) {
if (e instanceof ExecutionException exception) {
if (exception.getCause() instanceof JsonApiException jsonApiException) {
throw jsonApiException;
}
}
throw new RuntimeException(e);
}
public Uni<Boolean> vectorizeUpdateClause(DataVectorizer dataVectorizer) {
return dataVectorizer.vectorizeUpdateClause(updateClause);
}

/**
Expand Down Expand Up @@ -151,21 +138,54 @@ private boolean replace(ObjectNode docToUpdate, boolean docInserted) {
*
* @param dataVectorizer
*/
public void vectorizeTheReplacementDocument(DataVectorizer dataVectorizer) {
try {
dataVectorizer
.vectorize(List.of(replaceDocument))
.runSubscriptionOn(Infrastructure.getDefaultWorkerPool())
.subscribeAsCompletionStage()
.get();
} catch (Exception e) {
if (e instanceof ExecutionException exception) {
if (exception.getCause() instanceof JsonApiException jsonApiException) {
throw jsonApiException;
public Uni<Boolean> vectorizeTheReplacementDocument(DataVectorizer dataVectorizer) {
return dataVectorizer.vectorize(List.of(replaceDocument));
}

/**
* Check if there is any $vectorize diff If there are docs found to update or doc to replace, then
* this is a necessary condition to proceed vectorization
*
* @param foundDocs
*/
public boolean hasVectorizeDiff(List<ReadDocument> foundDocs) {
String vectorizeTextUpdate = null;
if (updateType().equals(DocumentUpdater.UpdateType.UPDATE)) {
// extract $vectorize if updateClause set operator has it
final ObjectNode setNode = updateClause.updateOperationDefs().get(UpdateOperator.SET);
if (setNode != null) {
final JsonNode updateClauseVectorizeTextJsonNode =
setNode.get(DocumentConstants.Fields.VECTOR_EMBEDDING_TEXT_FIELD);
if (updateClauseVectorizeTextJsonNode != null) {
vectorizeTextUpdate = updateClauseVectorizeTextJsonNode.asText();
}
}
} else if (updateType().equals(DocumentUpdater.UpdateType.REPLACE)) {
// extract $vectorize if replaceDocument has it
final JsonNode replaceDocumentVectorizeTextJsonNode =
replaceDocument.get(DocumentConstants.Fields.VECTOR_EMBEDDING_TEXT_FIELD);
if (replaceDocumentVectorizeTextJsonNode != null) {
vectorizeTextUpdate = replaceDocumentVectorizeTextJsonNode.asText();
}
}

// if there is no $vectorize to update or replace, then no diff.
if (vectorizeTextUpdate == null) {
return false;
}

// iterate foundDocs, see if there is any diff for $vectorize
for (ReadDocument foundDoc : foundDocs) {
final JsonNode foundDocVectorizeTextJsonNode =
foundDoc.document().get(DocumentConstants.Fields.VECTOR_EMBEDDING_TEXT_FIELD);
if (foundDocVectorizeTextJsonNode != null) {
if (!foundDocVectorizeTextJsonNode.asText().equals(vectorizeTextUpdate)) {
// There is a diff
return true;
}
}
throw new RuntimeException(e);
}
return false;
}

public record DocumentUpdaterResponse(JsonNode document, boolean modified) {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,16 @@
import io.quarkus.test.junit.TestProfile;
import io.stargate.sgv2.jsonapi.api.model.command.clause.update.UpdateClause;
import io.stargate.sgv2.jsonapi.api.model.command.clause.update.UpdateOperator;
import io.stargate.sgv2.jsonapi.config.constants.DocumentConstants;
import io.stargate.sgv2.jsonapi.exception.ErrorCode;
import io.stargate.sgv2.jsonapi.exception.JsonApiException;
import io.stargate.sgv2.jsonapi.service.operation.model.impl.ReadDocument;
import io.stargate.sgv2.jsonapi.service.shredding.model.DocumentId;
import io.stargate.sgv2.jsonapi.service.testutil.DocumentUpdaterUtils;
import io.stargate.sgv2.jsonapi.testresource.NoGlobalResourcesTestProfile;
import jakarta.inject.Inject;
import java.util.List;
import java.util.UUID;
import org.junit.jupiter.api.Nested;
import org.junit.jupiter.api.Test;

Expand Down Expand Up @@ -534,4 +539,100 @@ public void replaceEmpty() throws Exception {
});
}
}

@Nested
class VectorizeUpdateTest {

@Test
public void updateOne_onlyVectorizeWithDiff_noDiff() throws Exception {
String updateVectorizeData =
"""
{"$vectorize" : "Beijing City"}
""";
DocumentUpdater documentUpdater =
DocumentUpdater.construct(
DocumentUpdaterUtils.updateClause(
UpdateOperator.SET, (ObjectNode) objectMapper.readTree(updateVectorizeData)));

final ReadDocument readDocument =
ReadDocument.from(
DocumentId.fromString("key1"),
UUID.randomUUID(),
objectMapper
.createObjectNode()
.put(DocumentConstants.Fields.VECTOR_EMBEDDING_TEXT_FIELD, "Beijing City"));

assertThat(documentUpdater.hasVectorizeDiff(List.of(readDocument))).isFalse();
}

@Test
public void updateOne_onlyVectorizeWithDiff() throws Exception {
String updateVectorizeData =
"""
{"$vectorize" : "Beijing City"}
""";
DocumentUpdater documentUpdater =
DocumentUpdater.construct(
DocumentUpdaterUtils.updateClause(
UpdateOperator.SET, (ObjectNode) objectMapper.readTree(updateVectorizeData)));

final ReadDocument readDocument =
ReadDocument.from(
DocumentId.fromString("key1"),
UUID.randomUUID(),
objectMapper
.createObjectNode()
.put(DocumentConstants.Fields.VECTOR_EMBEDDING_TEXT_FIELD, "Shanghai City"));

assertThat(documentUpdater.hasVectorizeDiff(List.of(readDocument))).isTrue();
}

@Test
public void updateMany_onlyVectorizeWithDiff() throws Exception {
String updateVectorizeData =
"""
{"$vectorize" : "Beijing City"}
""";
DocumentUpdater documentUpdater =
DocumentUpdater.construct(
DocumentUpdaterUtils.updateClause(
UpdateOperator.SET, (ObjectNode) objectMapper.readTree(updateVectorizeData)));

final ReadDocument readDocument1 =
ReadDocument.from(
DocumentId.fromString("key1"),
UUID.randomUUID(),
objectMapper
.createObjectNode()
.put(DocumentConstants.Fields.VECTOR_EMBEDDING_TEXT_FIELD, "Shanghai City"));
final ReadDocument readDocument2 =
ReadDocument.from(
DocumentId.fromString("key2"),
UUID.randomUUID(),
objectMapper
.createObjectNode()
.put(DocumentConstants.Fields.VECTOR_EMBEDDING_TEXT_FIELD, "Beijing City"));

assertThat(documentUpdater.hasVectorizeDiff(List.of(readDocument1, readDocument2))).isTrue();
}

@Test
public void findOneAndReplace_onlyVectorizeWithDiff() throws Exception {
ObjectNode replaceNode =
objectMapper
.createObjectNode()
.put(DocumentConstants.Fields.VECTOR_EMBEDDING_TEXT_FIELD, "Beijing City");
DocumentUpdater documentUpdater = DocumentUpdater.construct(replaceNode);

final ReadDocument readDocument =
ReadDocument.from(
DocumentId.fromString("key1"),
UUID.randomUUID(),
objectMapper
.createObjectNode()
.put(DocumentConstants.Fields.VECTOR_EMBEDDING_TEXT_FIELD, "Shanghai City"));

assertThat(documentUpdater.hasVectorizeDiff(List.of(readDocument))).isTrue();
}
}
}
Loading