Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for default exclusion of $vector and $vectorize #1016

Merged
merged 20 commits into from
Apr 9, 2024
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
28008e5
Add support for default exclusion of $vector
tatu-at-datastax Apr 2, 2024
75e3489
Add _id tests for DocumentProjector (for cases not yet covered)
tatu-at-datastax Apr 2, 2024
2904120
First part of change to support $vector exclusion
tatu-at-datastax Apr 2, 2024
f3f735a
Fix detection of default projection filter
tatu-at-datastax Apr 3, 2024
f0c63f4
Change default Projection to exclude $vector
tatu-at-datastax Apr 3, 2024
1f4701a
Merge branch 'main' into tatu/1005-default-exclude-vector
tatu-at-datastax Apr 3, 2024
797d7df
Fix 2 unit tests wrt new default projector
tatu-at-datastax Apr 3, 2024
b0f091c
Fix more unit tests
tatu-at-datastax Apr 3, 2024
d19a2b1
Improve handling of null docs for update projection
tatu-at-datastax Apr 4, 2024
659715f
Fix some of ITs
tatu-at-datastax Apr 4, 2024
bb52550
Fix more ITs (vectorize ones)
tatu-at-datastax Apr 4, 2024
ac3ce3c
Last IT fixes of first round (still a few fails)
tatu-at-datastax Apr 4, 2024
43b0934
Fix one more IT
tatu-at-datastax Apr 4, 2024
e65cf8b
Use include-all filter for more tests
tatu-at-datastax Apr 4, 2024
9b21f44
Change defaultProjector() to includeAllProjector() for resolvers, to …
tatu-at-datastax Apr 4, 2024
ca20336
Merge branch 'main' into tatu/1005-default-exclude-vector
tatu-at-datastax Apr 5, 2024
c442217
Add handling for "$vectorize" too
tatu-at-datastax Apr 5, 2024
139e4e1
Extend unit tests wrt $vectorize
tatu-at-datastax Apr 5, 2024
5910faa
Fix failing IT (wrt exclusion of $vectorize by default)
tatu-at-datastax Apr 5, 2024
33dbb66
Merge branch 'main' into tatu/1005-default-exclude-vector
tatu-at-datastax Apr 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,9 @@ private Uni<UpdatedDocument> processUpdate(
returnUpdatedDocument ? updatedDocument : originalDocument;
// Some operations (findOneAndUpdate) define projection to apply to
// result:
resultProjection.applyProjection(documentToReturn);
if (documentToReturn != null) { // null for some Operation tests
resultProjection.applyProjection(documentToReturn);
}
}
return new UpdatedDocument(
writableShreddedDocument.id(), upsert, documentToReturn, null);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package io.stargate.sgv2.jsonapi.service.projection;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.JsonNodeFactory;
import com.fasterxml.jackson.databind.node.ObjectNode;
import io.stargate.sgv2.jsonapi.config.constants.DocumentConstants;
import io.stargate.sgv2.jsonapi.exception.ErrorCode;
Expand All @@ -23,12 +24,6 @@ public class DocumentProjector {
* No-op projector that does not modify documents. Considered "exclusion" projector since "no
* exclusions" is conceptually what happens ("no inclusions" would drop all content)
*/
private static final DocumentProjector DEFAULT_PROJECTOR =
new DocumentProjector(null, false, false);

private static final DocumentProjector DEFAULT_PROJECTOR_WITH_SIMILARITY =
new DocumentProjector(null, false, true);

private static final DocumentProjector INCLUDE_ALL_PROJECTOR =
new DocumentProjector(null, false, false);

Expand Down Expand Up @@ -57,7 +52,18 @@ private DocumentProjector(
}

public static DocumentProjector defaultProjector() {
return DEFAULT_PROJECTOR;
return DefaultProjectorWrapper.defaultProjector();
}

public static DocumentProjector includeAllProjector() {
return INCLUDE_ALL_PROJECTOR;
}

DocumentProjector withIncludeSimilarity(boolean includeSimilarityScore) {
if (this.includeSimilarityScore == includeSimilarityScore) {
return this;
}
return new DocumentProjector(rootLayer, inclusion, includeSimilarityScore);
}

public static DocumentProjector createFromDefinition(JsonNode projectionDefinition) {
Expand All @@ -69,9 +75,9 @@ public static DocumentProjector createFromDefinition(
// First special case: "simple" default projection
if (projectionDefinition == null || projectionDefinition.isEmpty()) {
if (includeSimilarity) {
return DEFAULT_PROJECTOR_WITH_SIMILARITY;
return DefaultProjectorWrapper.defaultProjectorWithSimilarity();
}
return DEFAULT_PROJECTOR;
return DefaultProjectorWrapper.defaultProjector();
}
if (!projectionDefinition.isObject()) {
throw new JsonApiException(
Expand Down Expand Up @@ -125,6 +131,7 @@ public void applyProjection(JsonNode document) {
}

public void applyProjection(JsonNode document, Float similarityScore) {
Objects.requireNonNull(document, "Document to call 'applyProjection()' on must not be null");
// null -> either include-add or exclude-all; but logic may seem counter-intuitive
if (rootLayer == null) {
if (inclusion) { // exclude-all
Expand Down Expand Up @@ -165,6 +172,34 @@ public int hashCode() {
return rootLayer.hashCode();
}

/**
* Due to the way projection is handled, we need to handle construction of default instance via
* separate class (to avoid cyclic dependency)
*/
static class DefaultProjectorWrapper {
/**
* Default projector that drops $vector but otherwise leaves document as-is. Constructed from
* empty definition (no inclusions/exclusions).
*/
private static final DocumentProjector DEFAULT_PROJECTOR;

static {
ObjectNode emptyDef = new ObjectNode(JsonNodeFactory.instance);
DEFAULT_PROJECTOR = PathCollector.collectPaths(emptyDef, false).buildProjector();
}

private static final DocumentProjector DEFAULT_PROJECTOR_WITH_SIMILARITY =
DEFAULT_PROJECTOR.withIncludeSimilarity(true);

public static DocumentProjector defaultProjector() {
return DEFAULT_PROJECTOR;
}

public static DocumentProjector defaultProjectorWithSimilarity() {
return DEFAULT_PROJECTOR_WITH_SIMILARITY;
}
}

/**
* Helper object used to traverse and collection inclusion/exclusion path definitions and verify
* that there are only one or the other (except for doc id). Does not build data structures for
Expand All @@ -179,6 +214,8 @@ private static class PathCollector {

private Boolean idInclusion = null;

private Boolean $vectorInclusion = null;

/** Whether similarity score is needed. */
private final boolean includeSimilarityScore;

Expand All @@ -191,36 +228,32 @@ static PathCollector collectPaths(JsonNode def, boolean includeSimilarity) {
}

public DocumentProjector buildProjector() {
if (isDefaultProjection()) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Caller will use the short-cut, no need to check whether explicit definition happens to work like default projection.

return defaultProjector();
}

// One more thing: do we need to add document id?
if (inclusions > 0) { // inclusion-based projection
// doc-id included unless explicitly excluded
return new DocumentProjector(
ProjectionLayer.buildLayersNoOverlap(paths, slices, !Boolean.FALSE.equals(idInclusion)),
ProjectionLayer.buildLayersForProjection(
paths,
slices,
// doc-id included unless explicitly excluded
!Boolean.FALSE.equals(idInclusion),
Copy link
Contributor

@Yuqi-Du Yuqi-Du Apr 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is interesting.
So by default, idInclusion will be null, which will give addDocId as

  • true for inclusion-based projection
  • false for exclusion-based projection

So essentially the meaning of addDocId will be flipped, in terms of what projection is used inclusion-based or exclusion-based. The variable name addDocId (similarly add$vector, add$vectorize) gives some confusion to me initially, but it makes sense afterwards.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, naming is hard. Instead of "addDocId" could use "includeDocId" but that would be even more confusing.
If you (or anyone else) has better name suggestion would be happy to rename.

// $vector only included if explicitly included
Boolean.TRUE.equals($vectorInclusion)),
true,
includeSimilarityScore);
} else { // exclusion-based
// doc-id excluded only if explicitly excluded
return new DocumentProjector(
ProjectionLayer.buildLayersNoOverlap(paths, slices, Boolean.FALSE.equals(idInclusion)),
ProjectionLayer.buildLayersForProjection(
paths,
slices,
// doc-id excluded only if explicitly excluded
Boolean.FALSE.equals(idInclusion),
// $vector excluded unless explicitly included
!Boolean.TRUE.equals($vectorInclusion)),
false,
includeSimilarityScore);
}
}

/**
* Accessor to use for checking if collected paths indicate "empty" (no-operation) projection:
* if so, caller can avoid actual construction or evaluation.
*/
boolean isDefaultProjection() {
// Only the case if we have no non-doc-id inclusions/exclusions AND
// doc-id is included (by default or explicitly)
return paths.isEmpty() && slices.isEmpty() && !Boolean.FALSE.equals(idInclusion);
}

PathCollector collectFromObject(JsonNode ob, String parentPath) {
var it = ob.fields();
while (it.hasNext()) {
Expand Down Expand Up @@ -338,6 +371,8 @@ private void addSlice(String path, JsonNode sliceDef) {
private void addExclusion(String path) {
if (DocumentConstants.Fields.DOC_ID.equals(path)) {
idInclusion = false;
} else if (DocumentConstants.Fields.VECTOR_EMBEDDING_FIELD.equals(path)) {
$vectorInclusion = false;
} else {
// Must not mix exclusions and inclusions
if (inclusions > 0) {
Expand All @@ -356,6 +391,8 @@ private void addExclusion(String path) {
private void addInclusion(String path) {
if (DocumentConstants.Fields.DOC_ID.equals(path)) {
idInclusion = true;
} else if (DocumentConstants.Fields.VECTOR_EMBEDDING_FIELD.equals(path)) {
$vectorInclusion = true;
} else {
// Must not mix exclusions and inclusions
if (exclusions > 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ public static IndexingProjector createForIndexing(Set<String> allowed, Set<Strin
if (!allowed.contains(DocumentConstants.Fields.VECTOR_EMBEDDING_TEXT_FIELD)) {
allowed.add(DocumentConstants.Fields.VECTOR_EMBEDDING_TEXT_FIELD);
}
return new IndexingProjector(ProjectionLayer.buildLayersOverlapOk(allowed), true, false);
return new IndexingProjector(ProjectionLayer.buildLayersForIndexing(allowed), true, false);
}
if (denied != null && !denied.isEmpty()) {
// (special) Case 4:
Expand All @@ -71,10 +71,10 @@ public static IndexingProjector createForIndexing(Set<String> allowed, Set<Strin
overrideFields.add(DocumentConstants.Fields.VECTOR_EMBEDDING_FIELD);
overrideFields.add(DocumentConstants.Fields.VECTOR_EMBEDDING_TEXT_FIELD);
return new IndexingProjector(
ProjectionLayer.buildLayersOverlapOk(overrideFields), true, true);
ProjectionLayer.buildLayersForIndexing(overrideFields), true, true);
}
// Case 2: exclusion-based projection
return new IndexingProjector(ProjectionLayer.buildLayersOverlapOk(denied), false, false);
return new IndexingProjector(ProjectionLayer.buildLayersForIndexing(denied), false, false);
}
// Case 3: include-all (identity) projection
return identityProjector();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,17 +50,21 @@ class ProjectionLayer {
nextLayers = null;
}

public static ProjectionLayer buildLayersNoOverlap(
Collection<String> dotPaths, List<SliceDef> slices, boolean addDocId) {
return buildLayers(dotPaths, slices, addDocId, true);
public static ProjectionLayer buildLayersForProjection(
Collection<String> dotPaths, List<SliceDef> slices, boolean addDocId, boolean add$vector) {
return buildLayers(dotPaths, slices, true, addDocId, add$vector);
}

public static ProjectionLayer buildLayersOverlapOk(Collection<String> dotPaths) {
return buildLayers(dotPaths, Collections.emptyList(), false, false);
public static ProjectionLayer buildLayersForIndexing(Collection<String> dotPaths) {
return buildLayers(dotPaths, Collections.emptyList(), false, false, false);
}

private static ProjectionLayer buildLayers(
Collection<String> dotPaths, List<SliceDef> slices, boolean addDocId, boolean failOnOverlap) {
Collection<String> dotPaths,
List<SliceDef> slices,
boolean failOnOverlap,
boolean addDocId,
boolean add$vector) {
// Root is always branch (not terminal):
ProjectionLayer root = new ProjectionLayer("", false);
for (String fullPath : dotPaths) {
Expand All @@ -83,6 +87,13 @@ private static ProjectionLayer buildLayers(
root,
new String[] {DocumentConstants.Fields.DOC_ID});
}
if (add$vector) {
buildPath(
failOnOverlap,
DocumentConstants.Fields.VECTOR_EMBEDDING_FIELD,
root,
new String[] {DocumentConstants.Fields.VECTOR_EMBEDDING_FIELD});
}
return root;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ private FindOperation getFindOperation(CommandContext commandContext, DeleteMany
return FindOperation.unsorted(
commandContext,
logicalExpression,
DocumentProjector.defaultProjector(),
DocumentProjector.includeAllProjector(),
null,
operationsConfig.maxDocumentDeleteCount() + 1,
operationsConfig.defaultPageSize(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ private FindOperation getFindOperation(CommandContext commandContext, DeleteOneC
return FindOperation.vsearchSingle(
commandContext,
logicalExpression,
DocumentProjector.defaultProjector(),
DocumentProjector.includeAllProjector(),
ReadType.KEY,
objectMapper,
vector);
Expand All @@ -74,7 +74,7 @@ private FindOperation getFindOperation(CommandContext commandContext, DeleteOneC
return FindOperation.sortedSingle(
commandContext,
logicalExpression,
DocumentProjector.defaultProjector(),
DocumentProjector.includeAllProjector(),
// For in memory sorting we read more data than needed, so defaultSortPageSize like 100
operationsConfig.defaultSortPageSize(),
ReadType.SORTED_DOCUMENT,
Expand All @@ -88,7 +88,7 @@ private FindOperation getFindOperation(CommandContext commandContext, DeleteOneC
return FindOperation.unsortedSingle(
commandContext,
logicalExpression,
DocumentProjector.defaultProjector(),
DocumentProjector.includeAllProjector(),
ReadType.KEY,
objectMapper);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ private FindOperation getFindOperation(
return FindOperation.vsearchSingle(
commandContext,
logicalExpression,
DocumentProjector.defaultProjector(),
DocumentProjector.includeAllProjector(),
ReadType.DOCUMENT,
objectMapper,
vector);
Expand All @@ -78,7 +78,7 @@ private FindOperation getFindOperation(
return FindOperation.sortedSingle(
commandContext,
logicalExpression,
DocumentProjector.defaultProjector(),
DocumentProjector.includeAllProjector(),
// For in memory sorting we read more data than needed, so defaultSortPageSize like 100
operationsConfig.defaultSortPageSize(),
ReadType.SORTED_DOCUMENT,
Expand All @@ -92,7 +92,7 @@ private FindOperation getFindOperation(
return FindOperation.unsortedSingle(
commandContext,
logicalExpression,
DocumentProjector.defaultProjector(),
DocumentProjector.includeAllProjector(),
ReadType.DOCUMENT,
objectMapper);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ private FindOperation getFindOperation(
return FindOperation.vsearchSingle(
commandContext,
logicalExpression,
DocumentProjector.defaultProjector(),
DocumentProjector.includeAllProjector(),
ReadType.DOCUMENT,
objectMapper,
vector);
Expand All @@ -96,7 +96,7 @@ private FindOperation getFindOperation(
return FindOperation.sortedSingle(
commandContext,
logicalExpression,
DocumentProjector.defaultProjector(),
DocumentProjector.includeAllProjector(),
// For in memory sorting we read more data than needed, so defaultSortPageSize like 100
operationsConfig.defaultSortPageSize(),
ReadType.SORTED_DOCUMENT,
Expand All @@ -110,7 +110,7 @@ private FindOperation getFindOperation(
return FindOperation.unsortedSingle(
commandContext,
logicalExpression,
DocumentProjector.defaultProjector(),
DocumentProjector.includeAllProjector(),
ReadType.DOCUMENT,
objectMapper);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ private FindOperation getFindOperation(
return FindOperation.vsearchSingle(
commandContext,
logicalExpression,
DocumentProjector.defaultProjector(),
DocumentProjector.includeAllProjector(),
ReadType.DOCUMENT,
objectMapper,
vector);
Expand All @@ -99,8 +99,8 @@ private FindOperation getFindOperation(
commandContext,
logicalExpression,
// 24-Mar-2023, tatu: Since we update the document, need to avoid modifications on
// read path, hence pass identity projector.
DocumentProjector.defaultProjector(),
// read path:
DocumentProjector.includeAllProjector(),
// For in memory sorting we read more data than needed, so defaultSortPageSize like 100
operationsConfig.defaultSortPageSize(),
ReadType.SORTED_DOCUMENT,
Expand All @@ -115,8 +115,8 @@ private FindOperation getFindOperation(
commandContext,
logicalExpression,
// 24-Mar-2023, tatu: Since we update the document, need to avoid modifications on
// read path, hence pass identity projector.
DocumentProjector.defaultProjector(),
// read path:
DocumentProjector.includeAllProjector(),
ReadType.DOCUMENT,
objectMapper);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ public Operation resolveCommand(CommandContext commandContext, UpdateManyCommand
false,
upsert,
shredder,
DocumentProjector.defaultProjector(),
DocumentProjector.includeAllProjector(),
operationsConfig.maxDocumentUpdateCount(),
operationsConfig.lwt().retries());
}
Expand All @@ -68,7 +68,7 @@ private FindOperation getFindOperation(CommandContext commandContext, UpdateMany
return FindOperation.unsorted(
commandContext,
logicalExpression,
DocumentProjector.defaultProjector(),
DocumentProjector.includeAllProjector(),
null != command.options() ? command.options().pageState() : null,
Integer.MAX_VALUE,
operationsConfig.defaultPageSize(),
Expand Down
Loading
Loading