Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add no-index support for Shredder (part 1) #786

Merged
merged 14 commits into from
Jan 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,29 @@
import io.stargate.sgv2.jsonapi.service.cqldriver.executor.CollectionSettings;
import io.stargate.sgv2.jsonapi.service.embedding.DataVectorizer;
import io.stargate.sgv2.jsonapi.service.embedding.operation.EmbeddingService;
import io.stargate.sgv2.jsonapi.service.projection.DocumentProjector;
import java.util.List;

/**
* Defines the context in which to execute the command.
*
* @param namespace The name of the namespace.
* @param collection The name of the collection.
* @param isVectorEnabled Whether the vector is enabled for the collection
* @param similarityFunction The similarity function used for indexing the vector
* @param collectionSettings Settings for the collection, if Collection-specific command; if not,
* "empty" Settings {see CollectionSettings#empty()}.
*/
public record CommandContext(
String namespace,
String collection,
boolean isVectorEnabled,
CollectionSettings.SimilarityFunction similarityFunction,
CollectionSettings collectionSettings,
EmbeddingService embeddingService) {

public CommandContext(String namespace, String collection) {
this(namespace, collection, false, null, null);
this(namespace, collection, CollectionSettings.empty(), null);
}

private static final CommandContext EMPTY = new CommandContext(null, null, false, null, null);
private static final CommandContext EMPTY =
new CommandContext(null, null, CollectionSettings.empty(), null);

/**
* @return Returns empty command context, having both {@link #namespace} and {@link #collection}
Expand All @@ -38,6 +39,18 @@ public static CommandContext empty() {
return EMPTY;
}

public CollectionSettings.SimilarityFunction similarityFunction() {
return collectionSettings.similarityFunction();
}

public boolean isVectorEnabled() {
return collectionSettings.vectorEnabled();
}

public DocumentProjector indexingProjector() {
return collectionSettings.indexingProjector();
}

public void tryVectorize(JsonNodeFactory nodeFactory, List<JsonNode> documents) {
new DataVectorizer(embeddingService(), nodeFactory).vectorize(documents);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,12 +179,7 @@ else if (error instanceof JsonApiException jsonApiException) {
}

CommandContext commandContext =
new CommandContext(
namespace,
collection,
collectionProperty.vectorEnabled(),
collectionProperty.similarityFunction(),
embeddingService);
new CommandContext(namespace, collection, collectionProperty, embeddingService);

// call processor
return meteredCommandProcessor.processCommand(commandContext, command);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,16 @@
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Suppliers;
import io.stargate.sgv2.jsonapi.config.constants.DocumentConstants;
import io.stargate.sgv2.jsonapi.exception.ErrorCode;
import io.stargate.sgv2.jsonapi.exception.JsonApiException;
import io.stargate.sgv2.jsonapi.service.projection.DocumentProjector;
import java.util.HashSet;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.function.Supplier;

/**
* Refactored as seperate class that represent a collection property.
Expand All @@ -37,7 +40,35 @@ public record CollectionSettings(
String modelName,
IndexingConfig indexingConfig) {

public record IndexingConfig(Set<String> allowed, Set<String> denied) {
private static final CollectionSettings EMPTY =
new CollectionSettings("", false, 0, null, null, null, null);

public static CollectionSettings empty() {
return EMPTY;
}

public DocumentProjector indexingProjector() {
// IndexingConfig null if no indexing definitions: default, index all:
if (indexingConfig == null) {
return DocumentProjector.identityProjector();
}
// otherwise get lazily initialized indexing projector from config
return indexingConfig.indexingProjector();
}

public record IndexingConfig(
Set<String> allowed, Set<String> denied, Supplier<DocumentProjector> indexedProject) {
public IndexingConfig(Set<String> allowed, Set<String> denied) {
this(
allowed,
denied,
Suppliers.memoize(() -> DocumentProjector.createForIndexing(allowed, denied)));
Copy link
Contributor Author

@tatu-at-datastax tatu-at-datastax Jan 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is done so that DocumentProjector is only created first time it is needed (if at all) and reused if used in future (as CollectionSettings are effectively cached)

}

public DocumentProjector indexingProjector() {
return indexedProject.get();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does build the Projector for every call. Is the idea to get this once use it in code passing everywhere?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, it uses Guava's memoize() to avoid just that.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll try to come up with a test to verify that we only get one instance; maybe CreateCollectionCommandResolverTest?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, easier to verify with stand-alone unit test: see CollectionSettingsTest which verifies that instance is created dynamically, and then same instance returned for further calls to get instance.

}

public static IndexingConfig fromJson(JsonNode jsonNode) {
Set<String> allowed = new HashSet<>();
Set<String> denied = new HashSet<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,11 @@ private Uni<UpdatedDocument> processUpdate(
}

final WritableShreddedDocument writableShreddedDocument =
shredder().shred(documentUpdaterResponse.document(), readDocument.txnId());
shredder()
.shred(
documentUpdaterResponse.document(),
readDocument.txnId(),
commandContext().indexingProjector());

// Have to do this because shredder adds _id field to the document if it doesn't exist
JsonNode updatedDocument = writableShreddedDocument.docJsonNode();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@
import io.stargate.sgv2.jsonapi.exception.JsonApiException;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.Set;

/**
* Helper class that implements functionality needed to support projections on documents fetched via
Expand Down Expand Up @@ -48,7 +50,7 @@ public static DocumentProjector createFromDefinition(
JsonNode projectionDefinition, boolean includeSimilarity) {
if (projectionDefinition == null) {
if (includeSimilarity) {
return IDENTITY_PROJECTOR_WITH_SIMILARITY;
return identityProjectorWithSimilarity();
} else {
return identityProjector();
}
Expand All @@ -63,12 +65,45 @@ public static DocumentProjector createFromDefinition(
return PathCollector.collectPaths(projectionDefinition, includeSimilarity).buildProjector();
}

public static DocumentProjector createForIndexing(Set<String> allowed, Set<String> denied) {
// Sets are expected to be validated to have one of 3 main cases:
// 1. Non-empty "allowed" (but empty/null "denied") -> build inclusion projection
// 2. Non-empty "denied" (but empty/null "allowed") -> build exclusion projection
// 3. Empty/null "allowed" and "denied" -> return identity projection
// as well as 2 special cases:
// 4. Empty "allowed" and single "*" entry for "denied" -> return exclude-all projection
// 5. Empty "deny" and single "*" entry for "allowed" -> return include-all ("identity")
// projection
// We need not (and should not) do further validation here.
// Note that (5) is effectively same as (3) and included for sake of uniformity
if (allowed != null && !allowed.isEmpty()) {
// (special) Case 5:
if (allowed.size() == 1 && allowed.contains("*")) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

allowed can't have *, it's only supported for denied.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is something I wanted to discuss (should have brought up) -- for consistency it would make sense to allow this. But for minimal approach not. Will ask on Stargate channel.

My main concern is if users would pass this, assuming it works, and it meaning something totally different.

return identityProjector();
}
// Case 1: inclusion-based projection
return new DocumentProjector(ProjectionLayer.buildLayersOverlapOk(allowed), true, false);
}
if (denied != null && !denied.isEmpty()) {
// (special) Case 4:
if (denied.size() == 1 && denied.contains("*")) {
// Basically inclusion projector with nothing to include
return new DocumentProjector(
ProjectionLayer.buildLayersOverlapOk(Collections.emptySet()), true, false);
}
// Case 2: exclusion-based projection
return new DocumentProjector(ProjectionLayer.buildLayersOverlapOk(denied), false, false);
}
// Case 3: include-all (identity) projection
return identityProjector();
}

public static DocumentProjector identityProjector() {
return IDENTITY_PROJECTOR;
}

public static DocumentProjector getIdentityProjectorWithSimilarity() {
return IDENTITY_PROJECTOR;
public static DocumentProjector identityProjectorWithSimilarity() {
return IDENTITY_PROJECTOR_WITH_SIMILARITY;
}

public boolean isInclusion() {
Expand Down Expand Up @@ -102,6 +137,32 @@ public void applyProjection(JsonNode document, Float similarityScore) {
}
}

/**
* Method to call to check if given path (dotted path, that is, dot-separated segments) would be
* included by this Projection. That is, either
*
* <ul>
* <li>This is inclusion projection, and path is covered by an inclusion path
* <li>This is exclusion projection, and path is NOT covered by any exclusion path
* </ul>
*
* @param path Dotted path (possibly nested) to check
* @return {@code true} if path is included; {@code false} if not.
*/
public boolean isPathIncluded(String path) {
// First: if we have no layers, we are identity projector and include everything
if (rootLayer == null) {
return true;
}
// Otherwise need to split path, evaluate; but note reversal wrt include/exclude
// projections
if (inclusion) {
return rootLayer.isPathIncluded(path);
} else {
return !rootLayer.isPathIncluded(path);
}
}

// Mostly for deserialization tests
@Override
public boolean equals(Object o) {
Expand Down Expand Up @@ -151,13 +212,13 @@ public DocumentProjector buildProjector() {
if (inclusions > 0) { // inclusion-based projection
// doc-id included unless explicitly excluded
return new DocumentProjector(
ProjectionLayer.buildLayers(paths, slices, !Boolean.FALSE.equals(idInclusion)),
ProjectionLayer.buildLayersNoOverlap(paths, slices, !Boolean.FALSE.equals(idInclusion)),
true,
includeSimilarityScore);
} else { // exclusion-based
// doc-id excluded only if explicitly excluded
return new DocumentProjector(
ProjectionLayer.buildLayers(paths, slices, Boolean.FALSE.equals(idInclusion)),
ProjectionLayer.buildLayersNoOverlap(paths, slices, Boolean.FALSE.equals(idInclusion)),
false,
includeSimilarityScore);
}
Expand Down
Loading