diff --git a/warehouse/query-core/src/main/java/datawave/query/QueryParameters.java b/warehouse/query-core/src/main/java/datawave/query/QueryParameters.java index 01ae45c84e0..d91f1aff47c 100644 --- a/warehouse/query-core/src/main/java/datawave/query/QueryParameters.java +++ b/warehouse/query-core/src/main/java/datawave/query/QueryParameters.java @@ -211,6 +211,11 @@ public class QueryParameters { */ public static final String EXCERPT_FIELDS = "excerpt.fields"; + /** + * Used to specify summaries that should be returned. + */ + public static final String SUMMARY = "summary.size"; + /** * Used to specify model or DB fields that should be treated as lenient (can be skipped if normalization fails) */ diff --git a/warehouse/query-core/src/main/java/datawave/query/attributes/SummarySize.java b/warehouse/query-core/src/main/java/datawave/query/attributes/SummarySize.java new file mode 100644 index 00000000000..fcbbaa7ad54 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/attributes/SummarySize.java @@ -0,0 +1,210 @@ +package datawave.query.attributes; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Objects; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonValue; + +import datawave.query.Constants; +import datawave.query.postprocessing.tf.PhraseIndexes; + +/** + * Represents options for a summary that have been specified within an #SUMMARY_SIZE function. An instance of {@link SummarySize} can easily be captured as a + * parameter string using {@link SummarySize#toString()}, and transformed back into a {@link SummarySize} instance via {@link SummarySize#from(String)}. + */ +public class SummarySize implements Serializable { + + private static final long serialVersionUID = 6769159729743311079L; + + private static final int DEFAULT_SIZE = 150; + private static final Logger log = LoggerFactory.getLogger(SummarySize.class); + + private int summarySize; + private ArrayList contentNamesList; + private boolean only; + + public SummarySize() { + summarySize = DEFAULT_SIZE; + contentNamesList = new ArrayList<>(); + only = false; + } + + /** + * Returns a new {@link SummarySize} parsed from the string. The provided string is expected to have the format returned by {@link SummarySize#toString()}. + * + * + * @param string + * the string to parse + * @return the parsed {@link SummarySize} + */ + @JsonCreator + public static SummarySize from(String string) { + if (string == null) { + return null; + } + // Strip whitespaces. + string = PhraseIndexes.whitespacePattern.matcher(string).replaceAll(""); + + if (string.isEmpty()) { + return new SummarySize(); + } + + SummarySize summarySize = new SummarySize(); + + String[] parameterParts = string.split(Constants.FORWARD_SLASH); + // add the size + summarySize.summarySize = Integer.parseInt(parameterParts[0]); + // if 2 parts, assume the second part is a list of content names + if (parameterParts.length == 2) { + Collections.addAll(summarySize.contentNamesList, parameterParts[1].split(Constants.COMMA)); + } else if (parameterParts.length >= 3) { // if 3 parts, assume part 2 is "only" and part 3 is a list of content names + if (parameterParts[1].equalsIgnoreCase("ONLY")) { + summarySize.only = true; + } + Collections.addAll(summarySize.contentNamesList, parameterParts[2].split(Constants.COMMA)); + } + + return summarySize; + } + + /** + * Returns a copy of the given {@link SummarySize} + * + * @param other + * the instance to copy + * @return the copy + */ + public static SummarySize copyOf(SummarySize other) { + if (other == null) { + return null; + } + SummarySize summarySize = new SummarySize(); + summarySize.summarySize = other.summarySize; + summarySize.contentNamesList = new ArrayList<>(other.contentNamesList); + summarySize.only = other.only; + return summarySize; + } + + public List getContentNames() { + return contentNamesList; + } + + public int getSummarySize() { + return summarySize; + } + + public boolean onlyListedContents() { + return only; + } + + public void addContentName(String contentName) { + contentNamesList.add(contentName); + } + + public void addContentName(String contentName, int index) { + if (index < contentNamesList.size() && index >= 0) { + contentNamesList.add(index, contentName); + } else { + log.info("index out of bounds, adding to beginning of list"); + contentNamesList.add(0, contentName); + } + } + + public void addContentNameToBeginning(String contentName) { + contentNamesList.add(0, contentName); + } + + /** + * Replace a content name with another content name + * + * @param contentName + * the one to replace + * @param replacement + * the one to replace the other + */ + public void replace(String contentName, String replacement) { + int index = contentNamesList.indexOf(contentName); + if (index != -1) { + contentNamesList.set(index, replacement); + } + } + + /** + * Return whether this {@link SummarySize} content names list is empty. + * + * @return true if empty, or false otherwise + */ + public boolean isEmpty() { + return contentNamesList.isEmpty(); + } + + public String contentNamesListToString() { + if (contentNamesList.isEmpty()) { + return ""; + } + + StringBuilder sb = new StringBuilder(); + for (String contentName : contentNamesList) { + sb.append(contentName).append(Constants.COMMA); + } + return sb.substring(0, sb.length() - 1); + } + + public static String[] contentNamesListFromString(String string) { + return string.split(Constants.COMMA); + } + + /** + * Returns this {@link SummarySize} as a formatted string that can later be parsed back into a {@link SummarySize} using {@link SummarySize#from(String)}. + * This is also what will be used when serializing a {@link SummarySize} to JSON/XML. The string will have the format + * {@code size/[only]/[contentName1, contentName2, ....]}. + * + * @return a formatted string + */ + @JsonValue + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append(summarySize); + if (only) { + sb.append("/").append("ONLY"); + } + if (!contentNamesList.isEmpty()) { + sb.append("/"); + for (String contentName : contentNamesList) { + sb.append(contentName).append(Constants.COMMA); + } + return sb.substring(0, sb.length() - 1); + } + return sb.toString(); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + SummarySize that = (SummarySize) o; + return Objects.equals(summarySize, that.summarySize) && Objects.equals(contentNamesList, that.contentNamesList) && Objects.equals(only, that.only); + } + + @Override + public int hashCode() { + return Objects.hash(summarySize, contentNamesList, only); + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/config/ShardQueryConfiguration.java b/warehouse/query-core/src/main/java/datawave/query/config/ShardQueryConfiguration.java index b88cef085f2..700ccb0911f 100644 --- a/warehouse/query-core/src/main/java/datawave/query/config/ShardQueryConfiguration.java +++ b/warehouse/query-core/src/main/java/datawave/query/config/ShardQueryConfiguration.java @@ -47,11 +47,13 @@ import datawave.query.DocumentSerialization.ReturnType; import datawave.query.QueryParameters; import datawave.query.attributes.ExcerptFields; +import datawave.query.attributes.SummarySize; import datawave.query.attributes.UniqueFields; import datawave.query.common.grouping.GroupFields; import datawave.query.function.DocumentPermutation; import datawave.query.iterator.QueryIterator; import datawave.query.iterator.ivarator.IvaratorCacheDirConfig; +import datawave.query.iterator.logic.DColumnSummaryIterator; import datawave.query.iterator.logic.TermFrequencyExcerptIterator; import datawave.query.jexl.JexlASTHelper; import datawave.query.jexl.visitors.JexlStringBuildingVisitor; @@ -438,6 +440,11 @@ public class ShardQueryConfiguration extends GenericQueryConfiguration implement // The class for the excerpt iterator private Class> excerptIterator = TermFrequencyExcerptIterator.class; + private SummarySize summarySize = new SummarySize(); + + // The class for the summary iterator + private Class> summaryIterator = DColumnSummaryIterator.class; + /** * A bloom filter to avoid duplicate results if needed */ @@ -734,6 +741,8 @@ public void copyFrom(ShardQueryConfiguration other) { this.setStrictFields(other.getStrictFields()); this.setExcerptFields(ExcerptFields.copyOf(other.getExcerptFields())); this.setExcerptIterator(other.getExcerptIterator()); + this.setSummarySize(SummarySize.copyOf(other.getSummarySize())); + this.setSummaryIterator(other.getSummaryIterator()); this.setFiFieldSeek(other.getFiFieldSeek()); this.setFiNextSeek(other.getFiNextSeek()); this.setEventFieldSeek(other.getEventFieldSeek()); @@ -2611,6 +2620,24 @@ public void setExcerptIterator(Class this.excerptIterator = excerptIterator; } + public SummarySize getSummarySize() { + return summarySize; + } + + public void setSummarySize(SummarySize summarySize) { + if (summarySize != null) { + this.summarySize = summarySize; + } + } + + public Class> getSummaryIterator() { + return summaryIterator; + } + + public void setSummaryIterator(Class> summaryIterator) { + this.summaryIterator = summaryIterator; + } + public int getFiFieldSeek() { return fiFieldSeek; } @@ -2990,6 +3017,7 @@ public boolean equals(Object o) { Objects.equals(getLenientFields(), that.getLenientFields()) && Objects.equals(getStrictFields(), that.getStrictFields()) && Objects.equals(getExcerptFields(), that.getExcerptFields()) && + Objects.equals(getSummarySize(), that.getSummarySize()) && getFiFieldSeek() == that.getFiFieldSeek() && getFiNextSeek() == that.getFiNextSeek() && getEventFieldSeek() == that.getEventFieldSeek() && @@ -3195,6 +3223,7 @@ public int hashCode() { getLenientFields(), getStrictFields(), getExcerptFields(), + getSummarySize(), getFiFieldSeek(), getFiNextSeek(), getEventFieldSeek(), diff --git a/warehouse/query-core/src/main/java/datawave/query/iterator/QueryIterator.java b/warehouse/query-core/src/main/java/datawave/query/iterator/QueryIterator.java index 6509b471db3..5f0e868ef80 100644 --- a/warehouse/query-core/src/main/java/datawave/query/iterator/QueryIterator.java +++ b/warehouse/query-core/src/main/java/datawave/query/iterator/QueryIterator.java @@ -116,6 +116,7 @@ import datawave.query.tracking.ActiveQuery; import datawave.query.tracking.ActiveQueryLog; import datawave.query.transformer.ExcerptTransform; +import datawave.query.transformer.SummaryTransform; import datawave.query.transformer.UniqueTransform; import datawave.query.util.EmptyContext; import datawave.query.util.EntryToTuple; @@ -203,6 +204,8 @@ public class QueryIterator extends QueryOptions implements YieldingKeyValueItera protected ExcerptTransform excerptTransform = null; + protected SummaryTransform summaryTransform = null; + protected RangeProvider rangeProvider; public QueryIterator() {} @@ -830,6 +833,11 @@ public Entry apply(@Nullable Entry input) { documents = excerptTransform.getIterator(documents); } + SummaryTransform summaryTransform = getSummaryTransform(); + if (summaryTransform != null) { + documents = summaryTransform.getIterator(documents); + } + // a hook to allow mapping the document such as with the TLD or Parent // query logics // or if the document was not aggregated in the first place because the @@ -1625,6 +1633,22 @@ protected ExcerptTransform getExcerptTransform() { return excerptTransform; } + protected SummaryTransform getSummaryTransform() { + if (summaryTransform == null && getSummarySize() != null) { + synchronized (getSummarySize()) { + if (summaryTransform == null) { + try { + summaryTransform = new SummaryTransform(summarySize, myEnvironment, sourceForDeepCopies.deepCopy(myEnvironment), + summaryIterator.getDeclaredConstructor().newInstance()); + } catch (NoSuchMethodException | InstantiationException | IllegalAccessException | InvocationTargetException e) { + throw new RuntimeException("Could not create summary transform", e); + } + } + } + } + return summaryTransform; + } + /** * Get a default implementation of a {@link RangeProvider} * diff --git a/warehouse/query-core/src/main/java/datawave/query/iterator/QueryOptions.java b/warehouse/query-core/src/main/java/datawave/query/iterator/QueryOptions.java index e0e58327a3e..809b2c6ebe4 100644 --- a/warehouse/query-core/src/main/java/datawave/query/iterator/QueryOptions.java +++ b/warehouse/query-core/src/main/java/datawave/query/iterator/QueryOptions.java @@ -61,6 +61,7 @@ import datawave.query.DocumentSerialization; import datawave.query.attributes.Document; import datawave.query.attributes.ExcerptFields; +import datawave.query.attributes.SummarySize; import datawave.query.attributes.UniqueFields; import datawave.query.common.grouping.GroupFields; import datawave.query.composite.CompositeMetadata; @@ -79,6 +80,7 @@ import datawave.query.iterator.filter.KeyIdentity; import datawave.query.iterator.filter.StringToText; import datawave.query.iterator.ivarator.IvaratorCacheDirConfig; +import datawave.query.iterator.logic.DColumnSummaryIterator; import datawave.query.iterator.logic.IndexIterator; import datawave.query.iterator.logic.TermFrequencyExcerptIterator; import datawave.query.jexl.DefaultArithmetic; @@ -259,6 +261,10 @@ public class QueryOptions implements OptionDescriber { public static final String EXCERPT_ITERATOR = "excerpt.iterator.class"; + public static final String SUMMARY = "summary.size"; + + public static final String SUMMARY_ITERATOR = "summary.iterator.class"; + // field and next thresholds before a seek is issued public static final String FI_FIELD_SEEK = "fi.field.seek"; public static final String FI_NEXT_SEEK = "fi.next.seek"; @@ -431,6 +437,10 @@ public class QueryOptions implements OptionDescriber { protected Class> excerptIterator = TermFrequencyExcerptIterator.class; + protected SummarySize summarySize; + + protected Class> summaryIterator = DColumnSummaryIterator.class; + // off by default, controls when to issue a seek private int fiFieldSeek = -1; private int fiNextSeek = -1; @@ -549,6 +559,8 @@ public void deepCopy(QueryOptions other) { this.excerptFields = other.excerptFields; this.excerptFieldsNoHitCallout = other.excerptFieldsNoHitCallout; this.excerptIterator = other.excerptIterator; + this.summarySize = other.summarySize; + this.summaryIterator = other.summaryIterator; this.fiFieldSeek = other.fiFieldSeek; this.fiNextSeek = other.fiNextSeek; @@ -1263,6 +1275,22 @@ public void setExcerptIterator(Class this.excerptIterator = excerptIterator; } + public SummarySize getSummarySize() { + return summarySize; + } + + public void setSummarySize(SummarySize summarySize) { + this.summarySize = summarySize; + } + + public Class> getSummaryIterator() { + return summaryIterator; + } + + public void setSummaryIterator(Class> summaryIterator) { + this.summaryIterator = summaryIterator; + } + @Override public IteratorOptions describeOptions() { Map options = new HashMap<>(); @@ -1356,6 +1384,8 @@ public IteratorOptions describeOptions() { options.put(EXCERPT_FIELDS, "excerpt fields"); options.put(EXCERPT_FIELDS_NO_HIT_CALLOUT, "excerpt fields no hit callout"); options.put(EXCERPT_ITERATOR, "excerpt iterator class (default datawave.query.iterator.logic.TermFrequencyExcerptIterator"); + options.put(SUMMARY, "The size of the summary to return with possible options (ONLY) and list of contentNames"); + options.put(SUMMARY_ITERATOR, "summary iterator class (default datawave.query.iterator.logic.DColumnSummaryIterator"); options.put(FI_FIELD_SEEK, "The number of fields traversed by a Field Index data filter or aggregator before a seek is issued"); options.put(FI_NEXT_SEEK, "The number of next calls made by a Field Index data filter or aggregator before a seek is issued"); options.put(EVENT_FIELD_SEEK, "The number of fields traversed by an Event data filter or aggregator before a seek is issued"); @@ -1862,6 +1892,18 @@ public boolean validateOptions(Map options) { } } + if (options.containsKey(SUMMARY)) { + setSummarySize(SummarySize.from(options.get(SUMMARY))); + } + + if (options.containsKey(SUMMARY_ITERATOR)) { + try { + setSummaryIterator((Class>) Class.forName(options.get(SUMMARY_ITERATOR))); + } catch (ClassNotFoundException e) { + throw new RuntimeException("Could not get class for " + options.get(SUMMARY_ITERATOR), e); + } + } + return true; } diff --git a/warehouse/query-core/src/main/java/datawave/query/iterator/logic/DColumnSummaryIterator.java b/warehouse/query-core/src/main/java/datawave/query/iterator/logic/DColumnSummaryIterator.java index 39b1ccd3da8..f231c6b9693 100644 --- a/warehouse/query-core/src/main/java/datawave/query/iterator/logic/DColumnSummaryIterator.java +++ b/warehouse/query-core/src/main/java/datawave/query/iterator/logic/DColumnSummaryIterator.java @@ -1,7 +1,18 @@ package datawave.query.iterator.logic; -import datawave.query.Constants; -import datawave.query.table.parser.ContentKeyValueFactory; +import static datawave.query.iterator.logic.TermFrequencyExcerptIterator.getDtUid; +import static datawave.query.iterator.logic.TermFrequencyExcerptIterator.getDtUidFromEventKey; +import static datawave.query.iterator.logic.TermFrequencyExcerptIterator.getSortedCFs; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.SortedSet; + import org.apache.accumulo.core.data.ArrayByteSequence; import org.apache.accumulo.core.data.ByteSequence; import org.apache.accumulo.core.data.Key; @@ -13,22 +24,12 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.SortedSet; - -import static datawave.query.iterator.logic.TermFrequencyExcerptIterator.getDtUid; -import static datawave.query.iterator.logic.TermFrequencyExcerptIterator.getDtUidFromEventKey; -import static datawave.query.iterator.logic.TermFrequencyExcerptIterator.getSortedCFs; +import datawave.query.Constants; +import datawave.query.attributes.SummarySize; +import datawave.query.table.parser.ContentKeyValueFactory; /** - * This iterator is intended to scan the d column for a specified document. The result will be a summary - * for each document scanned. + * This iterator is intended to scan the d column for a specified document. The result will be a summary for each document scanned. */ public class DColumnSummaryIterator implements SortedKeyValueIterator { private static final Logger log = LoggerFactory.getLogger(DColumnSummaryIterator.class); @@ -37,9 +38,21 @@ public class DColumnSummaryIterator implements SortedKeyValueIterator public static final String SUMMARY_SIZE = "summary.size"; - public static final String CONTENT_NAME = "content.name"; + public static final String CONTENT_NAMES = "content.names"; - private final ArrayList contentSummaryOrder = new ArrayList<>(); + public static final String ONLY_SPECIFIED = "only.specified"; + + /** + * A list of content names to potentially create a summary for. The closer to the front in the list, the higher the priority to get a summary for that + * content + */ + protected final ArrayList contentSummaryOrder = new ArrayList<>(); + + /** the size in bytes of the summary to create */ + protected int summarySize; + + /** if we will only look at the content names specified in the query */ + protected boolean onlySpecified; /** the underlying source */ protected SortedKeyValueIterator source; @@ -59,8 +72,6 @@ public class DColumnSummaryIterator implements SortedKeyValueIterator /** the top value */ protected Value tv; - protected int summarySize; - @Override public boolean hasTop() { return tk != null; @@ -76,16 +87,34 @@ public SortedKeyValueIterator deepCopy(IteratorEnvironment env) { @Override public void init(SortedKeyValueIterator source, Map options, IteratorEnvironment env) throws IOException { this.source = source; - this.summarySize = Integer.parseInt(options.get(SUMMARY_SIZE)); - if(options.containsKey(CONTENT_NAME)) { - contentSummaryOrder.remove(options.get(CONTENT_NAME)); - contentSummaryOrder.add(0, options.get(CONTENT_NAME)); + + contentSummaryOrder.addAll(List.of("CONTENT", "CONTENT1")); + + if (options.containsKey(SUMMARY_SIZE)) { + this.summarySize = Math.max(1, Math.min(Integer.parseInt(options.get(SUMMARY_SIZE)), 1500)); + } else { + this.summarySize = 150; + } + + // if "ONLY" we will clear the content names list so that we only use the ones passed in + if (options.containsKey(ONLY_SPECIFIED)) { + onlySpecified = Boolean.parseBoolean(options.get(ONLY_SPECIFIED)); + if (onlySpecified) { + contentSummaryOrder.clear(); + } + } else { + onlySpecified = false; } - } - public void init(SortedKeyValueIterator source, Map options, IteratorEnvironment env, List contentSummaryOrder) throws IOException { - this.contentSummaryOrder.addAll(contentSummaryOrder); - init(source, options, env); + // add the content names to the list in the order specified + if (options.containsKey(CONTENT_NAMES)) { + String[] nameList = SummarySize.contentNamesListFromString(options.get(CONTENT_NAMES)); + for (int i = nameList.length - 1; i >= 0; i--) { + String name = nameList[i]; + contentSummaryOrder.remove(name); + contentSummaryOrder.add(0, name); + } + } } @Override @@ -121,8 +150,7 @@ public void seek(Range range, Collection columnFamilies, boolean i startKey = new Key(range.getStartKey().getRow(), Constants.D_COLUMN_FAMILY); } else { // otherwise start at the first document specified - startKey = new Key(range.getStartKey().getRow(), Constants.D_COLUMN_FAMILY, - new Text(this.columnFamilies.first() + Constants.NULL)); + startKey = new Key(range.getStartKey().getRow(), Constants.D_COLUMN_FAMILY, new Text(this.columnFamilies.first() + Constants.NULL)); } } else { // we had a start document specified in the start key, so start there @@ -206,7 +234,8 @@ public void next() throws IOException { Key top = source.getTopKey(); - final Map foundContent = new HashMap<>(); + // this is where we will save all the content found for this document + final Map foundContent = new HashMap<>(); // while we have d keys for the same document while (source.hasTop() && dtUid.equals(getDtUidFromDKey(source.getTopKey()))) { @@ -215,7 +244,8 @@ public void next() throws IOException { // get the content name String currentContentName = getContentName(top); - if(contentSummaryOrder.contains(currentContentName)) { + // if this is one of the contents we want to use, add if to the map + if (contentSummaryOrder.stream().anyMatch(currentContentName::equalsIgnoreCase)) { foundContent.put(currentContentName, source.getTopValue().get()); } @@ -223,8 +253,9 @@ public void next() throws IOException { source.next(); } - String summary = createSummary(contentSummaryOrder, foundContent); - if(summary != null) { + // create the summary + String summary = createSummary(contentSummaryOrder, foundContent, summarySize); + if (summary != null) { tk = new Key(top.getRow(), new Text(dtUid), new Text(summary), top.getColumnVisibility()); tv = new Value(); return; @@ -235,10 +266,30 @@ public void next() throws IOException { tv = null; } - private static String createSummary(List contentSummaryOrder, Map foundContent) { - for(String name : contentSummaryOrder) { - if(foundContent.containsKey(name)) { - return name + Constants.NULL + new String(ContentKeyValueFactory.decodeAndDecompressContent(foundContent.get(name))); + /** + * this method attempts to create a summary out of the found contents + * + * @param contentSummaryOrder + * the order to check for contents. the first one found will have a summary made from it + * @param foundContent + * the map of all the content found for the document + * @param summarySize + * the size in bytes of the summary to create + * @return the created summary + */ + private static String createSummary(List contentSummaryOrder, Map foundContent, int summarySize) { + // check each potential content name we could make summaries for + for (String name : contentSummaryOrder) { + // if we have content name that matches the list... + if (foundContent.containsKey(name)) { + // decode and decompress the content + String summary = new String(ContentKeyValueFactory.decodeAndDecompressContent(foundContent.get(name))); + // if the content is longer than the specified length, truncate it + if (summary.length() > summarySize) { + summary = summary.substring(0, summarySize); + } + // return the content name and summary separated by null + return name + Constants.NULL + summary; } } return null; @@ -288,9 +339,14 @@ private static String getDtUidFromDKey(Key dKey) { return getDtUid(dKey.getColumnQualifier().toString()); } + public void setContentNameList(List contentNameList) { + contentSummaryOrder.clear(); + contentSummaryOrder.addAll(contentNameList); + } + @Override public String toString() { - return "DColumnExcerptIterator: " + contentSummaryOrder; + return "DColumnExcerptIterator: " + summarySize + ", " + onlySpecified + ", " + contentSummaryOrder; } } diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/QueryFunctions.java b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/QueryFunctions.java index 73fe785a3ec..7bf58108dfe 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/QueryFunctions.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/QueryFunctions.java @@ -28,6 +28,7 @@ public class QueryFunctions { public static final String UNIQUE_FUNCTION = "unique"; public static final String GROUPBY_FUNCTION = "groupby"; public static final String EXCERPT_FIELDS_FUNCTION = "excerpt_fields"; + public static final String SUMMARY_FUNCTION = "summary"; public static final String LENIENT_FIELDS_FUNCTION = "lenient"; public static final String STRICT_FIELDS_FUNCTION = "strict"; public static final String MATCH_REGEX = "matchRegex"; diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/QueryFunctionsDescriptor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/QueryFunctionsDescriptor.java index 718cfa7ff50..cc1251d3d9f 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/QueryFunctionsDescriptor.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/QueryFunctionsDescriptor.java @@ -261,6 +261,7 @@ private static void verify(String name, int numArgs) { case QueryOptionsFromQueryVisitor.UniqueFunction.UNIQUE_BY_YEAR_FUNCTION: case QueryFunctions.GROUPBY_FUNCTION: case QueryFunctions.EXCERPT_FIELDS_FUNCTION: + case QueryFunctions.SUMMARY_FUNCTION: case QueryFunctions.MATCH_REGEX: case QueryFunctions.INCLUDE_TEXT: case QueryFunctions.NO_EXPANSION: diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/QueryOptionsFromQueryVisitor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/QueryOptionsFromQueryVisitor.java index a9b9bdb8de0..5f96f5b4419 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/QueryOptionsFromQueryVisitor.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/QueryOptionsFromQueryVisitor.java @@ -59,7 +59,7 @@ public class QueryOptionsFromQueryVisitor extends RebuildingVisitor { QueryFunctions.UNIQUE_FUNCTION, UniqueFunction.UNIQUE_BY_DAY_FUNCTION, UniqueFunction.UNIQUE_BY_HOUR_FUNCTION, UniqueFunction.UNIQUE_BY_MINUTE_FUNCTION, UniqueFunction.UNIQUE_BY_TENTH_OF_HOUR_FUNCTION, UniqueFunction.UNIQUE_BY_MONTH_FUNCTION, UniqueFunction.UNIQUE_BY_SECOND_FUNCTION, UniqueFunction.UNIQUE_BY_MILLISECOND_FUNCTION, UniqueFunction.UNIQUE_BY_YEAR_FUNCTION, - QueryFunctions.GROUPBY_FUNCTION, QueryFunctions.EXCERPT_FIELDS_FUNCTION, QueryFunctions.NO_EXPANSION, + QueryFunctions.GROUPBY_FUNCTION, QueryFunctions.EXCERPT_FIELDS_FUNCTION, QueryFunctions.SUMMARY_FUNCTION, QueryFunctions.NO_EXPANSION, QueryFunctions.LENIENT_FIELDS_FUNCTION, QueryFunctions.STRICT_FIELDS_FUNCTION, QueryFunctions.SUM, QueryFunctions.MIN, QueryFunctions.MAX, QueryFunctions.AVERAGE, QueryFunctions.COUNT, QueryFunctions.RENAME_FUNCTION); @@ -249,6 +249,12 @@ private Object visit(ASTFunctionNode node, Map optionsMap) { updateFieldsOption(optionsMap, QueryParameters.EXCERPT_FIELDS, optionsList); return null; } + case QueryFunctions.SUMMARY_FUNCTION: { + List options = new ArrayList<>(); + this.visit(node, options); + optionsMap.put(QueryParameters.SUMMARY, JOINER.join(options)); + return null; + } case QueryFunctions.NO_EXPANSION: { List optionsList = new ArrayList<>(); this.visit(node, optionsList); diff --git a/warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/SummarySize.java b/warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/SummarySize.java new file mode 100644 index 00000000000..b4357702a42 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/SummarySize.java @@ -0,0 +1,63 @@ +package datawave.query.language.functions.jexl; + +import java.text.MessageFormat; +import java.util.ArrayList; + +import datawave.query.jexl.functions.QueryFunctions; +import datawave.query.language.functions.QueryFunction; +import datawave.webservice.query.exception.BadRequestQueryException; +import datawave.webservice.query.exception.DatawaveErrorCode; + +/** + * Function to specify when summaries should be included for results for any hit documents. This function accepts a string in the format + * {@code size/[only]/[contentName1, contentName2, ....]}. See {@link datawave.query.attributes.SummarySize} for additional documentation on supported + * formatting. + */ +public class SummarySize extends JexlQueryFunction { + + public SummarySize() { + super(QueryFunctions.SUMMARY_FUNCTION, new ArrayList<>()); + } + + @Override + public void validate() throws IllegalArgumentException { + if (this.parameterList.isEmpty()) { + BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.INVALID_FUNCTION_ARGUMENTS, + MessageFormat.format("{0} requires at least one argument", this.name)); + throw new IllegalArgumentException(qe); + } else { + String parameters = String.join(",", parameterList); + try { + datawave.query.attributes.SummarySize.from(parameters); + } catch (Exception e) { + BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.INVALID_FUNCTION_ARGUMENTS, + MessageFormat.format("Unable to parse summary options from arguments for function {0}", this.name)); + throw new IllegalArgumentException(qe); + } + } + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + + sb.append(QueryFunctions.QUERY_FUNCTION_NAMESPACE).append(':').append(QueryFunctions.SUMMARY_FUNCTION); + if (parameterList.isEmpty()) { + sb.append("()"); + } else { + char separator = '('; + for (String parm : parameterList) { + sb.append(separator).append(escapeString(parm)); + separator = ','; + } + sb.append(')'); + } + + return sb.toString(); + } + + @Override + public QueryFunction duplicate() { + return new SummarySize(); + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java b/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java index 8fbebd477e8..a8df9d7b17e 100644 --- a/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java +++ b/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java @@ -75,6 +75,7 @@ import datawave.query.Constants; import datawave.query.QueryParameters; import datawave.query.attributes.ExcerptFields; +import datawave.query.attributes.SummarySize; import datawave.query.attributes.UniqueFields; import datawave.query.common.grouping.GroupFields; import datawave.query.composite.CompositeMetadata; @@ -622,6 +623,8 @@ private void configureIterator(ShardQueryConfiguration config, IteratorSetting c configureExcerpts(config, cfg); + configureSummaries(config, cfg); + addOption(cfg, QueryOptions.LIMIT_FIELDS, config.getLimitFieldsAsString(), false); addOption(cfg, QueryOptions.MATCHING_FIELD_SETS, config.getMatchingFieldSetsAsString(), false); addOption(cfg, QueryOptions.GROUP_FIELDS, config.getGroupFields().toString(), true); @@ -655,6 +658,11 @@ private void configureExcerpts(ShardQueryConfiguration config, IteratorSetting c } } + private void configureSummaries(ShardQueryConfiguration config, IteratorSetting cfg) { + addOption(cfg, QueryOptions.SUMMARY, config.getSummarySize().toString(), true); + addOption(cfg, QueryOptions.SUMMARY_ITERATOR, config.getSummaryIterator().getName(), false); + } + /* * (non-Javadoc) * @@ -1868,6 +1876,11 @@ protected ASTJexlScript upperCaseIdentifiers(MetadataHelper metadataHelper, Shar Sets.newHashSet(excerptFields.getFields()).stream().forEach(s -> excerptFields.replace(s, s.toUpperCase())); } + SummarySize summarySize = config.getSummarySize(); + if (summarySize != null && !summarySize.isEmpty()) { + summarySize.getContentNames().stream().forEach(s -> summarySize.replace(s, s.toUpperCase())); + } + Set userProjection = config.getRenameFields(); if (userProjection != null && !userProjection.isEmpty()) { config.setRenameFields(upcase(userProjection)); diff --git a/warehouse/query-core/src/main/java/datawave/query/planner/QueryOptionsSwitch.java b/warehouse/query-core/src/main/java/datawave/query/planner/QueryOptionsSwitch.java index 249b33d2b26..4d32895754a 100644 --- a/warehouse/query-core/src/main/java/datawave/query/planner/QueryOptionsSwitch.java +++ b/warehouse/query-core/src/main/java/datawave/query/planner/QueryOptionsSwitch.java @@ -1,7 +1,6 @@ package datawave.query.planner; import java.util.Arrays; -import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; @@ -14,6 +13,7 @@ import datawave.query.Constants; import datawave.query.QueryParameters; import datawave.query.attributes.ExcerptFields; +import datawave.query.attributes.SummarySize; import datawave.query.attributes.UniqueFields; import datawave.query.common.grouping.GroupFields; import datawave.query.config.ShardQueryConfiguration; @@ -68,6 +68,10 @@ public static void apply(Map optionsMap, ShardQueryConfiguration ExcerptFields excerptFields = ExcerptFields.from(value); config.setExcerptFields(excerptFields); break; + case QueryParameters.SUMMARY: + SummarySize summarySize = SummarySize.from(value); + config.setSummarySize(summarySize); + break; case QueryParameters.NO_EXPANSION_FIELDS: config.setNoExpansionFields(new HashSet<>(Arrays.asList(StringUtils.split(value, Constants.PARAM_VALUE_SEP)))); break; diff --git a/warehouse/query-core/src/main/java/datawave/query/table/parser/ContentKeyValueFactory.java b/warehouse/query-core/src/main/java/datawave/query/table/parser/ContentKeyValueFactory.java index cb20caecc80..0fd257dbd55 100644 --- a/warehouse/query-core/src/main/java/datawave/query/table/parser/ContentKeyValueFactory.java +++ b/warehouse/query-core/src/main/java/datawave/query/table/parser/ContentKeyValueFactory.java @@ -54,7 +54,7 @@ public static ContentKeyValue parse(Key key, Value value, Authorizations auths, return c; } - public static byte[] decodeAndDecompressContent(byte[] contents){ + public static byte[] decodeAndDecompressContent(byte[] contents) { try { contents = decompress(Base64.getMimeDecoder().decode(contents)); } catch (IOException e) { diff --git a/warehouse/query-core/src/main/java/datawave/query/tables/ShardQueryLogic.java b/warehouse/query-core/src/main/java/datawave/query/tables/ShardQueryLogic.java index 3a113c75a66..bda06ab3439 100644 --- a/warehouse/query-core/src/main/java/datawave/query/tables/ShardQueryLogic.java +++ b/warehouse/query-core/src/main/java/datawave/query/tables/ShardQueryLogic.java @@ -63,6 +63,7 @@ import datawave.query.DocumentSerialization; import datawave.query.QueryParameters; import datawave.query.attributes.ExcerptFields; +import datawave.query.attributes.SummarySize; import datawave.query.attributes.UniqueFields; import datawave.query.cardinality.CardinalityConfiguration; import datawave.query.common.grouping.GroupFields; @@ -988,6 +989,14 @@ protected void loadQueryParameters(ShardQueryConfiguration config, Query setting } } + // Get the SUMMARY_SIZE parameter if given + String summarySizeParam = settings.findParameter(QueryParameters.SUMMARY).getParameterValue().trim(); + if (StringUtils.isNotBlank(summarySizeParam)) { + SummarySize summarySize = SummarySize.from(summarySizeParam); + this.setSummarySize(summarySize); + config.setSummarySize(summarySize); + } + // Get the HIT_LIST parameter if given String hitListString = settings.findParameter(QueryParameters.HIT_LIST).getParameterValue().trim(); if (StringUtils.isNotBlank(hitListString)) { @@ -1537,6 +1546,26 @@ public void setExcerptIterator(String iteratorClass) { } } + public SummarySize getSummarySize() { + return getConfig().getSummarySize(); + } + + public void setSummarySize(SummarySize summarySize) { + getConfig().setSummarySize(summarySize); + } + + public String getSummaryIterator() { + return getConfig().getSummaryIterator().getName(); + } + + public void setSummaryIterator(String iteratorClass) { + try { + getConfig().setSummaryIterator((Class>) Class.forName(iteratorClass)); + } catch (Exception e) { + throw new DatawaveFatalQueryException("Illegal d column summary iterator class", e); + } + } + public int getFiFieldSeek() { return getConfig().getFiFieldSeek(); } diff --git a/warehouse/query-core/src/main/java/datawave/query/transformer/SummaryTransform.java b/warehouse/query-core/src/main/java/datawave/query/transformer/SummaryTransform.java index 5041f0642cb..ca6fc81f4da 100644 --- a/warehouse/query-core/src/main/java/datawave/query/transformer/SummaryTransform.java +++ b/warehouse/query-core/src/main/java/datawave/query/transformer/SummaryTransform.java @@ -1,36 +1,12 @@ package datawave.query.transformer; -import com.google.common.collect.Iterators; -import com.google.protobuf.InvalidProtocolBufferException; -import datawave.common.util.ArgumentChecker; -import datawave.ingest.protobuf.TermWeight; -import datawave.ingest.protobuf.TermWeightPosition; -import datawave.query.Constants; -import datawave.query.attributes.Attribute; -import datawave.query.attributes.Attributes; -import datawave.query.attributes.Content; -import datawave.query.attributes.Document; -import datawave.query.attributes.ExcerptFields; -import datawave.query.attributes.ValueTuple; -import datawave.query.function.JexlEvaluation; -import datawave.query.iterator.logic.DColumnSummaryIterator; -import datawave.query.iterator.logic.TermFrequencyExcerptIterator; -import datawave.query.postprocessing.tf.PhraseIndexes; -import datawave.query.postprocessing.tf.PhraseOffset; -import org.apache.accumulo.core.data.Key; -import org.apache.accumulo.core.data.PartialKey; -import org.apache.accumulo.core.data.Range; -import org.apache.accumulo.core.data.Value; -import org.apache.accumulo.core.iterators.IteratorEnvironment; -import org.apache.accumulo.core.iterators.SortedKeyValueIterator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import static datawave.query.iterator.logic.DColumnSummaryIterator.CONTENT_NAMES; +import static datawave.query.iterator.logic.DColumnSummaryIterator.ONLY_SPECIFIED; +import static datawave.query.iterator.logic.DColumnSummaryIterator.SUMMARY_SIZE; -import javax.annotation.Nullable; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; @@ -40,29 +16,52 @@ import java.util.Objects; import java.util.Set; -import static datawave.query.iterator.logic.TermFrequencyExcerptIterator.Configuration.END_OFFSET; -import static datawave.query.iterator.logic.TermFrequencyExcerptIterator.Configuration.FIELD_NAME; -import static datawave.query.iterator.logic.TermFrequencyExcerptIterator.Configuration.START_OFFSET; +import javax.annotation.Nullable; + +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.PartialKey; +import org.apache.accumulo.core.data.Range; +import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.core.iterators.IteratorEnvironment; +import org.apache.accumulo.core.iterators.SortedKeyValueIterator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.collect.Iterators; + +import datawave.common.util.ArgumentChecker; +import datawave.query.Constants; +import datawave.query.attributes.Attributes; +import datawave.query.attributes.Content; +import datawave.query.attributes.Document; +import datawave.query.attributes.DocumentKey; +import datawave.query.attributes.SummarySize; +import datawave.query.iterator.logic.DColumnSummaryIterator; +import datawave.query.iterator.logic.TermFrequencyExcerptIterator; public class SummaryTransform extends DocumentTransform.DefaultDocumentTransform { private static final Logger log = LoggerFactory.getLogger(SummaryTransform.class); + public static final String SUMMARY_ERROR_MESSAGE = "SOMETHING WENT WRONG GENERATING YOUR SUMMARY!"; + private static final Summary ERROR_SUMMARY = new Summary(null, SUMMARY_ERROR_MESSAGE); + private static final Summary EMPTY_SUMMARY = new Summary(null, "NO CONTENT FOUND TO SUMMARIZE"); + + private static final String CONTENT_SUMMARY = "CONTENT_SUMMARY"; + private final DColumnSummaryIterator summaryIterator; - private final ExcerptFields excerptFields; + private final SummarySize summarySize; private final IteratorEnvironment env; private final SortedKeyValueIterator source; - private final ArrayList hitTermValues = new ArrayList<>(); - - public SummaryTransform(ExcerptFields excerptFields, IteratorEnvironment env, SortedKeyValueIterator source) { - this(excerptFields, env, source, new TermFrequencyExcerptIterator()); + public SummaryTransform(SummarySize summarySize, IteratorEnvironment env, SortedKeyValueIterator source) { + this(summarySize, env, source, new TermFrequencyExcerptIterator()); } - public SummaryTransform(ExcerptFields excerptFields, IteratorEnvironment env, SortedKeyValueIterator source, - SortedKeyValueIterator summaryIterator) { - ArgumentChecker.notNull(excerptFields); - this.excerptFields = excerptFields; + public SummaryTransform(SummarySize summarySize, IteratorEnvironment env, SortedKeyValueIterator source, + SortedKeyValueIterator summaryIterator) { + ArgumentChecker.notNull(summarySize); + this.summarySize = summarySize; this.env = env; this.source = source; this.summaryIterator = (DColumnSummaryIterator) summaryIterator; @@ -73,18 +72,18 @@ public SummaryTransform(ExcerptFields excerptFields, IteratorEnvironment env, So public Entry apply(@Nullable Entry entry) { if (entry != null) { Document document = entry.getValue(); - // Do not bother adding excerpts to transient documents. + // Do not bother adding summaries to transient documents. if (document.isToKeep()) { - PhraseIndexes phraseIndexes = getPhraseIndexes(document); - if (!phraseIndexes.isEmpty()) { + ArrayList documentKeys = getEventIds(document); + if (!documentKeys.isEmpty()) { if (log.isTraceEnabled()) { - log.trace("Fetching phrase excerpts {} for document {}", excerptFields, document.getMetadata()); + log.trace("Fetching summaries {} for document {}", summarySize, document.getMetadata()); } - Set excerpts = getExcerpts(phraseIndexes); - addExcerptsToDocument(excerpts, document); + Set summaries = getExcerpts(documentKeys); + addExcerptsToDocument(summaries, document); } else { if (log.isTraceEnabled()) { - log.trace("Phrase indexes were not added to document {}, skipping", document.getMetadata()); + log.trace("document keys were not added to document {}, skipping", document.getMetadata()); } } } @@ -93,357 +92,137 @@ public Entry apply(@Nullable Entry entry) { } /** - * Retrieve the phrase indexes from the {@value #PHRASE_INDEXES_ATTRIBUTE} attribute in the document. + * Retrieve the eventIds in the document. * * @param document * the document - * @return the phrase indexes + * @return a list of the eventIds */ - private PhraseIndexes getPhraseIndexes(Document document) { - PhraseIndexes phraseIndexes = null; - PhraseIndexes allPhraseIndexes = new PhraseIndexes(); - // first lets find all the phrase indexes that came from phrase functions - if (document.containsKey(PHRASE_INDEXES_ATTRIBUTE)) { - Content content = (Content) document.get(PHRASE_INDEXES_ATTRIBUTE); - phraseIndexes = PhraseIndexes.from(content.getContent()); - allPhraseIndexes.addAll(phraseIndexes); - } - // now lets find all the terms for excerpt fields and add them to the list - if (document.containsKey(JexlEvaluation.HIT_TERM_FIELD)) { - Attributes hitList = (Attributes) document.get(JexlEvaluation.HIT_TERM_FIELD); - // for each hit term - for (Attribute attr : hitList.getAttributes()) { - ValueTuple hitTuple = attributeToHitTuple(attr); - // if this is for a requested excerpt field - if (excerptFields.containsField(hitTuple.getFieldName())) { - // get the offset, preferring offsets that overlap with existing phrases for this field/eventId - TermWeightPosition pos = getOffset(hitTuple, phraseIndexes); - if (pos != null) { - // add the term as a phrase as defined in the term weight position. Note that this will collapse with any overlapping phrases already in - // the list. - allPhraseIndexes.addIndexTriplet(String.valueOf(hitTuple.getFieldName()), keyToEventId(attr.getMetadata()), pos.getLowOffset(), - pos.getOffset()); - } - // save the hit term for later call-out - Collections.addAll(hitTermValues, ((String) hitTuple.getValue()).split(Constants.SPACE)); - } - } + private static ArrayList getEventIds(Document document) { + ArrayList eventIds = new ArrayList<>(); + if (document.containsKey("RECORD_ID")) { + eventIds.add((DocumentKey) document.get("RECORD_ID")); + } else { + Key key = document.getMetadata(); + String[] cf = key.getColumnFamily().toString().split(Constants.NULL); + eventIds.add(new DocumentKey(key.getRow().toString(), cf[0], cf[1], document.isToKeep())); } - // return the file set of phrase indexes if any - return allPhraseIndexes; - } - - /** - * Get the term weight position (offset) for the specified hit term. This will return an offset overlapping a phrase in the existing phrase index map first. - * Otherwise, the first position will be returned. - * - * @param hitTuple - * The hit term tuple - * @param phraseIndexes - * The phrase indexes - * @return The TermWeightPosition for the given hit term - */ - private TermWeightPosition getOffset(ValueTuple hitTuple, PhraseIndexes phraseIndexes) { - Key docKey = hitTuple.getSource().getMetadata(); - // if we do not know the source document key, then we cannot find the term offset - if (docKey == null) { - log.warn("Unable to find the source document for a hit term, skipping excerpt generation"); - return null; - } - String fieldName = hitTuple.getFieldName(); - String eventId = keyToEventId(docKey); - - // get the key at which we would find the term frequencies - Key tfKey = new Key(docKey.getRow().toString(), Constants.TERM_FREQUENCY_COLUMN_FAMILY.toString(), - docKey.getColumnFamily().toString() + '\u0000' + hitTuple.getValue() + '\u0000' + hitTuple.getFieldName()); - Range range = new Range(tfKey, tfKey.followingKey(PartialKey.ROW_COLFAM_COLQUAL)); - try { - // seek directly to that key - source.seek(range, Collections.emptyList(), false); - if (source.hasTop()) { - TermWeightPosition pos = null; - - // parse the term frequencies - TermWeight.Info twInfo = TermWeight.Info.parseFrom(source.getTopValue().get()); - - // if we have phrase indexes, then find one that overlaps if any - if (phraseIndexes != null) { - pos = phraseIndexes.getOverlappingPosition(fieldName, eventId, twInfo); - } - - // if no overlapping phrases, then return the first position - if (pos == null) { - TermWeightPosition.Builder position = new TermWeightPosition.Builder(); - position.setTermWeightOffsetInfo(twInfo, 0); - pos = position.build(); - } - - return pos; - } - } catch (InvalidProtocolBufferException e) { - log.error("Value passed to aggregator was not of type TermWeight.Info for {}", tfKey, e); - } catch (IOException e) { - log.error("Failed to scan for term frequencies at {}", tfKey, e); - } - return null; + return eventIds; } /** - * Add the excerpts to the document as part of {@value #HIT_EXCERPT}. + * Add the summaries to the document as part of {@value #CONTENT_SUMMARY}. * - * @param excerpts - * the excerpts to add + * @param summaries + * the summaries to add * @param document * the document */ - private static void addExcerptsToDocument(Set excerpts, Document document) { - Attributes attributesWithoutScores = new Attributes(true); - Attributes attributesWithScores = new Attributes(true); - Attributes attributesOneBest = new Attributes(true); - - boolean hasScores = false; - - for (Excerpt excerpt : excerpts) { - Content contentWithoutScores = new Content(excerpt.getExcerptWithoutScores(), excerpt.getSource(), true); - attributesWithoutScores.add(contentWithoutScores); - - String excerptWithScores = excerpt.getExcerptWithScores(); - if (excerptWithScores.isBlank() || excerptWithScores.equals(TermFrequencyExcerptIterator.NOT_SCORED_MARKER)) { - continue; - } - - hasScores = true; + private static void addExcerptsToDocument(Set summaries, Document document) { + Attributes summaryAttribute = new Attributes(true); - Content contentWithScores = new Content(excerptWithScores, excerpt.getSource(), true); - attributesWithScores.add(contentWithScores); - Content contentOneBest = new Content(excerpt.getExcerptOneBest(), excerpt.getSource(), true); - attributesOneBest.add(contentOneBest); + for (Summary summary : summaries) { + Content contentSummary = new Content(summary.getSummary(), summary.getSource(), true); + summaryAttribute.add(contentSummary); } - document.put(HIT_EXCERPT, attributesWithoutScores); - if (hasScores) { - document.put(HIT_EXCERPT_WITH_SCORES, attributesWithScores); - document.put(HIT_EXCERPT_ONE_BEST, attributesOneBest); - } + document.put(CONTENT_SUMMARY, summaryAttribute); } /** - * Get the excerpts. + * Get the summaries. * - * @param phraseIndexes - * the pre-identified phrase offsets + * @param documentKeys + * the pre-identified document keys * @return the excerpts */ - private Set getExcerpts(PhraseIndexes phraseIndexes) { - final PhraseIndexes offsetPhraseIndexes = getOffsetPhraseIndexes(phraseIndexes, excerptFields); - if (offsetPhraseIndexes.isEmpty()) { + private Set getExcerpts(final ArrayList documentKeys) { + if (documentKeys.isEmpty()) { return Collections.emptySet(); } - // Fetch the excerpts. - Set excerpts = new HashSet<>(); - for (String field : offsetPhraseIndexes.getFields()) { - Collection indexes = offsetPhraseIndexes.getPhraseOffsets(field); - for (PhraseOffset phraseOffset : indexes) { - String eventId = phraseOffset.getEventId(); - int start = phraseOffset.getStartOffset(); - int end = phraseOffset.getEndOffset(); - if (log.isTraceEnabled()) { - log.trace("Fetching excerpt [{},{}] for field {} for document {}", start, end, field, eventId.replace('\u0000', '/')); - } + // Fetch the summaries. + Set summaries = new HashSet<>(); + for (DocumentKey documentKey : documentKeys) { + if (log.isTraceEnabled()) { + log.trace("Fetching summary for document {}", + documentKey.getShardId() + Constants.NULL + documentKey.getDataType() + Constants.NULL + documentKey.getUid()); + } - // Construct the required range for this document. - Key startKey = eventIdToKey(eventId); - Key endKey; - if (startKey != null) { - endKey = startKey.followingKey(PartialKey.ROW_COLFAM); - } else { - throw new IllegalStateException("eventID string was null"); - } - Range range = new Range(startKey, true, endKey, false); + // Construct the required range for this document. + Key startKey = new Key(documentKey.getShardId(), documentKey.getDataType() + Constants.NULL + documentKey.getUid()); + Key endKey = startKey.followingKey(PartialKey.ROW_COLFAM); + Range range = new Range(startKey, true, endKey, false); - Excerpt excerpt = getExcerpt(field, start, end, range, hitTermValues); - // Only retain non-blank excerpts. - if (!excerpt.isEmpty()) { - excerpts.add(excerpt); - } else { - if (log.isTraceEnabled()) { - log.trace("Failed to find excerpt [{},{}] for field {} for document {}", start, end, field, eventId.replace('\u0000', '/')); - } + Summary summary = getSummary(range, summarySize); + // Only retain non-blank summaries. + if (!summary.isEmpty()) { + summaries.add(summary); + } else { + if (log.isTraceEnabled()) { + log.trace("Failed to find summary for document {}", + documentKey.getShardId() + Constants.NULL + documentKey.getDataType() + Constants.NULL + documentKey.getUid()); } } } - return excerpts; + return summaries; } /** - * Get the excerpt for the specified field. + * Get the summary * - * @param field - * the field - * @param start - * the start index of the excerpt - * @param end - * the end index of the excerpt * @param range * the range to use when seeking - * @param hitTermValues - * the term values to match - * @return the excerpt + * @param summarySize + * the object with our summary specifications + * @return the summary */ - private Excerpt getExcerpt(String field, int start, int end, Range range, ArrayList hitTermValues) { - - final Map excerptIteratorOptions = new HashMap<>(); - excerptIteratorOptions.put(FIELD_NAME, field); - // We will attempt to create the excerpt we want up to two times. - // Currently, the only condition that will cause a second attempt is if we detect stop words in the TFs we scan. - // The main difference in the second attempt is that it runs with an expanded range to allow us to remove the - // stop words and still have a correctly sized excerpt - for (int attempt = 0; attempt <= 1; attempt++) { - // if this is the first attempt, set the start and end offsets using the passed in values - if (attempt == 0) { - excerptIteratorOptions.put(START_OFFSET, String.valueOf(start)); - excerptIteratorOptions.put(END_OFFSET, String.valueOf(end)); - } else { - // if this is the second attempt, set up the iterator with a larger range by adding/subtracting - // the start and end offsets by "expandedSize" - int expandedStart = Math.max(start - expandSize, 0); - int expandedEnd = end + expandSize; - excerptIteratorOptions.put(START_OFFSET, String.valueOf(expandedStart)); - excerptIteratorOptions.put(END_OFFSET, String.valueOf(expandedEnd)); - - if (log.isDebugEnabled()) { - log.debug("size of excerpt requested: {}", excerptFields.getOffset(field) * 2); - log.debug("original range is ({},{}) and the expanded range is ({},{})", start, end, expandedStart, expandedEnd); - } - } - - try { - // set all of our options for the iterator - summaryIterator.init(source, excerptIteratorOptions, env); - - // run the iterator - summaryIterator.seek(range, Collections.emptyList(), false); - - // if an excerpt is returned... - if (summaryIterator.hasTop()) { - // the excerpt will be in the column qualifier of the top key - Key topKey = summaryIterator.getTopKey(); - // The column qualifier is expected to be field\0phraseWithScores\0phraseWithoutScores\0oneBestExcerpt. - // split the column qualifier on null bytes to get the different parts - // we should have 4 parts after splitting the column qualifier on the null bytes - final String[] parts = topKey.getColumnQualifier().toString().split(Constants.NULL); - // if we don't have 4 parts after splitting the column qualifier... - if (parts.length != 4) { - if (attempt == 0) { // if this is the first attempt, try again - continue; - } - - // if this is the second attempt, log an error - if (log.isErrorEnabled()) { - log.error("{} returned top key with incorrectly-formatted column qualifier in key: {} when scanning for excerpt [{},{}] for field {} within range {} : parts= {}", - TermFrequencyExcerptIterator.class.getSimpleName(), topKey, start, end, field, range, Arrays.toString(parts)); - } - break; - } - - // if we have reached the limit of times to try, or we have no stop words removed - if (!parts[1].equals(TermFrequencyExcerptIterator.WORD_SKIPPED_MARKER)) { - // return just the excerpt parts - return new Excerpt(range.getStartKey(), parts[1], parts[2], parts[3]); - } - } else { // If no excerpt was returned on the first attempt, try again. If no excerpt was returned on the second attempt, log an error. - if (attempt == 1 && log.isErrorEnabled()) { - log.error("TermFrequencyExcerptIterator returned with hasTop() false: something went wrong in the iterator (or given bad parameters to run with)"); - log.error("The iterator options were: Field \"{}\" Range= {} StartOffset= {} EndOffset= {} HitTerms= {}", field, range, - excerptIteratorOptions.get(START_OFFSET), excerptIteratorOptions.get(END_OFFSET), hitTermValues); - break; - } - } - } catch (IOException e) { - throw new RuntimeException("Failed to scan for excerpt [" + start + "," + end + "] for field " + field + " within range " + range, e); - } + private Summary getSummary(Range range, SummarySize summarySize) { + // get the options out of the SummarySize object + final Map summaryIteratorOptions = new HashMap<>(); + summaryIteratorOptions.put(SUMMARY_SIZE, String.valueOf(summarySize.getSummarySize())); + if (!summarySize.isEmpty()) { + summaryIteratorOptions.put(CONTENT_NAMES, summarySize.contentNamesListToString()); } - // when working correctly, it should always return from inside the loop so if this is reached something went very wrong - return ERROR_EXCERPT; - } + summaryIteratorOptions.put(ONLY_SPECIFIED, String.valueOf(summarySize.onlyListedContents())); - /** - * Returned a filtered {@link PhraseIndexes} that only contains the fields for which excerpts are desired, with the indexes offset by the specified excerpt - * offset. - * - * @param phraseIndexes - * the original phrase indexes - * @param excerptFields - * the fields that we want excerpts for - * @return the filtered, offset phrase indexes - */ - private static PhraseIndexes getOffsetPhraseIndexes(PhraseIndexes phraseIndexes, ExcerptFields excerptFields) { - PhraseIndexes offsetPhraseIndexes = new PhraseIndexes(); - for (String field : excerptFields.getFields()) { - // Filter out phrases that are not in desired fields. - Collection indexes = phraseIndexes.getPhraseOffsets(field); - if (indexes != null) { - int offset = excerptFields.getOffset(field); - // Ensure the offset is modified to encompass the target excerpt range. - for (PhraseOffset indexPair : indexes) { - String eventId = indexPair.getEventId(); - int start = indexPair.getStartOffset() <= offset ? 0 : indexPair.getStartOffset() - offset; - int end = indexPair.getEndOffset() + offset + 1; // Add 1 here to offset the non-inclusive end of the range that will be used when scanning. - offsetPhraseIndexes.addIndexTriplet(field, eventId, start, end); + try { + // set all of our options for the iterator + summaryIterator.init(source, summaryIteratorOptions, env); + + // run the iterator + summaryIterator.seek(range, Collections.emptyList(), false); + + // if a summary is returned... + if (summaryIterator.hasTop()) { + // the excerpt will be in the column qualifier of the top key + Key topKey = summaryIterator.getTopKey(); + // The column qualifier is expected to be contentName\0summary. + // split the column qualifier on null bytes to get the different parts + // we should have 2 parts after splitting the column qualifier on the null bytes + final String[] parts = topKey.getColumnQualifier().toString().split(Constants.NULL); + // if we don't have 2 parts after splitting the column qualifier... + if (parts.length != 2) { + if (log.isErrorEnabled()) { + log.error("{} returned top key with incorrectly-formatted column qualifier in key: {} when scanning for summary within range {} : parts= {}", + TermFrequencyExcerptIterator.class.getSimpleName(), topKey, range, Arrays.toString(parts)); + } + return ERROR_SUMMARY; } + // return our summary + return new Summary(range.getStartKey(), parts[1]); } + } catch (IOException e) { + throw new RuntimeException("Failed to scan for summary within range " + range, e); } - return offsetPhraseIndexes; - } - - /** - * Given a hit term attribute, return a ValueTuple representation which will give us the field and value parsed out. - * - * @param source - * a hit term attribute - * @return A ValueTuple representation of the document hit-term attribute - */ - private static ValueTuple attributeToHitTuple(Attribute source) { - String hitTuple = String.valueOf(source.getData()); - int index = hitTuple.indexOf(':'); - String fieldName = hitTuple.substring(0, index); - String value = hitTuple.substring(index + 1); - return new ValueTuple(fieldName, value, value, source); - } - - /** - * Given an event ID, return the document Key - * - * @param eventId - * eventId string - * @return the document Key - */ - private static Key eventIdToKey(String eventId) { - if (eventId != null) { - int split = eventId.indexOf('\u0000'); - if (split < 0) { - throw new IllegalStateException("Malformed eventId (expected a null separator): " + eventId); - } - return new Key(eventId.substring(0, split), eventId.substring(split + 1)); - } - return null; - } - /** - * Given a document key, return the eventId - * - * @param docKey - * document key - * @return the event id (shard\x00dt\x00uid) - */ - private static String keyToEventId(Key docKey) { - return docKey != null ? docKey.getRow().toString() + '\u0000' + docKey.getColumnFamily().toString() : null; + // when working correctly, it should always return from inside the loop so if this is reached something went very wrong + return EMPTY_SUMMARY; } /** - * Add phrase excerpts to the documents from the given iterator. + * Add summaries to the documents from the given iterator. * * @param in * the iterator source @@ -454,31 +233,19 @@ public Iterator> getIterator(final Iterator extraParms, Collection goodResults) + throws Exception { + super.runTestQuery(connector, queryString, startDate, endDate, extraParms, goodResults); + } + } + + @RunWith(Arquillian.class) + public static class DocumentRange extends SummaryTest { + protected static AccumuloClient connector = null; + + @BeforeClass + public static void setUp() throws Exception { + + QueryTestTableHelper qtth = new QueryTestTableHelper(DocumentRange.class.toString(), log); + connector = qtth.client; + + WiseGuysIngest.writeItAll(connector, WiseGuysIngest.WhatKindaRange.DOCUMENT); + Authorizations auths = new Authorizations("ALL"); + PrintUtility.printTable(connector, auths, TableName.SHARD); + PrintUtility.printTable(connector, auths, TableName.SHARD_INDEX); + PrintUtility.printTable(connector, auths, QueryTestTableHelper.MODEL_TABLE_NAME); + } + + @Override + protected void runTestQuery(String queryString, Date startDate, Date endDate, Map extraParms, Collection goodResults) + throws Exception { + super.runTestQuery(connector, queryString, startDate, endDate, extraParms, goodResults); + } + } + + private static final Logger log = Logger.getLogger(SummaryTest.class); + + protected Authorizations auths = new Authorizations("ALL"); + + protected Set authSet = Set.of(auths); + + @Inject + @SpringBean(name = "EventQuery") + protected ShardQueryLogic logic; + + protected KryoDocumentDeserializer deserializer; + + private final DateFormat format = new SimpleDateFormat("yyyyMMdd"); + + @Deployment + public static JavaArchive createDeployment() throws Exception { + + return ShrinkWrap.create(JavaArchive.class) + .addPackages(true, "org.apache.deltaspike", "io.astefanutti.metrics.cdi", "datawave.query", "org.jboss.logging", + "datawave.webservice.query.result.event") + .deleteClass(DefaultEdgeEventQueryLogic.class).deleteClass(RemoteEdgeDictionary.class) + .deleteClass(datawave.query.metrics.QueryMetricQueryLogic.class) + .addAsManifestResource(new StringAsset( + "" + "datawave.query.tables.edge.MockAlternative" + ""), + "beans.xml"); + } + + @AfterClass + public static void teardown() { + TypeRegistry.reset(); + } + + @Before + public void setup() { + TimeZone.setDefault(TimeZone.getTimeZone("GMT")); + log.setLevel(Level.TRACE); + logic.setFullTableScanEnabled(true); + deserializer = new KryoDocumentDeserializer(); + } + + protected abstract void runTestQuery(String queryString, Date startDate, Date endDate, Map extraParms, Collection goodResults) + throws Exception; + + protected void runTestQuery(AccumuloClient connector, String queryString, Date startDate, Date endDate, Map extraParms, + Collection goodResults) throws Exception { + + QueryImpl settings = new QueryImpl(); + settings.setBeginDate(startDate); + settings.setEndDate(endDate); + settings.setPagesize(Integer.MAX_VALUE); + settings.setQueryAuthorizations(auths.serialize()); + settings.setQuery(queryString); + settings.setParameters(extraParms); + settings.setId(UUID.randomUUID()); + + log.debug("query: " + settings.getQuery()); + log.debug("logic: " + settings.getQueryLogicName()); + + GenericQueryConfiguration config = logic.initialize(connector, settings, authSet); + logic.setupQuery(config); + + Set docs = new HashSet<>(); + Set unexpectedFields = new HashSet<>(); + for (Map.Entry entry : logic) { + Document d = deserializer.apply(entry).getValue(); + log.trace(entry.getKey() + " => " + d); + docs.add(d); + Map>> dictionary = d.getDictionary(); + + log.debug("dictionary:" + dictionary); + for (Map.Entry>> dictionaryEntry : dictionary.entrySet()) { + + // skip expected generated fields + if (dictionaryEntry.getKey().equals(JexlEvaluation.HIT_TERM_FIELD) || dictionaryEntry.getKey().contains("ORIGINAL_COUNT") + || dictionaryEntry.getKey().equals("RECORD_ID")) { + continue; + } + + Attribute> attribute = dictionaryEntry.getValue(); + if (attribute instanceof Attributes) { + for (Attribute attr : ((Attributes) attribute).getAttributes()) { + String toFind = dictionaryEntry.getKey() + ":" + attr; + boolean found = goodResults.remove(toFind); + if (found) + log.debug("removed " + toFind); + else { + unexpectedFields.add(toFind); + } + } + } else { + + String toFind = dictionaryEntry.getKey() + ":" + dictionaryEntry.getValue(); + + boolean found = goodResults.remove(toFind); + if (found) + log.debug("removed " + toFind); + else { + unexpectedFields.add(toFind); + } + } + + } + } + + assertTrue("unexpected fields returned: " + unexpectedFields, unexpectedFields.isEmpty()); + assertTrue(goodResults + " was not empty", goodResults.isEmpty()); + + assertFalse("No docs were returned!", docs.isEmpty()); + } + + @Test + public void testWithOnly() throws Exception { + Map extraParameters = new HashMap<>(); + extraParameters.put("include.grouping.context", "true"); + extraParameters.put("return.fields", "CONTENT_SUMMARY"); + extraParameters.put("query.syntax", "LUCENE"); + + String queryString = "QUOTE:(farther) #SUMMARY(50/ONLY/CONTENT)"; + + // not sure why the timestamp and delete flag are present + Set goodResults = new HashSet<>(Set.of("CONTENT_SUMMARY:You can get much farther with a kind word and a gu: : [] 9223372036854775807 false")); + + runTestQuery(queryString, format.parse("19000101"), format.parse("20240101"), extraParameters, goodResults); + } + + @Test + public void testWithoutOnly() throws Exception { + Map extraParameters = new HashMap<>(); + extraParameters.put("include.grouping.context", "true"); + extraParameters.put("return.fields", "CONTENT_SUMMARY"); + extraParameters.put("query.syntax", "LUCENE"); + + String queryString = "QUOTE:(farther) #SUMMARY(50/CONTENT)"; + + // not sure why the timestamp and delete flag are present + Set goodResults = new HashSet<>(Set.of("CONTENT_SUMMARY:You can get much farther with a kind word and a gu: : [] 9223372036854775807 false")); + + runTestQuery(queryString, format.parse("19000101"), format.parse("20240101"), extraParameters, goodResults); + } + + @Test + public void testSize() throws Exception { + Map extraParameters = new HashMap<>(); + extraParameters.put("include.grouping.context", "true"); + extraParameters.put("return.fields", "CONTENT_SUMMARY"); + extraParameters.put("query.syntax", "LUCENE"); + + String queryString = "QUOTE:(farther) #SUMMARY(50)"; + + // not sure why the timestamp and delete flag are present + Set goodResults = new HashSet<>(Set.of("CONTENT_SUMMARY:You can get much farther with a kind word and a gu: : [] 9223372036854775807 false")); + + runTestQuery(queryString, format.parse("19000101"), format.parse("20240101"), extraParameters, goodResults); + } + + @Test + public void testOverMaxSize() throws Exception { + Map extraParameters = new HashMap<>(); + extraParameters.put("include.grouping.context", "true"); + extraParameters.put("return.fields", "CONTENT_SUMMARY"); + extraParameters.put("query.syntax", "LUCENE"); + + String queryString = "QUOTE:(farther) #SUMMARY(90000)"; + + // not sure why the timestamp and delete flag are present + Set goodResults = new HashSet<>(Set.of( + "CONTENT_SUMMARY:You can get much farther with a kind word and a gun than you can with a kind word alone: : [] 9223372036854775807 false")); + + runTestQuery(queryString, format.parse("19000101"), format.parse("20240101"), extraParameters, goodResults); + } + + @Test + public void testNegativeSize() throws Exception { + Map extraParameters = new HashMap<>(); + extraParameters.put("include.grouping.context", "true"); + extraParameters.put("return.fields", "CONTENT_SUMMARY"); + extraParameters.put("query.syntax", "LUCENE"); + + String queryString = "QUOTE:(farther) #SUMMARY(-50)"; + + // not sure why the timestamp and delete flag are present + Set goodResults = new HashSet<>(Set.of("CONTENT_SUMMARY:Y: : [] 9223372036854775807 false")); + + runTestQuery(queryString, format.parse("19000101"), format.parse("20240101"), extraParameters, goodResults); + } + + @Test + public void testNoContentFound() throws Exception { + Map extraParameters = new HashMap<>(); + extraParameters.put("include.grouping.context", "true"); + extraParameters.put("return.fields", "CONTENT_SUMMARY"); + extraParameters.put("query.syntax", "LUCENE"); + + String queryString = "QUOTE:(farther) #SUMMARY(50/ONLY/CANTFINDME,ORME)"; + + // not sure why the timestamp and delete flag are present + Set goodResults = new HashSet<>(Set.of("CONTENT_SUMMARY:NO CONTENT FOUND TO SUMMARIZE")); + + runTestQuery(queryString, format.parse("19000101"), format.parse("20240101"), extraParameters, goodResults); + } + +} diff --git a/warehouse/query-core/src/test/java/datawave/query/util/WiseGuysIngest.java b/warehouse/query-core/src/test/java/datawave/query/util/WiseGuysIngest.java index 9ed431a66d9..115926276e6 100644 --- a/warehouse/query-core/src/test/java/datawave/query/util/WiseGuysIngest.java +++ b/warehouse/query-core/src/test/java/datawave/query/util/WiseGuysIngest.java @@ -1,13 +1,20 @@ package datawave.query.util; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.util.AbstractMap; +import java.util.Base64; import java.util.Date; import java.util.Map; import java.util.concurrent.TimeUnit; +import java.util.zip.GZIPOutputStream; import org.apache.accumulo.core.client.AccumuloClient; import org.apache.accumulo.core.client.BatchWriter; import org.apache.accumulo.core.client.BatchWriterConfig; import org.apache.accumulo.core.client.MutationsRejectedException; +import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.Mutation; import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.iterators.user.SummingCombiner; @@ -30,6 +37,7 @@ import datawave.ingest.mapreduce.handler.shard.content.TermAndZone; import datawave.ingest.protobuf.TermWeight; import datawave.ingest.protobuf.Uid; +import datawave.query.Constants; import datawave.query.QueryTestTableHelper; import datawave.util.TableName; @@ -761,6 +769,10 @@ public static void writeItAll(AccumuloClient client, WhatKindaRange range) throw addFiTfTokens(bw, range, "QUOTE", "Im gonna make him an offer he cant refuse", corleoneUID); addFiTfTokens(bw, range, "QUOTE", "If you can quote the rules then you can obey them", sopranoUID); addFiTfTokens(bw, range, "QUOTE", "You can get much farther with a kind word and a gun than you can with a kind word alone", caponeUID); + + addDColumn(datatype, corleoneUID, "CONTENT", "Im gonna make him an offer he cant refuse", bw); + addDColumn(datatype, sopranoUID, "CONTENT", "If you can quote the rules then you can obey them", bw); + addDColumn(datatype, caponeUID, "CONTENT", "You can get much farther with a kind word and a gun than you can with a kind word alone", bw); } finally { if (null != bw) { bw.close(); @@ -1101,4 +1113,21 @@ private static void addFiTfTokens(BatchWriter bw, WhatKindaRange range, String f } bw.addMutation(fi); } + + private static void addDColumn(String datatype, String uid, String contentName, String content, BatchWriter bw) + throws IOException, MutationsRejectedException { + Mutation d = new Mutation(shard); + + final ByteArrayOutputStream bos = new ByteArrayOutputStream(Math.max(content.getBytes().length / 2, 1024)); + final OutputStream b64s = Base64.getEncoder().wrap(bos); + final GZIPOutputStream gzip = new GZIPOutputStream(b64s); + gzip.write(content.getBytes()); + gzip.close(); + b64s.close(); + bos.close(); + Value value = new Value(bos.toByteArray()); + + d.put("d", datatype + "\u0000" + uid + "\u0000" + contentName, columnVisibility, timeStamp, value); + bw.addMutation(d); + } } diff --git a/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml b/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml index 7c043448b9e..c71910ce174 100644 --- a/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml +++ b/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml @@ -46,6 +46,7 @@ +