Skip to content

Commit

Permalink
Add support for storing term vectors in FeatureField (#14034)
Browse files Browse the repository at this point in the history
This update introduces an option to store term vectors generated by the FeatureField.
With this option, term vectors can be used to access all features for each document.
  • Loading branch information
jimczi committed Dec 4, 2024
1 parent c9c631f commit 356a534
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 2 deletions.
2 changes: 1 addition & 1 deletion lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ API Changes

New Features
---------------------
(No changes)
* GITHUB#14034: Add support for storing term vectors in FeatureField. (Jim Ferenczi)

Improvements
---------------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,17 @@
public final class FeatureField extends Field {

private static final FieldType FIELD_TYPE = new FieldType();
private static final FieldType FIELD_TYPE_STORE_TERM_VECTORS = new FieldType();

static {
FIELD_TYPE.setTokenized(false);
FIELD_TYPE.setOmitNorms(true);
FIELD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS);

FIELD_TYPE_STORE_TERM_VECTORS.setTokenized(false);
FIELD_TYPE_STORE_TERM_VECTORS.setOmitNorms(true);
FIELD_TYPE_STORE_TERM_VECTORS.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
FIELD_TYPE_STORE_TERM_VECTORS.setStoreTermVectors(true);
}

private float featureValue;
Expand All @@ -123,7 +129,21 @@ public final class FeatureField extends Field {
* @param featureValue The value of the feature, must be a positive, finite, normal float.
*/
public FeatureField(String fieldName, String featureName, float featureValue) {
super(fieldName, featureName, FIELD_TYPE);
this(fieldName, featureName, featureValue, false);
}

/**
* Create a feature.
*
* @param fieldName The name of the field to store the information into. All features may be
* stored in the same field.
* @param featureName The name of the feature, eg. 'pagerank`. It will be indexed as a term.
* @param featureValue The value of the feature, must be a positive, finite, normal float.
* @param storeTermVectors Whether term vectors should be stored.
*/
public FeatureField(
String fieldName, String featureName, float featureValue, boolean storeTermVectors) {
super(fieldName, featureName, storeTermVectors ? FIELD_TYPE_STORE_TERM_VECTORS : FIELD_TYPE);
setFeatureValue(featureValue);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
*/
package org.apache.lucene.document;

import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;

import java.io.IOException;
import java.util.List;
import org.apache.lucene.document.Field.Store;
Expand All @@ -38,6 +41,7 @@
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.tests.search.QueryUtils;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;

public class TestFeatureField extends LuceneTestCase {
Expand Down Expand Up @@ -87,6 +91,9 @@ public void testBasics() throws Exception {
IndexSearcher searcher = LuceneTestCase.newSearcher(reader);
LeafReaderContext context = searcher.getIndexReader().leaves().get(0);

var fieldInfo = context.reader().getFieldInfos().fieldInfo("features");
assertFalse(fieldInfo.hasTermVectors());

Query q = FeatureField.newLogQuery("features", "pagerank", 3f, 4.5f);
Weight w = q.createWeight(searcher, ScoreMode.TOP_SCORES, 2);
Scorer s = w.scorer(context);
Expand Down Expand Up @@ -445,4 +452,77 @@ public void testBasicsNonScoringCase() throws IOException {
reader.close();
}
}

public void testStoreTermVectors() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter writer =
new RandomIndexWriter(
random(),
dir,
newIndexWriterConfig().setMergePolicy(newLogMergePolicy(random().nextBoolean())));
Document doc = new Document();
FeatureField pagerank = new FeatureField("features", "pagerank", 1, true);
FeatureField urlLength = new FeatureField("features", "urlLen", 1, true);
doc.add(pagerank);
doc.add(urlLength);

pagerank.setFeatureValue(10);
urlLength.setFeatureValue(0.5f);
writer.addDocument(doc);

writer.addDocument(new Document()); // gap

pagerank.setFeatureValue(42);
urlLength.setFeatureValue(1.5f);
writer.addDocument(doc);

doc.clear();
FeatureField invalid = new FeatureField("features", "pagerank", 1, false);
doc.add(invalid);
var exc = expectThrows(Exception.class, () -> writer.addDocument(doc));
assertThat(exc.getMessage(), containsString("store term vector"));

writer.forceMerge(1);
DirectoryReader reader = writer.getReader();
writer.close();

IndexSearcher searcher = LuceneTestCase.newSearcher(reader);
LeafReaderContext context = searcher.getIndexReader().leaves().get(0);

var fieldInfo = context.reader().getFieldInfos().fieldInfo("features");
assertTrue(fieldInfo.hasTermVectors());

var terms = context.reader().termVectors().get(0, "features");
var termsEnum = terms.iterator();
assertThat(termsEnum.next(), equalTo(new BytesRef("pagerank")));
var postings = termsEnum.postings(null);
assertThat(postings.nextDoc(), equalTo(0));
assertThat(FeatureField.decodeFeatureValue(postings.freq()), equalTo(10f));
assertThat(postings.nextDoc(), equalTo(DocIdSetIterator.NO_MORE_DOCS));

assertThat(termsEnum.next(), equalTo(new BytesRef("urlLen")));
postings = termsEnum.postings(postings);
assertThat(postings.nextDoc(), equalTo(0));
assertThat(FeatureField.decodeFeatureValue(postings.freq()), equalTo(0.5f));
assertThat(postings.nextDoc(), equalTo(DocIdSetIterator.NO_MORE_DOCS));

terms = context.reader().termVectors().get(1, "features");
assertNull(terms);

terms = context.reader().termVectors().get(2, "features");
termsEnum = terms.iterator();
assertThat(termsEnum.next(), equalTo(new BytesRef("pagerank")));
postings = termsEnum.postings(postings);
assertThat(postings.nextDoc(), equalTo(0));
assertThat(FeatureField.decodeFeatureValue(postings.freq()), equalTo(42f));
assertThat(postings.nextDoc(), equalTo(DocIdSetIterator.NO_MORE_DOCS));

assertThat(termsEnum.next(), equalTo(new BytesRef("urlLen")));
postings = termsEnum.postings(null);
assertThat(postings.nextDoc(), equalTo(0));
assertThat(FeatureField.decodeFeatureValue(postings.freq()), equalTo(1.5f));
assertThat(postings.nextDoc(), equalTo(DocIdSetIterator.NO_MORE_DOCS));

IOUtils.close(reader, dir);
}
}

0 comments on commit 356a534

Please sign in to comment.