Fix invalid break iterator highlighting on keyword field (elastic#49566)

By default the unified highlighter splits the input into passages using a sentence break iterator. However we don't check if the field is tokenized or not so `keyword` field also applies the break iterator even though they can only match on the entire content. This means that by default we'll split the content of a `keyword` field on sentence break if the requested number of fragments is set to a value different than 0 (default to 5). This commit changes this behavior to ignore the break iterator on non-tokenized fields (keyword) in order to always highlight the entire values. The number of requested fragments control the number of matched values are returned but the boundary_scanner_type is now ignored. Note that this is the behavior in 6x but some refactoring of the Lucene's highlighter exposed this bug in Elasticsearch 7x.
SivagurunathanV · Jan 21, 2020 · 9912f98 · 9912f98
1 parent ad60476
commit 9912f98
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 3 deletions.
diff --git a/...r/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java b/...r/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java
@@ -67,7 +67,7 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
         final int maxAnalyzedOffset = context.getIndexSettings().getHighlightMaxAnalyzedOffset();
 
         List<Snippet> snippets = new ArrayList<>();
-        int numberOfFragments;
+        int numberOfFragments = field.fieldOptions().numberOfFragments();
         try {
 
             final Analyzer analyzer = getAnalyzer(context.getMapperService().documentMapper(), hitContext);
@@ -89,14 +89,16 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
                         "This maximum can be set by changing the [" + IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() +
                         "] index level setting. " + "For large texts, indexing with offsets or term vectors is recommended!");
             }
-            if (field.fieldOptions().numberOfFragments() == 0) {
+            if (numberOfFragments == 0
+                    // non-tokenized fields should not use any break iterator (ignore boundaryScannerType)
+                    || fieldType.tokenized() == false) {
                 // we use a control char to separate values, which is the only char that the custom break iterator
                 // breaks the text on, so we don't lose the distinction between the different values of a field and we
                 // get back a snippet per value
                 CustomSeparatorBreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
                 highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, passageFormatter,
                     field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, field.fieldOptions().noMatchSize());
-                numberOfFragments = fieldValues.size(); // we are highlighting the whole content, one snippet per value
+                numberOfFragments = numberOfFragments == 0 ? fieldValues.size() : numberOfFragments;
             } else {
                 //using paragraph separator we make sure that each field value holds a discrete passage for highlighting
                 BreakIterator bi = getBreakIterator(field);

diff --git a/.../src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java b/.../src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java
@@ -120,6 +120,35 @@ protected Collection<Class<? extends Plugin>> nodePlugins() {
         return Arrays.asList(InternalSettingsPlugin.class, MockKeywordPlugin.class, MockAnalysisPlugin.class);
     }
 
+    public void testHighlightingWithKeywordIgnoreBoundaryScanner() throws IOException {
+        XContentBuilder mappings = jsonBuilder();
+        mappings.startObject();
+        mappings.startObject("type")
+            .startObject("properties")
+                .startObject("tags")
+                    .field("type", "keyword")
+                .endObject()
+            .endObject().endObject();
+        mappings.endObject();
+        assertAcked(prepareCreate("test")
+            .addMapping("type", mappings));
+        client().prepareIndex("test").setId("1")
+            .setSource(jsonBuilder().startObject().array("tags", "foo bar", "foo bar", "foo bar", "foo baz").endObject())
+            .get();
+        client().prepareIndex("test").setId("2")
+            .setSource(jsonBuilder().startObject().array("tags", "foo baz", "foo baz", "foo baz", "foo bar").endObject())
+            .get();
+        refresh();
+
+        for (BoundaryScannerType scanner : BoundaryScannerType.values()) {
+            SearchResponse search = client().prepareSearch().setQuery(matchQuery("tags", "foo bar"))
+                .highlighter(new HighlightBuilder().field(new Field("tags")).numOfFragments(2).boundaryScannerType(scanner)).get();
+            assertHighlight(search, 0, "tags", 0, 2, equalTo("<em>foo bar</em>"));
+            assertHighlight(search, 0, "tags", 1, 2, equalTo("<em>foo bar</em>"));
+            assertHighlight(search, 1, "tags", 0, 1, equalTo("<em>foo bar</em>"));
+        }
+    }
+
     public void testHighlightingWithStoredKeyword() throws IOException {
         XContentBuilder mappings = jsonBuilder();
         mappings.startObject();