Add the ability to set the number of hits to track accurately (#36357)

In Lucene 8 searches can skip non-competitive hits if the total hit count is not requested. It is also possible to track the number of hits up to a certain threshold. This is a trade off to speed up searches while still being able to know a lower bound of the total hit count. This change adds the ability to set this threshold directly in the track_total_hits search option. A boolean value (true, false) indicates whether the total hit count should be tracked in the response. When set as an integer this option allows to compute a lower bound of the total hits while preserving the ability to skip non-competitive hits when enough matches have been collected. Relates #33028
elastic · Jan 4, 2019 · e38cf1d · e38cf1d
1 parent ac4aecc
commit e38cf1d
Show file tree

Hide file tree

Showing 36 changed files with 573 additions and 148 deletions.
diff --git a/docs/reference/search/request-body.asciidoc b/docs/reference/search/request-body.asciidoc
@@ -189,6 +189,8 @@ include::request/from-size.asciidoc[]
 
 include::request/sort.asciidoc[]
 
+include::request/track-total-hits.asciidoc[]
+
 include::request/source-filtering.asciidoc[]
 
 include::request/stored-fields.asciidoc[]

diff --git a/docs/reference/search/request/track-total-hits.asciidoc b/docs/reference/search/request/track-total-hits.asciidoc
@@ -0,0 +1,176 @@
+[[search-request-track-total-hits]]
+=== Track total hits
+
+Generally the total hit count can't be computed accurately without visiting all
+matches, which is costly for queries that match lots of documents. The
+`track_total_hits` parameter allows you to control how the total number of hits
+should be tracked. When set to `true` the search response will always track the
+number of hits that match the query accurately (e.g. `total.relation` will always
+be equal to `"eq"` when `track_total_hits is set to true).
+
+[source,js]
+--------------------------------------------------
+GET twitter/_search
+{
+    "track_total_hits": true,
+     "query": {
+        "match" : {
+            "message" : "Elasticsearch"
+        }
+     }
+}
+--------------------------------------------------
+// TEST[setup:twitter]
+// CONSOLE
+
+\... returns:
+
+[source,js]
+--------------------------------------------------
+{
+    "_shards": ...
+    "timed_out": false,
+    "took": 100,
+    "hits": {
+        "max_score": 1.0,
+        "total" : {
+            "value": 2048,    <1>
+            "relation": "eq"  <2>
+        },
+        "hits": ...
+    }
+}
+--------------------------------------------------
+// TESTRESPONSE[s/"_shards": \.\.\./"_shards": "$body._shards",/]
+// TESTRESPONSE[s/"took": 100/"took": $body.took/]
+// TESTRESPONSE[s/"max_score": 1\.0/"max_score": $body.hits.max_score/]
+// TESTRESPONSE[s/"value": 2048/"value": $body.hits.total.value/]
+// TESTRESPONSE[s/"hits": \.\.\./"hits": "$body.hits.hits"/]
+
+<1> The total number of hits that match the query.
+<2> The count is accurate (e.g. `"eq"` means equals).
+
+If you don't need to track the total number of hits you can improve query times
+by setting this option to `false`. In such case the search can efficiently skip
+non-competitive hits because it doesn't need to count all matches:
+
+[source,js]
+--------------------------------------------------
+GET twitter/_search
+{
+    "track_total_hits": false,
+     "query": {
+        "match" : {
+            "message" : "Elasticsearch"
+        }
+     }
+}
+--------------------------------------------------
+// CONSOLE
+// TEST[continued]
+
+\... returns:
+
+[source,js]
+--------------------------------------------------
+{
+    "_shards": ...
+    "timed_out": false,
+    "took": 10,
+    "hits" : { <1>
+        "max_score": 1.0,
+        "hits": ...
+    }
+}
+--------------------------------------------------
+// TESTRESPONSE[s/"_shards": \.\.\./"_shards": "$body._shards",/]
+// TESTRESPONSE[s/"took": 10/"took": $body.took/]
+// TESTRESPONSE[s/"max_score": 1\.0/"max_score": $body.hits.max_score/]
+// TESTRESPONSE[s/"hits": \.\.\./"hits": "$body.hits.hits"/]
+
+<1> The total number of hits is unknown.
+
+Given that it is often enough to have a lower bound of the number of hits,
+such as "there are at least 1000 hits", it is also possible to set
+`track_total_hits` as an integer that represents the number of hits to count
+accurately. The search can efficiently skip non-competitive document as soon
+as  collecting at least $`track_total_hits` documents. This is a good trade
+off to speed up searches if you don't need the accurate number of hits after
+a certain threshold.
+
+
+For instance the following query will track the total hit count that match
+the query accurately up to 100 documents:
+
+[source,js]
+--------------------------------------------------
+GET twitter/_search
+{
+    "track_total_hits": 100,
+     "query": {
+        "match" : {
+            "message" : "Elasticsearch"
+        }
+     }
+}
+--------------------------------------------------
+// CONSOLE
+// TEST[continued]
+
+The `hits.total.relation` in the response will indicate if the
+value returned in `hits.total.value` is accurate (`eq`) or a lower
+bound of the total (`gte`).
+
+For instance the following response:
+
+[source,js]
+--------------------------------------------------
+{
+    "_shards": ...
+    "timed_out": false,
+    "took": 30,
+    "hits" : {
+        "max_score": 1.0,
+        "total" : {
+            "value": 42,         <1>
+            "relation": "eq"     <2>
+        },
+        "hits": ...
+    }
+}
+--------------------------------------------------
+// TESTRESPONSE[s/"_shards": \.\.\./"_shards": "$body._shards",/]
+// TESTRESPONSE[s/"took": 30/"took": $body.took/]
+// TESTRESPONSE[s/"max_score": 1\.0/"max_score": $body.hits.max_score/]
+// TESTRESPONSE[s/"value": 42/"value": $body.hits.total.value/]
+// TESTRESPONSE[s/"hits": \.\.\./"hits": "$body.hits.hits"/]
+
+<1> 42 documents match the query
+<2> and the count is accurate (`"eq"`)
+
+\... indicates that the number of hits returned in the `total`
+is accurate.
+
+If the total number of his that match the query is greater than the
+value set in `track_total_hits`, the total hits in the response
+will indicate that the returned value is a lower bound:
+
+[source,js]
+--------------------------------------------------
+{
+    "_shards": ...
+    "hits" : {
+        "max_score": 1.0,
+        "total" : {
+            "value": 100,         <1>
+            "relation": "gte"     <2>
+        },
+        "hits": ...
+    }
+}
+--------------------------------------------------
+// TESTRESPONSE
+// TEST[skip:response is already tested in the previous snippet]
+
+<1> There are at least 100 documents that match the query
+<2> This is a lower bound (`gte`).
diff --git a/docs/reference/search/uri-request.asciidoc b/docs/reference/search/uri-request.asciidoc
@@ -101,10 +101,12 @@ is important).
 |`track_scores` |When sorting, set to `true` in order to still track
 scores and return them as part of each hit.
 
-|`track_total_hits` |Set to `false` in order to disable the tracking
+|`track_total_hits` |Defaults to true. Set to `false` in order to disable the tracking
 of the total number of hits that match the query.
-(see <<index-modules-index-sorting,_Index Sorting_>> for more details).
-Defaults to true.
+It also accepts an integer which in this case represents the number of
+hits to count accurately.
+(See the <<search-request-track-total-hits, request body>> documentation
+for more details).
 
 |`timeout` |A search timeout, bounding the search request to be executed
 within the specified time value and bail with the hits accumulated up to

diff --git a/...stache/src/main/java/org/elasticsearch/script/mustache/RestMultiSearchTemplateAction.java b/...stache/src/main/java/org/elasticsearch/script/mustache/RestMultiSearchTemplateAction.java
@@ -49,7 +49,7 @@ public class RestMultiSearchTemplateAction extends BaseRestHandler {
 
     static {
         final Set<String> responseParams = new HashSet<>(
-            Arrays.asList(RestSearchAction.TYPED_KEYS_PARAM, RestSearchAction.TOTAL_HIT_AS_INT_PARAM)
+            Arrays.asList(RestSearchAction.TYPED_KEYS_PARAM, RestSearchAction.TOTAL_HITS_AS_INT_PARAM)
         );
         RESPONSE_PARAMS = Collections.unmodifiableSet(responseParams);
     }
@@ -103,6 +103,7 @@ public static MultiSearchTemplateRequest parseRequest(RestRequest restRequest, b
                     } else {
                         throw new IllegalArgumentException("Malformed search template");
                     }
+                    RestSearchAction.checkRestTotalHits(restRequest, searchRequest);
                 });
         return multiRequest;
     }

diff --git a/...ng-mustache/src/main/java/org/elasticsearch/script/mustache/RestSearchTemplateAction.java b/...ng-mustache/src/main/java/org/elasticsearch/script/mustache/RestSearchTemplateAction.java
@@ -43,7 +43,7 @@ public class RestSearchTemplateAction extends BaseRestHandler {
     private static final Set<String> RESPONSE_PARAMS;
 
     static {
-        final Set<String> responseParams = new HashSet<>(Arrays.asList(TYPED_KEYS_PARAM, RestSearchAction.TOTAL_HIT_AS_INT_PARAM));
+        final Set<String> responseParams = new HashSet<>(Arrays.asList(TYPED_KEYS_PARAM, RestSearchAction.TOTAL_HITS_AS_INT_PARAM));
         RESPONSE_PARAMS = Collections.unmodifiableSet(responseParams);
     }
 
@@ -77,6 +77,7 @@ public RestChannelConsumer prepareRequest(RestRequest request, NodeClient client
             searchTemplateRequest = SearchTemplateRequest.fromXContent(parser);
         }
         searchTemplateRequest.setRequest(searchRequest);
+        RestSearchAction.checkRestTotalHits(request, searchRequest);
 
         return channel -> client.execute(SearchTemplateAction.INSTANCE, searchTemplateRequest, new RestStatusToXContentListener<>(channel));
     }

diff --git a/qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/IndexingIT.java b/qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/IndexingIT.java
@@ -30,7 +30,7 @@
 import java.nio.charset.StandardCharsets;
 
 import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
-import static org.elasticsearch.rest.action.search.RestSearchAction.TOTAL_HIT_AS_INT_PARAM;
+import static org.elasticsearch.rest.action.search.RestSearchAction.TOTAL_HITS_AS_INT_PARAM;
 import static org.hamcrest.Matchers.equalTo;
 
 /**
@@ -158,7 +158,7 @@ private void bulk(String index, String valueSuffix, int count) throws IOExceptio
 
     private void assertCount(String index, int count) throws IOException {
         Request searchTestIndexRequest = new Request("POST", "/" + index + "/_search");
-        searchTestIndexRequest.addParameter(TOTAL_HIT_AS_INT_PARAM, "true");
+        searchTestIndexRequest.addParameter(TOTAL_HITS_AS_INT_PARAM, "true");
         searchTestIndexRequest.addParameter("filter_path", "hits.total");
         Response searchTestIndexResponse = client().performRequest(searchTestIndexRequest);
         assertEquals("{\"hits\":{\"total\":" + count + "}}",

diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/msearch/10_basic.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/msearch/10_basic.yml
@@ -115,11 +115,45 @@ setup:
           - query:
               match: {foo: foo}
 
-  - match:  { responses.0.hits.total.value:     2  }
+  - match:  { responses.0.hits.total.value:     2   }
   - match:  { responses.0.hits.total.relation:  eq  }
-  - match:  { responses.1.hits.total.value:     1  }
+  - match:  { responses.1.hits.total.value:     1   }
   - match:  { responses.1.hits.total.relation:  eq  }
-  - match:  { responses.2.hits.total.value:     1  }
+  - match:  { responses.2.hits.total.value:     1   }
   - match:  { responses.2.hits.total.relation:  eq  }
 
+  - do:
+      msearch:
+        body:
+        - index: index_*
+        - { query: { match: {foo: foo}}, track_total_hits: 1 }
+        - index: index_2
+        - query:
+            match_all: {}
+        - index: index_1
+        - query:
+            match: {foo: foo}
+
+  - match:  { responses.0.hits.total.value:     1    }
+  - match:  { responses.0.hits.total.relation:  gte  }
+  - match:  { responses.1.hits.total.value:     1    }
+  - match:  { responses.1.hits.total.relation:  eq   }
+  - match:  { responses.2.hits.total.value:     1    }
+  - match:  { responses.2.hits.total.relation:  eq   }
+
+  - do:
+      catch: /\[rest_total_hits_as_int\] cannot be used if the tracking of total hits is not accurate, got 10/
+      msearch:
+        rest_total_hits_as_int: true
+        body:
+          - index: index_*
+          - { query: { match_all: {}}, track_total_hits: 10}
+          - index: index_2
+          - query:
+              match_all: {}
+          - index: index_1
+          - query:
+              match: {foo: foo}
+
+