Fulltext preformance improvement (#1234)

Fulltext preformance improvement - [x] Performance Improvement
infiniflow · May 22, 2024 · 3c3b04c · 3c3b04c
1 parent 996fb7b
commit 3c3b04c
Show file tree

Hide file tree

Showing 6 changed files with 42 additions and 16 deletions.
diff --git a/docs/references/benchmark.md b/docs/references/benchmark.md
@@ -64,7 +64,7 @@ sudo mkdir -p /var/infinity && sudo chown -R $USER /var/infinity
 docker run -d --name infinity -v /var/infinity/:/var/infinity --ulimit nofile=500000:500000 --network=host infiniflow/infinity:0.1.0
 ```
 
-1. Run Benchmark:
+4. Run Benchmark:
 
 Drop file cache before benchmark query latency.
 
@@ -96,7 +96,22 @@ options:
   --dataset DATASET     data set to benchmark, one of: all, gist, sift, geonames, enwiki
 ```
 
-2. Navigate to the **results** folder to view the results and latency of each query. 
+Following are commands for engine `infinity` and dataset `enwiki`:
+
+```bash
+python run.py --generate --engine infinity --dataset enwiki
+python run.py --import --engine infinity --dataset enwiki
+python run.py --query --engine infinity --dataset enwiki
+python run.py --query-express=16 --engine infinity --dataset enwiki
+```
+
+Following are commands to issue a single query so that you can compare results among several engines.
+
+```base
+curl -X GET "http://localhost:9200/elasticsearch_enwiki/_search" -H 'Content-Type: application/json' -d'{"size":10,"_source":"doctitle","query": {"match": { "body": "wraysbury istorijos" }}}'
+
+psql -h 0.0.0.0 -p 5432 -c "SELECT doctitle, ROW_ID(), SCORE() FROM infinity_enwiki SEARCH MATCH TEXT ('body', 'wraysbury istorijos', 'topn=10;block_max=true');"
+```
 
 ## Benchmark Results
 ### SIFT1M
@@ -130,10 +145,10 @@ options:
 > - 33000000 documents
 > - 100000 `OR` queries generated based on the dataset. All terms are extracted from the dataset and very rare(occurrence < 100) terms are excluded. The number of terms of each query match the weight `[0.03, 0.15, 0.25, 0.25, 0.15, 0.08, 0.04, 0.03, 0.02]`.
 
-|                   | Time to insert & build index | Time to import & build index | P95 Latency(ms)| QPS (8 python clients) |  Memory | vCPU  |
-| ----------------- | ---------------------------- | ---------------------------- | ---------------| -----------------------| --------| ----- |
-| **Elasticsearch** | 2289 s                       | N/A                          | 14.75          | 1174                   | 21.0GB  | 10.0  |
-| **Infinity**      | 2321 s                       | 944 s                        | 3.51           | 3925                   | 9.0GB   | 4.2   |
+|                   | Time to insert & build index | Time to import & build index | P95 Latency(ms)| QPS (16 python clients) |  Memory | vCPU  |
+| ----------------- | ---------------------------- | ---------------------------- | ---------------| ------------------------| --------| ----- |
+| **Elasticsearch** | 2289 s                       | N/A                          | 14.75          | 1340                    | 21.0GB  | 10.6  |
+| **Infinity**      | 2321 s                       | 2890 s                       | 1.86           | 12328                   | 10.0GB  | 11.0  |
 
 ---
 

diff --git a/python/benchmark/clients/elasticsearch_client.py b/python/benchmark/clients/elasticsearch_client.py
@@ -263,6 +263,13 @@ def search(self) -> list[list[Any]]:
         return results
 
     def check_and_save_results(self, results: List[List[Any]]):
+        if "result_path" in self.data:
+            result_path = self.data["result_path"]
+            with open(result_path, "w") as f:
+                for result in results:
+                    line = json.dumps(result)
+                    f.write(line + "\n")
+            logging.info("query_result_path: {0}".format(result_path))
         if "ground_truth_path" in self.data:
             ground_truth_path = self.data["ground_truth_path"]
             _, ext = os.path.splitext(ground_truth_path)

diff --git a/python/benchmark/clients/infinity_client.py b/python/benchmark/clients/infinity_client.py
@@ -238,6 +238,13 @@ def search(self) -> list[list[Any]]:
         return results
 
     def check_and_save_results(self, results: List[List[Any]]):
+        if "result_path" in self.data:
+            result_path = self.data["result_path"]
+            with open(result_path, "w") as f:
+                for result in results:
+                    line = json.dumps(result)
+                    f.write(line + "\n")
+            logging.info("query_result_path: {0}".format(result_path))
         if "ground_truth_path" in self.data:
             ground_truth_path = self.data["ground_truth_path"]
             _, ext = os.path.splitext(ground_truth_path)
@@ -263,17 +270,12 @@ def check_and_save_results(self, results: List[List[Any]]):
                 with open(ground_truth_path, "r") as f:
                     for i, line in enumerate(f):
                         expected_result = json.loads(line)
+                        exp_ids = set(x[0] for x in expected_result[:-1])
                         result = results[i]
                         ids = set(x[0] for x in result[:-1])
-                        precision = (
-                            len(
-                                ids.intersection(
-                                    expected_result["expected_results"][
-                                        : self.data["topK"]
-                                    ]
-                                )
-                            )
-                            / self.data["topK"]
+                        precision = len(ids.intersection(exp_ids)) / self.data["topK"]
+                        logging.info(
+                            f"expected_ids: {exp_ids}, ids: {ids}, precision: {precision}"
                         )
                         precisions.append(precision)
                         latencies.append(result[-1])

diff --git a/python/benchmark/configs/elasticsearch_enwiki.json b/python/benchmark/configs/elasticsearch_enwiki.json
@@ -6,6 +6,7 @@
   "data_path": "datasets/enwiki/enwiki.csv",
   "insert_batch_size": 8192,
   "query_path": "datasets/enwiki/operations.txt",
+  "result_path": "datasets/enwiki/elasticsearch_result.jsonl",
   "mode": "fulltext",
   "topK": 10,
   "index": {

diff --git a/python/benchmark/configs/infinity_enwiki.json b/python/benchmark/configs/infinity_enwiki.json
@@ -6,6 +6,7 @@
     "data_link": "http://192.168.200.183:8000/enwiki-20120502-lines-10.csv",
     "insert_batch_size": 8192,
     "query_path": "datasets/enwiki/operations.txt",
+    "result_path": "datasets/enwiki/infinity_result.jsonl",
     "query_link": "to_be_set",
     "mode": "fulltext",
     "topK": 10,

diff --git a/src/storage/invertedindex/column_index_reader.cpp b/src/storage/invertedindex/column_index_reader.cpp
@@ -140,7 +140,7 @@ IndexReader TableIndexReaderCache::GetIndexReader(Txn *txn, TableEntry *self_tab
     std::scoped_lock lock(mutex_);
     assert(cache_ts_ <= first_known_update_ts_);
     assert(first_known_update_ts_ == MAX_TIMESTAMP || first_known_update_ts_ <= last_known_update_ts_);
-    if (cache_ts_ != 0 && begin_ts >= cache_ts_ && begin_ts < first_known_update_ts_) [[likely]] {
+    if (first_known_update_ts_ != 0 && begin_ts >= cache_ts_ && begin_ts < first_known_update_ts_) [[likely]] {
         // no need to build, use cache
         result.column_index_readers_ = cache_column_readers_;
         result.column2analyzer_ = column2analyzer_;