Add vectorsearch training workload (opensearch-project#333)

* Add vectorsearch training workload Signed-off-by: Finn Roblin <finnrobl@amazon.com> * Addressed Vijay feedback and ignores error if model DNE Signed-off-by: Finn Roblin <finnrobl@amazon.com> * Added documentation to VS readme Signed-off-by: Finn Roblin <finnrobl@amazon.com> --------- Signed-off-by: Finn Roblin <finnrobl@amazon.com>
vpehkone · Jul 18, 2024 · 29d9715 · 29d9715
1 parent 392670d
commit 29d9715
Show file tree

Hide file tree

Showing 10 changed files with 444 additions and 2 deletions.
diff --git a/vectorsearch/README.md b/vectorsearch/README.md
@@ -189,6 +189,148 @@ either in-memory, or an external OpenSearch cluster.
 |                                                     error rate |       prod-queries |           0 |      % |
 
 
+---------------------------------
+[INFO] SUCCESS (took 119 seconds)
+---------------------------------
+
+```
+
+### Train Test
+
+This procedure benchmarks approximate k-NN search algorithms that require a training step. For example, the FAISS IVF requires a training step to cluster vectors. Then search can be performed against a smaller number of cluster centroids instead of the entire dataset.
+
+#### Parameters
+
+This workload allows the following parameters to be specified using `--workload-params`:
+
+| Name                                    | Description                                                                                  |
+|-----------------------------------------|----------------------------------------------------------------------------------------------|
+| target_index_name                       | Name of index to add vectors to                                                              |
+| target_field_name                       | Name of field to add vectors to                                                              |
+| target_index_body                       | Path to target index definition                                                              |
+| target_index_primary_shards             | Target index primary shards                                                                  |
+| target_index_replica_shards             | Target index replica shards                                                                  |
+| target_index_dimension                  | Dimension of target index                                                                    |
+| target_index_space_type                 | Target index space type                                                                      |
+| target_index_bulk_size                  | Target index bulk size                                                                       |
+| target_index_bulk_index_data_set_format | Format of vector data set                                                                    |
+| target_index_bulk_index_data_set_path   | Path to vector data set                                                                      |
+| target_index_bulk_index_data_set_corpus | Corpus name to vector data set                                                               |
+| target_index_bulk_index_clients         | Clients to be used for bulk ingestion (must be divisor of data set size)                     |
+| target_index_max_num_segments           | Number of segments to merge target index down to before beginning search                     |
+| target_index_force_merge_timeout        | Timeout for of force merge requests in seconds                                               |
+| train_index_name                        | Name of index for training                                                                   |
+| train_field_name                        | Name of field for training                                                                   |
+| train_method_engine                     | Engine for training (e.g "faiss")                                                            |
+| train_index_body                        | Path to train index definition                                                               |
+| train_index_primary_shards              | Train index primary shards                                                                   |
+| train_index_replica_shards              | Train index replica shards                                                                   |
+| train_index_bulk_size                   | Bulk size for train index                                                                    |
+| train_index_bulk_index_data_set_format  | Format of training data set                                                                  |
+| train_index_bulk_index_data_set_path    | Path to training data set                                                                    |
+| train_index_bulk_indexing_clients       | Clients to be used for bulk indexing                                                         |
+| train_index_num_vectors                 | Number of vectors in the training index                                                      |
+| train_model_id                          | ID of the training model                                                                     |
+| train_operation_retries                 | Number of retries for querying training operation to see if complete                         |
+| train_operation_poll_period             | Poll period for querying training operation in seconds                                       |
+| train_search_size                       | Number of results per [scroll query](http://opensearch.org/docs/latest/api-reference/scroll/)|
+| hnsw_ef_search                          | HNSW ef search parameter                                                                     |
+| hnsw_ef_construction                    | HNSW ef construction parameter                                                               |
+| id_field_name                           | Name of field that will be used to identify documents in an index                            |
+| hnsw_m                                  | HNSW m parameter                                                                             |
+| query_k                                 | The number of neighbors to return for the search                                             |
+| query_data_set_format                   | Format of vector data set for queries                                                        |
+| query_data_set_path                     | Path to vector data set for queries                                                          |
+| query_count                             | Number of queries for search operation                                                       |
+| query_body                              | Json properties that will be merged with search body                                         |
+| search_clients                          | Number of clients to use for running queries                                                 |
+
+#### Sample Output
+
+The output of a sample test run is provided below. Metrics are captured in the result's data store as usual, and this can be configured to be 
+either in-memory, or an external OpenSearch cluster.
+
+```
+------------------------------------------------------
+    _______             __   _____
+   / ____(_)___  ____ _/ /  / ___/_________  ________
+  / /_  / / __ \/ __ `/ /   \__ \/ ___/ __ \/ ___/ _ \
+ / __/ / / / / / /_/ / /   ___/ / /__/ /_/ / /  /  __/
+/_/   /_/_/ /_/\__,_/_/   /____/\___/\____/_/   \___/
+------------------------------------------------------
+            
+|                                                         Metric |               Task |       Value |   Unit |
+|---------------------------------------------------------------:|-------------------:|------------:|-------:|
+|                     Cumulative indexing time of primary shards |                    |  0.00946667 |    min |
+|             Min cumulative indexing time across primary shards |                    |           0 |    min |
+|          Median cumulative indexing time across primary shards |                    |  0.00298333 |    min |
+|             Max cumulative indexing time across primary shards |                    |  0.00336667 |    min |
+|            Cumulative indexing throttle time of primary shards |                    |           0 |    min |
+|    Min cumulative indexing throttle time across primary shards |                    |           0 |    min |
+| Median cumulative indexing throttle time across primary shards |                    |           0 |    min |
+|    Max cumulative indexing throttle time across primary shards |                    |           0 |    min |
+|                        Cumulative merge time of primary shards |                    |           0 |    min |
+|                       Cumulative merge count of primary shards |                    |           0 |        |
+|                Min cumulative merge time across primary shards |                    |           0 |    min |
+|             Median cumulative merge time across primary shards |                    |           0 |    min |
+|                Max cumulative merge time across primary shards |                    |           0 |    min |
+|               Cumulative merge throttle time of primary shards |                    |           0 |    min |
+|       Min cumulative merge throttle time across primary shards |                    |           0 |    min |
+|    Median cumulative merge throttle time across primary shards |                    |           0 |    min |
+|       Max cumulative merge throttle time across primary shards |                    |           0 |    min |
+|                      Cumulative refresh time of primary shards |                    |  0.00861667 |    min |
+|                     Cumulative refresh count of primary shards |                    |          33 |        |
+|              Min cumulative refresh time across primary shards |                    |           0 |    min |
+|           Median cumulative refresh time across primary shards |                    |  0.00268333 |    min |
+|              Max cumulative refresh time across primary shards |                    |  0.00291667 |    min |
+|                        Cumulative flush time of primary shards |                    | 0.000183333 |    min |
+|                       Cumulative flush count of primary shards |                    |           2 |        |
+|                Min cumulative flush time across primary shards |                    |           0 |    min |
+|             Median cumulative flush time across primary shards |                    |           0 |    min |
+|                Max cumulative flush time across primary shards |                    | 0.000183333 |    min |
+|                                        Total Young Gen GC time |                    |       0.075 |      s |
+|                                       Total Young Gen GC count |                    |          17 |        |
+|                                          Total Old Gen GC time |                    |           0 |      s |
+|                                         Total Old Gen GC count |                    |           0 |        |
+|                                                     Store size |                    |  0.00869293 |     GB |
+|                                                  Translog size |                    | 2.56114e-07 |     GB |
+|                                         Heap used for segments |                    |           0 |     MB |
+|                                       Heap used for doc values |                    |           0 |     MB |
+|                                            Heap used for terms |                    |           0 |     MB |
+|                                            Heap used for norms |                    |           0 |     MB |
+|                                           Heap used for points |                    |           0 |     MB |
+|                                    Heap used for stored fields |                    |           0 |     MB |
+|                                                  Segment count |                    |           9 |        |
+|                                                 Min Throughput | custom-vector-bulk |       25527 | docs/s |
+|                                                Mean Throughput | custom-vector-bulk |       25527 | docs/s |
+|                                              Median Throughput | custom-vector-bulk |       25527 | docs/s |
+|                                                 Max Throughput | custom-vector-bulk |       25527 | docs/s |
+|                                        50th percentile latency | custom-vector-bulk |     36.3095 |     ms |
+|                                        90th percentile latency | custom-vector-bulk |     52.2662 |     ms |
+|                                       100th percentile latency | custom-vector-bulk |     68.6513 |     ms |
+|                                   50th percentile service time | custom-vector-bulk |     36.3095 |     ms |
+|                                   90th percentile service time | custom-vector-bulk |     52.2662 |     ms |
+|                                  100th percentile service time | custom-vector-bulk |     68.6513 |     ms |
+|                                                     error rate | custom-vector-bulk |           0 |      % |
+|                                                 Min Throughput |       prod-queries |      211.26 |  ops/s |
+|                                                Mean Throughput |       prod-queries |      213.85 |  ops/s |
+|                                              Median Throughput |       prod-queries |      213.48 |  ops/s |
+|                                                 Max Throughput |       prod-queries |      216.49 |  ops/s |
+|                                        50th percentile latency |       prod-queries |     3.43393 |     ms |
+|                                        90th percentile latency |       prod-queries |     4.01881 |     ms |
+|                                        99th percentile latency |       prod-queries |     5.56238 |     ms |
+|                                      99.9th percentile latency |       prod-queries |     9.95666 |     ms |
+|                                     99.99th percentile latency |       prod-queries |     39.7922 |     ms |
+|                                       100th percentile latency |       prod-queries |      62.415 |     ms |
+|                                   50th percentile service time |       prod-queries |     3.43405 |     ms |
+|                                   90th percentile service time |       prod-queries |      4.0191 |     ms |
+|                                   99th percentile service time |       prod-queries |     5.56316 |     ms |
+|                                 99.9th percentile service time |       prod-queries |     9.95666 |     ms |
+|                                99.99th percentile service time |       prod-queries |     39.7922 |     ms |
+|                                  100th percentile service time |       prod-queries |      62.415 |     ms |
+|                                                     error rate |       prod-queries |           0 |      % |
+
+
 ---------------------------------
 [INFO] SUCCESS (took 119 seconds)
 ---------------------------------

diff --git a/vectorsearch/indices/train-index.json b/vectorsearch/indices/train-index.json
@@ -0,0 +1,21 @@
+{
+    "settings": {
+      "index": {
+        {%- if train_index_primary_shards is defined and train_index_primary_shards %}
+        "number_of_shards": {{ train_index_primary_shards }}
+        {%- endif %}
+        {%- if train_index_replica_shards is defined %}
+        ,"number_of_replicas": {{ train_index_replica_shards }}
+        {%- endif %}
+      }
+    },
+    "mappings": {
+      "properties": {
+        "{{ train_field_name }}": {
+          "type": "knn_vector",
+          "dimension": {{ target_index_dimension }}
+        }
+      }
+    }
+  }
+
diff --git a/vectorsearch/operations/default.json b/vectorsearch/operations/default.json
@@ -18,4 +18,14 @@
     "operation-type": "refresh",
     "retries": 100,
     "index": "{{ target_index_name | default('target_index') }}"
-}
+},
+{
+    "name": "refresh-train-index",
+    "operation-type": "refresh",
+    "retries": 100,
+    "index": "{{ train_index_name | default('train_index') }}"
+},
+{
+    "name": "delete-model",
+    "operation-type": "delete-knn-model"   
+}
diff --git a/vectorsearch/params/train/train-faiss-sift-128-l2-pq.json b/vectorsearch/params/train/train-faiss-sift-128-l2-pq.json
@@ -0,0 +1,48 @@
+{
+    "target_index_name": "target_index",
+    "target_field_name": "target_field",
+    "target_index_body": "indices/faiss-index.json",
+    "target_index_primary_shards": 1,
+    "target_index_dimension": 128,
+    "target_index_space_type": "l2",
+    "target_index_bulk_size": 100,
+    "target_index_bulk_index_data_set_format": "hdf5",
+    "target_index_bulk_index_data_set_path": "/tmp/sift-128-euclidean.hdf5",
+    "target_index_bulk_indexing_clients": 10,
+
+    "train_index_name": "train_index",
+    "train_field_name": "train_field",
+    "train_method_engine": "faiss",
+    "train_index_body": "indices/train-index.json",
+    "train_index_primary_shards": 1,
+    "train_index_replica_shards": 1, 
+
+    "train_index_bulk_size": 100,
+    "train_index_bulk_index_data_set_format": "hdf5",
+    "train_index_bulk_index_data_set_path": "/tmp/sift-128-euclidean.hdf5",
+    "train_index_bulk_indexing_clients": 10,
+    "train_index_num_vectors": 1000,
+
+    "train_model_id": "test-model",
+    "train_operation_retries": 100,
+    "train_operation_poll_period": 0.5,
+    "train_search_size": 10000,
+
+    "encoder": "pq",
+    "faiss_encoder_code_size": 2,
+    "faiss_encoder_m": 4,
+
+    "target_index_max_num_segments": 1,
+    "target_index_force_merge_timeout": 300,
+    "hnsw_ef_search": 100,
+    "hnsw_ef_construction": 100,
+    "query_k": 100,
+    "query_body": {
+         "docvalue_fields" : ["_id"],
+         "stored_fields" : "_none_"
+    },
+
+    "query_data_set_format": "hdf5",
+    "query_data_set_path":"/tmp/sift-128-euclidean.hdf5",
+    "query_count": 100
+  }
diff --git a/vectorsearch/params/train/train-faiss-sift-128-l2-sq.json b/vectorsearch/params/train/train-faiss-sift-128-l2-sq.json
@@ -0,0 +1,48 @@
+{
+    "target_index_name": "target_index",
+    "target_field_name": "target_field",
+    "target_index_body": "indices/faiss-index.json",
+    "target_index_primary_shards": 1,
+    "target_index_dimension": 128,
+    "target_index_space_type": "l2",
+    "target_index_bulk_size": 100,
+    "target_index_bulk_index_data_set_format": "hdf5",
+    "target_index_bulk_index_data_set_path": "/tmp/sift-128-euclidean.hdf5",
+    "target_index_bulk_indexing_clients": 10,
+
+    "train_index_name": "train_index",
+    "train_field_name": "train_field",
+    "train_method_engine": "faiss",
+    "train_index_body": "indices/train-index.json",
+    "train_index_primary_shards": 1,
+    "train_index_replica_shards": 1, 
+
+    "train_index_bulk_size": 100,
+    "train_index_bulk_index_data_set_format": "hdf5",
+    "train_index_bulk_index_data_set_path": "/tmp/sift-128-euclidean.hdf5",
+    "train_index_bulk_indexing_clients": 10,
+    "train_index_num_vectors": 1000,
+
+    "train_model_id": "test-model",
+    "train_operation_retries": 100,
+    "train_operation_poll_period": 0.5,
+    "train_search_size": 10000,
+
+    "encoder": "sq",
+    "faiss_encoder_type": "fp16",
+    "faiss_encoder_clip": false,
+
+    "target_index_max_num_segments": 1,
+    "target_index_force_merge_timeout": 300,
+    "hnsw_ef_search": 100,
+    "hnsw_ef_construction": 100,
+    "query_k": 100,
+    "query_body": {
+         "docvalue_fields" : ["_id"],
+         "stored_fields" : "_none_"
+    },
+
+    "query_data_set_format": "hdf5",
+    "query_data_set_path":"/tmp/sift-128-euclidean.hdf5",
+    "query_count": 100
+  }