diff --git a/vectorsearch/README.md b/vectorsearch/README.md index 221c6f9f..89672c3e 100644 --- a/vectorsearch/README.md +++ b/vectorsearch/README.md @@ -189,6 +189,148 @@ either in-memory, or an external OpenSearch cluster. | error rate | prod-queries | 0 | % | +--------------------------------- +[INFO] SUCCESS (took 119 seconds) +--------------------------------- + +``` + +### Train Test + +This procedure benchmarks approximate k-NN search algorithms that require a training step. For example, the FAISS IVF requires a training step to cluster vectors. Then search can be performed against a smaller number of cluster centroids instead of the entire dataset. + +#### Parameters + +This workload allows the following parameters to be specified using `--workload-params`: + +| Name | Description | +|-----------------------------------------|----------------------------------------------------------------------------------------------| +| target_index_name | Name of index to add vectors to | +| target_field_name | Name of field to add vectors to | +| target_index_body | Path to target index definition | +| target_index_primary_shards | Target index primary shards | +| target_index_replica_shards | Target index replica shards | +| target_index_dimension | Dimension of target index | +| target_index_space_type | Target index space type | +| target_index_bulk_size | Target index bulk size | +| target_index_bulk_index_data_set_format | Format of vector data set | +| target_index_bulk_index_data_set_path | Path to vector data set | +| target_index_bulk_index_data_set_corpus | Corpus name to vector data set | +| target_index_bulk_index_clients | Clients to be used for bulk ingestion (must be divisor of data set size) | +| target_index_max_num_segments | Number of segments to merge target index down to before beginning search | +| target_index_force_merge_timeout | Timeout for of force merge requests in seconds | +| train_index_name | Name of index for training | +| train_field_name | Name of field for training | +| train_method_engine | Engine for training (e.g "faiss") | +| train_index_body | Path to train index definition | +| train_index_primary_shards | Train index primary shards | +| train_index_replica_shards | Train index replica shards | +| train_index_bulk_size | Bulk size for train index | +| train_index_bulk_index_data_set_format | Format of training data set | +| train_index_bulk_index_data_set_path | Path to training data set | +| train_index_bulk_indexing_clients | Clients to be used for bulk indexing | +| train_index_num_vectors | Number of vectors in the training index | +| train_model_id | ID of the training model | +| train_operation_retries | Number of retries for querying training operation to see if complete | +| train_operation_poll_period | Poll period for querying training operation in seconds | +| train_search_size | Number of results per [scroll query](http://opensearch.org/docs/latest/api-reference/scroll/)| +| hnsw_ef_search | HNSW ef search parameter | +| hnsw_ef_construction | HNSW ef construction parameter | +| id_field_name | Name of field that will be used to identify documents in an index | +| hnsw_m | HNSW m parameter | +| query_k | The number of neighbors to return for the search | +| query_data_set_format | Format of vector data set for queries | +| query_data_set_path | Path to vector data set for queries | +| query_count | Number of queries for search operation | +| query_body | Json properties that will be merged with search body | +| search_clients | Number of clients to use for running queries | + +#### Sample Output + +The output of a sample test run is provided below. Metrics are captured in the result's data store as usual, and this can be configured to be +either in-memory, or an external OpenSearch cluster. + +``` +------------------------------------------------------ + _______ __ _____ + / ____(_)___ ____ _/ / / ___/_________ ________ + / /_ / / __ \/ __ `/ / \__ \/ ___/ __ \/ ___/ _ \ + / __/ / / / / / /_/ / / ___/ / /__/ /_/ / / / __/ +/_/ /_/_/ /_/\__,_/_/ /____/\___/\____/_/ \___/ +------------------------------------------------------ + +| Metric | Task | Value | Unit | +|---------------------------------------------------------------:|-------------------:|------------:|-------:| +| Cumulative indexing time of primary shards | | 0.00946667 | min | +| Min cumulative indexing time across primary shards | | 0 | min | +| Median cumulative indexing time across primary shards | | 0.00298333 | min | +| Max cumulative indexing time across primary shards | | 0.00336667 | min | +| Cumulative indexing throttle time of primary shards | | 0 | min | +| Min cumulative indexing throttle time across primary shards | | 0 | min | +| Median cumulative indexing throttle time across primary shards | | 0 | min | +| Max cumulative indexing throttle time across primary shards | | 0 | min | +| Cumulative merge time of primary shards | | 0 | min | +| Cumulative merge count of primary shards | | 0 | | +| Min cumulative merge time across primary shards | | 0 | min | +| Median cumulative merge time across primary shards | | 0 | min | +| Max cumulative merge time across primary shards | | 0 | min | +| Cumulative merge throttle time of primary shards | | 0 | min | +| Min cumulative merge throttle time across primary shards | | 0 | min | +| Median cumulative merge throttle time across primary shards | | 0 | min | +| Max cumulative merge throttle time across primary shards | | 0 | min | +| Cumulative refresh time of primary shards | | 0.00861667 | min | +| Cumulative refresh count of primary shards | | 33 | | +| Min cumulative refresh time across primary shards | | 0 | min | +| Median cumulative refresh time across primary shards | | 0.00268333 | min | +| Max cumulative refresh time across primary shards | | 0.00291667 | min | +| Cumulative flush time of primary shards | | 0.000183333 | min | +| Cumulative flush count of primary shards | | 2 | | +| Min cumulative flush time across primary shards | | 0 | min | +| Median cumulative flush time across primary shards | | 0 | min | +| Max cumulative flush time across primary shards | | 0.000183333 | min | +| Total Young Gen GC time | | 0.075 | s | +| Total Young Gen GC count | | 17 | | +| Total Old Gen GC time | | 0 | s | +| Total Old Gen GC count | | 0 | | +| Store size | | 0.00869293 | GB | +| Translog size | | 2.56114e-07 | GB | +| Heap used for segments | | 0 | MB | +| Heap used for doc values | | 0 | MB | +| Heap used for terms | | 0 | MB | +| Heap used for norms | | 0 | MB | +| Heap used for points | | 0 | MB | +| Heap used for stored fields | | 0 | MB | +| Segment count | | 9 | | +| Min Throughput | custom-vector-bulk | 25527 | docs/s | +| Mean Throughput | custom-vector-bulk | 25527 | docs/s | +| Median Throughput | custom-vector-bulk | 25527 | docs/s | +| Max Throughput | custom-vector-bulk | 25527 | docs/s | +| 50th percentile latency | custom-vector-bulk | 36.3095 | ms | +| 90th percentile latency | custom-vector-bulk | 52.2662 | ms | +| 100th percentile latency | custom-vector-bulk | 68.6513 | ms | +| 50th percentile service time | custom-vector-bulk | 36.3095 | ms | +| 90th percentile service time | custom-vector-bulk | 52.2662 | ms | +| 100th percentile service time | custom-vector-bulk | 68.6513 | ms | +| error rate | custom-vector-bulk | 0 | % | +| Min Throughput | prod-queries | 211.26 | ops/s | +| Mean Throughput | prod-queries | 213.85 | ops/s | +| Median Throughput | prod-queries | 213.48 | ops/s | +| Max Throughput | prod-queries | 216.49 | ops/s | +| 50th percentile latency | prod-queries | 3.43393 | ms | +| 90th percentile latency | prod-queries | 4.01881 | ms | +| 99th percentile latency | prod-queries | 5.56238 | ms | +| 99.9th percentile latency | prod-queries | 9.95666 | ms | +| 99.99th percentile latency | prod-queries | 39.7922 | ms | +| 100th percentile latency | prod-queries | 62.415 | ms | +| 50th percentile service time | prod-queries | 3.43405 | ms | +| 90th percentile service time | prod-queries | 4.0191 | ms | +| 99th percentile service time | prod-queries | 5.56316 | ms | +| 99.9th percentile service time | prod-queries | 9.95666 | ms | +| 99.99th percentile service time | prod-queries | 39.7922 | ms | +| 100th percentile service time | prod-queries | 62.415 | ms | +| error rate | prod-queries | 0 | % | + + --------------------------------- [INFO] SUCCESS (took 119 seconds) --------------------------------- diff --git a/vectorsearch/indices/train-index.json b/vectorsearch/indices/train-index.json new file mode 100644 index 00000000..ea499daf --- /dev/null +++ b/vectorsearch/indices/train-index.json @@ -0,0 +1,21 @@ +{ + "settings": { + "index": { + {%- if train_index_primary_shards is defined and train_index_primary_shards %} + "number_of_shards": {{ train_index_primary_shards }} + {%- endif %} + {%- if train_index_replica_shards is defined %} + ,"number_of_replicas": {{ train_index_replica_shards }} + {%- endif %} + } + }, + "mappings": { + "properties": { + "{{ train_field_name }}": { + "type": "knn_vector", + "dimension": {{ target_index_dimension }} + } + } + } + } + \ No newline at end of file diff --git a/vectorsearch/operations/default.json b/vectorsearch/operations/default.json index 88b9ca44..ffa70b65 100644 --- a/vectorsearch/operations/default.json +++ b/vectorsearch/operations/default.json @@ -18,4 +18,14 @@ "operation-type": "refresh", "retries": 100, "index": "{{ target_index_name | default('target_index') }}" -} +}, +{ + "name": "refresh-train-index", + "operation-type": "refresh", + "retries": 100, + "index": "{{ train_index_name | default('train_index') }}" +}, +{ + "name": "delete-model", + "operation-type": "delete-knn-model" +} \ No newline at end of file diff --git a/vectorsearch/params/train/train-faiss-sift-128-l2-pq.json b/vectorsearch/params/train/train-faiss-sift-128-l2-pq.json new file mode 100644 index 00000000..150bd639 --- /dev/null +++ b/vectorsearch/params/train/train-faiss-sift-128-l2-pq.json @@ -0,0 +1,48 @@ +{ + "target_index_name": "target_index", + "target_field_name": "target_field", + "target_index_body": "indices/faiss-index.json", + "target_index_primary_shards": 1, + "target_index_dimension": 128, + "target_index_space_type": "l2", + "target_index_bulk_size": 100, + "target_index_bulk_index_data_set_format": "hdf5", + "target_index_bulk_index_data_set_path": "/tmp/sift-128-euclidean.hdf5", + "target_index_bulk_indexing_clients": 10, + + "train_index_name": "train_index", + "train_field_name": "train_field", + "train_method_engine": "faiss", + "train_index_body": "indices/train-index.json", + "train_index_primary_shards": 1, + "train_index_replica_shards": 1, + + "train_index_bulk_size": 100, + "train_index_bulk_index_data_set_format": "hdf5", + "train_index_bulk_index_data_set_path": "/tmp/sift-128-euclidean.hdf5", + "train_index_bulk_indexing_clients": 10, + "train_index_num_vectors": 1000, + + "train_model_id": "test-model", + "train_operation_retries": 100, + "train_operation_poll_period": 0.5, + "train_search_size": 10000, + + "encoder": "pq", + "faiss_encoder_code_size": 2, + "faiss_encoder_m": 4, + + "target_index_max_num_segments": 1, + "target_index_force_merge_timeout": 300, + "hnsw_ef_search": 100, + "hnsw_ef_construction": 100, + "query_k": 100, + "query_body": { + "docvalue_fields" : ["_id"], + "stored_fields" : "_none_" + }, + + "query_data_set_format": "hdf5", + "query_data_set_path":"/tmp/sift-128-euclidean.hdf5", + "query_count": 100 + } diff --git a/vectorsearch/params/train/train-faiss-sift-128-l2-sq.json b/vectorsearch/params/train/train-faiss-sift-128-l2-sq.json new file mode 100644 index 00000000..1decd64f --- /dev/null +++ b/vectorsearch/params/train/train-faiss-sift-128-l2-sq.json @@ -0,0 +1,48 @@ +{ + "target_index_name": "target_index", + "target_field_name": "target_field", + "target_index_body": "indices/faiss-index.json", + "target_index_primary_shards": 1, + "target_index_dimension": 128, + "target_index_space_type": "l2", + "target_index_bulk_size": 100, + "target_index_bulk_index_data_set_format": "hdf5", + "target_index_bulk_index_data_set_path": "/tmp/sift-128-euclidean.hdf5", + "target_index_bulk_indexing_clients": 10, + + "train_index_name": "train_index", + "train_field_name": "train_field", + "train_method_engine": "faiss", + "train_index_body": "indices/train-index.json", + "train_index_primary_shards": 1, + "train_index_replica_shards": 1, + + "train_index_bulk_size": 100, + "train_index_bulk_index_data_set_format": "hdf5", + "train_index_bulk_index_data_set_path": "/tmp/sift-128-euclidean.hdf5", + "train_index_bulk_indexing_clients": 10, + "train_index_num_vectors": 1000, + + "train_model_id": "test-model", + "train_operation_retries": 100, + "train_operation_poll_period": 0.5, + "train_search_size": 10000, + + "encoder": "sq", + "faiss_encoder_type": "fp16", + "faiss_encoder_clip": false, + + "target_index_max_num_segments": 1, + "target_index_force_merge_timeout": 300, + "hnsw_ef_search": 100, + "hnsw_ef_construction": 100, + "query_k": 100, + "query_body": { + "docvalue_fields" : ["_id"], + "stored_fields" : "_none_" + }, + + "query_data_set_format": "hdf5", + "query_data_set_path":"/tmp/sift-128-euclidean.hdf5", + "query_count": 100 + } diff --git a/vectorsearch/params/train/train-faiss-sift-128-l2.json b/vectorsearch/params/train/train-faiss-sift-128-l2.json new file mode 100644 index 00000000..b7b39010 --- /dev/null +++ b/vectorsearch/params/train/train-faiss-sift-128-l2.json @@ -0,0 +1,44 @@ +{ + "target_index_name": "target_index", + "target_field_name": "target_field", + "target_index_body": "indices/faiss-index.json", + "target_index_primary_shards": 1, + "target_index_dimension": 128, + "target_index_space_type": "l2", + "target_index_bulk_size": 100, + "target_index_bulk_index_data_set_format": "hdf5", + "target_index_bulk_index_data_set_path": "/tmp/sift-128-euclidean.hdf5", + "target_index_bulk_indexing_clients": 10, + + "train_index_name": "train_index", + "train_field_name": "train_field", + "train_method_engine": "faiss", + "train_index_body": "indices/train-index.json", + "train_index_primary_shards": 1, + "train_index_replica_shards": 1, + + "train_index_bulk_size": 100, + "train_index_bulk_index_data_set_format": "hdf5", + "train_index_bulk_index_data_set_path": "/tmp/sift-128-euclidean.hdf5", + "train_index_bulk_indexing_clients": 10, + "train_index_num_vectors": 1000, + + "train_model_id": "test-model", + "train_operation_retries": 100, + "train_operation_poll_period": 0.5, + "train_search_size": 10000, + + "target_index_max_num_segments": 1, + "target_index_force_merge_timeout": 300, + "hnsw_ef_search": 100, + "hnsw_ef_construction": 100, + "query_k": 100, + "query_body": { + "docvalue_fields" : ["_id"], + "stored_fields" : "_none_" + }, + + "query_data_set_format": "hdf5", + "query_data_set_path":"/tmp/sift-128-euclidean.hdf5", + "query_count": 100 + } diff --git a/vectorsearch/test_procedures/common/train-index-only-schedule.json b/vectorsearch/test_procedures/common/train-index-only-schedule.json new file mode 100644 index 00000000..bbdb828a --- /dev/null +++ b/vectorsearch/test_procedures/common/train-index-only-schedule.json @@ -0,0 +1,34 @@ +{ + "operation": { + "name": "delete-train-index", + "operation-type": "delete-index", + "only-if-exists": true, + "index": "{{ train_index_name | default('train_index') }}" + } +}, +{ + "operation": { + "name": "create-train-index", + "operation-type": "create-index", + "index": "{{ train_index_name | default('train_index') }}" + } +}, +{ + "operation": { + "name": "custom-vector-bulk-train", + "operation-type": "bulk-vector-data-set", + "index": "{{ train_index_name | default('train_index') }}", + "field": "{{ train_field_name | default('train_field') }}", + "bulk_size": {{ train_index_bulk_size | default(500)}}, + "data_set_format": "{{ train_index_bulk_index_data_set_format | default('hdf5') }}", + "data_set_path": "{{ train_index_bulk_index_data_set_path }}", + "data_set_corpus": "{{ train_index_bulk_index_data_set_corpus }}", + "num_vectors": {{ train_index_num_vectors | default(-1) }}, + "id-field-name": "_id" + }, + "clients": {{ train_index_bulk_indexing_clients | default(1)}} +}, +{ + "name" : "refresh-train-index", + "operation" : "refresh-train-index" +} diff --git a/vectorsearch/test_procedures/common/train-model-schedule.json b/vectorsearch/test_procedures/common/train-model-schedule.json new file mode 100644 index 00000000..11b74174 --- /dev/null +++ b/vectorsearch/test_procedures/common/train-model-schedule.json @@ -0,0 +1,75 @@ +{ + "operation": { + "operation-type": "delete-knn-model", + "name": "delete-model", + "model_id": "{{ train_model_id }}", + "ignore-if-model-does-not-exist": true + } +}, +{ + "operation": { + "name": "train-knn-model", + "operation-type": "train-knn-model", + "body": { + "training_index": "{{ train_index_name | default('train_index') }}", + "training_field": "{{ train_field_name | default('train_field') }}", + "search_size": "{{ train_search_size | default(10000) }}", + "dimension": {{ target_index_dimension }}, + {%- if train_max_vector_count is defined and train_max_vector_count %} + "max_training_vector_count": "{{ train_max_vector_count }}", + {%- endif %} + "method": { + "name": "{{ train_method_name | default('ivf') }}", + "engine": "{{ train_method_engine | default('faiss') }}", + "space_type": "{{ target_index_space_type | default('l2') }}", + "parameters": { + {%- if faiss_nlist is defined and faiss_nlist %} + "nlist": {{ faiss_nlist }} + {%- endif %} + + {%- if faiss_nprobes is defined and faiss_nprobes %} + {%- if faiss_nlist is defined and faiss_nlist %} + , + {%- endif %} + "nprobes": {{ faiss_nprobes}} + {%- endif %} + + {%- if encoder is defined and encoder %} + {%- if faiss_nprobes is defined and faiss_nprobes %} + , + {%- endif %} + "encoder": { + "name": "{{ encoder }}", + "parameters": { + {%- if faiss_encoder_code_size is defined and faiss_encoder_code_size %} + "code_size": {{ faiss_encoder_code_size }} + {%- endif %} + + {%- if faiss_encoder_m is defined and faiss_encoder_m %} + {%- if faiss_encoder_code_size is defined and faiss_encoder_code_size %} + , + {%- endif %} + "m": {{ faiss_encoder_m }} + {%- endif %} + + {%- if faiss_encoder_type is defined and faiss_encoder_type %} + "type": "{{ faiss_encoder_type }}" + {%- endif %} + + {%- if faiss_encoder_clip is defined and faiss_encoder_clip %} + {%- if faiss_encoder_type is defined and faiss_encoder_type %} + , + {%- endif %} + "clip": "{{ faiss_encoder_clip }}" + {%- endif %} + } + } + {%- endif %} + } + } + }, + "model_id": "{{ train_model_id | default('train_model') }}", + "retries": {{ train_operation_retries | default(1000) }}, + "poll_period": {{ train_operation_poll_period | default(0.5) }} + } +} \ No newline at end of file diff --git a/vectorsearch/test_procedures/default.json b/vectorsearch/test_procedures/default.json index 21999078..3a492c2c 100644 --- a/vectorsearch/test_procedures/default.json +++ b/vectorsearch/test_procedures/default.json @@ -49,4 +49,15 @@ {{ benchmark.collect(parts="aoss/index-only-schedule.json") }}, {{ benchmark.collect(parts="aoss/search-only-schedule.json") }} ] -} +}, +{ + "name": "train-test", + "description": "Index vector search that requires training.", + "default": false, + "schedule": [ + {{ benchmark.collect(parts="common/train-index-only-schedule.json") }}, + {{ benchmark.collect(parts="common/index-only-schedule.json") }}, + {{ benchmark.collect(parts="common/train-model-schedule.json") }}, + {{ benchmark.collect(parts="common/search-only-schedule.json") }} + ] +} \ No newline at end of file diff --git a/vectorsearch/workload.json b/vectorsearch/workload.json index 157ce142..c5f2940c 100644 --- a/vectorsearch/workload.json +++ b/vectorsearch/workload.json @@ -6,23 +6,30 @@ { "name": "{{ target_index_name }}", "body": "{{ target_index_body }}" + }, + { + "name": "{{ train_index_name }}", + "body": "{{ train_index_body }}" } ], "corpora": [ { "name": "cohere", "base-url": "https://dbyiw3u3rf9yr.cloudfront.net/corpora/vectorsearch/cohere-wikipedia-22-12-en-embeddings", + "target-index": "{{ target_index_name }}", "documents": [ { "source-file": "documents-1k.hdf5.bz2", "source-format": "hdf5", "document-count": 1000 + } ] }, { "name": "cohere-100k", "base-url": "https://dbyiw3u3rf9yr.cloudfront.net/corpora/vectorsearch/cohere-wikipedia-22-12-en-embeddings", + "target-index": "{{ target_index_name }}", "documents": [ { "source-file": "documents-100k.hdf5.bz2", @@ -34,6 +41,7 @@ { "name": "cohere-1m", "base-url": "https://dbyiw3u3rf9yr.cloudfront.net/corpora/vectorsearch/cohere-wikipedia-22-12-en-embeddings", + "target-index": "{{ target_index_name }}", "documents": [ { "source-file": "documents-1m.hdf5.bz2", @@ -45,6 +53,7 @@ { "name": "cohere-10m", "base-url": "https://dbyiw3u3rf9yr.cloudfront.net/corpora/vectorsearch/cohere-wikipedia-22-12-en-embeddings", + "target-index": "{{ target_index_name }}", "documents": [ { "source-file": "documents-10m.hdf5.bz2",