From de99b1d676d7de875ef1aeff2cd4eee80c945506 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 25 Mar 2024 21:03:05 +0000 Subject: [PATCH] Add new procewdure for AOSS (#243) AOSS doesn't allow users to pass value to "_id". Hence, create a new procedure and param to support user defined field "id" as unique field to identify document from index. Signed-off-by: Vijayan Balasubramanian (cherry picked from commit ddf643b4e6f8aa767e46720984acda59f261f416) Signed-off-by: github-actions[bot] --- vectorsearch/README.md | 10 +++++-- vectorsearch/indices/faiss-index.json | 5 ++++ vectorsearch/indices/lucene-index.json | 5 ++++ vectorsearch/indices/nmslib-index.json | 5 ++++ .../aoss/10million/faiss-cohere-768-dp.json | 26 ++++++++++++++++ .../aoss/10million/nmslib-cohere-768-dp.json | 28 +++++++++++++++++ .../aoss/1million/faiss-cohere-768-dp.json | 26 ++++++++++++++++ .../aoss/1million/nmslib-cohere-768-dp.json | 28 +++++++++++++++++ .../aoss/index-only-schedule.json | 30 +++++++++++++++++++ .../aoss/search-only-schedule.json | 20 +++++++++++++ vectorsearch/test_procedures/default.json | 9 ++++++ 11 files changed, 190 insertions(+), 2 deletions(-) create mode 100644 vectorsearch/params/aoss/10million/faiss-cohere-768-dp.json create mode 100644 vectorsearch/params/aoss/10million/nmslib-cohere-768-dp.json create mode 100644 vectorsearch/params/aoss/1million/faiss-cohere-768-dp.json create mode 100644 vectorsearch/params/aoss/1million/nmslib-cohere-768-dp.json create mode 100644 vectorsearch/test_procedures/aoss/index-only-schedule.json create mode 100644 vectorsearch/test_procedures/aoss/search-only-schedule.json diff --git a/vectorsearch/README.md b/vectorsearch/README.md index c8d57128..0842e690 100644 --- a/vectorsearch/README.md +++ b/vectorsearch/README.md @@ -50,16 +50,21 @@ for more details. This procedure is used to index only vector search index which requires no training. This will be useful if you are interested in benchmarking only indexing operation. -## Force Merge Index +### Force Merge Index This procedure is used to optimize vector search indices by performing force merge on an index, up to given maximum segments. For a large dataset, force merge is a costly operation. Hence, it is better to have separate procedure to trigger force merge occasionally based on user's requirement. -## Search +### Search This procedure is used to benchmark previously indexed vector search index. This will be useful if you want to benchmark large vector search index without indexing everytime since load time is substantial for a large dataset. This also contains warmup operation to avoid cold start problem during vector search. +### No Train Test AOSS + +This is similar to no train test, except, targeted for Amazon OpenSearch Serverless Vector Search Collection. This procedure +does not contain operations like refresh and warm up since they are not supported by Vector Search Collection. + #### Parameters @@ -84,6 +89,7 @@ This workload allows the following parameters to be specified using `--workload- | target_index_force_merge_timeout | Timeout for of force merge requests in seconds | | hnsw_ef_search | HNSW ef search parameter | | hnsw_ef_construction | HNSW ef construction parameter | +| id_field_name | Name of field that will be used to identify documents in an index | | hnsw_m | HNSW m parameter | | query_k | The number of neighbors to return for the search | | query_data_set_format | Format of vector data set for queries | diff --git a/vectorsearch/indices/faiss-index.json b/vectorsearch/indices/faiss-index.json index 0f093d20..41632a1e 100644 --- a/vectorsearch/indices/faiss-index.json +++ b/vectorsearch/indices/faiss-index.json @@ -16,6 +16,11 @@ "mappings": { "dynamic": "strict", "properties": { + {% if id_field_name is defined and id_field_name != "_id" %} + "{{id_field_name}}": { + "type": "keyword" + }, + {%- endif %} "target_field": { "type": "knn_vector", "dimension": {{ target_index_dimension }}, diff --git a/vectorsearch/indices/lucene-index.json b/vectorsearch/indices/lucene-index.json index 041cc416..5d26f96f 100644 --- a/vectorsearch/indices/lucene-index.json +++ b/vectorsearch/indices/lucene-index.json @@ -16,6 +16,11 @@ "mappings": { "dynamic": "strict", "properties": { + {% if id_field_name is defined and id_field_name != "_id" %} + "{{id_field_name}}": { + "type": "keyword" + }, + {%- endif %} "target_field": { "type": "knn_vector", "dimension": {{ target_index_dimension }}, diff --git a/vectorsearch/indices/nmslib-index.json b/vectorsearch/indices/nmslib-index.json index d115e9f7..feccfe6e 100644 --- a/vectorsearch/indices/nmslib-index.json +++ b/vectorsearch/indices/nmslib-index.json @@ -16,6 +16,11 @@ "mappings": { "dynamic": "strict", "properties": { + {% if id_field_name is defined and id_field_name != "_id" %} + "{{id_field_name}}": { + "type": "keyword" + }, + {%- endif %} "target_field": { "type": "knn_vector", "dimension": {{ target_index_dimension }}, diff --git a/vectorsearch/params/aoss/10million/faiss-cohere-768-dp.json b/vectorsearch/params/aoss/10million/faiss-cohere-768-dp.json new file mode 100644 index 00000000..f55c0044 --- /dev/null +++ b/vectorsearch/params/aoss/10million/faiss-cohere-768-dp.json @@ -0,0 +1,26 @@ +{ + "target_index_name": "target_index", + "target_field_name": "target_field", + "target_index_body": "indices/faiss-index.json", + "target_index_dimension": 768, + "target_index_space_type": "innerproduct", + "id_field_name": "id", + + "target_index_bulk_size": 100, + "target_index_bulk_index_data_set_format": "hdf5", + "target_index_bulk_index_data_set_corpus": "cohere-10m", + "target_index_bulk_indexing_clients": 10, + + "hnsw_ef_search": 256, + "hnsw_ef_construction": 256, + + "query_k": 100, + "query_body": { + "docvalue_fields" : ["id"], + "stored_fields" : "_none_" + }, + + "query_data_set_format": "hdf5", + "query_data_set_corpus": "cohere-10m", + "query_count": 10000 + } diff --git a/vectorsearch/params/aoss/10million/nmslib-cohere-768-dp.json b/vectorsearch/params/aoss/10million/nmslib-cohere-768-dp.json new file mode 100644 index 00000000..168fbc38 --- /dev/null +++ b/vectorsearch/params/aoss/10million/nmslib-cohere-768-dp.json @@ -0,0 +1,28 @@ +{ + "target_index_name": "target_index", + "target_field_name": "target_field", + "target_index_body": "indices/nmslib-index.json", + "target_index_dimension": 768, + "target_index_space_type": "innerproduct", + "id_field_name": "id", + + "target_index_bulk_size": 100, + "target_index_bulk_index_data_set_format": "hdf5", + "target_index_bulk_index_data_set_corpus": "cohere-10m", + "target_index_bulk_indexing_clients": 10, + + "hnsw_ef_search": 256, + "hnsw_ef_construction": 256, + + "query_k": 100, + "query_body": { + "docvalue_fields" : ["id"], + "stored_fields" : "_none_" + }, + + "query_data_set_format": "hdf5", + "query_data_set_corpus": "cohere-10m", + "neighbors_data_set_corpus": "cohere-10m", + "neighbors_data_set_format": "hdf5", + "query_count": 10000 + } diff --git a/vectorsearch/params/aoss/1million/faiss-cohere-768-dp.json b/vectorsearch/params/aoss/1million/faiss-cohere-768-dp.json new file mode 100644 index 00000000..773c080a --- /dev/null +++ b/vectorsearch/params/aoss/1million/faiss-cohere-768-dp.json @@ -0,0 +1,26 @@ +{ + "target_index_name": "target_index", + "target_field_name": "target_field", + "target_index_body": "indices/faiss-index.json", + "target_index_dimension": 768, + "target_index_space_type": "innerproduct", + "id_field_name": "id", + + "target_index_bulk_size": 100, + "target_index_bulk_index_data_set_format": "hdf5", + "target_index_bulk_index_data_set_corpus": "cohere-1m", + "target_index_bulk_indexing_clients": 10, + + "hnsw_ef_search": 256, + "hnsw_ef_construction": 256, + + "query_k": 100, + "query_body": { + "docvalue_fields" : ["id"], + "stored_fields" : "_none_" + }, + + "query_data_set_format": "hdf5", + "query_data_set_corpus": "cohere-1m", + "query_count": 10000 + } diff --git a/vectorsearch/params/aoss/1million/nmslib-cohere-768-dp.json b/vectorsearch/params/aoss/1million/nmslib-cohere-768-dp.json new file mode 100644 index 00000000..fe790b68 --- /dev/null +++ b/vectorsearch/params/aoss/1million/nmslib-cohere-768-dp.json @@ -0,0 +1,28 @@ +{ + "target_index_name": "target_index", + "target_field_name": "target_field", + "target_index_body": "indices/nmslib-index.json", + "target_index_dimension": 768, + "target_index_space_type": "innerproduct", + "id_field_name": "id", + + "target_index_bulk_size": 100, + "target_index_bulk_index_data_set_format": "hdf5", + "target_index_bulk_index_data_set_corpus": "cohere-1m", + "target_index_bulk_indexing_clients": 10, + + "hnsw_ef_search": 256, + "hnsw_ef_construction": 256, + + "query_k": 100, + "query_body": { + "docvalue_fields" : ["id"], + "stored_fields" : "_none_" + }, + + "query_data_set_format": "hdf5", + "query_data_set_corpus":"cohere-1m", + "neighbors_data_set_corpus":"cohere-1m", + "neighbors_data_set_format":"hdf5", + "query_count": 10000 + } diff --git a/vectorsearch/test_procedures/aoss/index-only-schedule.json b/vectorsearch/test_procedures/aoss/index-only-schedule.json new file mode 100644 index 00000000..88afc12f --- /dev/null +++ b/vectorsearch/test_procedures/aoss/index-only-schedule.json @@ -0,0 +1,30 @@ +{ + "operation": { + "name": "delete-target-index", + "operation-type": "delete-index", + "only-if-exists": true, + "index": "{{ target_index_name | default('target_index') }}" + } +}, +{ + "operation": { + "name": "create-target-index", + "operation-type": "create-index", + "index": "{{ target_index_name | default('target_index') }}" + } +}, +{ + "operation": { + "name": "custom-vector-bulk", + "operation-type": "bulk-vector-data-set", + "index": "{{ target_index_name | default('target_index') }}", + "field": "{{ target_field_name | default('target_field') }}", + "bulk_size": {{ target_index_bulk_size | default(500)}}, + "data_set_format": "{{ target_index_bulk_index_data_set_format | default('hdf5') }}", + "data_set_path": "{{ target_index_bulk_index_data_set_path }}", + "data_set_corpus": "{{ target_index_bulk_index_data_set_corpus }}", + "num_vectors": {{ target_index_num_vectors | default(-1) }}, + "id-field-name": "{{ id_field_name }}" + }, + "clients": {{ target_index_bulk_indexing_clients | default(1)}} +} diff --git a/vectorsearch/test_procedures/aoss/search-only-schedule.json b/vectorsearch/test_procedures/aoss/search-only-schedule.json new file mode 100644 index 00000000..6c5107d0 --- /dev/null +++ b/vectorsearch/test_procedures/aoss/search-only-schedule.json @@ -0,0 +1,20 @@ +{ + "operation": { + "name": "prod-queries", + "operation-type": "vector-search", + "index": "{{ target_index_name | default('target_index') }}", + "detailed-results": true, + "k": {{ query_k | default(100) }}, + "field" : "{{ target_field_name | default('target_field') }}", + "data_set_format" : "{{ query_data_set_format | default('hdf5') }}", + "data_set_path" : "{{ query_data_set_path }}", + "data_set_corpus" : "{{ query_data_set_corpus }}", + "neighbors_data_set_path" : "{{ neighbors_data_set_path }}", + "neighbors_data_set_corpus" : "{{ neighbors_data_set_corpus }}", + "neighbors_data_set_format" : "{{ neighbors_data_set_format | default('hdf5') }}", + "num_vectors" : {{ query_count | default(-1) }}, + "id-field-name": "{{ id_field_name }}", + "body": {{ query_body | default ({}) | tojson }} + }, + "clients": {{ search_clients | default(1)}} +} diff --git a/vectorsearch/test_procedures/default.json b/vectorsearch/test_procedures/default.json index db400ea7..2d99dccb 100644 --- a/vectorsearch/test_procedures/default.json +++ b/vectorsearch/test_procedures/default.json @@ -31,4 +31,13 @@ "schedule": [ {{ benchmark.collect(parts="common/force-merge-schedule.json") }} ] +}, +{ + "name": "no-train-test-aoss", + "description": "Index vector search which does not use an algorithm that requires training.", + "default": false, + "schedule": [ + {{ benchmark.collect(parts="aoss/index-only-schedule.json") }}, + {{ benchmark.collect(parts="aoss/search-only-schedule.json") }} + ] }