From c367a7a95ced8598bccc2bb981598db3cdff38ff Mon Sep 17 00:00:00 2001 From: Vijayan Balasubramanian Date: Mon, 25 Mar 2024 14:02:36 -0700 Subject: [PATCH] Add new procewdure for AOSS (#243) AOSS doesn't allow users to pass value to "_id". Hence, create a new procedure and param to support user defined field "id" as unique field to identify document from index. Signed-off-by: Vijayan Balasubramanian --- vectorsearch/README.md | 10 +++++-- vectorsearch/indices/faiss-index.json | 5 ++++ vectorsearch/indices/lucene-index.json | 5 ++++ vectorsearch/indices/nmslib-index.json | 5 ++++ .../aoss/10million/faiss-cohere-768-dp.json | 26 ++++++++++++++++ .../aoss/10million/nmslib-cohere-768-dp.json | 28 +++++++++++++++++ .../aoss/1million/faiss-cohere-768-dp.json | 26 ++++++++++++++++ .../aoss/1million/nmslib-cohere-768-dp.json | 28 +++++++++++++++++ .../aoss/index-only-schedule.json | 30 +++++++++++++++++++ .../aoss/search-only-schedule.json | 20 +++++++++++++ vectorsearch/test_procedures/default.json | 9 ++++++ 11 files changed, 190 insertions(+), 2 deletions(-) create mode 100644 vectorsearch/params/aoss/10million/faiss-cohere-768-dp.json create mode 100644 vectorsearch/params/aoss/10million/nmslib-cohere-768-dp.json create mode 100644 vectorsearch/params/aoss/1million/faiss-cohere-768-dp.json create mode 100644 vectorsearch/params/aoss/1million/nmslib-cohere-768-dp.json create mode 100644 vectorsearch/test_procedures/aoss/index-only-schedule.json create mode 100644 vectorsearch/test_procedures/aoss/search-only-schedule.json diff --git a/vectorsearch/README.md b/vectorsearch/README.md index c8d57128..0842e690 100644 --- a/vectorsearch/README.md +++ b/vectorsearch/README.md @@ -50,16 +50,21 @@ for more details. This procedure is used to index only vector search index which requires no training. This will be useful if you are interested in benchmarking only indexing operation. -## Force Merge Index +### Force Merge Index This procedure is used to optimize vector search indices by performing force merge on an index, up to given maximum segments. For a large dataset, force merge is a costly operation. Hence, it is better to have separate procedure to trigger force merge occasionally based on user's requirement. -## Search +### Search This procedure is used to benchmark previously indexed vector search index. This will be useful if you want to benchmark large vector search index without indexing everytime since load time is substantial for a large dataset. This also contains warmup operation to avoid cold start problem during vector search. +### No Train Test AOSS + +This is similar to no train test, except, targeted for Amazon OpenSearch Serverless Vector Search Collection. This procedure +does not contain operations like refresh and warm up since they are not supported by Vector Search Collection. + #### Parameters @@ -84,6 +89,7 @@ This workload allows the following parameters to be specified using `--workload- | target_index_force_merge_timeout | Timeout for of force merge requests in seconds | | hnsw_ef_search | HNSW ef search parameter | | hnsw_ef_construction | HNSW ef construction parameter | +| id_field_name | Name of field that will be used to identify documents in an index | | hnsw_m | HNSW m parameter | | query_k | The number of neighbors to return for the search | | query_data_set_format | Format of vector data set for queries | diff --git a/vectorsearch/indices/faiss-index.json b/vectorsearch/indices/faiss-index.json index 0f093d20..41632a1e 100644 --- a/vectorsearch/indices/faiss-index.json +++ b/vectorsearch/indices/faiss-index.json @@ -16,6 +16,11 @@ "mappings": { "dynamic": "strict", "properties": { + {% if id_field_name is defined and id_field_name != "_id" %} + "{{id_field_name}}": { + "type": "keyword" + }, + {%- endif %} "target_field": { "type": "knn_vector", "dimension": {{ target_index_dimension }}, diff --git a/vectorsearch/indices/lucene-index.json b/vectorsearch/indices/lucene-index.json index 041cc416..5d26f96f 100644 --- a/vectorsearch/indices/lucene-index.json +++ b/vectorsearch/indices/lucene-index.json @@ -16,6 +16,11 @@ "mappings": { "dynamic": "strict", "properties": { + {% if id_field_name is defined and id_field_name != "_id" %} + "{{id_field_name}}": { + "type": "keyword" + }, + {%- endif %} "target_field": { "type": "knn_vector", "dimension": {{ target_index_dimension }}, diff --git a/vectorsearch/indices/nmslib-index.json b/vectorsearch/indices/nmslib-index.json index d115e9f7..feccfe6e 100644 --- a/vectorsearch/indices/nmslib-index.json +++ b/vectorsearch/indices/nmslib-index.json @@ -16,6 +16,11 @@ "mappings": { "dynamic": "strict", "properties": { + {% if id_field_name is defined and id_field_name != "_id" %} + "{{id_field_name}}": { + "type": "keyword" + }, + {%- endif %} "target_field": { "type": "knn_vector", "dimension": {{ target_index_dimension }}, diff --git a/vectorsearch/params/aoss/10million/faiss-cohere-768-dp.json b/vectorsearch/params/aoss/10million/faiss-cohere-768-dp.json new file mode 100644 index 00000000..f55c0044 --- /dev/null +++ b/vectorsearch/params/aoss/10million/faiss-cohere-768-dp.json @@ -0,0 +1,26 @@ +{ + "target_index_name": "target_index", + "target_field_name": "target_field", + "target_index_body": "indices/faiss-index.json", + "target_index_dimension": 768, + "target_index_space_type": "innerproduct", + "id_field_name": "id", + + "target_index_bulk_size": 100, + "target_index_bulk_index_data_set_format": "hdf5", + "target_index_bulk_index_data_set_corpus": "cohere-10m", + "target_index_bulk_indexing_clients": 10, + + "hnsw_ef_search": 256, + "hnsw_ef_construction": 256, + + "query_k": 100, + "query_body": { + "docvalue_fields" : ["id"], + "stored_fields" : "_none_" + }, + + "query_data_set_format": "hdf5", + "query_data_set_corpus": "cohere-10m", + "query_count": 10000 + } diff --git a/vectorsearch/params/aoss/10million/nmslib-cohere-768-dp.json b/vectorsearch/params/aoss/10million/nmslib-cohere-768-dp.json new file mode 100644 index 00000000..168fbc38 --- /dev/null +++ b/vectorsearch/params/aoss/10million/nmslib-cohere-768-dp.json @@ -0,0 +1,28 @@ +{ + "target_index_name": "target_index", + "target_field_name": "target_field", + "target_index_body": "indices/nmslib-index.json", + "target_index_dimension": 768, + "target_index_space_type": "innerproduct", + "id_field_name": "id", + + "target_index_bulk_size": 100, + "target_index_bulk_index_data_set_format": "hdf5", + "target_index_bulk_index_data_set_corpus": "cohere-10m", + "target_index_bulk_indexing_clients": 10, + + "hnsw_ef_search": 256, + "hnsw_ef_construction": 256, + + "query_k": 100, + "query_body": { + "docvalue_fields" : ["id"], + "stored_fields" : "_none_" + }, + + "query_data_set_format": "hdf5", + "query_data_set_corpus": "cohere-10m", + "neighbors_data_set_corpus": "cohere-10m", + "neighbors_data_set_format": "hdf5", + "query_count": 10000 + } diff --git a/vectorsearch/params/aoss/1million/faiss-cohere-768-dp.json b/vectorsearch/params/aoss/1million/faiss-cohere-768-dp.json new file mode 100644 index 00000000..773c080a --- /dev/null +++ b/vectorsearch/params/aoss/1million/faiss-cohere-768-dp.json @@ -0,0 +1,26 @@ +{ + "target_index_name": "target_index", + "target_field_name": "target_field", + "target_index_body": "indices/faiss-index.json", + "target_index_dimension": 768, + "target_index_space_type": "innerproduct", + "id_field_name": "id", + + "target_index_bulk_size": 100, + "target_index_bulk_index_data_set_format": "hdf5", + "target_index_bulk_index_data_set_corpus": "cohere-1m", + "target_index_bulk_indexing_clients": 10, + + "hnsw_ef_search": 256, + "hnsw_ef_construction": 256, + + "query_k": 100, + "query_body": { + "docvalue_fields" : ["id"], + "stored_fields" : "_none_" + }, + + "query_data_set_format": "hdf5", + "query_data_set_corpus": "cohere-1m", + "query_count": 10000 + } diff --git a/vectorsearch/params/aoss/1million/nmslib-cohere-768-dp.json b/vectorsearch/params/aoss/1million/nmslib-cohere-768-dp.json new file mode 100644 index 00000000..fe790b68 --- /dev/null +++ b/vectorsearch/params/aoss/1million/nmslib-cohere-768-dp.json @@ -0,0 +1,28 @@ +{ + "target_index_name": "target_index", + "target_field_name": "target_field", + "target_index_body": "indices/nmslib-index.json", + "target_index_dimension": 768, + "target_index_space_type": "innerproduct", + "id_field_name": "id", + + "target_index_bulk_size": 100, + "target_index_bulk_index_data_set_format": "hdf5", + "target_index_bulk_index_data_set_corpus": "cohere-1m", + "target_index_bulk_indexing_clients": 10, + + "hnsw_ef_search": 256, + "hnsw_ef_construction": 256, + + "query_k": 100, + "query_body": { + "docvalue_fields" : ["id"], + "stored_fields" : "_none_" + }, + + "query_data_set_format": "hdf5", + "query_data_set_corpus":"cohere-1m", + "neighbors_data_set_corpus":"cohere-1m", + "neighbors_data_set_format":"hdf5", + "query_count": 10000 + } diff --git a/vectorsearch/test_procedures/aoss/index-only-schedule.json b/vectorsearch/test_procedures/aoss/index-only-schedule.json new file mode 100644 index 00000000..88afc12f --- /dev/null +++ b/vectorsearch/test_procedures/aoss/index-only-schedule.json @@ -0,0 +1,30 @@ +{ + "operation": { + "name": "delete-target-index", + "operation-type": "delete-index", + "only-if-exists": true, + "index": "{{ target_index_name | default('target_index') }}" + } +}, +{ + "operation": { + "name": "create-target-index", + "operation-type": "create-index", + "index": "{{ target_index_name | default('target_index') }}" + } +}, +{ + "operation": { + "name": "custom-vector-bulk", + "operation-type": "bulk-vector-data-set", + "index": "{{ target_index_name | default('target_index') }}", + "field": "{{ target_field_name | default('target_field') }}", + "bulk_size": {{ target_index_bulk_size | default(500)}}, + "data_set_format": "{{ target_index_bulk_index_data_set_format | default('hdf5') }}", + "data_set_path": "{{ target_index_bulk_index_data_set_path }}", + "data_set_corpus": "{{ target_index_bulk_index_data_set_corpus }}", + "num_vectors": {{ target_index_num_vectors | default(-1) }}, + "id-field-name": "{{ id_field_name }}" + }, + "clients": {{ target_index_bulk_indexing_clients | default(1)}} +} diff --git a/vectorsearch/test_procedures/aoss/search-only-schedule.json b/vectorsearch/test_procedures/aoss/search-only-schedule.json new file mode 100644 index 00000000..6c5107d0 --- /dev/null +++ b/vectorsearch/test_procedures/aoss/search-only-schedule.json @@ -0,0 +1,20 @@ +{ + "operation": { + "name": "prod-queries", + "operation-type": "vector-search", + "index": "{{ target_index_name | default('target_index') }}", + "detailed-results": true, + "k": {{ query_k | default(100) }}, + "field" : "{{ target_field_name | default('target_field') }}", + "data_set_format" : "{{ query_data_set_format | default('hdf5') }}", + "data_set_path" : "{{ query_data_set_path }}", + "data_set_corpus" : "{{ query_data_set_corpus }}", + "neighbors_data_set_path" : "{{ neighbors_data_set_path }}", + "neighbors_data_set_corpus" : "{{ neighbors_data_set_corpus }}", + "neighbors_data_set_format" : "{{ neighbors_data_set_format | default('hdf5') }}", + "num_vectors" : {{ query_count | default(-1) }}, + "id-field-name": "{{ id_field_name }}", + "body": {{ query_body | default ({}) | tojson }} + }, + "clients": {{ search_clients | default(1)}} +} diff --git a/vectorsearch/test_procedures/default.json b/vectorsearch/test_procedures/default.json index db42455b..122ce2ad 100644 --- a/vectorsearch/test_procedures/default.json +++ b/vectorsearch/test_procedures/default.json @@ -40,4 +40,13 @@ "schedule": [ {{ benchmark.collect(parts="common/force-merge-schedule.json") }} ] +}, +{ + "name": "no-train-test-aoss", + "description": "Index vector search which does not use an algorithm that requires training.", + "default": false, + "schedule": [ + {{ benchmark.collect(parts="aoss/index-only-schedule.json") }}, + {{ benchmark.collect(parts="aoss/search-only-schedule.json") }} + ] }