Skip to content

Commit

Permalink
Add new procewdure for AOSS (opensearch-project#243)
Browse files Browse the repository at this point in the history
AOSS doesn't allow users to pass value to "_id".
Hence, create a new procedure and param to support user defined
field "id" as unique field to identify document from index.

Signed-off-by: Vijayan Balasubramanian <balasvij@amazon.com>
  • Loading branch information
VijayanB authored Mar 25, 2024
1 parent d234efa commit ddf643b
Show file tree
Hide file tree
Showing 11 changed files with 190 additions and 2 deletions.
10 changes: 8 additions & 2 deletions vectorsearch/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,16 +50,21 @@ for more details.
This procedure is used to index only vector search index which requires no training. This will be useful if
you are interested in benchmarking only indexing operation.

## Force Merge Index
### Force Merge Index
This procedure is used to optimize vector search indices by performing force merge on an index, up to given maximum segments.
For a large dataset, force merge is a costly operation. Hence, it is better to have separate procedure to trigger
force merge occasionally based on user's requirement.

## Search
### Search
This procedure is used to benchmark previously indexed vector search index. This will be useful if you want
to benchmark large vector search index without indexing everytime since load time is substantial for a large dataset.
This also contains warmup operation to avoid cold start problem during vector search.

### No Train Test AOSS

This is similar to no train test, except, targeted for Amazon OpenSearch Serverless Vector Search Collection. This procedure
does not contain operations like refresh and warm up since they are not supported by Vector Search Collection.



#### Parameters
Expand All @@ -84,6 +89,7 @@ This workload allows the following parameters to be specified using `--workload-
| target_index_force_merge_timeout | Timeout for of force merge requests in seconds |
| hnsw_ef_search | HNSW ef search parameter |
| hnsw_ef_construction | HNSW ef construction parameter |
| id_field_name | Name of field that will be used to identify documents in an index |
| hnsw_m | HNSW m parameter |
| query_k | The number of neighbors to return for the search |
| query_data_set_format | Format of vector data set for queries |
Expand Down
5 changes: 5 additions & 0 deletions vectorsearch/indices/faiss-index.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
"mappings": {
"dynamic": "strict",
"properties": {
{% if id_field_name is defined and id_field_name != "_id" %}
"{{id_field_name}}": {
"type": "keyword"
},
{%- endif %}
"target_field": {
"type": "knn_vector",
"dimension": {{ target_index_dimension }},
Expand Down
5 changes: 5 additions & 0 deletions vectorsearch/indices/lucene-index.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
"mappings": {
"dynamic": "strict",
"properties": {
{% if id_field_name is defined and id_field_name != "_id" %}
"{{id_field_name}}": {
"type": "keyword"
},
{%- endif %}
"target_field": {
"type": "knn_vector",
"dimension": {{ target_index_dimension }},
Expand Down
5 changes: 5 additions & 0 deletions vectorsearch/indices/nmslib-index.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
"mappings": {
"dynamic": "strict",
"properties": {
{% if id_field_name is defined and id_field_name != "_id" %}
"{{id_field_name}}": {
"type": "keyword"
},
{%- endif %}
"target_field": {
"type": "knn_vector",
"dimension": {{ target_index_dimension }},
Expand Down
26 changes: 26 additions & 0 deletions vectorsearch/params/aoss/10million/faiss-cohere-768-dp.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"target_index_name": "target_index",
"target_field_name": "target_field",
"target_index_body": "indices/faiss-index.json",
"target_index_dimension": 768,
"target_index_space_type": "innerproduct",
"id_field_name": "id",

"target_index_bulk_size": 100,
"target_index_bulk_index_data_set_format": "hdf5",
"target_index_bulk_index_data_set_corpus": "cohere-10m",
"target_index_bulk_indexing_clients": 10,

"hnsw_ef_search": 256,
"hnsw_ef_construction": 256,

"query_k": 100,
"query_body": {
"docvalue_fields" : ["id"],
"stored_fields" : "_none_"
},

"query_data_set_format": "hdf5",
"query_data_set_corpus": "cohere-10m",
"query_count": 10000
}
28 changes: 28 additions & 0 deletions vectorsearch/params/aoss/10million/nmslib-cohere-768-dp.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"target_index_name": "target_index",
"target_field_name": "target_field",
"target_index_body": "indices/nmslib-index.json",
"target_index_dimension": 768,
"target_index_space_type": "innerproduct",
"id_field_name": "id",

"target_index_bulk_size": 100,
"target_index_bulk_index_data_set_format": "hdf5",
"target_index_bulk_index_data_set_corpus": "cohere-10m",
"target_index_bulk_indexing_clients": 10,

"hnsw_ef_search": 256,
"hnsw_ef_construction": 256,

"query_k": 100,
"query_body": {
"docvalue_fields" : ["id"],
"stored_fields" : "_none_"
},

"query_data_set_format": "hdf5",
"query_data_set_corpus": "cohere-10m",
"neighbors_data_set_corpus": "cohere-10m",
"neighbors_data_set_format": "hdf5",
"query_count": 10000
}
26 changes: 26 additions & 0 deletions vectorsearch/params/aoss/1million/faiss-cohere-768-dp.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"target_index_name": "target_index",
"target_field_name": "target_field",
"target_index_body": "indices/faiss-index.json",
"target_index_dimension": 768,
"target_index_space_type": "innerproduct",
"id_field_name": "id",

"target_index_bulk_size": 100,
"target_index_bulk_index_data_set_format": "hdf5",
"target_index_bulk_index_data_set_corpus": "cohere-1m",
"target_index_bulk_indexing_clients": 10,

"hnsw_ef_search": 256,
"hnsw_ef_construction": 256,

"query_k": 100,
"query_body": {
"docvalue_fields" : ["id"],
"stored_fields" : "_none_"
},

"query_data_set_format": "hdf5",
"query_data_set_corpus": "cohere-1m",
"query_count": 10000
}
28 changes: 28 additions & 0 deletions vectorsearch/params/aoss/1million/nmslib-cohere-768-dp.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"target_index_name": "target_index",
"target_field_name": "target_field",
"target_index_body": "indices/nmslib-index.json",
"target_index_dimension": 768,
"target_index_space_type": "innerproduct",
"id_field_name": "id",

"target_index_bulk_size": 100,
"target_index_bulk_index_data_set_format": "hdf5",
"target_index_bulk_index_data_set_corpus": "cohere-1m",
"target_index_bulk_indexing_clients": 10,

"hnsw_ef_search": 256,
"hnsw_ef_construction": 256,

"query_k": 100,
"query_body": {
"docvalue_fields" : ["id"],
"stored_fields" : "_none_"
},

"query_data_set_format": "hdf5",
"query_data_set_corpus":"cohere-1m",
"neighbors_data_set_corpus":"cohere-1m",
"neighbors_data_set_format":"hdf5",
"query_count": 10000
}
30 changes: 30 additions & 0 deletions vectorsearch/test_procedures/aoss/index-only-schedule.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"operation": {
"name": "delete-target-index",
"operation-type": "delete-index",
"only-if-exists": true,
"index": "{{ target_index_name | default('target_index') }}"
}
},
{
"operation": {
"name": "create-target-index",
"operation-type": "create-index",
"index": "{{ target_index_name | default('target_index') }}"
}
},
{
"operation": {
"name": "custom-vector-bulk",
"operation-type": "bulk-vector-data-set",
"index": "{{ target_index_name | default('target_index') }}",
"field": "{{ target_field_name | default('target_field') }}",
"bulk_size": {{ target_index_bulk_size | default(500)}},
"data_set_format": "{{ target_index_bulk_index_data_set_format | default('hdf5') }}",
"data_set_path": "{{ target_index_bulk_index_data_set_path }}",
"data_set_corpus": "{{ target_index_bulk_index_data_set_corpus }}",
"num_vectors": {{ target_index_num_vectors | default(-1) }},
"id-field-name": "{{ id_field_name }}"
},
"clients": {{ target_index_bulk_indexing_clients | default(1)}}
}
20 changes: 20 additions & 0 deletions vectorsearch/test_procedures/aoss/search-only-schedule.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"operation": {
"name": "prod-queries",
"operation-type": "vector-search",
"index": "{{ target_index_name | default('target_index') }}",
"detailed-results": true,
"k": {{ query_k | default(100) }},
"field" : "{{ target_field_name | default('target_field') }}",
"data_set_format" : "{{ query_data_set_format | default('hdf5') }}",
"data_set_path" : "{{ query_data_set_path }}",
"data_set_corpus" : "{{ query_data_set_corpus }}",
"neighbors_data_set_path" : "{{ neighbors_data_set_path }}",
"neighbors_data_set_corpus" : "{{ neighbors_data_set_corpus }}",
"neighbors_data_set_format" : "{{ neighbors_data_set_format | default('hdf5') }}",
"num_vectors" : {{ query_count | default(-1) }},
"id-field-name": "{{ id_field_name }}",
"body": {{ query_body | default ({}) | tojson }}
},
"clients": {{ search_clients | default(1)}}
}
9 changes: 9 additions & 0 deletions vectorsearch/test_procedures/default.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,13 @@
"schedule": [
{{ benchmark.collect(parts="common/force-merge-schedule.json") }}
]
},
{
"name": "no-train-test-aoss",
"description": "Index vector search which does not use an algorithm that requires training.",
"default": false,
"schedule": [
{{ benchmark.collect(parts="aoss/index-only-schedule.json") }},
{{ benchmark.collect(parts="aoss/search-only-schedule.json") }}
]
}

0 comments on commit ddf643b

Please sign in to comment.