Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backport 3] Add new procedure for AOSS #257

Merged
merged 1 commit into from
Mar 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions vectorsearch/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,16 +50,21 @@ for more details.
This procedure is used to index only vector search index which requires no training. This will be useful if
you are interested in benchmarking only indexing operation.

## Force Merge Index
### Force Merge Index
This procedure is used to optimize vector search indices by performing force merge on an index, up to given maximum segments.
For a large dataset, force merge is a costly operation. Hence, it is better to have separate procedure to trigger
force merge occasionally based on user's requirement.

## Search
### Search
This procedure is used to benchmark previously indexed vector search index. This will be useful if you want
to benchmark large vector search index without indexing everytime since load time is substantial for a large dataset.
This also contains warmup operation to avoid cold start problem during vector search.

### No Train Test AOSS

This is similar to no train test, except, targeted for Amazon OpenSearch Serverless Vector Search Collection. This procedure
does not contain operations like refresh and warm up since they are not supported by Vector Search Collection.



#### Parameters
Expand All @@ -84,6 +89,7 @@ This workload allows the following parameters to be specified using `--workload-
| target_index_force_merge_timeout | Timeout for of force merge requests in seconds |
| hnsw_ef_search | HNSW ef search parameter |
| hnsw_ef_construction | HNSW ef construction parameter |
| id_field_name | Name of field that will be used to identify documents in an index |
| hnsw_m | HNSW m parameter |
| query_k | The number of neighbors to return for the search |
| query_data_set_format | Format of vector data set for queries |
Expand Down
5 changes: 5 additions & 0 deletions vectorsearch/indices/faiss-index.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
"mappings": {
"dynamic": "strict",
"properties": {
{% if id_field_name is defined and id_field_name != "_id" %}
"{{id_field_name}}": {
"type": "keyword"
},
{%- endif %}
"target_field": {
"type": "knn_vector",
"dimension": {{ target_index_dimension }},
Expand Down
5 changes: 5 additions & 0 deletions vectorsearch/indices/lucene-index.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
"mappings": {
"dynamic": "strict",
"properties": {
{% if id_field_name is defined and id_field_name != "_id" %}
"{{id_field_name}}": {
"type": "keyword"
},
{%- endif %}
"target_field": {
"type": "knn_vector",
"dimension": {{ target_index_dimension }},
Expand Down
5 changes: 5 additions & 0 deletions vectorsearch/indices/nmslib-index.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
"mappings": {
"dynamic": "strict",
"properties": {
{% if id_field_name is defined and id_field_name != "_id" %}
"{{id_field_name}}": {
"type": "keyword"
},
{%- endif %}
"target_field": {
"type": "knn_vector",
"dimension": {{ target_index_dimension }},
Expand Down
26 changes: 26 additions & 0 deletions vectorsearch/params/aoss/10million/faiss-cohere-768-dp.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"target_index_name": "target_index",
"target_field_name": "target_field",
"target_index_body": "indices/faiss-index.json",
"target_index_dimension": 768,
"target_index_space_type": "innerproduct",
"id_field_name": "id",

"target_index_bulk_size": 100,
"target_index_bulk_index_data_set_format": "hdf5",
"target_index_bulk_index_data_set_corpus": "cohere-10m",
"target_index_bulk_indexing_clients": 10,

"hnsw_ef_search": 256,
"hnsw_ef_construction": 256,

"query_k": 100,
"query_body": {
"docvalue_fields" : ["id"],
"stored_fields" : "_none_"
},

"query_data_set_format": "hdf5",
"query_data_set_corpus": "cohere-10m",
"query_count": 10000
}
28 changes: 28 additions & 0 deletions vectorsearch/params/aoss/10million/nmslib-cohere-768-dp.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"target_index_name": "target_index",
"target_field_name": "target_field",
"target_index_body": "indices/nmslib-index.json",
"target_index_dimension": 768,
"target_index_space_type": "innerproduct",
"id_field_name": "id",

"target_index_bulk_size": 100,
"target_index_bulk_index_data_set_format": "hdf5",
"target_index_bulk_index_data_set_corpus": "cohere-10m",
"target_index_bulk_indexing_clients": 10,

"hnsw_ef_search": 256,
"hnsw_ef_construction": 256,

"query_k": 100,
"query_body": {
"docvalue_fields" : ["id"],
"stored_fields" : "_none_"
},

"query_data_set_format": "hdf5",
"query_data_set_corpus": "cohere-10m",
"neighbors_data_set_corpus": "cohere-10m",
"neighbors_data_set_format": "hdf5",
"query_count": 10000
}
26 changes: 26 additions & 0 deletions vectorsearch/params/aoss/1million/faiss-cohere-768-dp.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"target_index_name": "target_index",
"target_field_name": "target_field",
"target_index_body": "indices/faiss-index.json",
"target_index_dimension": 768,
"target_index_space_type": "innerproduct",
"id_field_name": "id",

"target_index_bulk_size": 100,
"target_index_bulk_index_data_set_format": "hdf5",
"target_index_bulk_index_data_set_corpus": "cohere-1m",
"target_index_bulk_indexing_clients": 10,

"hnsw_ef_search": 256,
"hnsw_ef_construction": 256,

"query_k": 100,
"query_body": {
"docvalue_fields" : ["id"],
"stored_fields" : "_none_"
},

"query_data_set_format": "hdf5",
"query_data_set_corpus": "cohere-1m",
"query_count": 10000
}
28 changes: 28 additions & 0 deletions vectorsearch/params/aoss/1million/nmslib-cohere-768-dp.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"target_index_name": "target_index",
"target_field_name": "target_field",
"target_index_body": "indices/nmslib-index.json",
"target_index_dimension": 768,
"target_index_space_type": "innerproduct",
"id_field_name": "id",

"target_index_bulk_size": 100,
"target_index_bulk_index_data_set_format": "hdf5",
"target_index_bulk_index_data_set_corpus": "cohere-1m",
"target_index_bulk_indexing_clients": 10,

"hnsw_ef_search": 256,
"hnsw_ef_construction": 256,

"query_k": 100,
"query_body": {
"docvalue_fields" : ["id"],
"stored_fields" : "_none_"
},

"query_data_set_format": "hdf5",
"query_data_set_corpus":"cohere-1m",
"neighbors_data_set_corpus":"cohere-1m",
"neighbors_data_set_format":"hdf5",
"query_count": 10000
}
30 changes: 30 additions & 0 deletions vectorsearch/test_procedures/aoss/index-only-schedule.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"operation": {
"name": "delete-target-index",
"operation-type": "delete-index",
"only-if-exists": true,
"index": "{{ target_index_name | default('target_index') }}"
}
},
{
"operation": {
"name": "create-target-index",
"operation-type": "create-index",
"index": "{{ target_index_name | default('target_index') }}"
}
},
{
"operation": {
"name": "custom-vector-bulk",
"operation-type": "bulk-vector-data-set",
"index": "{{ target_index_name | default('target_index') }}",
"field": "{{ target_field_name | default('target_field') }}",
"bulk_size": {{ target_index_bulk_size | default(500)}},
"data_set_format": "{{ target_index_bulk_index_data_set_format | default('hdf5') }}",
"data_set_path": "{{ target_index_bulk_index_data_set_path }}",
"data_set_corpus": "{{ target_index_bulk_index_data_set_corpus }}",
"num_vectors": {{ target_index_num_vectors | default(-1) }},
"id-field-name": "{{ id_field_name }}"
},
"clients": {{ target_index_bulk_indexing_clients | default(1)}}
}
20 changes: 20 additions & 0 deletions vectorsearch/test_procedures/aoss/search-only-schedule.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"operation": {
"name": "prod-queries",
"operation-type": "vector-search",
"index": "{{ target_index_name | default('target_index') }}",
"detailed-results": true,
"k": {{ query_k | default(100) }},
"field" : "{{ target_field_name | default('target_field') }}",
"data_set_format" : "{{ query_data_set_format | default('hdf5') }}",
"data_set_path" : "{{ query_data_set_path }}",
"data_set_corpus" : "{{ query_data_set_corpus }}",
"neighbors_data_set_path" : "{{ neighbors_data_set_path }}",
"neighbors_data_set_corpus" : "{{ neighbors_data_set_corpus }}",
"neighbors_data_set_format" : "{{ neighbors_data_set_format | default('hdf5') }}",
"num_vectors" : {{ query_count | default(-1) }},
"id-field-name": "{{ id_field_name }}",
"body": {{ query_body | default ({}) | tojson }}
},
"clients": {{ search_clients | default(1)}}
}
9 changes: 9 additions & 0 deletions vectorsearch/test_procedures/default.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,13 @@
"schedule": [
{{ benchmark.collect(parts="common/force-merge-schedule.json") }}
]
},
{
"name": "no-train-test-aoss",
"description": "Index vector search which does not use an algorithm that requires training.",
"default": false,
"schedule": [
{{ benchmark.collect(parts="aoss/index-only-schedule.json") }},
{{ benchmark.collect(parts="aoss/search-only-schedule.json") }}
]
}