Skip to content

Commit

Permalink
Separate vector search recall functions
Browse files Browse the repository at this point in the history
Signed-off-by: Junqiu Lei <junqiu@amazon.com>
  • Loading branch information
junqiu-lei committed Jul 23, 2024
1 parent b35e63a commit 0f2ad7e
Showing 1 changed file with 68 additions and 26 deletions.
94 changes: 68 additions & 26 deletions osbenchmark/worker_coordinator/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1230,17 +1230,6 @@ async def _vector_search_query_with_recall(opensearch, params):
Perform vector search and report recall@k , recall@r and time taken to perform recall in ms as
meta object.
"""
result = {
"weight": 1,
"unit": "ops",
"success": True,
"recall@k": 0,
"recall@1": 0,
"recall@min_score": 0,
"recall@min_score_1": 0,
"recall@max_distance": 0,
"recall@max_distance_1": 0,
}

def _is_empty_search_results(content):
if content is None:
Expand All @@ -1263,14 +1252,42 @@ def _get_field_value(content, field_name):
return _get_field_value(content["_source"], field_name)
return None

def calculate_recall(predictions, neighbors, top_1_recall=False):
def calculate_topk_search_recall(predictions, neighbors, top_k):
"""
Calculates the recall by comparing top_k neighbors with predictions.
recall = Sum of matched neighbors from predictions / total number of neighbors from ground truth
Args:
predictions: list containing ids of results returned by OpenSearch.
neighbors: list containing ids of the actual neighbors for a set of queries
top_1_recall: boolean to calculate recall@1
top_k: number of top results to check from the neighbors and should be greater than zero
Returns:
Recall between predictions and top k neighbors from ground truth
"""
correct = 0.0
if neighbors is None:
self.logger.info("No neighbors are provided for recall calculation")
return 0.0
min_num_of_results = min(top_k, len(neighbors))
truth_set = neighbors[:min_num_of_results]
for j in range(min_num_of_results):
if j >= len(predictions):
self.logger.info("No more neighbors in prediction to compare against ground truth.\n"
"Total neighbors in prediction: [%d].\n"
"Total neighbors in ground truth: [%d]", len(predictions), min_num_of_results)
break
if predictions[j] in truth_set:
correct += 1.0

return correct / min_num_of_results

def calculate_radial_search_recall(predictions, neighbors, enable_top_1_recall=False):
"""
Calculates the recall by comparing max_distance/min_score threshold neighbors with predictions.
recall = Sum of matched neighbors from predictions / total number of neighbors from ground truth
Args:
predictions: list containing ids of results returned by OpenSearch.
neighbors: list containing ids of the actual neighbors for a set of queries
enable_top_1_recall: boolean to calculate recall@1
Returns:
Recall between predictions and top k neighbors from ground truth
"""
Expand All @@ -1287,7 +1304,7 @@ def calculate_recall(predictions, neighbors, top_1_recall=False):
self.logger.info("No neighbors are provided for recall calculation")
return 1

if top_1_recall:
if enable_top_1_recall:
min_num_of_results = 1

for j in range(min_num_of_results):
Expand All @@ -1304,6 +1321,28 @@ def calculate_recall(predictions, neighbors, top_1_recall=False):
doc_type = params.get("type")
response = await self._raw_search(opensearch, doc_type, index, body, request_params, headers=headers)

result = {
"weight": 1,
"unit": "ops",
"success": True,
}

if params.get("k"):
result.update({
"recall@k": 0,
"recall@1": 0
})
elif params.get("max_distance"):
result.update({
"recall@max_distance": 0,
"recall@max_distance_1": 0
})
elif params.get("min_score"):
result.update({
"recall@min_score": 0,
"recall@min_score_1": 0
})

recall_processing_start = time.perf_counter()
if detailed_results:
props = parse(response, ["hits.total", "hits.total.value", "hits.total.relation", "timed_out", "took"])
Expand Down Expand Up @@ -1331,19 +1370,22 @@ def calculate_recall(predictions, neighbors, top_1_recall=False):
continue
candidates.append(field_value)
neighbors_dataset = params["neighbors"]
recall_threshold = calculate_recall(candidates, neighbors_dataset)
recall_top_1 = calculate_recall(candidates, neighbors_dataset, True)
max_distance = params.get("max_distance")
min_score = params.get("min_score")
if min_score:
result.update({"recall@min_score": recall_threshold})
result.update({"recall@min_score_1": recall_top_1})
elif max_distance:
result.update({"recall@max_distance": recall_threshold})
result.update({"recall@max_distance_1": recall_top_1})
else:
result.update({"recall@k": recall_threshold})

if params.get("k"):
num_neighbors = params.get("k", 1)
recall_top_k = calculate_topk_search_recall(candidates, neighbors_dataset, num_neighbors)
recall_top_1 = calculate_topk_search_recall(candidates, neighbors_dataset, 1)
result.update({"recall@k": recall_top_k})
result.update({"recall@1": recall_top_1})
else:
recall_threshold = calculate_radial_search_recall(candidates, neighbors_dataset)
recall_top_1 = calculate_radial_search_recall(candidates, neighbors_dataset, True)
if params.get("min_score"):
result.update({"recall@min_score": recall_threshold})
result.update({"recall@min_score_1": recall_top_1})
elif params.get("max_distance"):
result.update({"recall@max_distance": recall_threshold})
result.update({"recall@max_distance_1": recall_top_1})

recall_processing_end = time.perf_counter()
recall_processing_time = convert.seconds_to_ms(recall_processing_end - recall_processing_start)
Expand Down

0 comments on commit 0f2ad7e

Please sign in to comment.