diff --git a/haystack/components/evaluators/document_map.py b/haystack/components/evaluators/document_map.py index f18a0e675b..212edb86c1 100644 --- a/haystack/components/evaluators/document_map.py +++ b/haystack/components/evaluators/document_map.py @@ -43,6 +43,7 @@ class DocumentMAPEvaluator: ``` """ + # Refer to https://www.pinecone.io/learn/offline-evaluation/ for the algorithm. @component.output_types(score=float, individual_scores=List[float]) def run( self, ground_truth_documents: List[List[Document]], retrieved_documents: List[List[Document]] @@ -68,25 +69,21 @@ def run( individual_scores = [] for ground_truth, retrieved in zip(ground_truth_documents, retrieved_documents): - score = 0.0 - for ground_document in ground_truth: - if ground_document.content is None: - continue - - average_precision = 0.0 - relevant_documents = 0 + average_precision = 0.0 + average_precision_numerator = 0.0 + relevant_documents = 0 - for rank, retrieved_document in enumerate(retrieved): - if retrieved_document.content is None: - continue - - if ground_document.content in retrieved_document.content: - relevant_documents += 1 - average_precision += relevant_documents / (rank + 1) - if relevant_documents > 0: - score = average_precision / relevant_documents - individual_scores.append(score) + ground_truth_contents = [doc.content for doc in ground_truth if doc.content is not None] + for rank, retrieved_document in enumerate(retrieved): + if retrieved_document.content is None: + continue - score = sum(individual_scores) / len(retrieved_documents) + if retrieved_document.content in ground_truth_contents: + relevant_documents += 1 + average_precision_numerator += relevant_documents / (rank + 1) + if relevant_documents > 0: + average_precision = average_precision_numerator / relevant_documents + individual_scores.append(average_precision) + score = sum(individual_scores) / len(ground_truth_documents) return {"score": score, "individual_scores": individual_scores} diff --git a/haystack/components/evaluators/document_mrr.py b/haystack/components/evaluators/document_mrr.py index 64a98cc177..81caf0a418 100644 --- a/haystack/components/evaluators/document_mrr.py +++ b/haystack/components/evaluators/document_mrr.py @@ -41,6 +41,7 @@ class DocumentMRREvaluator: ``` """ + # Refer to https://www.pinecone.io/learn/offline-evaluation/ for the algorithm. @component.output_types(score=float, individual_scores=List[float]) def run( self, ground_truth_documents: List[List[Document]], retrieved_documents: List[List[Document]] @@ -66,20 +67,17 @@ def run( individual_scores = [] for ground_truth, retrieved in zip(ground_truth_documents, retrieved_documents): - score = 0.0 - for ground_document in ground_truth: - if ground_document.content is None: - continue - - for rank, retrieved_document in enumerate(retrieved): - if retrieved_document.content is None: - continue + reciprocal_rank = 0.0 - if ground_document.content in retrieved_document.content: - score = 1 / (rank + 1) - break - individual_scores.append(score) + ground_truth_contents = [doc.content for doc in ground_truth if doc.content is not None] + for rank, retrieved_document in enumerate(retrieved): + if retrieved_document.content is None: + continue + if retrieved_document.content in ground_truth_contents: + reciprocal_rank = 1 / (rank + 1) + break + individual_scores.append(reciprocal_rank) - score = sum(individual_scores) / len(retrieved_documents) + score = sum(individual_scores) / len(ground_truth_documents) return {"score": score, "individual_scores": individual_scores} diff --git a/releasenotes/notes/fix-issue-7758-d35b687ca226a707.yaml b/releasenotes/notes/fix-issue-7758-d35b687ca226a707.yaml new file mode 100644 index 0000000000..b1accdd2ac --- /dev/null +++ b/releasenotes/notes/fix-issue-7758-d35b687ca226a707.yaml @@ -0,0 +1,4 @@ +--- +fixes: + - | + Fixed the calculation for MRR and MAP scores. diff --git a/test/components/evaluators/test_document_map.py b/test/components/evaluators/test_document_map.py index 9c481e3d6c..7c7b26c089 100644 --- a/test/components/evaluators/test_document_map.py +++ b/test/components/evaluators/test_document_map.py @@ -62,7 +62,17 @@ def test_run_with_complex_data(): ], ], ) - assert result == {"individual_scores": [1.0, 0.8333333333333333, 1.0, 0.5, 0.0, 1.0], "score": 0.7222222222222222} + assert result == { + "individual_scores": [ + 1.0, + pytest.approx(0.8333333333333333), + 1.0, + pytest.approx(0.5833333333333333), + 0.0, + pytest.approx(0.8055555555555555), + ], + "score": pytest.approx(0.7037037037037037), + } def test_run_with_different_lengths():