Add sem_score metric for generation evaluation (#155)

* add all-mpnet-base-v2 embedding model as default * add all-mpnet-base-v2 to local model docs * add sem score metric and test code for it * add cast_metrics for processing metrics list or dict from yaml file. * add cast_metrics at evaluate for new List[Dict] input type * edit metrics type and add new metric sem_score to full.yaml * add documentation about sem_score * add api specification for new files --------- Co-authored-by: jeffrey <vkefhdl1@gmail.com>
Marker-Inc-Korea · Feb 10, 2024 · 882624e · 882624e
1 parent 671f3cb
commit 882624e
Show file tree

Hide file tree

Showing 15 changed files with 194 additions and 37 deletions.
diff --git a/autorag/__init__.py b/autorag/__init__.py
@@ -24,6 +24,7 @@
     # you can use your own model in this way.
     'huggingface_baai_bge_small': HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5"),
     'huggingface_cointegrated_rubert_tiny2': HuggingFaceEmbedding(model_name="cointegrated/rubert-tiny2"),
+    'huggingface_all_mpnet_base_v2': HuggingFaceEmbedding(model_name="sentence-transformers/all-mpnet-base-v2")
 }
 
 generator_models = {

diff --git a/autorag/evaluate/generation.py b/autorag/evaluate/generation.py
@@ -1,16 +1,17 @@
 import functools
 import warnings
-from typing import List, Callable
+from typing import List, Callable, Union, Dict
 
 import pandas as pd
 
-from autorag.evaluate.metric.generation import bleu, meteor, rouge
+from autorag.evaluate.metric.generation import bleu, meteor, rouge, sem_score
+from autorag.evaluate.util import cast_metrics
 
 GENERATION_METRIC_FUNC_DICT = {func.__name__: func for func in
-                               [bleu, meteor, rouge]}
+                               [bleu, meteor, rouge, sem_score]}
 
 
-def evaluate_generation(generation_gt: List[List[str]], metrics: List[str]):
+def evaluate_generation(generation_gt: List[List[str]], metrics: Union[List[str], List[Dict]]):
     def decorator_evaluate_generation(func: Callable):
         @functools.wraps(func)
         def wrapper(*args, **kwargs) -> pd.DataFrame:
@@ -27,13 +28,15 @@ def wrapper(*args, **kwargs) -> pd.DataFrame:
                 raise ValueError("Input func must return string list as generated answer at the first return value.")
 
             metric_scores = {}
-            for metric in metrics:
-                if metric not in GENERATION_METRIC_FUNC_DICT:
-                    warnings.warn(f"metric {metric} is not in supported metrics: {GENERATION_METRIC_FUNC_DICT.keys()}"
-                                  f"{metric} will be ignored.")
+            metric_names, metric_params = cast_metrics(metrics)
+
+            for metric_name, metric_param in zip(metric_names, metric_params):
+                if metric_name not in GENERATION_METRIC_FUNC_DICT:
+                    warnings.warn(f"metric {metric_name} is not in supported metrics: {GENERATION_METRIC_FUNC_DICT.keys()}"
+                                  f"{metric_name} will be ignored.")
                 else:
-                    metric_scores[metric] = GENERATION_METRIC_FUNC_DICT[metric](
-                        generation_gt=generation_gt, generations=generated_str)
+                    metric_scores[metric_name] = GENERATION_METRIC_FUNC_DICT[metric_name](
+                        generation_gt=generation_gt, generations=generated_str, **metric_param)
 
             metric_result_df = pd.DataFrame(metric_scores)
             execution_result_df = pd.DataFrame({

diff --git a/autorag/evaluate/metric/generation.py b/autorag/evaluate/metric/generation.py
@@ -1,26 +1,31 @@
 import functools
-from typing import List
+from typing import List, Optional
 
 import evaluate
 import pandas as pd
 import sacrebleu
+from llama_index.core.embeddings.base import BaseEmbedding
+
+from autorag import embedding_models
+from autorag.evaluate.metric.util import calculate_cosine_similarity
 
 
 def generation_metric(func):
     @functools.wraps(func)
-    def wrapper(generation_gt: List[List[str]], generations: List[str]) -> List[float]:
+    def wrapper(generation_gt: List[List[str]], generations: List[str], **kwargs) -> List[float]:
         """
         Compute generation metric.
 
         :param generation_gt: A list of ground truth.
             Must be 2-d list of string.
             Because it can be a multiple ground truth.
         :param generations: A list of generations that LLM generated.
+        :param kwargs: The additional arguments for metric function.
         :return: A list of computed metric scores.
         """
         # make generation_gt and generations to pd dataframe
         df = pd.DataFrame({'gt': generation_gt, 'pred': generations})
-        df[func.__name__] = df.swifter.apply(lambda x: func(x['gt'], x['pred']), axis=1)
+        df[func.__name__] = df.swifter.apply(lambda x: func(x['gt'], x['pred'], **kwargs), axis=1)
         return df[func.__name__].tolist()
 
     return wrapper
@@ -82,3 +87,28 @@ def rouge(generation_gt: List[List[str]], generations: List[str]) -> List[float]
     """
     rouge_instance = evaluate.load("rouge")
     return huggingface_evaluate(rouge_instance, 'rougeL', generation_gt, generations)
+
+
+@generation_metric
+def sem_score(generation_gt: List[str], pred: str, embedding_model: Optional[BaseEmbedding] = None) -> float:
+    """
+    Compute sem score between generation gt and pred with cosine similarity.
+
+    :param generation_gt: A list of ground truth.
+        Must be list of string.
+        It will get the max of cosine similarity between generation_gt and pred.
+    :param pred: Model prediction.
+    :param embedding_model: Embedding model to use for compute cosine similarity.
+        Default is all-mpnet-base-v2 embedding model.
+        The paper used this embedding model.
+    :return: Sem score between generation_gt and pred.
+    """
+    if embedding_model is None:
+        embedding_model = embedding_models['huggingface_all_mpnet_base_v2']
+
+    gt_embeddings = embedding_model.get_text_embedding_batch(generation_gt)
+    pred_embedding = embedding_model.get_text_embedding(pred)
+
+    # calculate cosine similarity
+    similarity_scores: List[float] = list(map(lambda x: calculate_cosine_similarity(x, pred_embedding), gt_embeddings))
+    return max(similarity_scores)
diff --git a/autorag/evaluate/metric/util.py b/autorag/evaluate/metric/util.py
@@ -0,0 +1,9 @@
+import numpy as np
+
+
+def calculate_cosine_similarity(a, b):
+    dot_product = np.dot(a, b)
+    norm_a = np.linalg.norm(a)
+    norm_b = np.linalg.norm(b)
+    similarity = dot_product / (norm_a * norm_b)
+    return similarity
diff --git a/autorag/evaluate/util.py b/autorag/evaluate/util.py
@@ -0,0 +1,31 @@
+from typing import Union, List, Dict, Tuple, Any
+
+from autorag import embedding_models
+
+
+def cast_metrics(metrics: Union[List[str], List[Dict]]) -> Tuple[List[str], List[Dict[str, Any]]]:
+    """
+     Turn metrics to list of metric names and parameter list.
+
+    :param metrics: List of string or dictionary.
+    :return: The list of metric names and dictionary list of metric parameters.
+    """
+    if not isinstance(metrics, list):
+        raise ValueError("metrics must be a list of string or dictionary.")
+    if isinstance(metrics[0], str):
+        return metrics, [{} for _ in metrics]
+    elif isinstance(metrics[0], dict):
+        # pop 'metric_name' key from dictionary
+        metric_names = list(map(lambda x: x.pop('metric_name'), metrics))
+        metric_params = [dict(map(lambda x, y: cast_embedding_model(x, y), metric.keys(), metric.values())) for metric
+                         in metrics]
+        return metric_names, metric_params
+    else:
+        raise ValueError("metrics must be a list of string or dictionary.")
+
+
+def cast_embedding_model(key, value):
+    if key == 'embedding_model':
+        return key, embedding_models[value]
+    else:
+        return key, value
diff --git a/autorag/nodes/promptmaker/run.py b/autorag/nodes/promptmaker/run.py
@@ -1,7 +1,7 @@
 import os
 import pathlib
 from copy import deepcopy
-from typing import List, Callable, Dict, Optional
+from typing import List, Callable, Dict, Optional, Union
 
 import pandas as pd
 
@@ -144,7 +144,7 @@ def evaluate_one_prompt_maker_node(generator_funcs: List[Callable],
                                    generator_params: List[Dict],
                                    prompts: List[str],
                                    generation_gt: List[List[str]],
-                                   metrics: List[str],
+                                   metrics: Union[List[str], List[Dict]],
                                    project_dir) -> pd.DataFrame:
     input_df = pd.DataFrame({'prompts': prompts})
     generator_results = list(map(lambda x: x[0](project_dir=project_dir, previous_result=input_df, **x[1]),
@@ -158,7 +158,7 @@ def evaluate_one_prompt_maker_node(generator_funcs: List[Callable],
 
 def evaluate_generator_result(result_df: pd.DataFrame,
                               generation_gt: List[List[str]],
-                              metrics: List[str]) -> pd.DataFrame:
+                              metrics: Union[List[str], List[Dict]]) -> pd.DataFrame:
     @evaluate_generation(generation_gt=generation_gt, metrics=metrics)
     def evaluate(df):
         return df['generated_texts'].tolist()

diff --git a/docs/source/api_spec/autorag.evaluate.metric.rst b/docs/source/api_spec/autorag.evaluate.metric.rst
@@ -28,6 +28,14 @@ autorag.evaluate.metric.retrieval\_contents module
    :undoc-members:
    :show-inheritance:
 
+autorag.evaluate.metric.util module
+-----------------------------------
+
+.. automodule:: autorag.evaluate.metric.util
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 Module contents
 ---------------
 

diff --git a/docs/source/api_spec/autorag.evaluate.rst b/docs/source/api_spec/autorag.evaluate.rst
@@ -36,6 +36,14 @@ autorag.evaluate.retrieval\_contents module
    :undoc-members:
    :show-inheritance:
 
+autorag.evaluate.util module
+----------------------------
+
+.. automodule:: autorag.evaluate.util
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 Module contents
 ---------------
 

diff --git a/docs/source/local_model.md b/docs/source/local_model.md
@@ -99,15 +99,16 @@ Modules that using embedding model can take `embedding_model` parameter to speci
 As default, we support OpenAI embedding models and some of the local models.
 To change the embedding model, you can change the `embedding_model` parameter to the following values:
 
-|                             Embedding Model Type                              |       embedding_model parameter       |
-|:-----------------------------------------------------------------------------:|:-------------------------------------:|
-|                           Default openai embedding                            |                openai                 |
-|                           openai babbage embedding                            |            openai_babbage             |
-|                             openai ada embedding                              |              openai_ada               |
-|                           openai davinci embedding                            |            openai_davinci             |
-|                            openai curie embedding                             |             openai_curie              |
-|    [BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5)    |      huggingface_baai_bge_small       |
-| [cointegrated/rubert-tiny2](https://huggingface.co/cointegrated/rubert-tiny2) | huggingface_cointegrated_rubert_tiny2 |
+|                                           Embedding Model Type                                            |       embedding_model parameter       |
+|:---------------------------------------------------------------------------------------------------------:|:-------------------------------------:|
+|                                         Default openai embedding                                          |                openai                 |
+|                                         openai babbage embedding                                          |            openai_babbage             |
+|                                           openai ada embedding                                            |              openai_ada               |
+|                                         openai davinci embedding                                          |            openai_davinci             |
+|                                          openai curie embedding                                           |             openai_curie              |
+|                  [BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5)                  |      huggingface_baai_bge_small       |
+|               [cointegrated/rubert-tiny2](https://huggingface.co/cointegrated/rubert-tiny2)               | huggingface_cointegrated_rubert_tiny2 |
+| [sentence-transformers/all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) |     huggingface_all_mpnet_base_v2     |
 
 For example, if you want to use OpenAI curie embedding model, you can set `embedding_model` parameter to `openai_curie`.
 
@@ -130,7 +131,7 @@ Because the embedding model is initialized at the beginning of the AutoRAG progr
 
 You can add more embedding models for AutoRAG.
 You can add it by simply calling `autorag.embedding_models` and add new key and value.
-For example, 
+For example,
 if you want to add `[KoSimCSE](https://huggingface.co/BM-K/KoSimCSE-roberta-multitask)` model for Korean embedding,
 execute the following code.
 
@@ -143,7 +144,6 @@ autorag.generator_models['kosimcse'] = HuggingFaceEmbedding("BM-K/KoSimCSE-rober
 
 Then you can use `kosimcse` at config yaml file.
 
-
 ```{caution}
 When you add new LLM model, you should add instance of the `BaseEmbedding` class from LlamaIndex.
 ```

diff --git a/docs/source/nodes/generator/generator.md b/docs/source/nodes/generator/generator.md
@@ -13,11 +13,21 @@ This document serves as a guide for configuring parameters, strategies, and the
 
 ### **Strategy Parameters**
 1. **Metrics**:  
-   - **Types**: `bleu`, `meteor`, `rouge`
+   - **Types**: `bleu`, `meteor`, `rouge`, `sem_score`
    ```{admonition} Purpose
    These metrics are used to evaluate the performance of language models by comparing model-generated text to ground truth texts.
    We are planning to add more metrics to evaluate generation performance.
    ```
+
+   ```{admonition} sem_score
+   Sem_score is a metric that evaluates the semantic similarity between ground truth and llm generation.
+   It is quite simple, but effective to evaluate LLM systems.
+   
+   Since it uses embedding model, you can specify the embedding model name at config YAML file.
+   Since AutoRAG v0.0.6, we support dictionary at strategy.
+   You can check out this feature at the example config.yaml file below.
+   ```
+
 
 2. **Speed Threshold**:
    - **Description**: This optional parameter can be applied to all nodes to ensure that the processing time for a method does not exceed a predefined threshold.
@@ -28,7 +38,11 @@ This document serves as a guide for configuring parameters, strategies, and the
   nodes:
     - node_type: generator
       strategy:
-        metrics: [bleu, meteor, rouge]
+        metrics:
+           - metric_name: bleu
+           - metric_name: meteor
+           - metric_name: sem_score
+             embedding_model: openai
         speed_threshold: 10
       modules:
         - module_type: llama_index_llm

diff --git a/docs/source/nodes/prompt_maker/prompt_maker.md b/docs/source/nodes/prompt_maker/prompt_maker.md
@@ -30,7 +30,7 @@ node_lines:
   nodes:
     - node_type: prompt_maker
       strategy:
-        metrics: [bleu, meteor, rouge]
+        metrics: [bleu, meteor, rouge, sem_score]
         speed_threshold: 10
         generator_modules:
           - module_type: llama_index_llm

diff --git a/sample_config/full.yaml b/sample_config/full.yaml
@@ -69,7 +69,12 @@ node_lines:
                    "Question: {query} \n Something to read: {retrieved_contents} \n What's your answer?"]
     - node_type: generator
       strategy:
-        metrics: [bleu, meteor, rouge]
+        metrics:
+          - metric_name: bleu
+          - metric_name: meteor
+          - metric_name: rouge
+          - metric_name: sem_score
+            embedding_model: openai
         speed_threshold: 10
       modules:
         - module_type: llama_index_llm

diff --git a/tests/autorag/evaluate/metric/test_generation_metric.py b/tests/autorag/evaluate/metric/test_generation_metric.py
@@ -1,6 +1,7 @@
 import pytest
+from llama_index import OpenAIEmbedding
 
-from autorag.evaluate.metric.generation import bleu, meteor, rouge
+from autorag.evaluate.metric.generation import bleu, meteor, rouge, sem_score
 
 generation_gts = [
     ['The dog had bit the man.', 'The man had bitten the dog.'],
@@ -16,8 +17,8 @@
 ]
 
 
-def base_test_generation_metrics(func, solution):
-    scores = func(generation_gt=generation_gts, generations=generations)
+def base_test_generation_metrics(func, solution, **kwargs):
+    scores = func(generation_gt=generation_gts, generations=generations, **kwargs)
     assert len(scores) == len(generation_gts)
     assert all(isinstance(score, float) for score in scores)
     assert all(list(map(lambda x: x[0] == pytest.approx(x[1], 0.001),
@@ -34,3 +35,11 @@ def test_meteor():
 
 def test_rouge():
     base_test_generation_metrics(rouge, [0.909, 0.35714, 1.0])
+
+
+def test_sem_score():
+    base_test_generation_metrics(sem_score, [0.8798, 0.7952, 1.0])
+
+
+def test_sem_score_other_model():
+    base_test_generation_metrics(sem_score, [0.9888, 0.9394, 1.0], embedding_model=OpenAIEmbedding())
diff --git a/tests/autorag/evaluate/test_evaluate_util.py b/tests/autorag/evaluate/test_evaluate_util.py
@@ -0,0 +1,26 @@
+from llama_index import OpenAIEmbedding
+
+from autorag.evaluate.util import cast_metrics
+
+
+def test_cast_metrics():
+    metric1 = ['bleu', 'meteor', 'rouge']
+    metric_names, metric_params = cast_metrics(metric1)
+    assert metric_names == ['bleu', 'meteor', 'rouge']
+    assert metric_params == [{}, {}, {}]
+
+    metric2 = [{'metric_name': 'bleu'}, {'metric_name': 'meteor'}, {'metric_name': 'rouge'}]
+    metric_names, metric_params = cast_metrics(metric2)
+    assert metric_names == ['bleu', 'meteor', 'rouge']
+    assert metric_params == [{}, {}, {}]
+
+    metric3 = [{'metric_name': 'bleu'}, {'metric_name': 'sem_score', 'embedding_model': 'openai'}]
+    metric_names, metric_params = cast_metrics(metric3)
+    assert metric_names == ['bleu', 'sem_score']
+    assert metric_params == [{}, {'embedding_model': OpenAIEmbedding()}]
+
+    metric4 = [{'metric_name': 'bleu', 'extra_param': 'extra'},
+               {'metric_name': 'sem_score', 'embedding_model': 'openai', 'extra_param': 'extra'}]
+    metric_names, metric_params = cast_metrics(metric4)
+    assert metric_names == ['bleu', 'sem_score']
+    assert metric_params == [{'extra_param': 'extra'}, {'embedding_model': OpenAIEmbedding(), 'extra_param': 'extra'}]