Skip to content

Commit

Permalink
Add sem_score metric for generation evaluation (#155)
Browse files Browse the repository at this point in the history
* add all-mpnet-base-v2 embedding model as default

* add all-mpnet-base-v2 to local model docs

* add sem score metric and test code for it

* add cast_metrics for processing metrics list or dict from yaml file.

* add cast_metrics at evaluate for new List[Dict] input type

* edit metrics type and add new metric sem_score to full.yaml

* add documentation about sem_score

* add api specification for new files

---------

Co-authored-by: jeffrey <vkefhdl1@gmail.com>
  • Loading branch information
vkehfdl1 and jeffrey authored Feb 10, 2024
1 parent 671f3cb commit 882624e
Show file tree
Hide file tree
Showing 15 changed files with 194 additions and 37 deletions.
1 change: 1 addition & 0 deletions autorag/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
# you can use your own model in this way.
'huggingface_baai_bge_small': HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5"),
'huggingface_cointegrated_rubert_tiny2': HuggingFaceEmbedding(model_name="cointegrated/rubert-tiny2"),
'huggingface_all_mpnet_base_v2': HuggingFaceEmbedding(model_name="sentence-transformers/all-mpnet-base-v2")
}

generator_models = {
Expand Down
23 changes: 13 additions & 10 deletions autorag/evaluate/generation.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
import functools
import warnings
from typing import List, Callable
from typing import List, Callable, Union, Dict

import pandas as pd

from autorag.evaluate.metric.generation import bleu, meteor, rouge
from autorag.evaluate.metric.generation import bleu, meteor, rouge, sem_score
from autorag.evaluate.util import cast_metrics

GENERATION_METRIC_FUNC_DICT = {func.__name__: func for func in
[bleu, meteor, rouge]}
[bleu, meteor, rouge, sem_score]}


def evaluate_generation(generation_gt: List[List[str]], metrics: List[str]):
def evaluate_generation(generation_gt: List[List[str]], metrics: Union[List[str], List[Dict]]):
def decorator_evaluate_generation(func: Callable):
@functools.wraps(func)
def wrapper(*args, **kwargs) -> pd.DataFrame:
Expand All @@ -27,13 +28,15 @@ def wrapper(*args, **kwargs) -> pd.DataFrame:
raise ValueError("Input func must return string list as generated answer at the first return value.")

metric_scores = {}
for metric in metrics:
if metric not in GENERATION_METRIC_FUNC_DICT:
warnings.warn(f"metric {metric} is not in supported metrics: {GENERATION_METRIC_FUNC_DICT.keys()}"
f"{metric} will be ignored.")
metric_names, metric_params = cast_metrics(metrics)

for metric_name, metric_param in zip(metric_names, metric_params):
if metric_name not in GENERATION_METRIC_FUNC_DICT:
warnings.warn(f"metric {metric_name} is not in supported metrics: {GENERATION_METRIC_FUNC_DICT.keys()}"
f"{metric_name} will be ignored.")
else:
metric_scores[metric] = GENERATION_METRIC_FUNC_DICT[metric](
generation_gt=generation_gt, generations=generated_str)
metric_scores[metric_name] = GENERATION_METRIC_FUNC_DICT[metric_name](
generation_gt=generation_gt, generations=generated_str, **metric_param)

metric_result_df = pd.DataFrame(metric_scores)
execution_result_df = pd.DataFrame({
Expand Down
36 changes: 33 additions & 3 deletions autorag/evaluate/metric/generation.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,31 @@
import functools
from typing import List
from typing import List, Optional

import evaluate
import pandas as pd
import sacrebleu
from llama_index.core.embeddings.base import BaseEmbedding

from autorag import embedding_models
from autorag.evaluate.metric.util import calculate_cosine_similarity


def generation_metric(func):
@functools.wraps(func)
def wrapper(generation_gt: List[List[str]], generations: List[str]) -> List[float]:
def wrapper(generation_gt: List[List[str]], generations: List[str], **kwargs) -> List[float]:
"""
Compute generation metric.
:param generation_gt: A list of ground truth.
Must be 2-d list of string.
Because it can be a multiple ground truth.
:param generations: A list of generations that LLM generated.
:param kwargs: The additional arguments for metric function.
:return: A list of computed metric scores.
"""
# make generation_gt and generations to pd dataframe
df = pd.DataFrame({'gt': generation_gt, 'pred': generations})
df[func.__name__] = df.swifter.apply(lambda x: func(x['gt'], x['pred']), axis=1)
df[func.__name__] = df.swifter.apply(lambda x: func(x['gt'], x['pred'], **kwargs), axis=1)
return df[func.__name__].tolist()

return wrapper
Expand Down Expand Up @@ -82,3 +87,28 @@ def rouge(generation_gt: List[List[str]], generations: List[str]) -> List[float]
"""
rouge_instance = evaluate.load("rouge")
return huggingface_evaluate(rouge_instance, 'rougeL', generation_gt, generations)


@generation_metric
def sem_score(generation_gt: List[str], pred: str, embedding_model: Optional[BaseEmbedding] = None) -> float:
"""
Compute sem score between generation gt and pred with cosine similarity.
:param generation_gt: A list of ground truth.
Must be list of string.
It will get the max of cosine similarity between generation_gt and pred.
:param pred: Model prediction.
:param embedding_model: Embedding model to use for compute cosine similarity.
Default is all-mpnet-base-v2 embedding model.
The paper used this embedding model.
:return: Sem score between generation_gt and pred.
"""
if embedding_model is None:
embedding_model = embedding_models['huggingface_all_mpnet_base_v2']

gt_embeddings = embedding_model.get_text_embedding_batch(generation_gt)
pred_embedding = embedding_model.get_text_embedding(pred)

# calculate cosine similarity
similarity_scores: List[float] = list(map(lambda x: calculate_cosine_similarity(x, pred_embedding), gt_embeddings))
return max(similarity_scores)
9 changes: 9 additions & 0 deletions autorag/evaluate/metric/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import numpy as np


def calculate_cosine_similarity(a, b):
dot_product = np.dot(a, b)
norm_a = np.linalg.norm(a)
norm_b = np.linalg.norm(b)
similarity = dot_product / (norm_a * norm_b)
return similarity
31 changes: 31 additions & 0 deletions autorag/evaluate/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from typing import Union, List, Dict, Tuple, Any

from autorag import embedding_models


def cast_metrics(metrics: Union[List[str], List[Dict]]) -> Tuple[List[str], List[Dict[str, Any]]]:
"""
Turn metrics to list of metric names and parameter list.
:param metrics: List of string or dictionary.
:return: The list of metric names and dictionary list of metric parameters.
"""
if not isinstance(metrics, list):
raise ValueError("metrics must be a list of string or dictionary.")
if isinstance(metrics[0], str):
return metrics, [{} for _ in metrics]
elif isinstance(metrics[0], dict):
# pop 'metric_name' key from dictionary
metric_names = list(map(lambda x: x.pop('metric_name'), metrics))
metric_params = [dict(map(lambda x, y: cast_embedding_model(x, y), metric.keys(), metric.values())) for metric
in metrics]
return metric_names, metric_params
else:
raise ValueError("metrics must be a list of string or dictionary.")


def cast_embedding_model(key, value):
if key == 'embedding_model':
return key, embedding_models[value]
else:
return key, value
6 changes: 3 additions & 3 deletions autorag/nodes/promptmaker/run.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import pathlib
from copy import deepcopy
from typing import List, Callable, Dict, Optional
from typing import List, Callable, Dict, Optional, Union

import pandas as pd

Expand Down Expand Up @@ -144,7 +144,7 @@ def evaluate_one_prompt_maker_node(generator_funcs: List[Callable],
generator_params: List[Dict],
prompts: List[str],
generation_gt: List[List[str]],
metrics: List[str],
metrics: Union[List[str], List[Dict]],
project_dir) -> pd.DataFrame:
input_df = pd.DataFrame({'prompts': prompts})
generator_results = list(map(lambda x: x[0](project_dir=project_dir, previous_result=input_df, **x[1]),
Expand All @@ -158,7 +158,7 @@ def evaluate_one_prompt_maker_node(generator_funcs: List[Callable],

def evaluate_generator_result(result_df: pd.DataFrame,
generation_gt: List[List[str]],
metrics: List[str]) -> pd.DataFrame:
metrics: Union[List[str], List[Dict]]) -> pd.DataFrame:
@evaluate_generation(generation_gt=generation_gt, metrics=metrics)
def evaluate(df):
return df['generated_texts'].tolist()
Expand Down
8 changes: 8 additions & 0 deletions docs/source/api_spec/autorag.evaluate.metric.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@ autorag.evaluate.metric.retrieval\_contents module
:undoc-members:
:show-inheritance:

autorag.evaluate.metric.util module
-----------------------------------

.. automodule:: autorag.evaluate.metric.util
:members:
:undoc-members:
:show-inheritance:

Module contents
---------------

Expand Down
8 changes: 8 additions & 0 deletions docs/source/api_spec/autorag.evaluate.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,14 @@ autorag.evaluate.retrieval\_contents module
:undoc-members:
:show-inheritance:

autorag.evaluate.util module
----------------------------

.. automodule:: autorag.evaluate.util
:members:
:undoc-members:
:show-inheritance:

Module contents
---------------

Expand Down
22 changes: 11 additions & 11 deletions docs/source/local_model.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,15 +99,16 @@ Modules that using embedding model can take `embedding_model` parameter to speci
As default, we support OpenAI embedding models and some of the local models.
To change the embedding model, you can change the `embedding_model` parameter to the following values:

| Embedding Model Type | embedding_model parameter |
|:-----------------------------------------------------------------------------:|:-------------------------------------:|
| Default openai embedding | openai |
| openai babbage embedding | openai_babbage |
| openai ada embedding | openai_ada |
| openai davinci embedding | openai_davinci |
| openai curie embedding | openai_curie |
| [BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) | huggingface_baai_bge_small |
| [cointegrated/rubert-tiny2](https://huggingface.co/cointegrated/rubert-tiny2) | huggingface_cointegrated_rubert_tiny2 |
| Embedding Model Type | embedding_model parameter |
|:---------------------------------------------------------------------------------------------------------:|:-------------------------------------:|
| Default openai embedding | openai |
| openai babbage embedding | openai_babbage |
| openai ada embedding | openai_ada |
| openai davinci embedding | openai_davinci |
| openai curie embedding | openai_curie |
| [BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) | huggingface_baai_bge_small |
| [cointegrated/rubert-tiny2](https://huggingface.co/cointegrated/rubert-tiny2) | huggingface_cointegrated_rubert_tiny2 |
| [sentence-transformers/all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) | huggingface_all_mpnet_base_v2 |

For example, if you want to use OpenAI curie embedding model, you can set `embedding_model` parameter to `openai_curie`.

Expand All @@ -130,7 +131,7 @@ Because the embedding model is initialized at the beginning of the AutoRAG progr

You can add more embedding models for AutoRAG.
You can add it by simply calling `autorag.embedding_models` and add new key and value.
For example,
For example,
if you want to add `[KoSimCSE](https://huggingface.co/BM-K/KoSimCSE-roberta-multitask)` model for Korean embedding,
execute the following code.

Expand All @@ -143,7 +144,6 @@ autorag.generator_models['kosimcse'] = HuggingFaceEmbedding("BM-K/KoSimCSE-rober

Then you can use `kosimcse` at config yaml file.


```{caution}
When you add new LLM model, you should add instance of the `BaseEmbedding` class from LlamaIndex.
```
Expand Down
18 changes: 16 additions & 2 deletions docs/source/nodes/generator/generator.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,21 @@ This document serves as a guide for configuring parameters, strategies, and the

### **Strategy Parameters**
1. **Metrics**:
- **Types**: `bleu`, `meteor`, `rouge`
- **Types**: `bleu`, `meteor`, `rouge`, `sem_score`
```{admonition} Purpose
These metrics are used to evaluate the performance of language models by comparing model-generated text to ground truth texts.
We are planning to add more metrics to evaluate generation performance.
```

```{admonition} sem_score
Sem_score is a metric that evaluates the semantic similarity between ground truth and llm generation.
It is quite simple, but effective to evaluate LLM systems.
Since it uses embedding model, you can specify the embedding model name at config YAML file.
Since AutoRAG v0.0.6, we support dictionary at strategy.
You can check out this feature at the example config.yaml file below.
```


2. **Speed Threshold**:
- **Description**: This optional parameter can be applied to all nodes to ensure that the processing time for a method does not exceed a predefined threshold.
Expand All @@ -28,7 +38,11 @@ This document serves as a guide for configuring parameters, strategies, and the
nodes:
- node_type: generator
strategy:
metrics: [bleu, meteor, rouge]
metrics:
- metric_name: bleu
- metric_name: meteor
- metric_name: sem_score
embedding_model: openai
speed_threshold: 10
modules:
- module_type: llama_index_llm
Expand Down
2 changes: 1 addition & 1 deletion docs/source/nodes/prompt_maker/prompt_maker.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ node_lines:
nodes:
- node_type: prompt_maker
strategy:
metrics: [bleu, meteor, rouge]
metrics: [bleu, meteor, rouge, sem_score]
speed_threshold: 10
generator_modules:
- module_type: llama_index_llm
Expand Down
7 changes: 6 additions & 1 deletion sample_config/full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,12 @@ node_lines:
"Question: {query} \n Something to read: {retrieved_contents} \n What's your answer?"]
- node_type: generator
strategy:
metrics: [bleu, meteor, rouge]
metrics:
- metric_name: bleu
- metric_name: meteor
- metric_name: rouge
- metric_name: sem_score
embedding_model: openai
speed_threshold: 10
modules:
- module_type: llama_index_llm
Expand Down
15 changes: 12 additions & 3 deletions tests/autorag/evaluate/metric/test_generation_metric.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest
from llama_index import OpenAIEmbedding

from autorag.evaluate.metric.generation import bleu, meteor, rouge
from autorag.evaluate.metric.generation import bleu, meteor, rouge, sem_score

generation_gts = [
['The dog had bit the man.', 'The man had bitten the dog.'],
Expand All @@ -16,8 +17,8 @@
]


def base_test_generation_metrics(func, solution):
scores = func(generation_gt=generation_gts, generations=generations)
def base_test_generation_metrics(func, solution, **kwargs):
scores = func(generation_gt=generation_gts, generations=generations, **kwargs)
assert len(scores) == len(generation_gts)
assert all(isinstance(score, float) for score in scores)
assert all(list(map(lambda x: x[0] == pytest.approx(x[1], 0.001),
Expand All @@ -34,3 +35,11 @@ def test_meteor():

def test_rouge():
base_test_generation_metrics(rouge, [0.909, 0.35714, 1.0])


def test_sem_score():
base_test_generation_metrics(sem_score, [0.8798, 0.7952, 1.0])


def test_sem_score_other_model():
base_test_generation_metrics(sem_score, [0.9888, 0.9394, 1.0], embedding_model=OpenAIEmbedding())
26 changes: 26 additions & 0 deletions tests/autorag/evaluate/test_evaluate_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from llama_index import OpenAIEmbedding

from autorag.evaluate.util import cast_metrics


def test_cast_metrics():
metric1 = ['bleu', 'meteor', 'rouge']
metric_names, metric_params = cast_metrics(metric1)
assert metric_names == ['bleu', 'meteor', 'rouge']
assert metric_params == [{}, {}, {}]

metric2 = [{'metric_name': 'bleu'}, {'metric_name': 'meteor'}, {'metric_name': 'rouge'}]
metric_names, metric_params = cast_metrics(metric2)
assert metric_names == ['bleu', 'meteor', 'rouge']
assert metric_params == [{}, {}, {}]

metric3 = [{'metric_name': 'bleu'}, {'metric_name': 'sem_score', 'embedding_model': 'openai'}]
metric_names, metric_params = cast_metrics(metric3)
assert metric_names == ['bleu', 'sem_score']
assert metric_params == [{}, {'embedding_model': OpenAIEmbedding()}]

metric4 = [{'metric_name': 'bleu', 'extra_param': 'extra'},
{'metric_name': 'sem_score', 'embedding_model': 'openai', 'extra_param': 'extra'}]
metric_names, metric_params = cast_metrics(metric4)
assert metric_names == ['bleu', 'sem_score']
assert metric_params == [{'extra_param': 'extra'}, {'embedding_model': OpenAIEmbedding(), 'extra_param': 'extra'}]
Loading

0 comments on commit 882624e

Please sign in to comment.