Skip to content

Commit

Permalink
code for LLMasEvaluator
Browse files Browse the repository at this point in the history
  • Loading branch information
Xiaoyu-SZ committed May 28, 2024
1 parent 8fd5877 commit c1dac89
Show file tree
Hide file tree
Showing 14 changed files with 3,994 additions and 1 deletion.
99 changes: 98 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,98 @@
# LLMasAnnotator
# LLMasEvaluator

The offical code for Large Language Model as Evaluator for Explainable Recommendation.

## Installation

Please do check if your torch.cuda.is_available() is True for your local machine.

Besides, to use LLMasEvaluator with vllm detailed here, you need to mannually install vllm following vllm document.

if your CUDA is 12.1
```bash
pip install vllm
pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu121
```
if your CUDA is 11.8
```bash
# Replace `cp39` with your Python version (e.g., `cp38`, `cp39`, `cp311`).
pip install https://github.com/vllm-project/vllm/releases/download/v0.2.2/vllm-0.2.2+cu118-cp39-cp39-manylinux1_x86_64.whl
pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu118
```

Then, install the dependencies by running the following command:

```bash
pip install -r requirements.txt
```

## Usage

### Generate Annotations via APIs

To generate annotations via APIs, please give your API key in `llm.py`

Then give your parameters in `annot.py` or `annot_single.py`, in `annot.py` we generate annotations in all aspects at a time, in `annot_single.py` we only generate annotations in one aspect at a time.

For example:
```python
MODEL_NAME = 'gpt-3.5-turbo'
CONTAIN_USER_PROFILE = False
CONTAIN_SHOT = 'None' # All or Type or None
PERSONALIZED = '_personalized' if CONTAIN_USER_PROFILE else ''
TEMPEARTURE = 0
```

Then run `annot.py` or `annot_single.py`

### Generate Annotations via Local LLM

Please make sure your local machine has installed vllm and xformers.

Then give your parameters in `annot_vllm.py` or `annot_vllm_single.py`, in `annot_vllm.py` we generate annotations in all aspects at a time, in `annot_vllm_single.py` we only generate annotations in one aspect at a time.

Then run `annot_vllm.py` or `annot_vllm_single.py`

### Calculate the metrics

Run `corr.py`, it will calculate the correlations between the annotations and the ground truth for all files in `./data/`

The output contains Pearson correlation, Spearman correlation and Kendall correlation; all of them are in Dataset-Level, User-Level and Item-Level.

## Output Format

The output is in DataFrame format. The columns are:
`user`,`movie_id`,`movie_title`,`explanation_text`,`explanation_type`,`metric`,`user_value`,`llm_value`

The `llm_value` is the value predicted by LLM, and the others are from the data.

## Dataset Information

The dataset is from the paper "User Perception of Recommendation Explanation: Are Your Explanations What Users Need?".

If you use this dataset, please cite the paper:

```bibtex
@article{UserPerceptionTois2023,
author = {Hongyu Lu and
Weizhi Ma and
Yifan Wang and
Min Zhang and
Xiang Wang and
Yiqun Liu and
Tat{-}Seng Chua and
Shaoping Ma},
title = {User Perception of Recommendation Explanation: Are Your Explanations
What Users Need?},
journal = {{ACM} Trans. Inf. Syst.},
volume = {41},
number = {2},
pages = {48:1--48:31},
year = {2023},
url = {https://doi.org/10.1145/3565480},
doi = {10.1145/3565480},
timestamp = {Sat, 27 May 2023 15:23:45 +0200},
biburl = {https://dblp.org/rec/journals/tois/LuMWZWLCM23.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
```
174 changes: 174 additions & 0 deletions annot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
import pandas as pd
from prompt import *
from llm import *
import numpy as np
from joblib import Parallel, delayed
from tqdm import tqdm
from bs4 import BeautifulSoup

MODEL_NAME = 'gpt-4'
CONTAIN_USER_PROFILE = False #
CONTAIN_SHOT = 'None' # All or Type or None
PERSONALIZED = '_peronalized' if CONTAIN_USER_PROFILE else ''
TEMPEARTURE = 1
FILE_NAME = f'output/{MODEL_NAME}_T{str(TEMPEARTURE)}_{CONTAIN_SHOT}{PERSONALIZED}_correlation_results.csv'
LOG_FILE_NAME = f'output/{MODEL_NAME}_T{str(TEMPEARTURE)}_{CONTAIN_SHOT}{PERSONALIZED}_user_{MODEL_NAME}_log_1.txt'


def form_few_shot_case(data):
case = f'''
Movie:{data['movie_title']}
Explanation:{data['explanation']}
-Persuasiveness : {int(data['persuasiveness'])}
-Transparency : {int(data['transparency'])}
-Accuracy : {int(data['interest_accuracy'])}
-Satisfactory : {int(data['satisfaction'])}'''
return case

def extract_plain_text(html_text):
soup = BeautifulSoup(html_text, 'html.parser')
return soup.get_text()

def read_explanatation_data():
df = pd.read_pickle("data/df_explanation.pkl")

# 提取'user_id'和'movie_id'两列
selected_columns = ['user_id', 'movie_id','movie_title','explanation_type','explanation','persuasiveness','transparency','satisfaction','interest_accuracy']

result_df = df[selected_columns]
for metric in ['persuasiveness', 'transparency', 'interest_accuracy', 'satisfaction']:
result_df[metric].fillna(3,inplace=True)

# 打印结果
pd.set_option('display.max_columns', None)
print(result_df.head(1))
result_df.to_csv('data/df_explanation_selected.csv', index=False)
return result_df

def read_user_data():
user_dict = {}
user_df= pd.read_csv('data/user_demo.csv')
for index, row in user_df.iterrows():
user_id = str(row["user_id"])
user_profile = generate_user_profile(row)
user_dict[user_id] = user_profile
return user_dict

def read_data():
result_dict = {}
explanatation_df = read_explanatation_data()
for index, row in explanatation_df.iterrows():
user_id = row['user_id']
explanation_type = row['explanation_type']
row['explanation'] = extract_plain_text(row['explanation'])

if user_id not in result_dict:
result_dict[user_id] = []

result_dict[user_id].append(row.to_dict())
return result_dict

def evaluate_llm(result_dict,user_dict):
llm_data = {
'user': [],
'movie_id': [],
'movie_title': [],
'explanation_text': [],
'explanation_type': [],
'metric': [],
'user_value': [],
'llm_value': [],
'corr':[],
}
result_list = list(result_dict.items())

logger = open(LOG_FILE_NAME,'w')

with open(FILE_NAME, 'w') as f:

f.write("user\tmovie_id\tmovie_title\texplanation_text\texplanation_type\tmetric\tuser_value\tllm_value\tcorr\n")
for result in tqdm(result_list):
user = result[0]
first_movie = result[1][0]['movie_title']
interactions = result[1][:1]
user_values = {
'persuasiveness': [None]*len(interactions),
'transparency': [None]*len(interactions),
'accuracy': [None]*len(interactions),
'satisfactory':[None]*len(interactions)
}
llm_values = {
'persuasiveness': [None]*len(interactions),
'transparency': [None]*len(interactions),
'accuracy': [None]*len(interactions),
'satisfactory':[None]*len(interactions),
}
case = {}
def process_data(data,index):
if (data['movie_title'] == first_movie) and (CONTAIN_SHOT != 'None'):
case[data['explanation_type']] = form_few_shot_case(data)
else:
user_values['persuasiveness'][index]=(float(data['persuasiveness']))
user_values['transparency'][index]=(float(data['transparency']))
user_values['accuracy'][index]=(float(data['interest_accuracy']))
user_values['satisfactory'][index]=(float(data['satisfaction']))
user_profile ={'contain':CONTAIN_USER_PROFILE,'prompt':user_dict[user]}

if CONTAIN_SHOT == 'Type':
if data['explanation_type'] in case.keys():
present_case = case[data['explanation_type']]
else:
present_case = ''
elif CONTAIN_SHOT == 'All':
present_case = '\n'.join(str(value) for value in present_case.values())
elif CONTAIN_SHOT == 'None':
present_case = ''
cases = {'type':CONTAIN_SHOT,'prompt':present_case}

llm_prompt = prompt_evaluate_in_likert_personalize_few_shot(data,user_profile,cases)
# logger.write(llm_prompt+'\n\n')
llm_answer,llm_result = call_llm_for_evaluate(llm_prompt, MODEL_NAME,mode="multi",temperature=float(TEMPEARTURE))
# logger.write(llm_answer+'\n\n')
llm_values['persuasiveness'][index]=(float(llm_result[0]))
llm_values['transparency'][index]=(float(llm_result[1]))
llm_values['accuracy'][index]=(float(llm_result[2]))
llm_values['satisfactory'][index]=(float(llm_result[3]))
# print(llm_values)

Parallel(n_jobs=5, backend="threading")(
delayed(process_data)(data,index) for index,data in tqdm(enumerate(interactions))
)

for index,data in tqdm(enumerate(interactions)):
for metric in ['persuasiveness', 'transparency', 'accuracy', 'satisfactory']:
row = [
user,
data['movie_id'],
data['movie_title'],
data['explanation'],
data['explanation_type'],
metric,
user_values[metric][index],
llm_values[metric][index],
np.corrcoef(llm_values[metric][:index], user_values[metric][:index])[0, 1]
]
f.write('\t'.join(map(str, row)) + '\n')
logger.flush()
f.flush()


df = pd.read_csv(FILE_NAME,sep='\t')
df['user_value'].fillna(3,inplace=True)
for metric in ['persuasiveness', 'transparency', 'accuracy', 'satisfactory']:
user_values = df[df['metric'] == metric]['user_value']
llm_values = df[df['metric'] == metric]['llm_value']
correlation_coefficient = np.corrcoef(user_values, llm_values)[0, 1]
print(f"Correlation coefficient for {metric}: {correlation_coefficient}")
logger.write(f"Correlation coefficient for {metric}: {correlation_coefficient}")

logger.close()


result_dict = read_data()
user_dict = read_user_data()
evaluate_llm(result_dict,user_dict)
Loading

0 comments on commit c1dac89

Please sign in to comment.