add third-party data

Xiaoyu-SZ · May 28, 2024 · 6f2b8d3 · 6f2b8d3
1 parent c1dac89
commit 6f2b8d3
Show file tree

Hide file tree

Showing 3 changed files with 2,566 additions and 40 deletions.
diff --git a/README.md b/README.md
@@ -38,7 +38,7 @@ For example:
 ```python
 MODEL_NAME = 'gpt-3.5-turbo'
 CONTAIN_USER_PROFILE = False
-CONTAIN_SHOT = 'None' # All or Type or None
+CONTAIN_SHOT = 'None' # Type or None
 PERSONALIZED = '_personalized' if CONTAIN_USER_PROFILE else ''
 TEMPEARTURE = 0
 ```
@@ -55,7 +55,7 @@ Then run `annot_vllm.py` or `annot_vllm_single.py`
 
 ### Calculate the metrics
 
-Run `corr.py`, it will calculate the correlations between the annotations and the ground truth for all files in `./data/`
+Run `corr.py`, it will calculate the correlations between the annotations and the ground truth for files in `./output/`
 
 The output contains Pearson correlation, Spearman correlation and Kendall correlation; all of them are in Dataset-Level, User-Level and Item-Level.
 
@@ -67,10 +67,11 @@ The output is in DataFrame format. The columns are:
 The `llm_value` is the value predicted by LLM, and the others are from the data.
 
 ## Dataset Information
+The data of real user labels and self-explanations is from the paper "User Perception of Recommendation Explanation: Are Your Explanations What Users Need?", see `./output/df_explanation.pkl`.
 
-The dataset is from the paper "User Perception of Recommendation Explanation: Are Your Explanations What Users Need?".
+We additionally collect third-part annotations for the explanatory texts, see `./output/third_party_annotation.csv`.
 
-If you use this dataset, please cite the paper:
+If you use the data in `./output/df_explanation.pkl`, please cite the paper:
 
 ```bibtex
 @article{UserPerceptionTois2023,

diff --git a/corr.py b/corr.py
@@ -24,56 +24,44 @@ def calculate_correlation(data_df, metric, METRIC):
     print(FILE_NAME)
     df = pd.read_csv(FILE_NAME, sep='\t')
     df['user_value'].fillna(3, inplace=True)
+    df['llm_value'].fillna(0, inplace=True)
     results = {}
     for metric in ['persuasiveness', 'transparency', 'accuracy', 'satisfactory']:
         data_df = df[df['metric'] == metric]
         user_values = data_df['user_value']
         llm_values = data_df['llm_value']
         index = len(user_values)
         print(index)
-        pearsonr_correlation = np.corrcoef(
-            user_values[:index], llm_values[:index])[0, 1]
-        pearson_correlation = pearsonr(
-            user_values[:index], llm_values[:index])[0]
-        spearmanr_correlation = spearmanr(
-            user_values[:index], llm_values[:index])[0]
-        kendalltau_correlation = kendalltau(
-            user_values[:index], llm_values[:index])[0]
+        pearson_correlation = pearsonr(user_values[:index], llm_values[:index])[0]
+        spearmanr_correlation = spearmanr(user_values[:index], llm_values[:index])[0]
+        kendalltau_correlation = kendalltau(user_values[:index], llm_values[:index])[0]
+        # mae = np.mean(np.abs(user_values -all_three_array))
+        # rmse = np.sqrt(np.mean((user_values - all_three_array)**2))
 
         print(f"Metrics for {metric}, Dataset-Level:")
-        print(f"    Scipy Pearson correlation coefficient: {
-              pearson_correlation}")
+        # print(f"    Pearson correlation coefficient: {correlation_coefficient}")
+        print(f"    Scipy Pearson correlation coefficient: {pearson_correlation}")
         print(f"    Spearman correlation coefficient: {spearmanr_correlation}")
         print(f"    Kendall correlation coefficient: {kendalltau_correlation}")
-
-        user_correlation = data_df.groupby(['user']).apply(
-            calculate_correlation, 'user_value', 'llm_value').reset_index()
-
+
+        user_correlation = data_df.groupby(['user']).apply(calculate_correlation,'user_value','llm_value').reset_index()
+
         print(f"Metrics for {metric}, User-Level:")
-        print(f"   META Pearson correlation coefficient: {
-              user_correlation['pearsonr_correlation'].mean()}")
-        print(f"   META Spearman correlation coefficient: {
-              user_correlation['spearmanr_correlation'].mean()}")
-        print(f"   META Kendall correlation coefficient: {
-              user_correlation['kendalltau_correlation'].mean()}")
-
-        pair_correlation = data_df.groupby(['user', 'movie_id']).apply(
-            calculate_correlation, 'user_value', 'llm_value').reset_index()
+        print(f"   META Pearson correlation coefficient: {user_correlation['pearsonr_correlation'].mean()}")
+        print(f"   META Spearman correlation coefficient: {user_correlation['spearmanr_correlation'].mean()}")
+        print(f"   META Kendall correlation coefficient: {user_correlation['kendalltau_correlation'].mean()}")
+
+        pair_correlation = data_df.groupby(['user','movie_id']).apply(calculate_correlation,'user_value','llm_value').reset_index()
 
         print(f"Metrics for {metric}, Sample-Level:")
-        print(f"   META Pearson correlation coefficient: {
-              pair_correlation['pearsonr_correlation'].mean()}")
-        print(f"   META Spearman correlation coefficient: {
-              pair_correlation['spearmanr_correlation'].mean()}")
-        print(f"   META Kendall correlation coefficient: {
-              pair_correlation['kendalltau_correlation'].mean()}")
-
-        results[metric] = [f"& {100*pearson_correlation:.2f} & {100*spearmanr_correlation:.2f} & {100*kendalltau_correlation:.2f}",
-                           f"& {100*user_correlation['pearsonr_correlation'].mean():.2f} & {100*user_correlation['spearmanr_correlation'].mean():.2f} & {
-            100*user_correlation['kendalltau_correlation'].mean():.2f}",
-            f"& {100*pair_correlation['pearsonr_correlation'].mean():.2f} & {100*pair_correlation['spearmanr_correlation'].mean():.2f} & {
-            100*pair_correlation['kendalltau_correlation'].mean():.2f}"
-        ]
+        print(f"   META Pearson correlation coefficient: {pair_correlation['pearsonr_correlation'].mean()}")
+        print(f"   META Spearman correlation coefficient: {pair_correlation['spearmanr_correlation'].mean()}")
+        print(f"   META Kendall correlation coefficient: {pair_correlation['kendalltau_correlation'].mean()}")
+
+        results[metric] = {'dataset':[100*pearson_correlation,100*spearmanr_correlation,100*kendalltau_correlation],
+                            'user':[100*user_correlation['pearsonr_correlation'].mean(),100*user_correlation['spearmanr_correlation'].mean(),100*user_correlation['kendalltau_correlation'].mean()],
+                            'pair':[100*pair_correlation['pearsonr_correlation'].mean(),100*pair_correlation['spearmanr_correlation'].mean(),100*pair_correlation['kendalltau_correlation'].mean()]
+                            }
 
     for i in range(3):
         output = ''