kohen kappa

Farmerobot · Nov 28, 2024 · cc3ec86 · cc3ec86
1 parent 811c671
commit cc3ec86
Show file tree

Hide file tree

Showing 3 changed files with 198 additions and 19 deletions.
diff --git a/data/annotation_comparison_by_technique.csv b/data/annotation_comparison_by_technique.csv
@@ -0,0 +1,77 @@
+filename,comparison,technique,kappa
+gpt-4o-mini_vs_llama-3-1-8b-instruct_5.json,human_vs_annotations_4o_mini_ev,Shifting the Burden of Proof,0.06896551724137934
+gpt-4o-mini_vs_llama-3-1-8b-instruct_5.json,human_vs_annotations_4o_mini_ev,Distraction,-0.12499999999999978
+gpt-4o-mini_vs_llama-3-1-8b-instruct_5.json,human_vs_annotations_gemini_ev,Appeal to Logic,0.11764705882352944
+gpt-4o-mini_vs_llama-3-1-8b-instruct_5.json,human_vs_annotations_gemini_ev,Shifting the Burden of Proof,0.7272727272727273
+gpt-4o-mini_vs_llama-3-1-8b-instruct_5.json,human_vs_annotations_gemini_2_ev,Appeal to Logic,1.0
+gpt-4o-mini_vs_llama-3-1-8b-instruct_5.json,human_vs_annotations_gemini_2_ev,Shifting the Burden of Proof,1.0
+gpt-4o-mini_vs_llama-3-1-8b-instruct_5.json,human_vs_annotations_gemini_2_ev,Appeal to Credibility,1.0
+gpt-4o-mini_vs_llama-3-1-8b-instruct_5.json,human_vs_annotations_gemini_2_ev,Distraction,0.6341463414634146
+gpt-4o-mini_vs_llama-3-1-8b-instruct_5.json,human_vs_annotations_gemini_2_ev,Strategic Voting Suggestion,1.0
+gpt-4o-mini_vs_llama-3-1-8b-instruct_5.json,human_vs_annotations_gemini_2_ev,Projection,0.5833333333333333
+llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_4o_mini_ev,Appeal to Logic,0.07246376811594202
+llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_4o_mini_ev,Shifting the Burden of Proof,0.4117647058823529
+llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_4o_mini_ev,Appeal to Credibility,1.0
+llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_4o_mini_ev,Distraction,0.4482758620689655
+llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_4o_mini_ev,Projection,0.6363636363636364
+llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_4o_mini_ev,Gaslighting,-0.06666666666666665
+llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_gemini_ev,Appeal to Logic,0.3846153846153846
+llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_gemini_ev,Strategic Voting Suggestion,-0.0714285714285714
+llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_gemini_2_ev,Appeal to Logic,1.0
+llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_gemini_2_ev,Shifting the Burden of Proof,0.9
+llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_gemini_2_ev,Appeal to Credibility,0.45945945945945954
+llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_gemini_2_ev,Distraction,0.7727272727272727
+llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_gemini_2_ev,Strategic Voting Suggestion,0.6153846153846154
+claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_4o_mini_ev,Appeal to Logic,0.2222222222222222
+claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_4o_mini_ev,Shifting the Burden of Proof,-0.2790697674418605
+claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_4o_mini_ev,Appeal to Credibility,0.16666666666666663
+claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_4o_mini_ev,Distraction,1.0
+claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_4o_mini_ev,Projection,1.0
+claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_ev,Appeal to Logic,0.36363636363636365
+claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_ev,Appeal to Relationship,1.0
+claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_ev,Shifting the Burden of Proof,0.41860465116279066
+claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_ev,Appeal to Credibility,0.8571428571428572
+claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_ev,Projection,1.0
+claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_ev,Strategic Voting Suggestion,0.6153846153846154
+claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_2_ev,Appeal to Logic,1.0
+claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_2_ev,Appeal to Relationship,1.0
+claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_2_ev,Denial without Evidence,0.6428571428571428
+claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_2_ev,Exaggeration,1.0
+claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_2_ev,Shifting the Burden of Proof,1.0
+claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_2_ev,Appeal to Credibility,1.0
+claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_2_ev,Distraction,1.0
+claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_2_ev,Projection,1.0
+claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_2_ev,Strategic Voting Suggestion,1.0
+claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_2_ev,Gaslighting,1.0
+gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_4o_mini_ev,Appeal to Logic,0.3870967741935484
+gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_4o_mini_ev,Shifting the Burden of Proof,0.8724832214765101
+gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_4o_mini_ev,Vagueness,0.7710843373493976
+gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_4o_mini_ev,Appeal to Credibility,1.0
+gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_4o_mini_ev,Gaslighting,1.0
+gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_4o_mini_ev,Projection,0.30487804878048785
+gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_ev,Appeal to Logic,0.8275862068965517
+gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_ev,Shifting the Burden of Proof,0.8571428571428572
+gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_ev,Projection,0.34782608695652173
+gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_ev,Appeal to Emotion,1.0
+gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_2_ev,Appeal to Logic,1.0
+gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_2_ev,Shifting the Burden of Proof,1.0
+gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_2_ev,Vagueness,1.0
+gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_2_ev,Appeal to Credibility,1.0
+gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_2_ev,Gaslighting,1.0
+gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_2_ev,Projection,1.0
+gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_2_ev,Distraction,1.0
+gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_2_ev,Strategic Voting Suggestion,1.0
+gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_2_ev,Appeal to Emotion,1.0
+gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_4o_mini_ev,Appeal to Logic,0.19753086419753074
+gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_4o_mini_ev,Shifting the Burden of Proof,0.17721518987341767
+gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_4o_mini_ev,Distraction,1.0
+gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_4o_mini_ev,Strategic Voting Suggestion,0.4347826086956521
+gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_4o_mini_ev,Withholding Information,-0.1304347826086958
+gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_gemini_ev,Shifting the Burden of Proof,0.4444444444444444
+gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_gemini_ev,Distraction,1.0
+gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_gemini_2_ev,Appeal to Logic,1.0
+gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_gemini_2_ev,Shifting the Burden of Proof,1.0
+gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_gemini_2_ev,Distraction,1.0
+gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_gemini_2_ev,Strategic Voting Suggestion,1.0
+gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_gemini_2_ev,Gaslighting,1.0
+gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_gemini_2_ev,Withholding Information,1.0
diff --git a/data/annotation_comparison_overall.csv b/data/annotation_comparison_overall.csv
@@ -0,0 +1,16 @@
+filename,comparison,overall_kappa
+gpt-4o-mini_vs_llama-3-1-8b-instruct_5.json,human_vs_annotations_4o_mini_ev,0.022526146419951654
+gpt-4o-mini_vs_llama-3-1-8b-instruct_5.json,human_vs_annotations_gemini_ev,0.48538011695906436
+gpt-4o-mini_vs_llama-3-1-8b-instruct_5.json,human_vs_annotations_gemini_2_ev,0.857916102841678
+llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_4o_mini_ev,0.467438494934877
+llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_gemini_ev,0.4189531934516947
+llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_gemini_2_ev,0.8030242737763629
+claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_4o_mini_ev,0.439826526924467
+claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_ev,0.5777351247600768
+claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_2_ev,0.9847978108847674
+gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_4o_mini_ev,0.5947470474175921
+gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_ev,0.6987951807228916
+gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_2_ev,1.0
+gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_4o_mini_ev,0.27929441411171774
+gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_gemini_ev,0.3396226415094339
+gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_gemini_2_ev,0.950920245398773
diff --git a/src/among_them/analysis/compare_annotations.py b/src/among_them/analysis/compare_annotations.py
@@ -3,6 +3,8 @@
 import csv
 from collections import defaultdict
 from typing import Dict, List, Set, Tuple
+import numpy as np
+from sklearn.metrics import cohen_kappa_score
 
 def load_json_file(file_path: str) -> List[dict]:
     with open(file_path, 'r') as f:
@@ -16,7 +18,76 @@ def get_annotations_by_text(data: List[dict]) -> Dict[str, Set[str]]:
         annotations[text] = set(annotation_list)
     return annotations
 
-def compare_annotations(file1_path: str, file2_path: str) -> Tuple[dict, float]:
+def get_all_unique_annotations(annotations1: Dict[str, Set[str]], annotations2: Dict[str, Set[str]]) -> Set[str]:
+    all_annotations = set()
+    for annots in annotations1.values():
+        all_annotations.update(annots)
+    for annots in annotations2.values():
+        all_annotations.update(annots)
+    return all_annotations
+
+def calculate_cohens_kappa(annotations1: Dict[str, Set[str]], annotations2: Dict[str, Set[str]], 
+                         common_texts: Set[str]) -> Dict[str, float]:
+    # Get all unique annotation labels
+    all_annotations = get_all_unique_annotations(annotations1, annotations2)
+
+    # Create binary matrices for each annotation type
+    kappa_scores = {}
+
+    for annotation in all_annotations:
+        rater1_scores = []
+        rater2_scores = []
+
+        # Count occurrences to check if we have enough variation
+        count_rater1 = 0
+        count_rater2 = 0
+
+        for text in common_texts:
+            score1 = 1 if annotation in annotations1[text] else 0
+            score2 = 1 if annotation in annotations2[text] else 0
+            rater1_scores.append(score1)
+            rater2_scores.append(score2)
+            count_rater1 += score1
+            count_rater2 += score2
+
+        # Only calculate kappa if both raters used at least one positive and one negative label
+        if (count_rater1 > 0 and count_rater1 < len(common_texts) and 
+            count_rater2 > 0 and count_rater2 < len(common_texts)):
+            try:
+                kappa = cohen_kappa_score(rater1_scores, rater2_scores, labels=[0, 1])
+                if not np.isnan(kappa):
+                    kappa_scores[annotation] = kappa
+            except:
+                continue
+
+    # Calculate overall kappa
+    all_rater1_scores = []
+    all_rater2_scores = []
+
+    for text in common_texts:
+        for annotation in all_annotations:
+            all_rater1_scores.append(1 if annotation in annotations1[text] else 0)
+            all_rater2_scores.append(1 if annotation in annotations2[text] else 0)
+
+    # Check if we have enough variation in the overall scores
+    unique_scores1 = len(set(all_rater1_scores))
+    unique_scores2 = len(set(all_rater2_scores))
+
+    if unique_scores1 > 1 and unique_scores2 > 1:
+        try:
+            overall_kappa = cohen_kappa_score(all_rater1_scores, all_rater2_scores, labels=[0, 1])
+            if not np.isnan(overall_kappa):
+                kappa_scores['overall'] = overall_kappa
+            else:
+                kappa_scores['overall'] = 0.0
+        except:
+            kappa_scores['overall'] = 0.0
+    else:
+        kappa_scores['overall'] = 0.0
+
+    return kappa_scores
+
+def compare_annotations(file1_path: str, file2_path: str) -> Tuple[dict, Dict[str, float]]:
     # Load both JSON files
     data1 = load_json_file(file1_path)
     data2 = load_json_file(file2_path)
@@ -36,10 +107,10 @@ def compare_annotations(file1_path: str, file2_path: str) -> Tuple[dict, float]:
     # Find texts present in both files
     common_texts = set(annotations1.keys()) & set(annotations2.keys())
 
-    # Analyze common texts
-    total_comparisons = 0
-    matching_annotations = 0
+    # Calculate Cohen's Kappa scores
+    kappa_scores = calculate_cohens_kappa(annotations1, annotations2, common_texts)
 
+    # Analyze common texts
     for text in common_texts:
         annot1 = annotations1[text]
         annot2 = annotations2[text]
@@ -49,7 +120,6 @@ def compare_annotations(file1_path: str, file2_path: str) -> Tuple[dict, float]:
                 'text': text,
                 'annotations': list(annot1)
             })
-            matching_annotations += 1
         else:
             comparison_results['different_annotations'].append({
                 'text': text,
@@ -59,7 +129,6 @@ def compare_annotations(file1_path: str, file2_path: str) -> Tuple[dict, float]:
                 'unique_to_file1': list(annot1 - annot2),
                 'unique_to_file2': list(annot2 - annot1)
             })
-        total_comparisons += 1
 
     # Find texts unique to each file
     for text in set(annotations1.keys()) - common_texts:
@@ -74,10 +143,7 @@ def compare_annotations(file1_path: str, file2_path: str) -> Tuple[dict, float]:
             'annotations': list(annotations2[text])
         })
 
-    # Calculate agreement score
-    agreement_score = matching_annotations / total_comparisons if total_comparisons > 0 else 0
-
-    return comparison_results, agreement_score
+    return comparison_results, kappa_scores
 
 def analyze_annotation_distribution(data: List[dict]) -> Dict[str, int]:
     distribution = defaultdict(int)
@@ -112,26 +178,46 @@ def main():
             compare_file_path = os.path.join(base_dir, compare_dir, filename)
 
             if os.path.exists(compare_file_path):
-                comparison_results, agreement_score = compare_annotations(
+                comparison_results, kappa_scores = compare_annotations(
                     human_file_path, compare_file_path
                 )
 
+                # Add overall results
                 results.append({
                     'filename': filename,
                     'comparison': f'human_vs_{compare_dir}',
-                    'agreement_score': agreement_score,
+                    'overall_kappa': kappa_scores.get('overall', 0.0),
                 })
+
+                # Add per-technique results
+                for technique, score in kappa_scores.items():
+                    if technique != 'overall':
+                        results.append({
+                            'filename': filename,
+                            'comparison': f'human_vs_{compare_dir}',
+                            'technique': technique,
+                            'kappa': score,
+                        })
 
-    # Save results to a CSV file
-    output_file = os.path.join(base_dir, 'annotation_comparison_results.csv')
+    # Save results to CSV files
+    overall_output = os.path.join(base_dir, 'annotation_comparison_overall.csv')
+    technique_output = os.path.join(base_dir, 'annotation_comparison_by_technique.csv')
+
+    # Write overall results
+    overall_results = [r for r in results if 'technique' not in r]
+    with open(overall_output, 'w', newline='') as f:
+        writer = csv.DictWriter(f, fieldnames=['filename', 'comparison', 'overall_kappa'])
+        writer.writeheader()
+        writer.writerows(overall_results)
 
-    # Write to CSV
-    with open(output_file, 'w', newline='') as f:
-        writer = csv.DictWriter(f, fieldnames=['filename', 'comparison', 'agreement_score'])
+    # Write technique-specific results
+    technique_results = [r for r in results if 'technique' in r]
+    with open(technique_output, 'w', newline='') as f:
+        writer = csv.DictWriter(f, fieldnames=['filename', 'comparison', 'technique', 'kappa'])
         writer.writeheader()
-        writer.writerows(results)
+        writer.writerows(technique_results)
 
-    print(f"Results have been saved to: {output_file}")
+    print(f"Results have been saved to: {overall_output} and {technique_output}")
 
 if __name__ == "__main__":
     main()