Skip to content

Commit

Permalink
kohen kappa
Browse files Browse the repository at this point in the history
  • Loading branch information
Luncenok committed Nov 28, 2024
1 parent 811c671 commit cc3ec86
Show file tree
Hide file tree
Showing 3 changed files with 198 additions and 19 deletions.
77 changes: 77 additions & 0 deletions data/annotation_comparison_by_technique.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
filename,comparison,technique,kappa
gpt-4o-mini_vs_llama-3-1-8b-instruct_5.json,human_vs_annotations_4o_mini_ev,Shifting the Burden of Proof,0.06896551724137934
gpt-4o-mini_vs_llama-3-1-8b-instruct_5.json,human_vs_annotations_4o_mini_ev,Distraction,-0.12499999999999978
gpt-4o-mini_vs_llama-3-1-8b-instruct_5.json,human_vs_annotations_gemini_ev,Appeal to Logic,0.11764705882352944
gpt-4o-mini_vs_llama-3-1-8b-instruct_5.json,human_vs_annotations_gemini_ev,Shifting the Burden of Proof,0.7272727272727273
gpt-4o-mini_vs_llama-3-1-8b-instruct_5.json,human_vs_annotations_gemini_2_ev,Appeal to Logic,1.0
gpt-4o-mini_vs_llama-3-1-8b-instruct_5.json,human_vs_annotations_gemini_2_ev,Shifting the Burden of Proof,1.0
gpt-4o-mini_vs_llama-3-1-8b-instruct_5.json,human_vs_annotations_gemini_2_ev,Appeal to Credibility,1.0
gpt-4o-mini_vs_llama-3-1-8b-instruct_5.json,human_vs_annotations_gemini_2_ev,Distraction,0.6341463414634146
gpt-4o-mini_vs_llama-3-1-8b-instruct_5.json,human_vs_annotations_gemini_2_ev,Strategic Voting Suggestion,1.0
gpt-4o-mini_vs_llama-3-1-8b-instruct_5.json,human_vs_annotations_gemini_2_ev,Projection,0.5833333333333333
llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_4o_mini_ev,Appeal to Logic,0.07246376811594202
llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_4o_mini_ev,Shifting the Burden of Proof,0.4117647058823529
llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_4o_mini_ev,Appeal to Credibility,1.0
llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_4o_mini_ev,Distraction,0.4482758620689655
llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_4o_mini_ev,Projection,0.6363636363636364
llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_4o_mini_ev,Gaslighting,-0.06666666666666665
llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_gemini_ev,Appeal to Logic,0.3846153846153846
llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_gemini_ev,Strategic Voting Suggestion,-0.0714285714285714
llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_gemini_2_ev,Appeal to Logic,1.0
llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_gemini_2_ev,Shifting the Burden of Proof,0.9
llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_gemini_2_ev,Appeal to Credibility,0.45945945945945954
llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_gemini_2_ev,Distraction,0.7727272727272727
llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_gemini_2_ev,Strategic Voting Suggestion,0.6153846153846154
claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_4o_mini_ev,Appeal to Logic,0.2222222222222222
claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_4o_mini_ev,Shifting the Burden of Proof,-0.2790697674418605
claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_4o_mini_ev,Appeal to Credibility,0.16666666666666663
claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_4o_mini_ev,Distraction,1.0
claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_4o_mini_ev,Projection,1.0
claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_ev,Appeal to Logic,0.36363636363636365
claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_ev,Appeal to Relationship,1.0
claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_ev,Shifting the Burden of Proof,0.41860465116279066
claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_ev,Appeal to Credibility,0.8571428571428572
claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_ev,Projection,1.0
claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_ev,Strategic Voting Suggestion,0.6153846153846154
claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_2_ev,Appeal to Logic,1.0
claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_2_ev,Appeal to Relationship,1.0
claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_2_ev,Denial without Evidence,0.6428571428571428
claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_2_ev,Exaggeration,1.0
claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_2_ev,Shifting the Burden of Proof,1.0
claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_2_ev,Appeal to Credibility,1.0
claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_2_ev,Distraction,1.0
claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_2_ev,Projection,1.0
claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_2_ev,Strategic Voting Suggestion,1.0
claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_2_ev,Gaslighting,1.0
gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_4o_mini_ev,Appeal to Logic,0.3870967741935484
gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_4o_mini_ev,Shifting the Burden of Proof,0.8724832214765101
gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_4o_mini_ev,Vagueness,0.7710843373493976
gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_4o_mini_ev,Appeal to Credibility,1.0
gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_4o_mini_ev,Gaslighting,1.0
gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_4o_mini_ev,Projection,0.30487804878048785
gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_ev,Appeal to Logic,0.8275862068965517
gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_ev,Shifting the Burden of Proof,0.8571428571428572
gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_ev,Projection,0.34782608695652173
gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_ev,Appeal to Emotion,1.0
gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_2_ev,Appeal to Logic,1.0
gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_2_ev,Shifting the Burden of Proof,1.0
gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_2_ev,Vagueness,1.0
gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_2_ev,Appeal to Credibility,1.0
gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_2_ev,Gaslighting,1.0
gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_2_ev,Projection,1.0
gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_2_ev,Distraction,1.0
gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_2_ev,Strategic Voting Suggestion,1.0
gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_2_ev,Appeal to Emotion,1.0
gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_4o_mini_ev,Appeal to Logic,0.19753086419753074
gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_4o_mini_ev,Shifting the Burden of Proof,0.17721518987341767
gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_4o_mini_ev,Distraction,1.0
gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_4o_mini_ev,Strategic Voting Suggestion,0.4347826086956521
gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_4o_mini_ev,Withholding Information,-0.1304347826086958
gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_gemini_ev,Shifting the Burden of Proof,0.4444444444444444
gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_gemini_ev,Distraction,1.0
gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_gemini_2_ev,Appeal to Logic,1.0
gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_gemini_2_ev,Shifting the Burden of Proof,1.0
gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_gemini_2_ev,Distraction,1.0
gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_gemini_2_ev,Strategic Voting Suggestion,1.0
gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_gemini_2_ev,Gaslighting,1.0
gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_gemini_2_ev,Withholding Information,1.0
16 changes: 16 additions & 0 deletions data/annotation_comparison_overall.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
filename,comparison,overall_kappa
gpt-4o-mini_vs_llama-3-1-8b-instruct_5.json,human_vs_annotations_4o_mini_ev,0.022526146419951654
gpt-4o-mini_vs_llama-3-1-8b-instruct_5.json,human_vs_annotations_gemini_ev,0.48538011695906436
gpt-4o-mini_vs_llama-3-1-8b-instruct_5.json,human_vs_annotations_gemini_2_ev,0.857916102841678
llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_4o_mini_ev,0.467438494934877
llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_gemini_ev,0.4189531934516947
llama-3-1-405b-instruct_vs_gemini-flash-1-5_3.json,human_vs_annotations_gemini_2_ev,0.8030242737763629
claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_4o_mini_ev,0.439826526924467
claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_ev,0.5777351247600768
claude-3-5-haiku_vs_gpt-4o_1.json,human_vs_annotations_gemini_2_ev,0.9847978108847674
gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_4o_mini_ev,0.5947470474175921
gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_ev,0.6987951807228916
gpt-4o_vs_gpt-4o-mini_1.json,human_vs_annotations_gemini_2_ev,1.0
gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_4o_mini_ev,0.27929441411171774
gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_gemini_ev,0.3396226415094339
gemini-flash-1-5_vs_claude-3-5-sonnet_5.json,human_vs_annotations_gemini_2_ev,0.950920245398773
124 changes: 105 additions & 19 deletions src/among_them/analysis/compare_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import csv
from collections import defaultdict
from typing import Dict, List, Set, Tuple
import numpy as np
from sklearn.metrics import cohen_kappa_score

def load_json_file(file_path: str) -> List[dict]:
with open(file_path, 'r') as f:
Expand All @@ -16,7 +18,76 @@ def get_annotations_by_text(data: List[dict]) -> Dict[str, Set[str]]:
annotations[text] = set(annotation_list)
return annotations

def compare_annotations(file1_path: str, file2_path: str) -> Tuple[dict, float]:
def get_all_unique_annotations(annotations1: Dict[str, Set[str]], annotations2: Dict[str, Set[str]]) -> Set[str]:
all_annotations = set()
for annots in annotations1.values():
all_annotations.update(annots)
for annots in annotations2.values():
all_annotations.update(annots)
return all_annotations

def calculate_cohens_kappa(annotations1: Dict[str, Set[str]], annotations2: Dict[str, Set[str]],
common_texts: Set[str]) -> Dict[str, float]:
# Get all unique annotation labels
all_annotations = get_all_unique_annotations(annotations1, annotations2)

# Create binary matrices for each annotation type
kappa_scores = {}

for annotation in all_annotations:
rater1_scores = []
rater2_scores = []

# Count occurrences to check if we have enough variation
count_rater1 = 0
count_rater2 = 0

for text in common_texts:
score1 = 1 if annotation in annotations1[text] else 0
score2 = 1 if annotation in annotations2[text] else 0
rater1_scores.append(score1)
rater2_scores.append(score2)
count_rater1 += score1
count_rater2 += score2

# Only calculate kappa if both raters used at least one positive and one negative label
if (count_rater1 > 0 and count_rater1 < len(common_texts) and
count_rater2 > 0 and count_rater2 < len(common_texts)):
try:
kappa = cohen_kappa_score(rater1_scores, rater2_scores, labels=[0, 1])
if not np.isnan(kappa):
kappa_scores[annotation] = kappa
except:
continue

# Calculate overall kappa
all_rater1_scores = []
all_rater2_scores = []

for text in common_texts:
for annotation in all_annotations:
all_rater1_scores.append(1 if annotation in annotations1[text] else 0)
all_rater2_scores.append(1 if annotation in annotations2[text] else 0)

# Check if we have enough variation in the overall scores
unique_scores1 = len(set(all_rater1_scores))
unique_scores2 = len(set(all_rater2_scores))

if unique_scores1 > 1 and unique_scores2 > 1:
try:
overall_kappa = cohen_kappa_score(all_rater1_scores, all_rater2_scores, labels=[0, 1])
if not np.isnan(overall_kappa):
kappa_scores['overall'] = overall_kappa
else:
kappa_scores['overall'] = 0.0
except:
kappa_scores['overall'] = 0.0
else:
kappa_scores['overall'] = 0.0

return kappa_scores

def compare_annotations(file1_path: str, file2_path: str) -> Tuple[dict, Dict[str, float]]:
# Load both JSON files
data1 = load_json_file(file1_path)
data2 = load_json_file(file2_path)
Expand All @@ -36,10 +107,10 @@ def compare_annotations(file1_path: str, file2_path: str) -> Tuple[dict, float]:
# Find texts present in both files
common_texts = set(annotations1.keys()) & set(annotations2.keys())

# Analyze common texts
total_comparisons = 0
matching_annotations = 0
# Calculate Cohen's Kappa scores
kappa_scores = calculate_cohens_kappa(annotations1, annotations2, common_texts)

# Analyze common texts
for text in common_texts:
annot1 = annotations1[text]
annot2 = annotations2[text]
Expand All @@ -49,7 +120,6 @@ def compare_annotations(file1_path: str, file2_path: str) -> Tuple[dict, float]:
'text': text,
'annotations': list(annot1)
})
matching_annotations += 1
else:
comparison_results['different_annotations'].append({
'text': text,
Expand All @@ -59,7 +129,6 @@ def compare_annotations(file1_path: str, file2_path: str) -> Tuple[dict, float]:
'unique_to_file1': list(annot1 - annot2),
'unique_to_file2': list(annot2 - annot1)
})
total_comparisons += 1

# Find texts unique to each file
for text in set(annotations1.keys()) - common_texts:
Expand All @@ -74,10 +143,7 @@ def compare_annotations(file1_path: str, file2_path: str) -> Tuple[dict, float]:
'annotations': list(annotations2[text])
})

# Calculate agreement score
agreement_score = matching_annotations / total_comparisons if total_comparisons > 0 else 0

return comparison_results, agreement_score
return comparison_results, kappa_scores

def analyze_annotation_distribution(data: List[dict]) -> Dict[str, int]:
distribution = defaultdict(int)
Expand Down Expand Up @@ -112,26 +178,46 @@ def main():
compare_file_path = os.path.join(base_dir, compare_dir, filename)

if os.path.exists(compare_file_path):
comparison_results, agreement_score = compare_annotations(
comparison_results, kappa_scores = compare_annotations(
human_file_path, compare_file_path
)

# Add overall results
results.append({
'filename': filename,
'comparison': f'human_vs_{compare_dir}',
'agreement_score': agreement_score,
'overall_kappa': kappa_scores.get('overall', 0.0),
})

# Add per-technique results
for technique, score in kappa_scores.items():
if technique != 'overall':
results.append({
'filename': filename,
'comparison': f'human_vs_{compare_dir}',
'technique': technique,
'kappa': score,
})

# Save results to a CSV file
output_file = os.path.join(base_dir, 'annotation_comparison_results.csv')
# Save results to CSV files
overall_output = os.path.join(base_dir, 'annotation_comparison_overall.csv')
technique_output = os.path.join(base_dir, 'annotation_comparison_by_technique.csv')

# Write overall results
overall_results = [r for r in results if 'technique' not in r]
with open(overall_output, 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=['filename', 'comparison', 'overall_kappa'])
writer.writeheader()
writer.writerows(overall_results)

# Write to CSV
with open(output_file, 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=['filename', 'comparison', 'agreement_score'])
# Write technique-specific results
technique_results = [r for r in results if 'technique' in r]
with open(technique_output, 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=['filename', 'comparison', 'technique', 'kappa'])
writer.writeheader()
writer.writerows(results)
writer.writerows(technique_results)

print(f"Results have been saved to: {output_file}")
print(f"Results have been saved to: {overall_output} and {technique_output}")

if __name__ == "__main__":
main()

0 comments on commit cc3ec86

Please sign in to comment.