Update CLEVA harms, others and a few scenarios

lyy1994 · Sep 25, 2023 · ec2d535 · ec2d535
1 parent 43fb081
commit ec2d535
Showing 1 changed file with 29 additions and 29 deletions.
diff --git a/src/helm/benchmark/static/schema.yaml b/src/helm/benchmark/static/schema.yaml
@@ -1415,7 +1415,7 @@ metrics:
   - name: cleva_top1_accuracy
     display_name: Chinese Top-1 Accuracy
     short_display_name: Acc@Top-1 (Chinese)
-    description: A special accuracy [(Patel and Pavlick, 2008)](https://openreview.net/pdf?id=gJcEM8sxHK) that gives perfect precision as long as a substring of the answer appears in the most confident model prediction.
+    description: A special accuracy [(Patel and Pavlick, 2022)](https://openreview.net/pdf?id=gJcEM8sxHK) that gives perfect precision as long as a substring of the answer appears in the most confident model prediction.
     lower_is_better: false
   - name: cleva_machine_translation_bleu
     display_name: BLEU
@@ -3041,10 +3041,10 @@ run_groups:
       main_split: test
     taxonomy:
       task: translation
-      what: "?"
+      what: news
       who: "?"
-      when: "?"
-      language: Chinese or English
+      when: 2022 or after
+      language: bilingual
 
   - name: cleva_data_to_text_generation
     display_name: CLEVA (Chinese) data to text generation
@@ -3262,7 +3262,7 @@ run_groups:
 # Knowledge
   - name: cleva_subject_knowledge
     display_name: CLEVA (Chinese) subject knowledge
-    description: CLEVA (Chinese) subject knowledge
+    description: CLEVA (Chinese) subject knowledge is inspired by [Petroni et al. (2019)](https://aclanthology.org/D19-1250/) to extensively test factual knowledge in Chinese. It contains 13 subjects and a general domain.
     metric_groups:
       - accuracy
       - calibration
@@ -3275,7 +3275,7 @@ run_groups:
       task: knowledge base completion
       what: entity-relation-entity triples in natural language form
       who: automatically generated from templates
-      when: "?"
+      when: 2022 or before
       language: structured Chinese
 
   - name: cleva_cultural_knowledge
@@ -3290,7 +3290,7 @@ run_groups:
       main_name: exact_match
       main_split: test
     taxonomy:
-      task: "?"
+      task: multiple-choice
       what: "?"
       who: "?"
       when: "?"
@@ -3406,7 +3406,7 @@ run_groups:
 
   - name: cleva_conceptual_generalization
     display_name: CLEVA (Chinese) conceptual generalization
-    description: CLEVA (Chinese) conceptual generalization
+    description: CLEVA (Chinese) conceptual generalization assesses whether LLMs could generalize physical relations to a synthetic grid world based on [Patel and Pavlick, (2022)](https://openreview.net/pdf?id=gJcEM8sxHK).
     metric_groups:
       - calibration
       - efficiency
@@ -3417,15 +3417,15 @@ run_groups:
       main_split: test
     taxonomy:
       task: next-word prediction
-      what: n/a
-      who: n/a
+      what: "?"
+      who: automatically generated by algorithm
       when: n/a
       language: synthetic
 
 # Harms
   - name: cleva_toxicity_detection
     display_name: CLEVA (Chinese) toxicity detection
-    description: CLEVA (Chinese) toxicity detection
+    description: CLEVA (Chinese) toxicity detection asks LLMs about the offensiveness of the given text [(Deng et al., 2022)](https://aclanthology.org/2022.emnlp-main.796/).
     metric_groups:
       - accuracy
       - calibration
@@ -3436,14 +3436,14 @@ run_groups:
       main_split: test
     taxonomy:
       task: toxicity classification
-      what: "?"
+      what: text from Chinese social media 
       who: "?"
       when: "?"
       language: Chinese
 
   - name: cleva_bias
     display_name: CLEVA (Chinese) bias
-    description: CLEVA (Chinese) bias
+    description: CLEVA (Chinese) bias follows [Zhou et al., (2022)](https://arxiv.org/abs/2202.08011) to gauge bias of four demographic categories in dialogues, including race, gender, region, and occupation.
     metric_groups:
       - accuracy
       - calibration
@@ -3454,15 +3454,15 @@ run_groups:
       main_name: exact_match
       main_split: test
     taxonomy:
-      task: "?"
-      what: "?"
+      task: multiple-choice
+      what: short Chinese dialogues from social media
       who: "?"
       when: "?"
       language: Chinese
 
   - name: cleva_copyright
     display_name: CLEVA (Chinese) copyright
-    description: CLEVA (Chinese) copyright
+    description: CLEVA (Chinese) copyright measures copyright and memorization behavior for Chinese books and code (in subtasks 'text' and 'code' respectively), based off of [Carlini et al. (2021)](https://www.usenix.org/biblio-11958).
     metric_groups:
       - copyright_metrics
       - general_information
@@ -3478,7 +3478,7 @@ run_groups:
 
   - name: cleva_fact_checking
     display_name: CLEVA (Chinese) fact checking
-    description: CLEVA (Chinese) fact checking
+    description: CLEVA (Chinese) fact checking lets LLMs identify whether the given fact is true to test their factuality [(Hu et al., 2022)](https://github.com/THU-BPM/CHEF).
     metric_groups:
       - accuracy
       - calibration
@@ -3489,16 +3489,16 @@ run_groups:
       main_name: exact_match
       main_split: test
     taxonomy:
-      task: "?"
-      what: "?"
+      task: multiple-choice
+      what: factual statements in natural language form
       who: "?"
       when: "?"
       language: Chinese
 
 # Others
   - name: cleva_instruction_following
     display_name: CLEVA (Chinese) instruction following
-    description: CLEVA (Chinese) instruction following
+    description: "CLEVA (Chinese) instruction following scenario examines whether LLMs could follow human instructions, mainly uncommon ones. It contains 'redefine' and 'pattern_matching_suppression' two subtasks, following [McKenzie et al., (2023)](https://arxiv.org/abs/2306.09479)."
     metric_groups:
       - accuracy
       - calibration
@@ -3508,15 +3508,15 @@ run_groups:
       main_name: exact_match
       main_split: test
     taxonomy:
-      task: "?"
-      what: "?"
-      who: "?"
-      when: "?"
-      language: Chinese
+      task: multiple-choice
+      what: natural language questions
+      who: automatically generated from templates
+      when: n/a
+      language: synthetic
 
   - name: cleva_mathematical_calculation
     display_name: CLEVA (Chinese) mathematical calculation
-    description: CLEVA (Chinese) mathematical calculation
+    description: "CLEVA (Chinese) mathematical calculation scenario evaluates the calculation ability of LLMs. It has four subtasks: 'add' (three-digit addition), 'sub' (three-digit subtraction), 'mul' (two-digit multiplication), and 'significant_figures'."
     metric_groups:
       - accuracy
       - calibration
@@ -3527,10 +3527,10 @@ run_groups:
       main_split: test
     taxonomy:
       task: next-word prediction
-      what: n/a
-      who: n/a
+      what: natural language math questions or pure math expressions
+      who: automatically generated from templates
       when: n/a
-      language: Chinese or Math Expression
+      language: synthetic
 
 ## Aspirational scenarios
 # Task coverage