Stability-AI · polm-stability · Oct 5, 2023 · Oct 4, 2023 · Oct 5, 2023 · Oct 5, 2023
@@ -13,6 +13,7 @@
 import torch.nn.functional as F
 
 from lm_eval.metrics import mean, weighted_perplexity, weighted_mean, bits_per_byte
+from lm_eval.metrics import balanced_mean
 from lm_eval import utils
 from abc import abstractmethod
 
@@ -709,6 +710,42 @@ def aggregation(self):
             "acc_norm": mean,
         }
 
+class BalancedMultipleChoiceTask(MultipleChoiceTask):
+    """A task where the choices are the same every time, and accuracy should be
+    calculated separately for each class.
+
+    Originally created for marc-ja, which is severely imbalanced, though also
+    useful with less weird datasets. Not suitable for datasets where the choices
+    change for every question.
+    """
+    def process_results(self, doc, results):
+        gold = doc["gold"]
+
+        acc = 1.0 if np.argmax(results) == gold else 0.0
+        completion_len = np.array([float(len(i)) for i in doc["choices"]])
+        acc_norm = 1.0 if np.argmax(results / completion_len) == gold else 0.0
+
+        return {
+            "acc": acc,
+            "acc_norm": acc_norm,
+            "balanced_acc": (acc, gold)
+        }
+
+    def higher_is_better(self):
+        return {
+            "acc": True,
+            "acc_norm": True,
+            "balanced_acc": True,
+        }
+
+    def aggregation(self):
+        return {
+            "acc": mean,
+            "acc_norm": mean,
+            "balanced_acc": balanced_mean,
+        }
+
+
 
 class PerplexityTask(Task, abc.ABC):
     def should_decontaminate(self):

@@ -5,6 +5,7 @@
 import sacrebleu
 import sklearn.metrics
 import random
+from collections import defaultdict
 
 
 def mean(arr):
@@ -29,6 +30,22 @@ def median(arr):
     return arr[len(arr) // 2]
 
 
+def balanced_mean(arr):
+    # each entry is of the form (acc score, class label)
+    # first group the results
+    by_class = defaultdict(list)
+    for acc, label in arr:
+        by_class[label].append(acc)
+
+    # calculate class averages
+    avgs = []
+    for key, vals in by_class.items():
+        avgs.append(sum(vals) / len(vals))
+
+    # average the class values
+    return sum(avgs) / len(avgs)
+
+
 def matthews_corrcoef(items):
     unzipped_list = list(zip(*items))
     golds = unzipped_list[0]

@@ -7,7 +7,7 @@
 
 Homepage: https://github.com/yahoojapan/JGLUE
 """
-from lm_eval.base import MultipleChoiceTask, rf
+from lm_eval.base import BalancedMultipleChoiceTask, rf
 
 _CITATION = """
 @inproceedings{kurihara-etal-2022-jglue,
@@ -28,7 +28,7 @@
 
 
 
-class JNLIWithFintanPrompt(MultipleChoiceTask):
+class JNLIWithFintanPrompt(BalancedMultipleChoiceTask):
     """
     prompt template is taken from [ChatGPT vs BERT: どちらが日本語をより理解できるのか?](https://fintan.jp/page/9126/)
     """

@@ -7,7 +7,7 @@
 
 Homepage: https://github.com/yahoojapan/JGLUE
 """
-from lm_eval.base import MultipleChoiceTask, rf
+from lm_eval.base import BalancedMultipleChoiceTask, rf
 
 _CITATION = """
 @inproceedings{kurihara-etal-2022-jglue,
@@ -28,7 +28,7 @@
 
 
 
-class MARCJaWithFintanPrompt(MultipleChoiceTask):
+class MARCJaWithFintanPrompt(BalancedMultipleChoiceTask):
     """
     prompt template is taken from [ChatGPT vs BERT: どちらが日本語をより理解できるのか?](https://fintan.jp/page/9126/)
     """
@@ -162,4 +162,4 @@ def construct_tasks():
     tasks = {}
     for version_class in VERSIONS:
         tasks[f"marc_ja-{version_class.VERSION}-{version_class.PROMPT_VERSION}"] = version_class
-    return tasks
+    return tasks