Verbose output for more tasks (Stability-AI#92)

* Add output to jaqket v2 * Add details to jsquad * Add versbose output to xlsum --------- Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
polm-stability · Oct 11, 2023 · ec0e2a5 · ec0e2a5
1 parent 1547fc7
commit ec0e2a5
Show file tree

Hide file tree

Showing 3 changed files with 32 additions and 3 deletions.
diff --git a/lm_eval/tasks/ja/jaqket_v2.py b/lm_eval/tasks/ja/jaqket_v2.py
@@ -279,7 +279,7 @@ def process_results(self, doc, results):
             "id": doc["qid"],
             "answers": doc["answers"],
         }
-        return {
+        out = {
             "exact_match": (
                 predictions,
                 references,
@@ -290,6 +290,16 @@ def process_results(self, doc, results):
             ),  # The F-score of predicted tokens versus the gold answer
         }
 
+        # add details. Because the metric computation isn't simple (probably?)
+        # always include it.
+        out["details"] = {
+            "question": doc["question"],
+            "response": continuation,
+            "gold": doc["answers"]
+        }
+
+        return out
+
 
     def aggregation(self):
         return {

diff --git a/lm_eval/tasks/ja/jsquad.py b/lm_eval/tasks/ja/jsquad.py
@@ -151,7 +151,7 @@ def process_results(self, doc, results):
             "id": doc["id"],
             "answers": doc["answers"],
         }
-        return {
+        out = {
             "exact_match": (
                 predictions,
                 references,
@@ -162,6 +162,16 @@ def process_results(self, doc, results):
             ),  # The F-score of predicted tokens versus the gold answer
         }
 
+        # add verbose output
+        out["details"] = {
+            "question": doc["question"],
+            "response": continuation,
+            "gold": doc["answers"]
+        }
+
+        return out
+
+
     def aggregation(self):
         return {
             "exact_match": partial(

diff --git a/lm_eval/tasks/ja/xlsum_ja.py b/lm_eval/tasks/ja/xlsum_ja.py
@@ -137,12 +137,21 @@ def construct_requests(self, doc, ctx):
     def process_results(self, doc, results):
         continuation = results[0]
         ground_truth = doc["summary"]
-        return {
+        out = {
             "rouge2": (
                 continuation,
                 ground_truth,
             )
         }
+        # add verbose output
+        out["details"] = {
+            # this isn't really a question, but keeping it this way for
+            # consistency
+            "question": doc["text"],
+            "response": continuation,
+            "gold": doc["summary"]
+        }
+        return out
 
     def aggregation(self):
         return {