huggingface · lhoestq · Dec 8, 2021 · Dec 3, 2021 · Dec 3, 2021 · Dec 3, 2021
diff --git a/datasets/clue/README.md b/datasets/clue/README.md
@@ -95,9 +95,9 @@ This example was too long and was cropped:
 
 #### chid
 
-- **Size of downloaded dataset files:** 127.15 MB
-- **Size of the generated dataset:** 259.71 MB
-- **Total amount of disk used:** 386.86 MB
+- **Size of downloaded dataset files:** 132.75 MB
+- **Size of the generated dataset:** 261.38 MB
+- **Total amount of disk used:** 394.13 MB
 
 An example of 'train' looks as follows.
 ```
@@ -116,9 +116,9 @@ This example was too long and was cropped:
 
 #### cluewsc2020
 
-- **Size of downloaded dataset files:** 0.08 MB
-- **Size of the generated dataset:** 0.41 MB
-- **Total amount of disk used:** 0.49 MB
+- **Size of downloaded dataset files:** 0.27 MB
+- **Size of the generated dataset:** 0.98 MB
+- **Total amount of disk used:** 1.23 MB
 
 An example of 'train' looks as follows.
 ```

diff --git a/datasets/clue/clue.py b/datasets/clue/clue.py
@@ -407,12 +407,18 @@ def _info(self):
     def _split_generators(self, dl_manager):
         dl_dir = dl_manager.download_and_extract(self.config.data_url)
         data_dir = os.path.join(dl_dir, self.config.data_dir)
+
+        if self.config.name in {"chid", "c3"}:
+            test_file = "test1.1.json"
+        elif self.config.name == "diagnostics":
+            test_file = "diagnostics_test.json"
+        else:
+            test_file = "test.json"
+
         test_split = datasets.SplitGenerator(
             name=datasets.Split.TEST,
             gen_kwargs={
-                "data_file": os.path.join(
-                    data_dir, "test.json" if self.config.name != "diagnostics" else "diagnostics_test.json"
-                ),
+                "data_file": os.path.join(data_dir, test_file),
                 "split": "test",
             },
         )
@@ -472,15 +478,15 @@ def _generate_examples(self, data_file, split):
                 data_subset = json.load(open(f, encoding="utf8"))
                 data += data_subset
             for idx, entry in enumerate(data):
-                for question in entry[1]:
+                for qidx, question in enumerate(entry[1]):
                     example = {
                         "id": idx if split != "test" else int(question["id"]),
                         "context": entry[0],
                         "question": question["question"],
                         "choice": question["choice"],
                         "answer": question["answer"] if split != "test" else "",
                     }
-                    yield example["id"], example
+                    yield f"{idx}_{qidx}", example
 
         else:
             with open(data_file, encoding="utf8") as f:

diff --git a/datasets/clue/dataset_infos.json b/datasets/clue/dataset_infos.json
diff --git a/datasets/clue/dummy/c3/1.0.0/dummy_data.zip b/datasets/clue/dummy/c3/1.0.0/dummy_data.zip
diff --git a/datasets/clue/dummy/chid/1.0.0/dummy_data.zip b/datasets/clue/dummy/chid/1.0.0/dummy_data.zip