lyy1994 · lyy1994 · Sep 26, 2023 · Sep 26, 2023 · Sep 26, 2023 · Sep 26, 2023
diff --git a/src/helm/benchmark/augmentations/cleva_perturbation.py b/src/helm/benchmark/augmentations/cleva_perturbation.py
@@ -43,7 +43,15 @@ class Description(PerturbationDescription):
     name: str = "chinese_typos"
 
     # For downloading resources
-    ASSET_URL = "https://drive.google.com/uc?id=1p5mldLpKxI-63H8YEruGJghtD1dZJI8k"
+    ASSET_URL = "http://39.108.215.175/assets/butter_finger"
+    FILE_NAMES: List[str] = [
+        "pinyin_to_char.json",
+        "toneless_pinyin_to_char.json",
+        "pinyin_to_common_char.json",
+        "toneless_pinyin_to_common_char.json",
+        "pinyin_to_word.json",
+        "toneless_pinyin_to_word.json",
+    ]
 
     def __init__(
         self,
@@ -62,8 +70,11 @@ def __init__(
 
         # Ensure all necessary data are downloaded
         output_dir = os.path.join("benchmark_output", "perturbations", self.name)
-        ensure_directory_exists(os.path.dirname(output_dir))
-        ensure_file_downloaded(source_url=self.ASSET_URL, target_path=output_dir, unpack=True, unpack_type="unzip")
+        ensure_directory_exists(output_dir)
+        for filename in self.FILE_NAMES:
+            target_path = os.path.join(output_dir, filename)
+            SOURCE_URL: str = f"{self.ASSET_URL}/{filename}"
+            ensure_file_downloaded(source_url=SOURCE_URL, target_path=target_path)
 
         # Load the data for the perturbation
         with open(
@@ -285,7 +296,7 @@ class Description(PerturbationDescription):
     name: str = "chinese_synonym"
 
     # For downloading resources
-    SOURCE_URI: str = "https://drive.google.com/uc?id=1gXyZjoUw6yRjrsrh9ERzB_gxVluMTvij"
+    SOURCE_URL: str = "http://39.108.215.175/assets/synonyms.json"
 
     def __init__(self, prob: float, trial_num: int = 10):
         # Assign parameters to instance variables
@@ -294,7 +305,7 @@ def __init__(self, prob: float, trial_num: int = 10):
 
         target_dir = os.path.join("benchmark_output", "perturbations", self.name, "synonyms.json")
         ensure_directory_exists(os.path.dirname(target_dir))
-        ensure_file_downloaded(source_url=self.SOURCE_URI, target_path=target_dir)
+        ensure_file_downloaded(source_url=self.SOURCE_URL, target_path=target_dir)
         with open(os.path.join(target_dir)) as f:
             self.synonym_dict: Dict[str, List[str]] = json.load(f)
 
@@ -377,7 +388,7 @@ class ChineseGenderPerturbation(Perturbation):
     MODES = [GENDER_TERM, GENDER_PRONOUN]
 
     """ Resources """
-    SOURCE_URI: str = "https://drive.google.com/uc?id=1tJ5GLKboQrpzzBYTnFxeRuCOBxYhjFLp"
+    SOURCE_URL: str = "http://39.108.215.175/assets/gender_term.txt"
 
     @dataclass(frozen=True)
     class Description(PerturbationDescription):
@@ -424,7 +435,7 @@ class must be one of the genders in it. If not, it must be
 
             target_path = os.path.join("benchmark_output", "perturbations", self.name, "gender_term.txt")
             ensure_directory_exists(os.path.dirname(target_path))
-            ensure_file_downloaded(source_url=self.SOURCE_URI, target_path=target_path)
+            ensure_file_downloaded(source_url=self.SOURCE_URL, target_path=target_path)
             with open(target_path) as fin:
                 for line in fin.readlines():
                     splits: List[str] = line.strip("\n").split(" ")
@@ -480,7 +491,7 @@ class ChinesePersonNamePerturbation(Perturbation):
     should_perturb_references: bool = True
 
     """ Resources """
-    SOURCE_URI: str = "https://drive.google.com/uc?id=1nKnfsxREkScrNOyhqiFxP5F1SjRgk6r8"
+    SOURCE_URL: str = "http://39.108.215.175/assets/chinese_name_gender.json"
     OUTPUT_PATH = os.path.join("benchmark_output", "perturbations", name)
 
     """ Gender categories """
@@ -545,7 +556,7 @@ def __init__(
 
         target_path = os.path.join("benchmark_output", "perturbations", self.name, "chinese_name_gender.json")
         ensure_directory_exists(os.path.dirname(target_path))
-        ensure_file_downloaded(source_url=self.SOURCE_URI, target_path=target_path)
+        ensure_file_downloaded(source_url=self.SOURCE_URL, target_path=target_path)
         with open(os.path.join(target_path), "r", encoding="utf-8") as f:
             self.gender2name: Dict[str, List[str]] = json.load(f)
             del self.gender2name["unknown"]
@@ -715,7 +726,7 @@ class MandarinToCantonesePerturbation(Perturbation):
     should_perturb_references: bool = True
 
     """ Resources """
-    SOURCE_URI: str = "https://drive.google.com/uc?id=1vljbwq0hTm7W1tz74gjPnONWJ6kSEwK2"
+    SOURCE_URL: str = "http://39.108.215.175/assets/simplified_jyutping_conversion.json"
 
     @property
     def description(self) -> PerturbationDescription:
@@ -733,7 +744,7 @@ def __init__(
 
         target_path = os.path.join("benchmark_output", "perturbations", self.name, "conversion.json")
         ensure_directory_exists(os.path.dirname(target_path))
-        ensure_file_downloaded(source_url=self.SOURCE_URI, target_path=target_path)
+        ensure_file_downloaded(source_url=self.SOURCE_URL, target_path=target_path)
         with open(target_path) as fin:
             self.phrase_table = json.load(fin)
 

diff --git a/src/helm/benchmark/metrics/cleva_harms_metrics.py b/src/helm/benchmark/metrics/cleva_harms_metrics.py
@@ -36,15 +36,16 @@ class CLEVABiasMetric(BiasMetric):
     name: str = "chinese_bias"
 
     """ Chinese resources. """
-    FILE_TO_URL: Dict[str, str] = {
-        "asian_word_list.txt": "https://drive.google.com/uc?id=1Enm7x1nj5o5DMXdSD3WFqr47F09QgFaM",
-        "hispanic_word_list.txt": "https://drive.google.com/uc?id=1-JAUBxA0IGiJ0j9nL6xJ8wCP9o-Gh8sS",
-        "white_word_list.txt": "https://drive.google.com/uc?id=1jbKhE4stKs8VyYmvQjx4aXq_m6oSccHp",
-        "male_word_list.txt": "https://drive.google.com/uc?id=1xWEsIYzXvQvOlpcID_zYBqfItIrVDIam",
-        "female_word_list.txt": "https://drive.google.com/uc?id=1Q5e18NfMCqPxdy7mBIteMqPHrJmOs7s_",
-        "profession_word_list.txt": "https://drive.google.com/uc?id=1baBsev6ippugLwUCQ8lHnLaOSBDstsj3",
-        "adjective_word_list.txt": "https://drive.google.com/uc?id=1s-jgE6OW-btc921GX9Aos0EhIbYwmBAT",
-    }
+    ASSET_URL = "http://39.108.215.175/assets"
+    FILE_NAMES: List[str] = [
+        "asian_word_list.txt",
+        "hispanic_word_list.txt",
+        "white_word_list.txt",
+        "male_word_list.txt",
+        "female_word_list.txt",
+        "profession_word_list.txt",
+        "adjective_word_list.txt",
+    ]
 
     def __repr__(self):
         return (
@@ -71,9 +72,9 @@ def __init__(self, mode: str, demographic_category: str, target_category: Option
         # Ensure all necessary data are downloaded
         self.output_dir = os.path.join("benchmark_output", "metrics", self.name)
         ensure_directory_exists(self.output_dir)
-        for FILENAME, URL in self.FILE_TO_URL.items():
-            target_path = os.path.join(self.output_dir, FILENAME)
-            ensure_file_downloaded(source_url=URL, target_path=target_path)
+        for filename in self.FILE_NAMES:
+            target_path = os.path.join(self.output_dir, filename)
+            ensure_file_downloaded(source_url=f"{self.ASSET_URL}/{filename}", target_path=target_path)
 
         # Overwrite inherited mappings
         self.build_mappings()

diff --git a/src/helm/benchmark/run_specs.py b/src/helm/benchmark/run_specs.py
@@ -2380,7 +2380,7 @@ def get_anthropic_hh_rlhf_spec(num_respondents: int, subset: str) -> RunSpec:
 def get_cleva_spec(task: str, version: str, subtask: str = None, prompt_id: int = 0) -> RunSpec:
     from .scenarios.cleva_scenario import CLEVAScenario  # noqa
 
-    CLEVAScenario.download_dataset()
+    CLEVAScenario.download_dataset(task, version)
 
     _, prompt_setting = CLEVAScenario.get_prompt_setting(task, subtask, version, prompt_id)
     inference_parameters = CLEVAScenario.load_inference_parameters(task, subtask, version, prompt_id)

diff --git a/src/helm/benchmark/scenarios/cleva_scenario.py b/src/helm/benchmark/scenarios/cleva_scenario.py
@@ -16,7 +16,7 @@
 from .code_scenario import CodeReference, CodeInstance
 
 
-CLEVA_DATA_URL = "https://drive.google.com/uc?id=1uteSvq2dOgsmutOOwEziQd_d9i5Ypan6&confirm=t"
+CLEVA_DATA_URL = "http://39.108.215.175/data"
 CLEVA_DATA_PATH = "benchmark_output/scenarios/cleva"
 
 
@@ -410,10 +410,11 @@ def task(self) -> str:
         pass
 
     @classmethod
-    def download_dataset(cls):
-        target_dir = os.path.join(CLEVA_DATA_PATH, "data")
-        ensure_directory_exists(CLEVA_DATA_PATH)
-        ensure_file_downloaded(source_url=CLEVA_DATA_URL, target_path=target_dir, unpack=True, unpack_type="untar")
+    def download_dataset(cls, task: str, version: str):
+        source_url: str = CLEVA_DATA_URL + f"/{version}/{task}.zip"
+        target_dir: str = os.path.join(CLEVA_DATA_PATH, "data", version)
+        ensure_directory_exists(target_dir)
+        ensure_file_downloaded(source_url=source_url, target_path=os.path.join(target_dir, task), unpack=True)
 
     def load_dataset(self) -> Dict[str, List[Dict[str, Any]]]:
         data_dir: str = os.path.join(CLEVA_DATA_PATH, "data", self.version, self.task)