From d8e68cbd2e2303a6612ef458dcbf6036fc87355b Mon Sep 17 00:00:00 2001 From: Jianqiao-Zhao Date: Tue, 26 Sep 2023 16:03:26 +0800 Subject: [PATCH 1/5] Update All Download Links --- .../augmentations/cleva_perturbation.py | 33 ++++++++++++------- .../benchmark/metrics/cleva_harms_metrics.py | 25 +++++++------- .../benchmark/scenarios/cleva_scenario.py | 11 ++++--- 3 files changed, 41 insertions(+), 28 deletions(-) diff --git a/src/helm/benchmark/augmentations/cleva_perturbation.py b/src/helm/benchmark/augmentations/cleva_perturbation.py index f82d6da3d8..f434075d87 100644 --- a/src/helm/benchmark/augmentations/cleva_perturbation.py +++ b/src/helm/benchmark/augmentations/cleva_perturbation.py @@ -43,7 +43,15 @@ class Description(PerturbationDescription): name: str = "chinese_typos" # For downloading resources - ASSET_URL = "https://drive.google.com/uc?id=1p5mldLpKxI-63H8YEruGJghtD1dZJI8k" + ASSET_URL = "http://39.108.215.175/assets/butter_finger" + FILE_NAMES: List[str] = [ + "pinyin_to_char.json", + "toneless_pinyin_to_char.json", + "pinyin_to_common_char.json", + "toneless_pinyin_to_common_char.json", + "pinyin_to_word.json", + "toneless_pinyin_to_word.json", + ] def __init__( self, @@ -62,8 +70,11 @@ def __init__( # Ensure all necessary data are downloaded output_dir = os.path.join("benchmark_output", "perturbations", self.name) - ensure_directory_exists(os.path.dirname(output_dir)) - ensure_file_downloaded(source_url=self.ASSET_URL, target_path=output_dir, unpack=True, unpack_type="unzip") + ensure_directory_exists(output_dir) + for filename in self.FILE_NAMES: + target_path = os.path.join(output_dir, filename) + SOURCE_URL: str = f"{self.ASSET_URL}/{filename}" + ensure_file_downloaded(source_url=SOURCE_URL, target_path=target_path) # Load the data for the perturbation with open( @@ -285,7 +296,7 @@ class Description(PerturbationDescription): name: str = "chinese_synonym" # For downloading resources - SOURCE_URI: str = "https://drive.google.com/uc?id=1gXyZjoUw6yRjrsrh9ERzB_gxVluMTvij" + SOURCE_URL: str = "http://39.108.215.175/assets/synonyms.json" def __init__(self, prob: float, trial_num: int = 10): # Assign parameters to instance variables @@ -294,7 +305,7 @@ def __init__(self, prob: float, trial_num: int = 10): target_dir = os.path.join("benchmark_output", "perturbations", self.name, "synonyms.json") ensure_directory_exists(os.path.dirname(target_dir)) - ensure_file_downloaded(source_url=self.SOURCE_URI, target_path=target_dir) + ensure_file_downloaded(source_url=self.SOURCE_URL, target_path=target_dir) with open(os.path.join(target_dir)) as f: self.synonym_dict: Dict[str, List[str]] = json.load(f) @@ -377,7 +388,7 @@ class ChineseGenderPerturbation(Perturbation): MODES = [GENDER_TERM, GENDER_PRONOUN] """ Resources """ - SOURCE_URI: str = "https://drive.google.com/uc?id=1tJ5GLKboQrpzzBYTnFxeRuCOBxYhjFLp" + SOURCE_URL: str = "http://39.108.215.175/assets/gender_term.txt" @dataclass(frozen=True) class Description(PerturbationDescription): @@ -424,7 +435,7 @@ class must be one of the genders in it. If not, it must be target_path = os.path.join("benchmark_output", "perturbations", self.name, "gender_term.txt") ensure_directory_exists(os.path.dirname(target_path)) - ensure_file_downloaded(source_url=self.SOURCE_URI, target_path=target_path) + ensure_file_downloaded(source_url=self.SOURCE_URL, target_path=target_path) with open(target_path) as fin: for line in fin.readlines(): splits: List[str] = line.strip("\n").split(" ") @@ -480,7 +491,7 @@ class ChinesePersonNamePerturbation(Perturbation): should_perturb_references: bool = True """ Resources """ - SOURCE_URI: str = "https://drive.google.com/uc?id=1nKnfsxREkScrNOyhqiFxP5F1SjRgk6r8" + SOURCE_URL: str = "http://39.108.215.175/assets/chinese_name_gender.json" OUTPUT_PATH = os.path.join("benchmark_output", "perturbations", name) """ Gender categories """ @@ -545,7 +556,7 @@ def __init__( target_path = os.path.join("benchmark_output", "perturbations", self.name, "chinese_name_gender.json") ensure_directory_exists(os.path.dirname(target_path)) - ensure_file_downloaded(source_url=self.SOURCE_URI, target_path=target_path) + ensure_file_downloaded(source_url=self.SOURCE_URL, target_path=target_path) with open(os.path.join(target_path), "r", encoding="utf-8") as f: self.gender2name: Dict[str, List[str]] = json.load(f) del self.gender2name["unknown"] @@ -715,7 +726,7 @@ class MandarinToCantonesePerturbation(Perturbation): should_perturb_references: bool = True """ Resources """ - SOURCE_URI: str = "https://drive.google.com/uc?id=1vljbwq0hTm7W1tz74gjPnONWJ6kSEwK2" + SOURCE_URL: str = "http://39.108.215.175/assets/simplified_jyutping_conversion.json" @property def description(self) -> PerturbationDescription: @@ -733,7 +744,7 @@ def __init__( target_path = os.path.join("benchmark_output", "perturbations", self.name, "conversion.json") ensure_directory_exists(os.path.dirname(target_path)) - ensure_file_downloaded(source_url=self.SOURCE_URI, target_path=target_path) + ensure_file_downloaded(source_url=self.SOURCE_URL, target_path=target_path) with open(target_path) as fin: self.phrase_table = json.load(fin) diff --git a/src/helm/benchmark/metrics/cleva_harms_metrics.py b/src/helm/benchmark/metrics/cleva_harms_metrics.py index c0ef6aa43f..c24fa8d662 100644 --- a/src/helm/benchmark/metrics/cleva_harms_metrics.py +++ b/src/helm/benchmark/metrics/cleva_harms_metrics.py @@ -36,15 +36,16 @@ class CLEVABiasMetric(BiasMetric): name: str = "chinese_bias" """ Chinese resources. """ - FILE_TO_URL: Dict[str, str] = { - "asian_word_list.txt": "https://drive.google.com/uc?id=1Enm7x1nj5o5DMXdSD3WFqr47F09QgFaM", - "hispanic_word_list.txt": "https://drive.google.com/uc?id=1-JAUBxA0IGiJ0j9nL6xJ8wCP9o-Gh8sS", - "white_word_list.txt": "https://drive.google.com/uc?id=1jbKhE4stKs8VyYmvQjx4aXq_m6oSccHp", - "male_word_list.txt": "https://drive.google.com/uc?id=1xWEsIYzXvQvOlpcID_zYBqfItIrVDIam", - "female_word_list.txt": "https://drive.google.com/uc?id=1Q5e18NfMCqPxdy7mBIteMqPHrJmOs7s_", - "profession_word_list.txt": "https://drive.google.com/uc?id=1baBsev6ippugLwUCQ8lHnLaOSBDstsj3", - "adjective_word_list.txt": "https://drive.google.com/uc?id=1s-jgE6OW-btc921GX9Aos0EhIbYwmBAT", - } + ASSET_URL = "http://39.108.215.175/assets" + FILE_NAMES: List[str] = [ + "asian_word_list.txt", + "hispanic_word_list.txt", + "white_word_list.txt", + "male_word_list.txt", + "female_word_list.txt", + "profession_word_list.txt", + "adjective_word_list.txt", + ] def __repr__(self): return ( @@ -71,9 +72,9 @@ def __init__(self, mode: str, demographic_category: str, target_category: Option # Ensure all necessary data are downloaded self.output_dir = os.path.join("benchmark_output", "metrics", self.name) ensure_directory_exists(self.output_dir) - for FILENAME, URL in self.FILE_TO_URL.items(): - target_path = os.path.join(self.output_dir, FILENAME) - ensure_file_downloaded(source_url=URL, target_path=target_path) + for filename in self.FILE_NAMES: + target_path = os.path.join(self.output_dir, filename) + ensure_file_downloaded(source_url=f"{self.ASSET_URL}/{filename}", target_path=target_path) # Overwrite inherited mappings self.build_mappings() diff --git a/src/helm/benchmark/scenarios/cleva_scenario.py b/src/helm/benchmark/scenarios/cleva_scenario.py index 716211d987..60c35f8c41 100644 --- a/src/helm/benchmark/scenarios/cleva_scenario.py +++ b/src/helm/benchmark/scenarios/cleva_scenario.py @@ -16,7 +16,7 @@ from .code_scenario import CodeReference, CodeInstance -CLEVA_DATA_URL = "https://drive.google.com/uc?id=1uteSvq2dOgsmutOOwEziQd_d9i5Ypan6&confirm=t" +CLEVA_DATA_URL = "http://39.108.215.175/data" CLEVA_DATA_PATH = "benchmark_output/scenarios/cleva" @@ -410,10 +410,11 @@ def task(self) -> str: pass @classmethod - def download_dataset(cls): - target_dir = os.path.join(CLEVA_DATA_PATH, "data") - ensure_directory_exists(CLEVA_DATA_PATH) - ensure_file_downloaded(source_url=CLEVA_DATA_URL, target_path=target_dir, unpack=True, unpack_type="untar") + def download_dataset(cls, task, version): + source_url: str = CLEVA_DATA_URL + f"/{version}/{task}.zip" + target_dir: str = os.path.join(CLEVA_DATA_PATH, "data", version) + ensure_directory_exists(target_dir) + ensure_file_downloaded(source_url=source_url, target_path=os.path.join(target_dir, f"{task}.zip"), unpack=True) def load_dataset(self) -> Dict[str, List[Dict[str, Any]]]: data_dir: str = os.path.join(CLEVA_DATA_PATH, "data", self.version, self.task) From a3270ae1db9eb3af37d5938ebd589f476ef1dd4c Mon Sep 17 00:00:00 2001 From: Jianqiao-Zhao Date: Tue, 26 Sep 2023 16:29:25 +0800 Subject: [PATCH 2/5] Minor Debug --- src/helm/benchmark/run_specs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helm/benchmark/run_specs.py b/src/helm/benchmark/run_specs.py index 5e5d37b3ed..9b14d837c5 100644 --- a/src/helm/benchmark/run_specs.py +++ b/src/helm/benchmark/run_specs.py @@ -2380,7 +2380,7 @@ def get_anthropic_hh_rlhf_spec(num_respondents: int, subset: str) -> RunSpec: def get_cleva_spec(task: str, version: str, subtask: str = None, prompt_id: int = 0) -> RunSpec: from .scenarios.cleva_scenario import CLEVAScenario # noqa - CLEVAScenario.download_dataset() + CLEVAScenario.download_dataset(task, version) _, prompt_setting = CLEVAScenario.get_prompt_setting(task, subtask, version, prompt_id) inference_parameters = CLEVAScenario.load_inference_parameters(task, subtask, version, prompt_id) From ec1aed258afd880e4e4928c02c8d62383863ba96 Mon Sep 17 00:00:00 2001 From: Jianqiao-Zhao Date: Tue, 26 Sep 2023 17:21:17 +0800 Subject: [PATCH 3/5] Minor Debug --- src/helm/benchmark/scenarios/cleva_scenario.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helm/benchmark/scenarios/cleva_scenario.py b/src/helm/benchmark/scenarios/cleva_scenario.py index 60c35f8c41..469db99b3c 100644 --- a/src/helm/benchmark/scenarios/cleva_scenario.py +++ b/src/helm/benchmark/scenarios/cleva_scenario.py @@ -414,7 +414,7 @@ def download_dataset(cls, task, version): source_url: str = CLEVA_DATA_URL + f"/{version}/{task}.zip" target_dir: str = os.path.join(CLEVA_DATA_PATH, "data", version) ensure_directory_exists(target_dir) - ensure_file_downloaded(source_url=source_url, target_path=os.path.join(target_dir, f"{task}.zip"), unpack=True) + ensure_file_downloaded(source_url=source_url, target_path=os.path.join(target_dir, task), unpack=True) def load_dataset(self) -> Dict[str, List[Dict[str, Any]]]: data_dir: str = os.path.join(CLEVA_DATA_PATH, "data", self.version, self.task) From 1b33806edf619f1acde9f46cd7a41395adcb6bd1 Mon Sep 17 00:00:00 2001 From: Jianqiao-Zhao Date: Tue, 26 Sep 2023 17:29:51 +0800 Subject: [PATCH 4/5] Add Type Annotation --- src/helm/benchmark/scenarios/cleva_scenario.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helm/benchmark/scenarios/cleva_scenario.py b/src/helm/benchmark/scenarios/cleva_scenario.py index 469db99b3c..0c40bd5651 100644 --- a/src/helm/benchmark/scenarios/cleva_scenario.py +++ b/src/helm/benchmark/scenarios/cleva_scenario.py @@ -410,7 +410,7 @@ def task(self) -> str: pass @classmethod - def download_dataset(cls, task, version): + def download_dataset(cls, task: str, version: str): source_url: str = CLEVA_DATA_URL + f"/{version}/{task}.zip" target_dir: str = os.path.join(CLEVA_DATA_PATH, "data", version) ensure_directory_exists(target_dir) From 5c33af04f6a3eead5d482b1c17d7514bd1022003 Mon Sep 17 00:00:00 2001 From: Jianqiao-Zhao Date: Tue, 26 Sep 2023 23:10:55 +0800 Subject: [PATCH 5/5] Update One File Name --- src/helm/benchmark/augmentations/cleva_perturbation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helm/benchmark/augmentations/cleva_perturbation.py b/src/helm/benchmark/augmentations/cleva_perturbation.py index f434075d87..0b89aa49bc 100644 --- a/src/helm/benchmark/augmentations/cleva_perturbation.py +++ b/src/helm/benchmark/augmentations/cleva_perturbation.py @@ -726,7 +726,7 @@ class MandarinToCantonesePerturbation(Perturbation): should_perturb_references: bool = True """ Resources """ - SOURCE_URL: str = "http://39.108.215.175/assets/simplified_jyutping_conversion.json" + SOURCE_URL: str = "http://39.108.215.175/assets/conversion.json" @property def description(self) -> PerturbationDescription: