Skip to content
This repository has been archived by the owner on Dec 15, 2023. It is now read-only.

Update Download links #44

Merged
merged 5 commits into from
Sep 26, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 22 additions & 11 deletions src/helm/benchmark/augmentations/cleva_perturbation.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,15 @@ class Description(PerturbationDescription):
name: str = "chinese_typos"

# For downloading resources
ASSET_URL = "https://drive.google.com/uc?id=1p5mldLpKxI-63H8YEruGJghtD1dZJI8k"
ASSET_URL = "http://39.108.215.175/assets/butter_finger"
FILE_NAMES: List[str] = [
"pinyin_to_char.json",
"toneless_pinyin_to_char.json",
"pinyin_to_common_char.json",
"toneless_pinyin_to_common_char.json",
"pinyin_to_word.json",
"toneless_pinyin_to_word.json",
]

def __init__(
self,
Expand All @@ -62,8 +70,11 @@ def __init__(

# Ensure all necessary data are downloaded
output_dir = os.path.join("benchmark_output", "perturbations", self.name)
ensure_directory_exists(os.path.dirname(output_dir))
ensure_file_downloaded(source_url=self.ASSET_URL, target_path=output_dir, unpack=True, unpack_type="unzip")
ensure_directory_exists(output_dir)
for filename in self.FILE_NAMES:
target_path = os.path.join(output_dir, filename)
SOURCE_URL: str = f"{self.ASSET_URL}/{filename}"
ensure_file_downloaded(source_url=SOURCE_URL, target_path=target_path)

# Load the data for the perturbation
with open(
Expand Down Expand Up @@ -285,7 +296,7 @@ class Description(PerturbationDescription):
name: str = "chinese_synonym"

# For downloading resources
SOURCE_URI: str = "https://drive.google.com/uc?id=1gXyZjoUw6yRjrsrh9ERzB_gxVluMTvij"
SOURCE_URL: str = "http://39.108.215.175/assets/synonyms.json"

def __init__(self, prob: float, trial_num: int = 10):
# Assign parameters to instance variables
Expand All @@ -294,7 +305,7 @@ def __init__(self, prob: float, trial_num: int = 10):

target_dir = os.path.join("benchmark_output", "perturbations", self.name, "synonyms.json")
ensure_directory_exists(os.path.dirname(target_dir))
ensure_file_downloaded(source_url=self.SOURCE_URI, target_path=target_dir)
ensure_file_downloaded(source_url=self.SOURCE_URL, target_path=target_dir)
with open(os.path.join(target_dir)) as f:
self.synonym_dict: Dict[str, List[str]] = json.load(f)

Expand Down Expand Up @@ -377,7 +388,7 @@ class ChineseGenderPerturbation(Perturbation):
MODES = [GENDER_TERM, GENDER_PRONOUN]

""" Resources """
SOURCE_URI: str = "https://drive.google.com/uc?id=1tJ5GLKboQrpzzBYTnFxeRuCOBxYhjFLp"
SOURCE_URL: str = "http://39.108.215.175/assets/gender_term.txt"

@dataclass(frozen=True)
class Description(PerturbationDescription):
Expand Down Expand Up @@ -424,7 +435,7 @@ class must be one of the genders in it. If not, it must be

target_path = os.path.join("benchmark_output", "perturbations", self.name, "gender_term.txt")
ensure_directory_exists(os.path.dirname(target_path))
ensure_file_downloaded(source_url=self.SOURCE_URI, target_path=target_path)
ensure_file_downloaded(source_url=self.SOURCE_URL, target_path=target_path)
with open(target_path) as fin:
for line in fin.readlines():
splits: List[str] = line.strip("\n").split(" ")
Expand Down Expand Up @@ -480,7 +491,7 @@ class ChinesePersonNamePerturbation(Perturbation):
should_perturb_references: bool = True

""" Resources """
SOURCE_URI: str = "https://drive.google.com/uc?id=1nKnfsxREkScrNOyhqiFxP5F1SjRgk6r8"
SOURCE_URL: str = "http://39.108.215.175/assets/chinese_name_gender.json"
OUTPUT_PATH = os.path.join("benchmark_output", "perturbations", name)

""" Gender categories """
Expand Down Expand Up @@ -545,7 +556,7 @@ def __init__(

target_path = os.path.join("benchmark_output", "perturbations", self.name, "chinese_name_gender.json")
ensure_directory_exists(os.path.dirname(target_path))
ensure_file_downloaded(source_url=self.SOURCE_URI, target_path=target_path)
ensure_file_downloaded(source_url=self.SOURCE_URL, target_path=target_path)
with open(os.path.join(target_path), "r", encoding="utf-8") as f:
self.gender2name: Dict[str, List[str]] = json.load(f)
del self.gender2name["unknown"]
Expand Down Expand Up @@ -715,7 +726,7 @@ class MandarinToCantonesePerturbation(Perturbation):
should_perturb_references: bool = True

""" Resources """
SOURCE_URI: str = "https://drive.google.com/uc?id=1vljbwq0hTm7W1tz74gjPnONWJ6kSEwK2"
SOURCE_URL: str = "http://39.108.215.175/assets/simplified_jyutping_conversion.json"
lyy1994 marked this conversation as resolved.
Show resolved Hide resolved

@property
def description(self) -> PerturbationDescription:
Expand All @@ -733,7 +744,7 @@ def __init__(

target_path = os.path.join("benchmark_output", "perturbations", self.name, "conversion.json")
ensure_directory_exists(os.path.dirname(target_path))
ensure_file_downloaded(source_url=self.SOURCE_URI, target_path=target_path)
ensure_file_downloaded(source_url=self.SOURCE_URL, target_path=target_path)
with open(target_path) as fin:
self.phrase_table = json.load(fin)

Expand Down
25 changes: 13 additions & 12 deletions src/helm/benchmark/metrics/cleva_harms_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,16 @@ class CLEVABiasMetric(BiasMetric):
name: str = "chinese_bias"

""" Chinese resources. """
FILE_TO_URL: Dict[str, str] = {
"asian_word_list.txt": "https://drive.google.com/uc?id=1Enm7x1nj5o5DMXdSD3WFqr47F09QgFaM",
"hispanic_word_list.txt": "https://drive.google.com/uc?id=1-JAUBxA0IGiJ0j9nL6xJ8wCP9o-Gh8sS",
"white_word_list.txt": "https://drive.google.com/uc?id=1jbKhE4stKs8VyYmvQjx4aXq_m6oSccHp",
"male_word_list.txt": "https://drive.google.com/uc?id=1xWEsIYzXvQvOlpcID_zYBqfItIrVDIam",
"female_word_list.txt": "https://drive.google.com/uc?id=1Q5e18NfMCqPxdy7mBIteMqPHrJmOs7s_",
"profession_word_list.txt": "https://drive.google.com/uc?id=1baBsev6ippugLwUCQ8lHnLaOSBDstsj3",
"adjective_word_list.txt": "https://drive.google.com/uc?id=1s-jgE6OW-btc921GX9Aos0EhIbYwmBAT",
}
ASSET_URL = "http://39.108.215.175/assets"
FILE_NAMES: List[str] = [
"asian_word_list.txt",
"hispanic_word_list.txt",
"white_word_list.txt",
"male_word_list.txt",
"female_word_list.txt",
"profession_word_list.txt",
"adjective_word_list.txt",
]

def __repr__(self):
return (
Expand All @@ -71,9 +72,9 @@ def __init__(self, mode: str, demographic_category: str, target_category: Option
# Ensure all necessary data are downloaded
self.output_dir = os.path.join("benchmark_output", "metrics", self.name)
ensure_directory_exists(self.output_dir)
for FILENAME, URL in self.FILE_TO_URL.items():
target_path = os.path.join(self.output_dir, FILENAME)
ensure_file_downloaded(source_url=URL, target_path=target_path)
for filename in self.FILE_NAMES:
target_path = os.path.join(self.output_dir, filename)
ensure_file_downloaded(source_url=f"{self.ASSET_URL}/{filename}", target_path=target_path)

# Overwrite inherited mappings
self.build_mappings()
Expand Down
2 changes: 1 addition & 1 deletion src/helm/benchmark/run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2380,7 +2380,7 @@ def get_anthropic_hh_rlhf_spec(num_respondents: int, subset: str) -> RunSpec:
def get_cleva_spec(task: str, version: str, subtask: str = None, prompt_id: int = 0) -> RunSpec:
from .scenarios.cleva_scenario import CLEVAScenario # noqa

CLEVAScenario.download_dataset()
CLEVAScenario.download_dataset(task, version)

_, prompt_setting = CLEVAScenario.get_prompt_setting(task, subtask, version, prompt_id)
inference_parameters = CLEVAScenario.load_inference_parameters(task, subtask, version, prompt_id)
Expand Down
11 changes: 6 additions & 5 deletions src/helm/benchmark/scenarios/cleva_scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from .code_scenario import CodeReference, CodeInstance


CLEVA_DATA_URL = "https://drive.google.com/uc?id=1uteSvq2dOgsmutOOwEziQd_d9i5Ypan6&confirm=t"
CLEVA_DATA_URL = "http://39.108.215.175/data"
CLEVA_DATA_PATH = "benchmark_output/scenarios/cleva"


Expand Down Expand Up @@ -410,10 +410,11 @@ def task(self) -> str:
pass

@classmethod
def download_dataset(cls):
target_dir = os.path.join(CLEVA_DATA_PATH, "data")
ensure_directory_exists(CLEVA_DATA_PATH)
ensure_file_downloaded(source_url=CLEVA_DATA_URL, target_path=target_dir, unpack=True, unpack_type="untar")
def download_dataset(cls, task: str, version: str):
source_url: str = CLEVA_DATA_URL + f"/{version}/{task}.zip"
target_dir: str = os.path.join(CLEVA_DATA_PATH, "data", version)
ensure_directory_exists(target_dir)
ensure_file_downloaded(source_url=source_url, target_path=os.path.join(target_dir, task), unpack=True)

def load_dataset(self) -> Dict[str, List[Dict[str, Any]]]:
data_dir: str = os.path.join(CLEVA_DATA_PATH, "data", self.version, self.task)
Expand Down
Loading