diff --git a/README.md b/README.md index a5a0220e43..0fe428a493 100644 --- a/README.md +++ b/README.md @@ -205,6 +205,30 @@ We support wildcards in task names, for example you can run all of the machine-t We currently only support one prompt per task, which we strive to make the "standard" as defined by the benchmark's authors. If you would like to study how varying prompts causes changes in the evaluation score, check out the [BigScience fork](https://github.com/bigscience-workshop/lm-evaluation-harness) of this repo. We are currently working on upstreaming this capability to `main`. +## Cluster Usage + +The evaluation suite can be called via the Python API, which makes it possible to script jobs with [submitit](https://github.com/facebookincubator/submitit), for example. You can find a detailed example of how this works in `scripts/run_eval.py`. + +Running a job via submitit has two steps: preparing the **executor**, which controls cluster options, and preparing the actual **evaluation** options. + +First you need to configure the executor. This controls cluster job details, like how many GPUs or nodes to use. For a detailed example, see `build_executor` in `run_eval.py`, but a minimal example looks like this: + + base_args = {... cluster args ...} + executor = submitit.AutoExecutor(folder="./logs") + executor.update_parameters(**base_args) + +Once the executor is prepared, you need to actually run the evaluation task. A detailed example of wrapping the API to make this easy is in the `eval_task` function, which mainly just calls out to `main` in `scripts/main_eval.py`. The basic structure is like this: + + def my_task(): + args = {... eval args ...} + + # this is the function from main_eval.py + main_eval(args, output_path="./hoge.json") + + job = executor.submit(my_task) + +You can then get output from the job and check that it completed successfully. See `run_job` for an example of how that works. + ## Implementing new tasks To implement a new task in the eval harness, see [this guide](./docs/task_guide.md). diff --git a/main.py b/main.py deleted file mode 100644 index 7370464521..0000000000 --- a/main.py +++ /dev/null @@ -1,124 +0,0 @@ -import os -import argparse -import json -import logging -import fnmatch - -from lm_eval import tasks, evaluator - -logging.getLogger("openai").setLevel(logging.WARNING) - - -class MultiChoice: - def __init__(self, choices): - self.choices = choices - - # Simple wildcard support (linux filename patterns) - def __contains__(self, values): - for value in values.split(","): - if len(fnmatch.filter(self.choices, value)) == 0: - return False - - return True - - def __iter__(self): - for choice in self.choices: - yield choice - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True) - parser.add_argument("--model_args", default="") - parser.add_argument("--tasks", default=None, choices=MultiChoice(tasks.ALL_TASKS)) - parser.add_argument("--provide_description", action="store_true") - parser.add_argument("--num_fewshot", type=str, default="0") - parser.add_argument("--batch_size", type=int, default=None) - parser.add_argument("--device", type=str, default=None) - parser.add_argument("--output_path", default=None) - parser.add_argument("--limit", type=str, default=None) - parser.add_argument("--no_cache", action="store_true") - parser.add_argument("--decontamination_ngrams_path", default=None) - parser.add_argument("--description_dict_path", default=None) - parser.add_argument("--check_integrity", action="store_true") - parser.add_argument("--verbose", action="store_true") - - return parser.parse_args() - - -# Returns a list containing all values of the source_list that -# match at least one of the patterns -def pattern_match(patterns, source_list): - task_names = [] - for pattern in patterns: - for matching in fnmatch.filter(source_list, pattern): - task_names.append(matching) - return task_names - - -def main(): - args = parse_args() - - assert not args.provide_description # not implemented - - if args.limit: - print( - "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." - ) - - if args.tasks is None: - task_names = tasks.ALL_TASKS - else: - task_names = pattern_match(args.tasks.split(","), tasks.ALL_TASKS) - - print(f"Selected Tasks: {task_names}") - if "," in args.num_fewshot: - num_fewshot = [int(n) for n in args.num_fewshot.split(",")] - else: - num_fewshot = int(args.num_fewshot) - - if args.limit is not None: - if "," in args.limit: - limit = [int(n) if n.isdigit() else float(n) for n in args.limit.split(",")] - else: - limit = int(args.limit) - else: - limit = None - description_dict = {} - if args.description_dict_path: - with open(args.description_dict_path, "r") as f: - description_dict = json.load(f) - results = evaluator.simple_evaluate( - model=args.model, - model_args=args.model_args, - tasks=task_names, - num_fewshot=num_fewshot, - batch_size=args.batch_size, - device=args.device, - no_cache=args.no_cache, - limit=limit, - description_dict=description_dict, - decontamination_ngrams_path=args.decontamination_ngrams_path, - check_integrity=args.check_integrity, - verbose=args.verbose, - ) - - dumped = json.dumps(results, indent=2, ensure_ascii=False) - print(dumped) - - if args.output_path: - dir = os.path.dirname(args.output_path) - if dir: - os.makedirs(dir, exist_ok=True) - with open(args.output_path, "w") as f: - f.write(dumped) - - print( - f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, " - f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}" - ) - print(evaluator.make_table(results)) - - -if __name__ == "__main__": - main() diff --git a/main.py b/main.py new file mode 120000 index 0000000000..70f41b5b43 --- /dev/null +++ b/main.py @@ -0,0 +1 @@ +scripts/main_eval.py \ No newline at end of file diff --git a/scripts/harness_example.py b/scripts/harness_example.py new file mode 100755 index 0000000000..8a60f90bd0 --- /dev/null +++ b/scripts/harness_example.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python +# This script runs eval in the cluster. Use it as a basis for your own harnesses. +from run_eval import build_executor, run_job +from run_eval import JAEVAL8_TASKS, JAEVAL8_FEWSHOT +from main_eval import main as main_eval + + +def build_task_list(tasks, prompt): + out = [] + # Some tasks don't have a prompt version + promptless = ["xwinograd_ja"] + for task in tasks: + if task not in promptless: + out.append(f"{task}-{prompt}") + else: + out.append(task) + return out + + +def main(): + executor = build_executor("eval", gpus_per_task=8, cpus_per_gpu=12) + + tasks = build_task_list(JAEVAL8_TASKS, "0.3") + eval_args = { + "tasks": tasks, + "num_fewshot": JAEVAL8_FEWSHOT, + "model": "hf-causal", + "model_args": "pretrained=rinna/japanese-gpt-1b,use_fast=False", + "device": "cuda", + "limit": 100, + "verbose": True, + } + + run_job(executor, main_eval, eval_args=eval_args, output_path="./check.json") + + +if __name__ == "__main__": + main() diff --git a/scripts/main_eval.py b/scripts/main_eval.py new file mode 100644 index 0000000000..9d529421ef --- /dev/null +++ b/scripts/main_eval.py @@ -0,0 +1,127 @@ +import os +import argparse +import json +import logging +import fnmatch + +from lm_eval import tasks, evaluator + +logging.getLogger("openai").setLevel(logging.WARNING) + + +class MultiChoice: + def __init__(self, choices): + self.choices = choices + + # Simple wildcard support (linux filename patterns) + def __contains__(self, values): + for value in values.split(","): + if len(fnmatch.filter(self.choices, value)) == 0: + return False + + return True + + def __iter__(self): + for choice in self.choices: + yield choice + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", required=True) + parser.add_argument("--model_args", default="") + parser.add_argument("--tasks", default=None, choices=MultiChoice(tasks.ALL_TASKS)) + parser.add_argument("--num_fewshot", type=str, default="0") + parser.add_argument("--batch_size", type=int, default=None) + parser.add_argument("--device", type=str, default=None) + parser.add_argument("--output_path", default=None) + parser.add_argument("--limit", type=str, default=None) + parser.add_argument("--no_cache", action="store_true") + parser.add_argument("--decontamination_ngrams_path", default=None) + parser.add_argument("--description_dict_path", default=None) + parser.add_argument("--check_integrity", action="store_true") + parser.add_argument("--verbose", action="store_true") + # TODO This is deprecated and throws an error, remove it + parser.add_argument("--provide_description", action="store_true") + + return parser.parse_args() + + +def clean_args(args) -> dict: + """Handle conversion to lists etc. for args""" + + assert not args.provide_description, "provide-description is not implemented" + + if args.limit: + print( + "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." + ) + + if args.tasks is None: + args.tasks = tasks.ALL_TASKS + else: + args.tasks = pattern_match(args.tasks.split(","), tasks.ALL_TASKS) + + print(f"Selected Tasks: {args.tasks}") + if args.num_fewshot is not None: + args.num_fewshot = [int(n) for n in args.num_fewshot.split(",")] + + if args.limit is not None: + args.limit = [ + int(n) if n.isdigit() else float(n) for n in args.limit.split(",") + ] + + return vars(args) + + +# Returns a list containing all values of the source_list that +# match at least one of the patterns +def pattern_match(patterns, source_list): + task_names = [] + for pattern in patterns: + for matching in fnmatch.filter(source_list, pattern): + task_names.append(matching) + return task_names + + +def main(eval_args: dict, description_dict_path: str = None, output_path: str = None): + """Run evaluation and optionally save output. + + For a description of eval args, see `simple_evaluate`. + """ + if description_dict_path: + with open(description_dict_path, "r") as f: + eval_args["description_dict"] = json.load(f) + + results = evaluator.simple_evaluate(**eval_args) + + dumped = json.dumps(results, indent=2, ensure_ascii=False) + print(dumped) + + if output_path: + os.makedirs(os.path.dirname(output_path), exist_ok=True) + with open(output_path, "w") as f: + f.write(dumped) + + return results + + +if __name__ == "__main__": + args = parse_args() + args = clean_args(args) + + # This is not used + args.pop("provide_description", None) + # treat non-eval args separately + description_dict_path = args.get("description_dict_path", None) + args.pop("description_dict_path", None) + output_path = args.get("output_path", None) + args.pop("output_path", None) + + results = main(args, description_dict_path, output_path) + + print( + f"{args['model']} ({args['model_args']}), limit: {args['limit']}, " + f"num_fewshot: {args['num_fewshot']}, batch_size: {args['batch_size']}" + ) + print(evaluator.make_table(results)) diff --git a/scripts/notify.py b/scripts/notify.py new file mode 100755 index 0000000000..779574bc50 --- /dev/null +++ b/scripts/notify.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python +# This is an example of sending a slack notification. For more details see +# official docs: +# https://api.slack.com/messaging/webhooks + +import requests +import json +import os + +# This URL is tied to a single channel. That can be generalized, or you can +# create a new "app" to use another channel. +WEBHOOK = os.environ.get("WEBHOOK_URL") +if WEBHOOK is None: + print("Webhook URL not found in WEBHOOK_URL env var. Will just print messages.") + + +def notify(message): + headers = {"Content-Type": "application/json"} + data = json.dumps({"text": message}) + if WEBHOOK is None: + print(message) + else: + requests.post(WEBHOOK, data=data, headers=headers) + + +if __name__ == "__main__": + print("Please type your message.") + message = input("message> ") + notify(message) diff --git a/scripts/run_eval.py b/scripts/run_eval.py new file mode 100755 index 0000000000..26cef8d481 --- /dev/null +++ b/scripts/run_eval.py @@ -0,0 +1,168 @@ +import submitit +from submitit.helpers import CommandFunction +import argparse +from notify import notify +from pathlib import Path + +from main_eval import main as main_eval + +# These are the standard 8 tasks +JAEVAL8_TASKS = [ + "jcommonsenseqa-1.1", + "jnli-1.1", + "marc_ja-1.1", + "jsquad-1.1", + "jaqket_v2-0.2", + "xlsum_ja-1.0", + "xwinograd_ja", + "mgsm-1.0", +] +JAEVAL8_FEWSHOT = [3, 3, 3, 2, 1, 1, 0, 5] + + +def eval_task(): + args = { + "tasks": ["jsquad-1.1-0.2"], + "num_fewshot": [1], + "model": "hf-causal", + "model_args": "pretrained=rinna/japanese-gpt-1b,use_fast=False", + "device": "cuda", + "limit": 100, + "verbose": True, + } + + main_eval(args, output_path="./check.json") + + +def build_executor( + name: str, + gpus_per_task: int, + cpus_per_gpu: int, + timeout: int = 0, + partition: str = "g40", + account: str = "stablegpt", +): + base_args = { + # just "gpus" does not work + "slurm_gpus_per_task": 8, + "name": "eval", + "slurm_account": "stablegpt", + "slurm_partition": "g40", + "slurm_cpus_per_gpu": 12, + # Default timeout is 5 minutes??? + "timeout_min": 0, + } + + executor = submitit.AutoExecutor(folder="./logs") + executor.update_parameters(**base_args) + return executor + + +def build_shell_script_task(script_path, args, repo): + """This is how you can wrap an existing harness.sh. + + This is not currently used. + """ + task = CommandFunction(["bash", args.harness_script], cwd=repo) + return task + + +def run_job(executor, task, *args, **kwargs): + """Given an executor and a task, run the task with error reporting. + + `executor` should be a submitit executor. + + `task` can be a CommandFunction from submitit, which wraps a shell script, + or a Python function. Further positional or keyword arguments are passed to + the function. + """ + job = executor.submit(task, *args, **kwargs) + print("Submitted job") + print("See log at:") + print(f"\t{job.paths.stdout}") + + try: + output = job.result() + print("Job finished successfully!") + notify(f":white_check_mark: Eval Finished for `{job}`") + return output + except Exception as ee: # noqa: F841 + # submitit doesn't seem to have a parent class for their exceptions, so + # just catch everything. We want to be aware of any failure anyway. + # If this is noisy we can ignore certain kinds of early failures. + + msg = f""" + :rotating_light: Eval failed for `{job}` + + See `{job.paths.stderr}` + """.strip() + notify(msg) + raise + + +def run_eval_shell_script(): + parser = argparse.ArgumentParser( + prog="run-eval", + description="Run eval harness", + ) + + parser.add_argument("harness_script") + + args = parser.parse_args() + + base_args = { + # just "gpus" does not work + "slurm_gpus_per_task": 8, + "name": "eval", + "slurm_account": "stablegpt", + "slurm_partition": "g40", + "slurm_cpus_per_gpu": 12, + # Default timeout is 5 minutes??? + "timeout_min": 0, + } + + executor = submitit.AutoExecutor(folder="./logs") + executor.update_parameters(**base_args) + + # Harness scripts expect the cwd to be the repo root + spath = Path(args.harness_script) + repo = str(spath.parent.parent.parent.parent) + print("repo path:", repo) + # the eval harness relies on validating cli args, so it's difficult to run + # directly from Python. Use the harness.sh scripts for now. + # Also note this needs to be a list of strings. + harness = CommandFunction(["bash", args.harness_script], cwd=repo) + + job = executor.submit(harness) + print("Submitted job") + print("See log at:") + print(f"\t{job.paths.stdout}") + + try: + output = job.result() + print("Job finished successfully!") + notify(f":white_check_mark: Eval Finished for `{args.harness_script}`") + return output + except Exception as ee: # noqa: F841 + # submitit doesn't seem to have a parent class for their exceptions, so + # just catch everything. We want to be aware of any failure anyway. + # If this is noisy we can ignore certain kinds of early failures. + + msg = f""" + :rotating_light: Eval failed for `{args.harness_script}` + + See `{job.paths.stderr}` + """.strip() + notify(msg) + raise + + +def run_eval_submitit(): + """Run evaluation using submitit.""" + executor = build_executor() + # By wrapping everything in a function, we don't have to pass args. + run_job(executor, eval_task) + + +if __name__ == "__main__": + run_eval_shell_script()