diff --git a/docs/reproducing_leaderboards.md b/docs/reproducing_leaderboards.md index 72b0336edf..10a3141135 100644 --- a/docs/reproducing_leaderboards.md +++ b/docs/reproducing_leaderboards.md @@ -34,7 +34,7 @@ helm-server --suite $SUITE_NAME The following specifies the appropriate parameters and configuration files for a leaderboard, given its project and version number. -### Lite +### Lite for non-instruction-following models ```bash export RUN_ENTRIES_CONF_PATH=run_entries_lite_20240424.conf @@ -44,6 +44,16 @@ export MAX_EVAL_INSTANCES=1000 export PRIORITY=2 ``` +### Lite for instruction-following models + +```bash +export RUN_ENTRIES_CONF_PATH=run_entries_lite_20240424_output_format_instructions.conf +export SCHEMA_PATH=schema_lite.yaml +export NUM_TRAIN_TRIALS=1 +export MAX_EVAL_INSTANCES=1000 +export PRIORITY=2 +``` + ### Classic before v0.2.4 ```bash diff --git a/src/helm/benchmark/presentation/run_entries_lite_20240424_output_format_instructions.conf b/src/helm/benchmark/presentation/run_entries_lite_20240424_output_format_instructions.conf new file mode 100644 index 0000000000..f97ffe97c7 --- /dev/null +++ b/src/helm/benchmark/presentation/run_entries_lite_20240424_output_format_instructions.conf @@ -0,0 +1,50 @@ +# HELM scenarios. + +entries: [ + # NarrativeQA + {description: "narrative_qa:model=text,output_format_instructions=narrative_qa", priority: 1} + + # NaturalQuestions + {description: "natural_qa:model=text,mode=openbook_longans,output_format_instructions=natural_qa", priority: 1} + {description: "natural_qa:model=text,mode=closedbook,output_format_instructions=natural_qa", priority: 1} + + # OpenbookQA + {description: "commonsense:model=text_code,dataset=openbookqa,method=multiple_choice_joint,output_format_instructions=openbookqa", priority: 1} + + # MMLU + {description: "mmlu:model=text,subject=abstract_algebra,output_format_instructions=mmlu", priority: 2} + {description: "mmlu:model=text,subject=college_chemistry,output_format_instructions=mmlu", priority: 2} + {description: "mmlu:model=text,subject=computer_security,output_format_instructions=mmlu", priority: 2} + {description: "mmlu:model=text,subject=econometrics,output_format_instructions=mmlu", priority: 2} + {description: "mmlu:model=text,subject=us_foreign_policy,output_format_instructions=mmlu", priority: 2} + + # MATH + {description: "math:model=text_code,subject=number_theory,level=1,use_chain_of_thought=True", priority: 2} + {description: "math:model=text_code,subject=intermediate_algebra,level=1,use_chain_of_thought=True", priority: 2} + {description: "math:model=text_code,subject=algebra,level=1,use_chain_of_thought=True", priority: 2} + {description: "math:model=text_code,subject=prealgebra,level=1,use_chain_of_thought=True", priority: 2} + {description: "math:model=text_code,subject=geometry,level=1,use_chain_of_thought=True", priority: 2} + {description: "math:model=text_code,subject=counting_and_probability,level=1,use_chain_of_thought=True", priority: 2} + {description: "math:model=text_code,subject=precalculus,level=1,use_chain_of_thought=True", priority: 2} + + # GSM + {description: "gsm:model=text_code,stop=none", priority: 2} + + # LegalBench + {description: "legalbench:model=text_code,subset=abercrombie,output_format_instructions=legalbench", priority: 2} + {description: "legalbench:model=text_code,subset=corporate_lobbying,output_format_instructions=legalbench", priority: 2} + {description: "legalbench:model=text_code,subset=international_citizenship_questions,output_format_instructions=legalbench", priority: 2} + {description: "legalbench:model=text_code,subset=function_of_decision_section,output_format_instructions=legalbench", priority: 2} + {description: "legalbench:model=text_code,subset=proa,output_format_instructions=legalbench", priority: 2} + + # MedQA + {description: "med_qa:model=text_code,output_format_instructions=med_qa", priority: 2} + + # WMT14 + {description: "wmt_14:language_pair=cs-en,model=text,output_format_instructions=wmt_14", priority: 2} + {description: "wmt_14:language_pair=de-en,model=text,output_format_instructions=wmt_14", priority: 2} + {description: "wmt_14:language_pair=fr-en,model=text,output_format_instructions=wmt_14", priority: 2} + {description: "wmt_14:language_pair=hi-en,model=text,output_format_instructions=wmt_14", priority: 2} + {description: "wmt_14:language_pair=ru-en,model=text,output_format_instructions=wmt_14", priority: 2} +] +