-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathrun_others.sh
55 lines (44 loc) · 3.58 KB
/
run_others.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
############################################################################################################
# Code for the paper: Large Language Models for Scientific Synthesis, Inference and Explanation
# Code Authors: Yizhen Zheng, Huan Yee Koh, Jiaxin Ju
# https://github.com/zyzisastudyreallyhardguy/LLM4SD/tree/main
#
# * Please install packages in requirment.txt before running this file
#
# * To automate the entire process, we recommend utilizing GPT-4-turbo for summarizing inference rules and
# generating the corresponding code. Please ensure you have entered your API_KEY before initiating the run.
############################################################################################################
API_KEY='' # OpenAI API key for gpt4 summarization and code generation
MODEL="galactica-6.7b" # ("falcon-7b" "falcon-40b" "galactica-30b" "chemdfm" "chemllm-7b") chemllm-7b fails in BACE task
# Generated content from "chemllm-7b" for inference.py may not meet the required standards for proceeding directly to Step 4.
# It is strongly recommended that you thoroughly review the rules qualities before running step 4.
DATASET=("bbbp" "bace" "clintox" "esol" "freesolv" "hiv" "lipophilicity")
# Step 1: Generate prior knowledge and data knowledge prompt json file
# Please mannually add your own prompt if your task is not in the above dataset list
echo "Processing step 1: Generating Prompt files ..."
python create_prompt.py --task synthesize
python create_prompt.py --task inference
# Processing Dataset: bbbp, bace, clintox, esol, freesolv, hiv, lipophilicity
for dataset in "${DATASET[@]}"; do
# Step 2: Knowledge synthesis from the Scientific Literature
echo "Processing step 2 for $dataset: LLM for Scientific Synthesize"
python synthesize.py --dataset ${dataset} --subtask "" --model ${MODEL} --output_folder "synthesize_model_response"
# Step 3: Knowledge inference from Data
echo "Processing step 3 for $dataset: LLM for Scientific Inference"
python inference.py --dataset ${dataset} --subtask "" --model ${MODEL} --list_num 30 --output_folder "inference_model_response"
# Step 4: Summarize inference rules generated from the last step
echo "Processing step 4 for $dataset: Summarize rules from gpt4"
python summarize_rules.py --input_model_folder ${MODEL} --dataset ${dataset} --subtask "" --list_num 30 --api_key ${API_KEY} \
--output_folder "summarized_inference_rules"
# Step 5: Interpretable model training and Evaluation
# Results in our paper are stored in eval_result folder and the generated code files are in eval_code_generation_repo folder
# To avoid overwriting we here provide another folder name
# **** Must run synthesize and inference setting before run all setting ****
echo "Processing step 5 for $dataset: Interpretable model training and Evaluation"
python code_gen_and_eval.py --dataset ${dataset} --subtask "" --model ${MODEL} --knowledge_type "synthesize" \
--api_key ${API_KEY} --output_dir "llm4sd_results" --code_gen_folder "llm4sd_code_generation"
python code_gen_and_eval.py --dataset ${dataset} --subtask "" --model ${MODEL} --knowledge_type "inference" --list_num 30 \
--api_key ${API_KEY} --output_dir "llm4sd_results" --code_gen_folder "llm4sd_code_generation"
python code_gen_and_eval.py --dataset ${dataset} --subtask "" --model ${MODEL} --knowledge_type "all" --list_num 30 \
--api_key ${API_KEY} --output_dir "llm4sd_results" --code_gen_folder "llm4sd_code_generation"
done