-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_RNN.py
166 lines (139 loc) · 5.35 KB
/
train_RNN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
# import packages
import argparse
import configparser
import os
import pandas as pd
import torch
from torch.utils.data import DataLoader
from profis.gen.dataset import SELFIESDataset, SMILESDataset, DeepSMILESDataset
from profis.gen.train import train
from profis.utils.split import scaffold_split
from profis.utils.modelinit import initialize_model
from profis.utils.vectorizer import (
SELFIESVectorizer,
SMILESVectorizer,
DeepSMILESVectorizer,
)
# Suppress RDKit warnings
from rdkit import RDLogger
RDLogger.DisableLog("rdApp.*")
def main(config_path):
"""
Training script for model with fully-connected encoder and GRU decoder
Args:
config_path: path to config file
"""
# read config file
config = configparser.ConfigParser()
config.read(config_path)
train_size = float(config["RUN"]["train_size"])
random_seed = int(config["RUN"]["random_seed"])
run_name = str(config["RUN"]["run_name"])
batch_size = int(config["RUN"]["batch_size"])
data_path = str(config["RUN"]["data_path"])
dataloader_workers = int(config["RUN"]["num_workers"])
fp_len = int(config["MODEL"]["fp_len"])
use_cuda = config.getboolean("RUN", "use_cuda")
out_encoding = str(config["RUN"]["out_encoding"])
val_size = round(1 - train_size, 1)
train_percent = int(train_size * 100)
val_percent = int(val_size * 100)
cuda_available = torch.cuda.is_available() and use_cuda
device = torch.device("cuda" if cuda_available else "cpu")
print("Using device:", device)
# read dataset
if not os.path.isfile(data_path):
raise FileNotFoundError(f"Dataset file {data_path} not found")
dataset = pd.read_parquet(data_path)
# create a directory for this model weights if not there
if not os.path.isdir(f"models/{run_name}"):
if not os.path.isdir("models"):
os.mkdir("models")
os.mkdir(f"models/{run_name}")
with open(f"models/{run_name}/hyperparameters.ini", "w") as configfile:
config.write(configfile)
# if train_dataset not generated, perform scaffold split
if not os.path.isfile(
data_path.split(".")[0] + f"_train_{train_percent}.parquet"
) or not os.path.isfile(data_path.split(".")[0] + f"_val_{val_percent}.parquet"):
print("Performing scaffold split...")
train_df, val_df = scaffold_split(
dataset, train_size, seed=random_seed, shuffle=True
)
train_df.to_parquet(data_path.split(".")[0] + f"_train_{train_percent}.parquet")
val_df.to_parquet(data_path.split(".")[0] + f"_val_{val_percent}.parquet")
print("Scaffold split complete")
else:
train_df = pd.read_parquet(
data_path.split(".")[0] + f"_train_{train_percent}.parquet"
)
val_df = pd.read_parquet(
data_path.split(".")[0] + f"_val_{val_percent}.parquet"
)
scoring_df = val_df.sample(frac=0.1, random_state=random_seed)
# prepare dataloaders
if out_encoding == "selfies":
vectorizer = SELFIESVectorizer(pad_to_len=100)
train_dataset = SELFIESDataset(train_df, vectorizer, fp_len)
val_dataset = SELFIESDataset(val_df, vectorizer, fp_len)
scoring_dataset = SELFIESDataset(scoring_df, vectorizer, fp_len)
elif out_encoding == "smiles":
vectorizer = SMILESVectorizer(pad_to_len=100)
train_dataset = SMILESDataset(train_df, vectorizer, fp_len)
val_dataset = SMILESDataset(val_df, vectorizer, fp_len)
scoring_dataset = SMILESDataset(scoring_df, vectorizer, fp_len)
elif out_encoding == "deepsmiles":
vectorizer = DeepSMILESVectorizer(pad_to_len=100)
train_dataset = DeepSMILESDataset(train_df, vectorizer, fp_len)
val_dataset = DeepSMILESDataset(val_df, vectorizer, fp_len)
scoring_dataset = DeepSMILESDataset(scoring_df, vectorizer, fp_len)
else:
raise ValueError(
"Invalid output encoding (must be selfies, smiles or deepsmiles)"
)
print("Dataset size:", len(dataset))
print("Train size:", len(train_dataset))
print("Val size:", len(val_dataset))
print("Scoring size:", len(scoring_dataset))
val_batch_size = batch_size if batch_size < len(val_dataset) else len(val_dataset)
scoring_batch_size = (
batch_size if batch_size < len(scoring_dataset) else len(scoring_dataset)
)
train_loader = DataLoader(
train_dataset,
shuffle=True,
batch_size=batch_size,
drop_last=True,
num_workers=dataloader_workers,
)
val_loader = DataLoader(
val_dataset,
shuffle=False,
batch_size=val_batch_size,
drop_last=True,
num_workers=dataloader_workers,
)
scoring_loader = DataLoader(
scoring_dataset,
shuffle=False,
batch_size=scoring_batch_size,
drop_last=True,
num_workers=dataloader_workers,
)
# Init model
model = initialize_model(
config_path, device=device, use_dropout=True, teacher_forcing=True
)
_ = train(config, model, train_loader, val_loader, scoring_loader)
return None
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"-c",
"--config",
type=str,
default="config_files/RNN_config.ini",
help="Path to config file",
)
config_path = parser.parse_args().config
main(config_path)