-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathassemblyCalcs_samples.py
158 lines (119 loc) · 4.31 KB
/
assemblyCalcs_samples.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import assemblycalculator as ac
import multiprocessing as mp
import pickle
import numpy as np
import signal
import os
import subprocess
def build_month_increments(start, stop):
""" Build month increments in the form YYYY-MM
Args:
start (int): Starting year
stop (int): Ending year
Returns:
list: list of strings in the form YYYY-MM (e.g., "1980-01")
"""
months = []
while start <= stop:
for month in [
"01", "02", "03", "04", "05", "06", "07", "08", "09", "10",
"11", "12"
]:
months.append(str(start) + "-" + month)
start += 1
return months
def calculate_assembly(month, inchi):
""" Calculate the assembly value of an inchi string
Args:
month (string): YYYY-MM description of the month where the compound was sampled from
inchi (string): inchi representation of the SureChemBL compound
Returns:
dict: values of month, inchi, and the assembly index
"""
ai = ac.calculate_ma(inchi,
60,
"monte-carlo",
num_frags_hist=5000,
path_samples=10000)
return {"month": month, "inchi": inchi, "ai": ai}
def crtl_c_exit(proc):
_ = proc.send_signal(signal.SIGINT)
return proc
def run_assemblyGo(molfile, fp):
#Set a timeout to 2 minutes
timeout = 120
activator = [
"../assemblygo/assembly", "-file=" + fp + molfile, "-log",
"-logfile=" + fp + molfile[:-4] + ".txt"
]
try:
proc = subprocess.Popen(activator)
_ = proc.wait(timeout=timeout)
_, _ = proc.communicate()
except subprocess.TimeoutExpired:
proc = crtl_c_exit(proc)
_, _ = proc.communicate()
except:
pass
def parallel_runAssemblyGo(molfiles, fp):
""" Calls assemblyGo in parallel
Args:
molfiles (list): list of individual molfiles
"""
#Set up parallelization
pool = mp.Pool(mp.cpu_count())
for f in molfiles:
pool.apply_async(run_assemblyGo, args=(f, fp))
pool.close()
pool.join()
def main():
# #Read in sampled compounds (updated for full 1000 compounds)
# #NOTE: add "_NEW" for new compounds found in each year (remove for all compounds)
# cpds = pickle.load(file=open("Data/sample_inchi_1000_2020-2022.p", "rb"))
# for year in np.arange(2020, 2023, 1):
# #Set up parallelization - a bit of overhead for setting it up, but that's fine
# pool = mp.Pool(64)
# #Build months in a specific year
# months = build_month_increments(year, year)
# date_cpd_sets = []
# for key, value in cpds.items():
# if key in months:
# for cpd in value:
# date_cpd_sets.append((key, cpd))
# #Calculate assembly values for all inchis, save in a list holding dictionaries
# assemblies = [
# pool.apply(calculate_assembly, args=(s[0], s[1]))
# for s in date_cpd_sets
# ]
# pool.close()
# pool.join()
# #NOTE: include '_FULL_' when sampling all compounds
# pickle.dump(assemblies,
# file=open("Data/assembly_values_1000_FULL_" + str(year) + ".p",
# "wb"))
### Run AssemblyGo on full dataset
fp = "Data/AssemblyValues/FullDatabase/"
molfiles = os.listdir(fp)
parallel_runAssemblyGo(molfiles, fp)
###: Run AssemblyGo on new dataset
# fp = "Data/AssemblyValues/NewDatabase/"
# molfiles = os.listdir(fp)
# parallel_runAssemblyGo(molfiles, fp)
### AssemblyGo on cost/reaxys directories
# fp = "Data/AssemblyValues/CostRandom/"
# molfiles = os.listdir(fp)
# parallel_runAssemblyGo(molfiles, fp)
# ### AssemblyGo on cost/reaxys directories
# fp = "Data/AssemblyValues/CostRandomPercentiles/"
# molfiles = os.listdir(fp)
# parallel_runAssemblyGo(molfiles, fp)
# ### AssemblyGo on author compound directories
# fp = "Data/AssemblyValues/AuthorCpds/"
# molfiles = os.listdir(fp)
# parallel_runAssemblyGo(molfiles, fp)
# ### AssemblyGo on author compound directories
# fp = "Data/AssemblyValues/AssigneeCpds/"
# molfiles = os.listdir(fp)
# parallel_runAssemblyGo(molfiles, fp)
if __name__ == "__main__":
main()