-
Notifications
You must be signed in to change notification settings - Fork 0
/
Exploratory Data Analysis and Model Building
275 lines (203 loc) · 8.46 KB
/
Exploratory Data Analysis and Model Building
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
#Install the ChEMBL web service package to retrieve bioactivity data from the ChEMBL Database.
! pip install chembl_webresource_client
# Import libraries
import pandas as pd
import numpy as np
from chembl_webresource_client.new_client import new_client
#Search for dengue virus target protein data
target = new_client.target
target_query = target.search('dengue')
targets = pd.DataFrame.from_dict(target_query)
targets
#Select and retrieve bioactivity data for Dengue virus target protein
selected_target = targets.target_chembl_id[5]
selected_target
#retrieve only bioactivity data for *Dengue virus* (CHEMBL5980) that are reported as IC50 pChEMBL values
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")
df = pd.DataFrame.from_dict(res)
df.head(10)
#save the resulting bioactivity data to a CSV file bioactivity_data.csv
df.to_csv('bioactivity_data.csv', index=False)
! cp bioactivity_data.csv "/content/drive/My Drive/Colab_Notebooks/Dissertation_Data"
! ls "/content/drive/My Drive/Colab_Notebooks/Dissertation_Data"
! head bioactivity_data.csv
#import libraries for visualization
import seaborn as sns
from seaborn import scatterplot
sns.set(style='ticks')
import matplotlib.pyplot as plt
#load data
df2 = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/Dissertation_Data/bioactivity_data.csv')
df2
#It is important to set the bioactivity threshold to categorize the molecules
#optimally, the lower the IC50 value, the more potent the drug and the lower the dose of the drug needed to be administered
#thus, we label the molecules in three categories: active- less than 1000µM, inactive- greater than 10000µM, neutral- in between active and inactive.
bioactivity_threshold = []
for i in df2.standard_value:
if float(i) >= 10000:
bioactivity_threshold.append("inactive")
elif float(i) <= 1000:
bioactivity_threshold.append("active")
else:
bioactivity_threshold.append("neutral")
bioactivity_class = pd.Series(bioactivity_threshold, name='bioactivity_class')
#create a new dataframe by merging the bioactivity class column with the df2 table
dfc = pd.concat([df2, bioactivity_class], axis=1)
dfc
#create a new DataFrame with specified columns
new = ['molecule_chembl_id','canonical_smiles','standard_value', 'bioactivity_class']
df3 = dfc[new]
#Remove empty rows
df3 = df3[df3.standard_value.notna()]
df3 = df3[df3.canonical_smiles.notna()]
#save the resulting bioactivity data to a CSV file bioactivity_data.csv
df3.to_csv('bioactivity_data_preprocessed.csv', index=False)
! cp bioactivity_data_preprocessed.csv "/content/drive/My Drive/Colab_Notebooks/Dissertation_Data"
! ls "/content/drive/My Drive/Colab_Notebooks/Dissertation_Data"
! head bioactivity_data.csv
#Install conda and rdkit
! wget https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh
! chmod +x Miniconda3-py37_4.8.2-Linux-x86_64.sh
! bash ./Miniconda3-py37_4.8.2-Linux-x86_64.sh -b -f -p /usr/local
! conda install -c rdkit rdkit -y
import sys
sys.path.append('/usr/local/lib/python3.7/site-packages/')
#Load bioactivity data
Bdf = pd.read_csv('/content/bioactivity_data_preprocessed.csv')
Bdf
#import libraries
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski
#Calculate Lipinski descriptors
def lipinski(smiles, verbose=False):
moldata= []
for elem in smiles:
mol=Chem.MolFromSmiles(elem)
moldata.append(mol)
baseData= np.arange(1,1)
i=0
for mol in moldata:
desc_MolWt = Descriptors.MolWt(mol)
desc_MolLogP = Descriptors.MolLogP(mol)
desc_NumHDonors = Lipinski.NumHDonors(mol)
desc_NumHAcceptors = Lipinski.NumHAcceptors(mol)
row = np.array([desc_MolWt,
desc_MolLogP,
desc_NumHDonors,
desc_NumHAcceptors])
if(i==0):
baseData=row
else:
baseData=np.vstack([baseData, row])
i=i+1
columnNames=["MW","LogP","NumHDonors","NumHAcceptors"]
descriptors = pd.DataFrame(data=baseData,columns=columnNames)
return descriptors
df_lipinski = lipinski(Bdf.canonical_smiles)
df_lipinski
#Combine DataFrames using the .concat() function
df_descriptors = pd.concat([Bdf,df_lipinski], axis=1)
df_descriptors
#Convert IC50 to pIC50
def pIC50(input):
pIC50 = []
for i in input['standard_value_norm']:
molar = i*(10**-9) # Converts nM to M
pIC50.append(-np.log10(molar))
input['pIC50'] = pIC50
x = input.drop('standard_value_norm', 1)
return x
def norm_value(input):
norm = []
for i in input['standard_value']:
if i > 100000000:
i = 100000000
norm.append(i)
input['standard_value_norm'] = norm
x = input.drop('standard_value', 1)
return x
#Apply the norm_value() function to normalise the values in the standard_value column
df_norm = norm_value(df_descriptors)
df_norm
#Add the normalised pIC50 column to the dataframe
df_pIC50 = pIC50(df_norm)
#Removing the 'neutral' bioactivity class
Bdf_pIC50= df_pIC50[df_pIC50['bioactivity_class'] != 'neutral']
#Frequency plot of the 2 bioactivity classes
plt.figure(figsize=(10, 8))
sns.countplot(x='bioactivity_class', data=Bdf_pIC50, edgecolor='white', color='black')
plt.xlabel('Bioactivity Class', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.title('Frequency of Inactive and Active Molecules', fontsize=16)
#SupportVectorMachine
from seaborn import load_dataset, pairplot
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
#Create dataframe for model building
dfp = Bdf_pIC50.drop(['canonical_smiles'], axis=1)
dfp = dfp.drop(['ID'], axis=1)
dfp = dfp.drop(['pIC50'], axis=1)
dfp
#create pairplots to visualize relationships between the pIC50 and the descriptors
sns.pairplot(dfp, hue='bioactivity_class')
plt.show()
# Splitting the dataset into the Training set and Test set
X = dfp.drop('bioactivity_class', axis=1)
y = dfp.bioactivity_class
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Building and training our model
model = SVC(kernel='linear', random_state=0)
model.fit(X_train, y_train)
# Making predictions with our data
predictions = model.predict(X_test)
print(predictions[:100])
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))
y_predict = model.predict(X_test)
from sklearn.metrics import classification_report, accuracy_score
#confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_predict)
print(confusion_matrix)
print(classification_report(y_test, y_predict))
print(accuracy_score(y_test, y_predict))
#Binary Logistic Regression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from statsmodels.formula.api import logit
#Expressed the bioactivity class column values in binary form
dfp["bioactivity_class"] = dfp["bioactivity_class"].map({"inactive":0, "active":1})
train_data, test_data = train_test_split(dfp, test_size=0.2)
formula = ('bioactivity_class ~ LogP + NumHAcceptors + NumHDonors + MW')
model1 = logit(formula = formula, data = train_data).fit()
model1.summary()
model1_odds = pd.DataFrame(np.exp(model1.params), columns= ['OR'])
model1_odds['z-value']= model1.pvalues
model1_odds[['2.5%', '97.5%']] = np.exp(model1.conf_int())
model1_odds
#Multicollinearity test
dfp.corr()
#heatmap
sns.set(rc = {'figure.figsize':(15,8)})
sns.heatmap(dfp.corr(), cmap='coolwarm', annot= True)
#Linearity assumption test
# Plotting multiple plots in the same figure
fig, (axL, axR) = plt.subplots(2, figsize=(15, 15))
plt.suptitle("Logistic Regression Residual Plots \n using Seaborn Lowess line (N = 823)")
# Deviance Residuals
sns.regplot(model1.fittedvalues, model1.resid_dev, ax= axL,
color="black", scatter_kws={"s": 5},
line_kws={"color":"r", "alpha":1, "lw":2}, lowess=True)
axL.set_title("Deviance Residuals \n against Fitted Values")
axL.set_xlabel("Linear Predictor Values")
axL.set_ylabel("Deviance Residuals")
# Studentized Pearson Residuals
sns.regplot(model1.fittedvalues, model1.resid_pearson, ax= axR,
color="black", scatter_kws={"s": 5},
line_kws={"color":"y", "alpha":1, "lw":2}, lowess=True)
axR.set_title("Studentized Pearson Residuals \n against Fitted Values")
axR.set_xlabel("Linear Predictor Values")
axR.set_ylabel("Studentized Pearson Residuals")
plt.show()