forked from elliewith/XOmiVAE
-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathomiShapExplainer.py
489 lines (421 loc) · 27.7 KB
/
omiShapExplainer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
import torch
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import pandas as pd
from exprVAEwithAdditionalFeatures import ExprOmiVAE
import generalHelperFunctions as GeneralHelper
import shapExplainerHelper as ShapHelper
from dimensionKnockOutExperiments import nullingDimensions
from torch.nn import functional as F
import sys
sys.path.insert(0, '~/PycharmProjects/shapLundberg')
import shapLundberg
from shapLundberg import shap
from sklearn import metrics
from scipy.stats import skew
#import gffpandas.gffpandas as gffpd
def omiShapExplainer(sample_id,label_array,expr_df,dimension=0,tumourID=0,device='cpu',LatentSpaceExplain=False, statisticsTest=False, NormalvsTumourExplain=False,
NormalvsTumourInterimExplain=False, TestingNullingDimensions=False,
saveLatentSpace=False, statisticsTestEvenGender=False,histogram=False,NormalvsTumourDimensionExplain=False, knockoutGenes=False,idProvided=True,tumourName="TCGA-KIRC",file_name="place_holder",knockOutNumber=50,mean=False,toZero=True,TopGenes=True,skewValues=False,knockOutAcrossTumours=False,measureGenesinCommon=False):
"""
:param tumourName: String version of TCGA tumour e.g. "TCGA-KIRC" we would like to explain
:param NormalvsTumourExplain: Explain predictions (from supervised OmiVAE). Pass in tumourName to explain.
:param NormalvsTumourInterimExplain: Explain most important dimensions (from supervised OmiVAE). Pass in tumourName to explain.
:param NormalvsTumourDimensionExplain: Explain a specified dimension (either after supervised or unsupervised training). Pass in dimension and tumourName.
:param statisticsTest: Find the most important dimension in the unsupervised part of the model. Adjust within the code for which latent space (z) file to load in and which factrs we are investigating.
:param LatentSpaceExplain: Explain the mean latent variables. Need to specifiy dimension. Dimension found by running statisticsTest. Adjust within section
:param tumourID: TCGA tumour code (see tumour type digit file) that we want to explain prediction. Required for testing nulling dimensions.
:return:
"""
#This class has combined the different analysis' of the Deep SHAP values we conducted.
#SHAP reference: Lundberg et al., 2017: http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf
input_path = 'DataSources/GDC-PANCAN_'
vae_model = ExprOmiVAE(input_path, expr_df)
#load vae trained model
vae_model.load_state_dict(torch.load('DataSources/vae_saved_model(original).pt',map_location=torch.device('cpu')))
#Get the most statistically significant dimensions: use for explaining the UNSUPERVISED model (just VAE)
if NormalvsTumourExplain:
"""
Explain the most important genes for a specific tumour, when using the normal tissue as a background
Note: adjusting to use a random training sample as the background is simple, use the
ShapHelper.randomTrainingSample(expr) function.
"""
phenotype = GeneralHelper.processPhenotypeDataForSamples(sample_id)
print("cancer type is")
print(tumourName)
conditionone=phenotype['sample_type'] == "Primary Tumor"
conditiontwo=phenotype['project_id'] == tumourName
conditionthree=phenotype['sample_type'] == "Solid Tissue Normal"
conditionaltumour=np.logical_and(conditionone,conditiontwo)
conditionalnormal = np.logical_and(conditiontwo, conditionthree)
"""
#recommended to have 100 samples, but in some cases it is not possible to sample 100 for a tissue type, especially for normal tissue
print("count of normal tissue")
print(np.count_nonzero(conditionalnormal))
if np.count_nonzero(conditionalnormal) < 10:
normal_number=np.count_nonzero(conditionalnormal)
else:
normal_number=10
normal_expr = ShapHelper.splitExprandSample(condition=conditionalnormal, sampleSize=10, expr=expr_df)
if np.count_nonzero(conditionaltumour) < 10:
conditionalcount = np.count_nonzero(conditionaltumour)
else:
conditionalcount = 10
"""
normal_expr = ShapHelper.randomTrainingSample(expr_df, 10)
tumour_expr = ShapHelper.splitExprandSample(condition=conditionaltumour, sampleSize=10, expr=expr_df)
# put on device as correct datatype
background = GeneralHelper.addToTensor(expr_selection=normal_expr, device=device)
male_expr_tensor = GeneralHelper.addToTensor(expr_selection=tumour_expr, device=device)
GeneralHelper.checkPredictions(male_expr_tensor, vae_model)
#output number is based from OmiVAE forward function. 4= predicted y value.
e = shap.DeepExplainer(vae_model, background, outputNumber=4)
print("calculating shap values")
shap_values_female = e.shap_values(male_expr_tensor, ranked_outputs=1)
genes,chrom, ensg =ShapHelper.getGenes(expr_df)
shap.summary_plot(shap_values_female[0][0],features=tumour_expr.T,feature_names=genes)
print("calculated shap values")
most_imp = ShapHelper.getTopShapValues(shap_vals=shap_values_female, numberOfTopFeatures=50,
expr_df=expr_df, ranked_output=True, cancerType=tumourName,absolute=True)
if NormalvsTumourDimensionExplain:
"""
Explain the most important genes for a specific dimension for a tumour.
Note: adjusting to use a random training sample as the background is simple, use the
ShapHelper.randomTrainingSample(expr) function.
Here we are explaining the mean output (outputNumber=2).
"""
phenotype = GeneralHelper.processPhenotypeDataForSamples(sample_id)
conditionone=phenotype['sample_type'] == "Primary Tumor"
conditiontwo=phenotype['project_id'] == tumourName
conditionthree=phenotype['sample_type'] == "Solid Tissue Normal"
conditionaltumour=np.logical_and(conditionone,conditiontwo)
conditionalnormal = np.logical_and(conditiontwo, conditionthree)
#recommended to have 100 samples, but in some cases it is not possible to sample 100 for a tissue type, especially for normal tissue
normal_expr = ShapHelper.splitExprandSample(condition=conditionalnormal, sampleSize=2, expr=expr_df)
tumour_expr = ShapHelper.splitExprandSample(condition=conditionaltumour, sampleSize=2, expr=expr_df)
# put on device as correct datatype
background = GeneralHelper.addToTensor(expr_selection=normal_expr, device=device)
male_expr_tensor = GeneralHelper.addToTensor(expr_selection=tumour_expr, device=device)
GeneralHelper.checkPredictions(background, vae_model)
#output number 2 is the mean (latent space)
#I edited the SHAP library to allow a dimension to be passed in and edited the output value. important that explainLatentSpace=True.
e = shap.DeepExplainer(vae_model, background, dim=dimension, outputNumber=2, explainLatentSpace=True)
#e = shap.DeepExplainer(vae_model, background)
#as we are explaining the genes for a dimension (unlike previously where we had to choose which prediction we were explaining) we do not need to specifiy to rank the outputs
shap_values_female = e.shap_values(male_expr_tensor)
most_imp = ShapHelper.getTopShapValues(shap_vals=shap_values_female, numberOfTopFeatures=50,
expr_df=expr_df, ranked_output=False)
#Note: this can be easily adjusted to explain any two samples; simply change 'normal_expr' and 'tumour_expr' to the chosen samples
if NormalvsTumourInterimExplain:
"""
Explain the interim layer; pass in the interim layer and then it explains the top dimensions.
"""
phenotype = GeneralHelper.processPhenotypeDataForSamples(sample_id)
conditionone = phenotype['sample_type'] == "Primary Tumor"
conditiontwo = phenotype['project_id'] == tumourName
conditionthree = phenotype['sample_type'] == "Solid Tissue Normal"
conditionaltumour = np.logical_and(conditionone, conditiontwo)
conditionalnormal = np.logical_and(conditionthree, conditiontwo)
# Recommended to have 100 samples, but in some cases it is not possible to sample 100 for a tissue type, especially for normal tissue
#change background here; e.g. random training sample or normal tissue
normal_expr = ShapHelper.randomTrainingSample(expr_df, 50)
#normal_expr = ShapHelper.splitExprandSample(condition=conditionone, sampleSize=50, expr=expr_df)
tumour_expr = ShapHelper.splitExprandSample(condition=conditionaltumour, sampleSize=50, expr=expr_df)
# Ensure tensor is on device and has the correct data type
background = GeneralHelper.addToTensor(expr_selection=normal_expr, device=device)
male_expr_tensor = GeneralHelper.addToTensor(expr_selection=tumour_expr, device=device)
GeneralHelper.checkPredictions(background, vae_model)
GeneralHelper.checkPredictions(male_expr_tensor, vae_model)
#output number is based from OmiVAE forward function. 4= predicted y value
#this should be c_fc1 for autoencoded layer
e = shap.DeepExplainer((vae_model, vae_model.c_fc1), background, outputNumber=4)
#explains layer before mean (512 dimensions)
#e = shap.DeepExplainer((vae_model, vae_model.e_fc4_mean), background)
shap_values_female = e.shap_values(male_expr_tensor, ranked_outputs=1)
# Here look at the numbers to left (they should range 1 to no. of dimensions) and this is the most important dimension
most_imp, least_imp = ShapHelper.getTopShapValues(shap_vals=shap_values_female, numberOfTopFeatures=50,
expr_df=expr_df, ranked_output=True)
if LatentSpaceExplain:
"""
This requires a known dimension to explain (found using the statistical significance above),
This is for gender, subtype and pathway latent space explanations for the unsupervised part of the model.
"""
#Example of how to obtain the relevant samples for the subtypes we would like to analyse (source: https://www.sciencedirect.com/science/article/pii/S0092867418303593)
subtypeConditionalOne, subtypeConditionalTwo = GeneralHelper.processSubtypeSamples(sample_id,subtypeOne="Basal",subtypeTwo="LumB")
#Example of how to obtain the relevant samples for the pathways we would like to analyse (source: https://www.sciencedirect.com/science/article/pii/S0092867418303593)
logicalOne, logicalTwo = ShapHelper.pathwayComparison(sample_id=sample_id,pathway='RTK RAS')
#Example of how to obtain the relevant samples for the different genders to explain the latent space
femaleCondition, maleCondition=ShapHelper.splitForGenders(sample_id=sample_id,)
ShapHelper.printConditionalSelection(maleCondition)
#Change condition to one of the relevant conditions above
female_expr = ShapHelper.splitExprandSample(condition=femaleCondition,sampleSize=50,expr=expr_df)
male_expr = ShapHelper.splitExprandSample(condition=maleCondition, sampleSize=50, expr=expr_df)
#put on device as correct datatype
background= GeneralHelper.addToTensor(expr_selection=female_expr,device=device)
male_expr_tensor = GeneralHelper.addToTensor(expr_selection=male_expr, device=device)
GeneralHelper.checkPredictions(background, vae_model)
e = shap.DeepExplainer(vae_model, background,outputNumber=2,dim=dimension)
# If explaining the z/mu dimension then don't need ranked outputs like we used before (as only one output from the model)
shap_values_female = e.shap_values(male_expr_tensor)
most_imp, least_imp=ShapHelper.getTopShapValues(shap_vals=shap_values_female, numberOfTopFeatures=50, expr_df=expr_df, ranked_output=False)
if saveLatentSpace:
expr_tensor=GeneralHelper.addToTensor(expr_df,device)
vae_model.load_state_dict(torch.load('data/beta15.pt', map_location=torch.device('cpu')))
GeneralHelper.saveLatentSpace(vae_model,expr_tensor)
if statisticsTest:
# z latent space; explain beta-vae and normal vae latent space (can also feed in the mean here)
z = np.genfromtxt('data/z_before_supervised_loss32_ForBeta10AnnealingKlLoss8.csv')
#Split for gender here but can also split for subtype if we want to find the most important dimension for a subtype
femaleCondition, maleCondition = ShapHelper.splitForGenders(sample_id=sample_id, )
# Example for adapting this code to measure the statistically significant dimensions in a subtype
# subtypeConditionalOne, subtypeConditionalTwo = GeneralHelper.processSubtypeSamples(sample_id, subtypeOne="Basal",
# subtypeTwo="LumB")
female_z = z[femaleCondition]
male_z = z[maleCondition]
statistics = stats.ttest_ind(female_z, male_z, axis=0, equal_var=False, nan_policy='propagate')
stat_lowest_index = np.argmin(statistics.pvalue)
female_z_column = female_z[:, stat_lowest_index]
male_z_column = male_z[:, stat_lowest_index]
np.savetxt('data/brca_z.csv', female_z_column)
np.savetxt('data/normal_z.csv', male_z_column)
# Method to sample the same amount to ensure gender split is even
if statisticsTestEvenGender:
# Get the z vector we would like to explain
z = np.genfromtxt('data/z_before_supervised_loss32_ForBeta10AnnealingKlLoss8.csv')
phenotype = GeneralHelper.processPhenotypeDataForSamples(sample_id)
# tumour id's were chosen as they were typically non-gender specific cancers
femaleCondition, maleCondition = ShapHelper.multipleSampling(label_array=label_array, phenotype=phenotype,
tumour_id_one=13,
tumour_id_two=18, tumour_id_three=10,
tumour_id_four=5, tumour_id_five=6,
sample_number=50)
ShapHelper.saveMostStatisticallySignificantIndex(conditionOne=femaleCondition, conditionTwo=maleCondition,
fileNameOne="female_z", fileNameTwo="male_z",z=z)
if histogram:
feature_importance = pd.read_csv('data/luaddimensions.csv', header=0, index_col=0)
plt.style.use('seaborn-white')
hist = feature_importance.hist(column='feature_importance_vals',normed=False, alpha=0.5,
histtype='stepfilled', color='skyblue',
edgecolor='none')
plt.title('SHAP value histogram')
plt.xlabel('SHAP_value')
plt.ylabel('Frequency')
plt.yscale('log')
plt.savefig("SHAPvalueLoghistogram.png", dpi=1500)
plt.show()
if TestingNullingDimensions:
#Example of testing four different dimensions
print("dim 42")
nullingDimensions(sample_id=sample_id,expr_df=expr_df,diseaseCode=tumourID, chosenTissue=tumourName, dimension=42)
print("dim 125")
nullingDimensions(sample_id=sample_id,expr_df=expr_df,diseaseCode=tumourID, chosenTissue=tumourName, dimension=125)
def runThroughModel(vae_mod, exprTensor, disease_code):
with torch.no_grad():
_, recon_data, mean, log_var, pred_y = vae_mod(exprTensor)
pred_y_softmax = F.softmax(pred_y, dim=1)
_, predicted = torch.max(pred_y_softmax, 1)
correct_vals = (predicted == disease_code).sum().item()
accuracy_tumour = correct_vals / len(pred_y) * 100
return accuracy_tumour, predicted
def runThroughModelFullExp(vae_mod, exprTensor):
with torch.no_grad():
_, recon_data, mean, log_var, pred_y = vae_mod(exprTensor)
pred_y_softmax = F.softmax(pred_y, dim=1)
_, predicted = torch.max(pred_y_softmax, 1)
return predicted
def getAccuracyOfModel(vae_mod, disease_code,norm_expr_tensor, tumour_expr_tensor):
accuracy, predictedTumour = runThroughModel(vae_mod, tumour_expr_tensor, disease_code)
print('Cancer tissue accuracy is: {}'.format(accuracy))
accuracy, predictedNormal = runThroughModel(vae_mod, norm_expr_tensor, 0)
print('Normal tissue accuracy is: {}'.format(accuracy))
unique, counts = np.unique(predictedTumour, return_counts=True)
print("Predictions for tumour")
print(np.asarray((unique, counts)).T)
def extract_relevantExpr(expr_df, condition):
expr_df_T = expr_df.T
relevantExpr = expr_df_T[condition].T
# tumour_expr = tumour_expr.sample(n=100, axis=1)
relevantExpr_tensor=GeneralHelper.addToTensor(relevantExpr,device)
return relevantExpr_tensor
def geneKnockOutExpr(expr_df,conditionaltumour,file_name,knockOutNumber,idProvided,TopGenes):
shap_genes=pd.read_csv(file_name, sep=',', header=0, index_col=1)
#knock out the bottom genes- sort by lowests to highest
if not TopGenes:
shap_genes.sort_values(by=['feature_importance_vals'], ascending=True, inplace=True)
print("rearranges bottom to top")
#get top number of genes to knock out
gencode_ids = pd.read_csv('DataSources/gencodev22annotationgene.tsv', sep='\t', header=0, index_col=1)
# merge includes duplicate genes!
if idProvided:
shapwithID = pd.merge(shap_genes, gencode_ids, on='id', how='inner')
mostImp_shap_values = shapwithID.head(knockOutNumber)
ensembleCodes = mostImp_shap_values.iloc[:, 2]
else:
shapwithID = pd.merge(shap_genes, gencode_ids, on='gene', how='inner')
mostImp_shap_values = shapwithID.head(knockOutNumber)
ensembleCodes = mostImp_shap_values.iloc[:, 3]
expr_df=expr_df.T
"""
if mean:
newValuesForGenes = expr_df.loc[expr_df.index.intersection(ensembleCodes)].mean(axis=1)
newvalues = np.resize(newValuesForGenes, (np.count_nonzero(conditionaltumour), newValuesForGenes.size))
expr_df.loc[expr_df[conditionaltumour].index, expr_df.columns.intersection(ensembleCodes)] = newvalues
"""
expr_df.loc[expr_df[conditionaltumour].index, expr_df.columns.intersection(ensembleCodes)] = 0
print("values after")
print(expr_df.loc[expr_df[conditionaltumour].index, expr_df.columns.intersection(ensembleCodes)])
relevantExpr = expr_df[conditionaltumour].T
# tumour_expr = tumour_expr.sample(n=100, axis=1)
relevantExpr_tensor = GeneralHelper.addToTensor(relevantExpr, device)
expr_df = expr_df.T
return relevantExpr_tensor,expr_df
if knockoutGenes:
vae_model.eval()
phenotype = pd.read_csv('DataSources/GDC-PANCAN.basic_phenotype.tsv', sep='\t', header=0, index_col=0)
phenotype = phenotype.T[sample_id]
phenotype = phenotype.T
conditionone = phenotype['sample_type'] == "Primary Tumor"
conditiontwo = phenotype['project_id'] == tumourName
conditionthree = phenotype['sample_type'] == "Solid Tissue Normal"
conditionaltumour = np.logical_and(conditionone, conditiontwo)
conditionalnormal = np.logical_and(conditionthree, conditiontwo)
norm_expr_tensor = extract_relevantExpr(expr_df, conditionalnormal)
tumour_expr_tensor = extract_relevantExpr(expr_df, conditionaltumour)
print("accuracy before")
### accuracy of just tumour
getAccuracyOfModel(vae_model,disease_code=tumourID, norm_expr_tensor=norm_expr_tensor,
tumour_expr_tensor=tumour_expr_tensor)
##total accuracy
expr_tensor = GeneralHelper.addToTensor(expr_df, device)
predicted = runThroughModelFullExp(vae_model, expr_tensor)
unique, counts = np.unique(predicted, return_counts=True)
print("Predictions for ALL")
print(np.asarray((unique, counts)).T)
#accuracy = metrics.accuracy_score(label_array, predicted, average='weighted')
#print('accuracy: {}'.format(accuracy))
print("accuracy after")
tumour_expr_tensor=geneKnockOutExpr(expr_df,conditionaltumour,file_name,knockOutNumber,idProvided)
getAccuracyOfModel(vae_model, disease_code=tumourID, norm_expr_tensor=norm_expr_tensor,
tumour_expr_tensor=tumour_expr_tensor)
##total accuracy
expr_tensor = GeneralHelper.addToTensor(expr_df, device)
predicted = runThroughModelFullExp(vae_model, expr_tensor)
unique, counts = np.unique(predicted, return_counts=True)
print("Predictions for ALL")
print(np.asarray((unique, counts)).T)
#accuracy = metrics.accuracy_score(label_array, predicted, average='weighted')
#print('accuracy: {}'.format(accuracy))
def getPredictions(expr_df,device,vae_model):
expr_tensor = GeneralHelper.addToTensor(expr_df, device)
predicted = runThroughModelFullExp(vae_model, expr_tensor)
unique, countsBefore = np.unique(predicted, return_counts=True)
print(np.asarray((unique, countsBefore)).T)
return unique, countsBefore
def getConditionalTumour(sample_id,tumourName):
tumourName= "TCGA-"+tumourName
phenotype = pd.read_csv('DataSources/GDC-PANCAN.basic_phenotype.tsv', sep='\t', header=0, index_col=0)
phenotype = phenotype.T[sample_id]
phenotype = phenotype.T
conditionone = phenotype['sample_type'] == "Primary Tumor"
conditiontwo = phenotype['project_id'] == tumourName
conditionaltumour = np.logical_and(conditionone, conditiontwo)
return conditionaltumour
def runThroughKnockOutAndSave(expr_df,device,vae_model,conditionaltumour,tumourType,count,TopGenes):
uniqueBefore, countsBefore = getPredictions(expr_df, device, vae_model)
tumour_expr_tensor, expr_df = geneKnockOutExpr(expr_df, conditionaltumour, file_name, count, idProvided,TopGenes)
uniqueAfter, countsAfter = getPredictions(expr_df, device, vae_model)
transcriptCountsBefore = np.asarray((uniqueBefore, countsBefore)).T
transcriptCountsAfter = np.asarray((uniqueAfter, countsAfter)).T
transcriptCountsBeforeDF = pd.DataFrame(transcriptCountsBefore,
columns=['unique', 'countsBefore'])
transcriptCountsAfterDF = pd.DataFrame(transcriptCountsAfter,
columns=['unique', 'countsAfter'])
#left: useo nly keys from left frame
countsMerges = pd.merge(transcriptCountsBeforeDF, transcriptCountsAfterDF[['unique','countsAfter']], on='unique', how='left')
countsMerges=countsMerges.fillna(0)
subtract=(countsMerges['countsAfter'] - countsMerges['countsBefore']).values
countsMerges.insert(3, "Subtraction", subtract, True)
accuracy=((countsMerges['countsAfter'] / countsMerges['countsBefore'])*100).values
countsMerges.insert(4,"Accuracy", accuracy, True)
if TopGenes:
fileAddition = "_top_SHAP_values"
else:
fileAddition = "_everything_but_top_SHAP_values"
countsMerges.to_csv("TCGA-" + tumourType + "knockout_"+str(count)+fileAddition)
if knockOutAcrossTumours:
TumourID = ['ACC', 'BLCA', 'BRCA', 'CESC', 'CHOL', 'COAD', 'DLBC', 'ESCA', 'GBM', 'HNSC', 'KICH', 'KIRC',
'KIRP', 'LAML','LGG', 'LIHC', 'LUAD', 'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PRAD', 'READ', 'SARC', 'SKCM', 'STAD',
'TGCT', 'THCA', 'THYM', 'UCEC', 'UCS', 'UVM']
TumourID = ['ACC']
vae_model.eval()
print("tumour is")
print(TumourID)
#knockOutNumbers = [0.2,0.5,1,2,5,6]
knockOutNumbers = [6]
for tumourType in TumourID:
conditionaltumour = getConditionalTumour(sample_id, tumourType)
file_name = 'SHAP_with_ID/TCGA-' + tumourType + '_feature_imp_secondtry'
for numberOfGenes in knockOutNumbers:
if numberOfGenes==6:
#newknockOutNumbers=[0.2,0.5,1,5]
newknockOutNumbers = [0.2]
for i in newknockOutNumbers:
print("Knocking out everything but the amount of genes numbering")
count = round(numberOfGenes / 100 * expr_df.shape[0])
count=expr_df.shape[0]-count
print("count is")
print(count)
TopGenes=False
runThroughKnockOutAndSave(expr_df, device, vae_model, conditionaltumour, tumourType,count,TopGenes)
print("count is")
count=round(numberOfGenes/100*expr_df.shape[0])
print(count)
TopGenes = True
runThroughKnockOutAndSave(expr_df,device,vae_model,conditionaltumour,tumourType,count,TopGenes)
if measureGenesinCommon:
#go through Top 1% of genes in SHAP file
TumourID = ['ACC', 'BLCA', 'BRCA', 'CESC', 'CHOL', 'COAD', 'DLBC', 'ESCA', 'GBM', 'HNSC', 'KICH', 'KIRC', 'KIRP',
'LGG', 'LIHC', 'LUAD', 'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PRAD', 'READ', 'SARC', 'SKCM', 'STAD',
'TGCT', 'THCA', 'THYM', 'UCEC', 'UCS', 'UVM']
ShapVals=[]
print("count is")
#top 0.1%
count = round(0.1 / 100 * expr_df.shape[0])
for x in TumourID:
feature_importance = pd.read_csv('SHAP_with_ID/TCGA-' + x + '_feature_imp_secondtry', header=0, index_col=0)
feature_importance = feature_importance.head(count)
###ACTUALLY WANT TOP 1% INDEX OF ID HERE!!!!!
ShapVals.append(feature_importance['id'].values)
#list_of_tuples = list(zip(TumourID, ShapVals))
#vdf = pd.DataFrame(list_of_tuples,columns=['TumourID', 'SHAP_vals'])
flattened_list = []
# flatten the lis
for x in ShapVals:
for y in x:
flattened_list.append(y)
unique, counts = np.unique(flattened_list, return_counts=True)
print("Predictions for tumour")
transcriptCounts=np.asarray((unique, counts)).T
#transcriptCountsPd=pd.DataFrame(data=transcriptCounts[:,1],index=transcriptCounts[:,0],columns=['count'])
print(transcriptCounts)
gencode_ids = pd.read_csv('DataSources/gencodev22annotationgene.tsv', sep='\t', header=0)
transcriptCounts=pd.DataFrame(data=transcriptCounts[:, :], columns=['id', 'count'])
# merge includes duplicate genes!
flattened_list_with_gene = pd.merge(transcriptCounts, gencode_ids[['gene','id']], on='id', how='inner')
flattened_list_with_gene['count'] = pd.to_numeric(flattened_list_with_gene['count'])
flattened_list_with_gene.sort_values(by=['count'], ascending=False, inplace=True)
print(flattened_list_with_gene)
flattened_list_with_gene.to_csv("topGenesAllTumoursForTop_"+str(count)+"genes.csv",columns = ['gene', 'count'])
if skewValues:
TumourID=['ACC','BLCA','BRCA','CESC','CHOL','COAD','DLBC','ESCA','GBM','HNSC','KICH','KIRC','KIRP','LAML',
'LGG','LIHC','LUAD','LUSC','MESO','OV','PAAD','PCPG','PRAD','READ','SARC','SKCM','STAD','TGCT','THCA','THYM','UCEC','UCS','UVM']
Skew=[]
for x in TumourID:
feature_importance = pd.read_csv('SHAP_feature_importance/TCGA-'+x+'_feature_imp', header=0, index_col=0)
Skew.append(skew(feature_importance['feature_importance_vals']))
print(x)
list_of_tuples = list(zip(TumourID, Skew))
df = pd.DataFrame(list_of_tuples,
columns=['TumourID', 'Skew'])
df.to_csv("SHAP_value_skew_by_tumourID")