forked from ToolsVanBox/SMuRF
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtool.py
758 lines (686 loc) · 36.1 KB
/
tool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
#!/usr/bin/python
# Import modules
import vcf as pyvcf
import pysam
import argparse
import multiprocessing as mp
import queue
import time
import sys
import collections
import subprocess
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
from sklearn.mixture import GaussianMixture
from matplotlib.backends.backend_pdf import PdfPages
import warnings
# Get version from git
#__version__ = subprocess.check_output(["git", "describe"]).strip().decode('UTF-8')
__version__ = 'v1.1.3'
# Set arguments
parser = argparse.ArgumentParser()
parser = argparse.ArgumentParser(description='Put here a description.')
parser.add_argument('-i', '--input', type=str, help='Input indexed vcf.gz file', required=True)
parser.add_argument('-b', '--bam', action='append', nargs="*", type=str, help='Input bam file', required=True)
parser.add_argument('-t', '--threads', default=8, type=int, help="Number of threads (default: %(default)s)")
parser.add_argument('-c', '--control', action='append', type=str, help='Control sample name')
parser.add_argument('-bl', '--blacklist', action='append', type=str, help='Black list vcf or bed file. Multiple files can be used.')
parser.add_argument('-q', '--qual', default=100, type=int, help="Flag variants with a low QUAL value (default: %(default)s)")
parser.add_argument('-mq', '--mq', default=60, type=int, help="Flag variants with a low MQ value (default: %(default)s)")
parser.add_argument('-sgq11','--sample_gq_homozygous', default=10, type=int, help="Minimal Genome Quality of a homozygous SNV in a sample (default: %(default)s)")
parser.add_argument('-sgq01','--sample_gq_heterozygous', default=99, type=int, help="Minimal Genome Quality of a heterozygous SNV in a sample (default: %(default)s)")
parser.add_argument('-cgq11','--control_gq_homozygous', default=10, type=int, help="Minimal Genome Quality of a homozygous SNV in a control (default: %(default)s)")
parser.add_argument('-cgq01','--control_gq_heterozygous', default=10, type=int, help="Minimal Genome Quality of a heterozygous SNV in a control (default: %(default)s)")
parser.add_argument('-igq11','--indel_gq_homozygous', default=99, type=int, help="Minimal Genome Quality of a homozygous indel (default: %(default)s)")
parser.add_argument('-igq01','--indel_gq_heterozygous', default=99, type=int, help="Minimal Genome Quality of a heterozygous indel (default: %(default)s)")
parser.add_argument('-cov', '--coverage', default=10, type=int, help="Flag variants with a low COV value (default: %(default)s)")
parser.add_argument('-ct', '--clonal_threshold', default=0.3, type=float, help="Sample reported as subclonal if VAF is lower (default: %(default)s)")
parser.add_argument('-at', '--absent_threshold', default=0.0, type=float, help="Sample reported as absent if VAF is lower(default: %(default)s)")
parser.add_argument('-m', '--mapq', default=0, type=int, help="Include only reads with a minimal mapq (default: %(default)s)")
parser.add_argument('-p', '--base_phred_quality', default=0, type=int, help="Include only bases with a minimal base phred quality (default: %(default)s)")
parser.add_argument('-minc','--min_components', default=2,type=int,help="Minimal number of components (default: %(default)s)")
parser.add_argument('-maxc','--max_components',default=3,type=int,help="Maximal number of components (default: %(default)s)")
parser.add_argument('-indel', '--indel', default=True, action='store_false', help="Exclude indels")
parser.add_argument('-v', '--version', action='version', version=__version__)
args = parser.parse_args()
# Flatten input list of bam files
args.bam = [x for l in args.bam for x in l]
# Set default control None if no control is given at the command line
if not args.control:
args.control = [ None ]
# Read the vcf, fix and add fields to the header
vcf_reader = pyvcf.Reader(filename=args.input)
# Create tmp directory if it does not exists
try:
os.stat('./tool_tmp')
except:
os.mkdir('./tool_tmp')
# Define global variables
vaf_dict = collections.defaultdict(list)
responsibilities_dict = collections.defaultdict(dict)
contig_list = []
bam_sample_names = collections.defaultdict(dict)
def main():
global vcf_reader, vaf_df
vcf_reader = fix_vcf_header(vcf_reader)
vcf_reader = add_vcf_header(vcf_reader)
create_blacklist()
for contig in vcf_reader.contigs:
contig_list.append(contig)
# Create an input queue with the contigs and an empty output queue
q = mp.Queue()
q_out = mp.Queue()
for contig in contig_list:
q.put(contig)
# Create number of processes to parse the vcf file
processes = [mp.Process(target=parse_chr_vcf, args=(q, q_out, vcf_reader, args.bam)) for x in range(args.threads)]
for p in processes:
p.start()
liveprocs = list(processes)
while liveprocs:
time.sleep(5)
try:
while 1:
for s,v in q_out.get(block=False, timeout=1).items():
vaf_dict[s].extend(v)
except queue.Empty:
pass
# Give tasks a chance to put more data in
time.sleep(10)
if not q.empty():
continue
liveprocs = [p for p in liveprocs if p.is_alive()]
for p in processes:
p.join()
def parse_chr_vcf(q, q_out, contig_vcf_reader, bams):
"""
Function to parse the vcf per contig.
Write the new record to a vcf file.
Input: Queue object
Input: Queue out object
Input: VCF reader object
Input: List with the bam names
"""
contig_vaf = collections.defaultdict(list)
while True:
try:
# Get contig one by one from the queue
contig = q.get(block=False,timeout=1)
contig_vcf_flag_writer = pyvcf.Writer(open('./tool_tmp/{}_flag.vcf'.format(contig),'w'), contig_vcf_reader)
try:
# Try to parse the specific contig from the vcf
contig_vcf_reader.fetch(contig)
except:
# Skip contig if it is not present in the vcf file
continue
for record in contig_vcf_reader.fetch(contig):
if not record.FILTER:
chr = record.CHROM
chr = chr.lower()
chr = re.sub("chr|chrom", "", chr)
if record.ID and "COSM" not in record.ID:
record.FILTER.append('KnownVariant')
elif record.QUAL < args.qual:
record.FILTER.append('BadQual')
elif len(record.ALT) > 1:
record.FILTER.append('MultiAllelic')
elif record.INFO['MQ'] < args.mq:
record.FILTER.append('BadMQ')
elif chr in blacklist and record.POS in blacklist[chr]:
record.FILTER.append("BlackList")
elif (len(record.ALT[0]) > 1 or len(record.REF) > 1) and not args.indel:
record.FILTER.append("Indel")
elif sample_quality_control( record ):
for s,v in calculate_vaf( record ).items():
contig_vaf[s].append(v)
contig_vcf_flag_writer.write_record(record)
q_out.put( contig_vaf )
# Break the loop if the queue is empty
except queue.Empty:
break
def get_command_line():
"""
Function to get the commandline arguments
Return: A string with the actual command.
"""
cmdline = [sys.argv[0]]
for arg in vars(args):
if type(getattr(args,arg)) is list:
for a in getattr(args,arg):
cmdline.append('--{} {}'.format(arg,str(a)))
else:
cmdline.append('--{} {}'.format(arg,str(getattr(args,arg))))
return( '"{}"'.format(" ".join(cmdline)) )
def fix_vcf_header( vcf_reader ):
"""
Function to fix fields in the vcf header
Input: A vcf reader object
Return: The vcf reader object with fixed headers
"""
#dbNSFP_clinvar_clnsig has a Integer type but sometimes it is a String, e.g. 2|2
vcf_reader.infos['dbNSFP_clinvar_clnsig'] = pyvcf.parser._Info("dbNSFP_clinvar_clnsig",1,"String","Field 'clinvar_clnsig' from dbNSFP", None, None)
#dbNSFP_clinvar_golden_stars has a Integer type but sometimes it is a String, e.g. 0|1
vcf_reader.infos['dbNSFP_clinvar_golden_stars'] = pyvcf.parser._Info("dbNSFP_clinvar_golden_stars",1,"String","Field 'clinvar_golden_stars' from dbNSFP", None, None)
vcf_reader.infos['dbNSFP_hg18_chr'] = pyvcf.parser._Info("dbNSFP_hg18_chr",1,"String","Field 'hg18_chr' from dbNSFP", None, None)
vcf_reader.infos['dbNSFP_hg19_chr'] = pyvcf.parser._Info("dbNSFP_hg19_chr",1,"String","Field 'hg19_chr' from dbNSFP", None, None)
return( vcf_reader )
def add_vcf_header( vcf_reader ):
"""
Function to add a new field to the vcf header
Input: A vcf reader object
Return: The vcf reader object with new headers added
"""
# Metadata
vcf_reader.metadata['ToolCmd'] = [get_command_line()]
# Formats
vcf_reader.formats['VAF'] = pyvcf.parser._Format('VAF',None,'Float','Variant Allele Frequency calculated from the BAM file')
vcf_reader.formats['CAD'] = pyvcf.parser._Format('CAD',None,'Integer','Calculated Allelic Depth, used for VAF calculation')
vcf_reader.formats['FT'] = pyvcf.parser._Format('FT',None,'String','Sample filter')
# Filters
vcf_reader.filters['KnownVariant'] = pyvcf.parser._Filter('KnownVariant','Variant has already an ID, excluding COSMIC_IDs')
vcf_reader.filters['BadMQ'] = pyvcf.parser._Filter('BadMQ', 'Variant with MQ <'+str(args.mq))
vcf_reader.filters['BadQual'] = pyvcf.parser._Filter('BadQual','Variant with a QUAL <'+str(args.qual))
vcf_reader.filters['MultiAllelic'] = pyvcf.parser._Filter('MultiAllelic', 'Variant has multiple alternative alleles')
vcf_reader.filters['BlackList'] = pyvcf.parser._Filter('BlackList', 'Variant exists in a blacklist')
vcf_reader.filters['Indel'] = pyvcf.parser._Filter('Indel','Variant is an indel')
vcf_reader.filters['ControlEvidence'] = pyvcf.parser._Filter('ControlEvidence','Variant is also found in a control based on the GT')
vcf_reader.filters['NoSampleEvidence'] = pyvcf.parser._Filter('NoSampleEvidence','Variant is not found in any of the samples based on the GT')
vcf_reader.filters['AllSamplesFailedQC'] = pyvcf.parser._Filter('AllSamplesFailedQC', 'All samples failed the quality control')
vcf_reader.filters['AllControlsFailedQC'] = pyvcf.parser._Filter('AllControlsFailedQC', 'All controls failed the quality control')
vcf_reader.filters['ControlSubclonal'] = pyvcf.parser._Filter('ControlSubclonal', 'Variant is found as subclonal in a control based on the recalculated VAF')
vcf_reader.filters['ControlClonal'] = pyvcf.parser._Filter('ControlClonal', 'Variant is found as clonal in a control based on the recalculated VAF')
vcf_reader.filters['NoClonalSample'] = pyvcf.parser._Filter('NoClonalSample', 'Variant is not found as clonal in any of the samples based on the recalculated VAF')
# Sample filters
vcf_reader.filters['LowCov'] = pyvcf.parser._Filter('LowCov', 'Variant has a coverage <'+str(args.coverage)+' in this sample/control')
vcf_reader.filters['NoGenoType'] = pyvcf.parser._Filter('NoGenoType', 'Genotype is empty for this sample/control')
vcf_reader.filters['isRef'] = pyvcf.parser._Filter('isRef', 'Genotype is a reference (i.e. reference 0/0)')
vcf_reader.filters['isVariant'] = pyvcf.parser._Filter('isVariant', 'Genotype is a variant (i.e. not reference 0/0)')
vcf_reader.filters['LowGQ'] = pyvcf.parser._Filter('LowGQ', 'Variant has a low genome quality for this sample/control')
# Infos
vcf_reader.infos['ABSENT_SAMPLES'] = pyvcf.parser._Info('ABSENT_SAMPLES',1,'Integer','Number of samples without the variant', None, None)
vcf_reader.infos['SUBCLONAL_SAMPLES'] = pyvcf.parser._Info('SUBCLONAL_SAMPLES',1,'Integer','Number of samples with a subclonal variant', None, None)
vcf_reader.infos['CLONAL_SAMPLES'] = pyvcf.parser._Info('CLONAL_SAMPLES',1,'Integer','Number of samples with a clonal variant', None, None)
vcf_reader.infos['ABSENT_CONTROLS'] = pyvcf.parser._Info('ABSENT_CONTROLS',1,'Integer','Number of controls without the variant', None, None)
vcf_reader.infos['SUBCLONAL_CONTROLS'] = pyvcf.parser._Info('SUBCLONAL_CONTROLS',1,'Integer','Number of controls with a subclonal variant', None, None)
vcf_reader.infos['CLONAL_CONTROLS'] = pyvcf.parser._Info('CLONAL_CONTROLS',1,'Integer','Number of controls with a clonal variant', None, None)
vcf_reader.infos['ABSENT_SAMPLE_NAMES'] = pyvcf.parser._Info('ABSENT_SAMPLE_NAMES',None,'String','Samples without the variant', None, None)
vcf_reader.infos['SUBCLONAL_SAMPLE_NAMES'] = pyvcf.parser._Info('SUBCLONAL_SAMPLE_NAMES',None,'String','Samples with a subclonal variant', None, None)
vcf_reader.infos['CLONAL_SAMPLE_NAMES'] = pyvcf.parser._Info('CLONAL_SAMPLE_NAMES',None,'String','Samples with a clonal variant', None, None)
vcf_reader.infos['ABSENT_CONTROL_NAMES'] = pyvcf.parser._Info('ABSENT_CONTROL_NAMES',None,'String','Controls without the variant', None, None)
vcf_reader.infos['SUBCLONAL_CONTROL_NAMES'] = pyvcf.parser._Info('SUBCLONAL_CONTROL_NAMES',None,'String','Controls with a subclonal variant', None, None)
vcf_reader.infos['CLONAL_CONTROL_NAMES'] = pyvcf.parser._Info('CLONAL_CONTROL_NAMES',None,'String','Controls with a clonal variant', None, None)
vcf_reader.infos['PASS_QC_SAMPLES'] = pyvcf.parser._Info('PASS_QC_SAMPLES',1,'Integer','Number of samples which pass all quality control filters', None, None)
vcf_reader.infos['PASS_QC_CONTROLS'] = pyvcf.parser._Info('PASS_QC_CONTROLS',1,'Integer','Number of controls which pass all quality control filters', None, None)
vcf_reader.infos['FAIL_QC_SAMPLES'] = pyvcf.parser._Info('FAIL_QC_SAMPLES',1,'Integer','Number of samples which failed one or multiple quality control filters', None, None)
vcf_reader.infos['FAIL_QC_CONTROLS'] = pyvcf.parser._Info('FAIL_QC_CONTROLS',1,'Integer','Number of controls which failed one or multiple quality control filters', None, None)
vcf_reader.infos['PASS_QC_SAMPLE_NAMES'] = pyvcf.parser._Info('PASS_QC_SAMPLE_NAMES',None,'String','Samples which pass all quality control filters', None, None)
vcf_reader.infos['PASS_QC_CONTROL_NAMES'] = pyvcf.parser._Info('PASS_QC_CONTROL_NAMES',None,'String','Controls which pass all quality control filters', None, None)
vcf_reader.infos['FAIL_QC_SAMPLE_NAMES'] = pyvcf.parser._Info('FAIL_QC_SAMPLE_NAMES',None,'String','Samples which failed one or multiple quality control filters', None, None)
vcf_reader.infos['FAIL_QC_CONTROL_NAMES'] = pyvcf.parser._Info('FAIL_QC_CONTROL_NAMES',None,'String','Controls which failed one or multiple quality control filters', None, None)
return( vcf_reader )
def get_sample_name( bamfile ):
"""
Function to get the sample name from the bam file
Input: An AlignmentFile object of the bam file
Return: The sample or False if there is no SM tag in the bam header
"""
header = bamfile.header
sample_name = False
if 'RG' in header:
if type(header['RG']) is list:
sample_name = header['RG'][0]['SM']
else:
sample_name = header['RG']['SM']
return( sample_name )
def check_pileupread( pileupread ):
"""
Function to check a pileup read.
Returns True if the read needs to be kept and returns False if read can be skipped.
Input: Pileupread object
Return: True or False
"""
check = True
if pileupread.alignment.is_duplicate:
check = False
elif pileupread.is_del:
check = False
elif pileupread.is_refskip:
check = False
elif not pileupread.query_position:
check = False
elif pileupread.alignment.mapq < args.mapq:
check = False
elif pileupread.alignment.query_qualities[pileupread.query_position] < args.base_phred_quality:
check = False
return( check )
def update_call_data( call, edit_keys, edit_values, vcf_reader ):
"""
Function to add or update a field to the format field.
This will be automatically update in the call object
Input: A call object
Input: A list with format fields
Input: A list with format values
"""
f_keys = list(vcf_reader.formats.keys())
d = dict(call.data._asdict())
f_vals = []
for key in f_keys:
if key in edit_keys:
f_vals.append(edit_values[edit_keys.index(key)] )
elif key in d:
f_vals.append(d[key] )
else:
f_vals.append(None)
handy_dict = dict(zip(f_keys, f_vals))
f_keys.remove('GT')
f_keys.insert(0,'GT')
call.data = collections.namedtuple('CallData',f_keys)(**handy_dict)
def gmm( X ):
"""
Function to fit a GMM model with two components
Input: Array with VAF values
Return: x values
Return: List with probability values for all components
Return: List with probability values of each component
Return: List of the means of each component
Return: List of standard deviations of each component
"""
X = np.array(X)
X = np.reshape(X, [len(X), 1])
# ------------------------------------------------------------
# Learn the best-fit GMM models
# Here we'll use GMM in the standard way: the fit() method
# uses an Expectation-Maximization approach to find the best
# mixture of Gaussians for the data
# fit models with 2 components
N = np.arange(args.min_components, args.max_components)
models = [None for i in range(len(N))]
for i in range(len(N)):
models[i] = GaussianMixture(N[i]).fit(X)
# compute the AIC and the BIC
AIC = [m.aic(X) for m in models]
BIC = [m.bic(X) for m in models]
x = np.linspace(0.01, 1, 100)
x = x.reshape([len(x), 1])
M_best = models[np.argmin(AIC)]
means = M_best.means_
N = M_best.n_components
std_devs = [ np.sqrt( np.trace(M_best.covariances_[i])/N) for i in range(0,N) ]
logprob = M_best.score_samples(x)
responsibilities = M_best.predict_proba(x)
p = np.exp(logprob)
p_individual = responsibilities * p[:, np.newaxis]
r = responsibilities
r = r[:, np.argsort([u for m in means for u in m]) ]
return( x, p, p_individual, means, std_devs, r )
def solve(m1,m2,std1,std2):
"""
Function to calculate the intersection points of two distributions
Input: Mean of one distribution
Input: Mean of another distribution
Input: Standard deviation of a distribution
Input: Standard deviation of another distribution
Return: Intersection points
"""
a = 1/(2*std1**2) - 1/(2*std2**2)
b = m2/(std2**2) - m1/(std1**2)
c = m1**2 /(2*std1**2) - m2**2 / (2*std2**2) - np.log(std2/std1)
return np.roots([a,b,c])
def create_vaf_plot():
"""
Function to plot the VAF values
"""
# Open a multipage pdf file
with PdfPages('VAFplot.pdf') as pdf:
for sample in vaf_dict:
plt.figure(figsize=(30,10))
x, p, p_individual, means, std_devs, respons = gmm( vaf_dict[sample] )
for idx in range(0,len(x)):
x_vaf = '{0:.2f}'.format(x[idx][0])
responsibilities_dict[sample][x_vaf] = respons[idx].tolist()
ax0 = plt.subplot(1,2,1)
ax0.hist(vaf_dict[sample],bins=50)
ax1 = plt.subplot(1,2,2)
ax1.plot(x, p_individual, '--k', color='lightgray',linewidth=0.5)
ax1.plot(x, p, '-k')
ax1.axvline(x=0.5)
ax1.axvline(x=0.3)
mu = []
# Calculate the intersection points between the distributions
result = solve(means[0],means[1], std_devs[0], std_devs[1])
for r in result:
if r > 0.0 and r <= 1.0:
# ax1.axvline(x=r,linestyle='dashed',color='lightgray',linewidth=0.5)
means = np.append(means,r)
# Plot lines at the mean of each distribution
for m in means:
mu.append(str(("{0:.2f}".format(float(m)))))
ax1.axvline(x=m,linestyle='dashed',color='lightgray',linewidth=0.5)
# Plot formatting
plt.title(sample)
# plt.xlabel('VAF')
plt.ylabel('p(VAF)')
# Set second x axis for the means of each component
ax2 = ax1.twiny()
ax2.xaxis.set_tick_params(length=15)
ax2.set_xticks(means)
ax2.set_xticklabels(mu)
ax2.xaxis.set_ticks_position('bottom')
ax2.xaxis.set_label_position('bottom')
ax2.set_xlim(ax1.get_xlim())
pdf.savefig()
plt.close()
def sample_quality_control( record ):
"""
Function to check if the sample pass the quality control.
Input: VCF record object
Return: True of False if all samples or controls failed the quality control
"""
qc = collections.defaultdict(dict)
noSampleEvidence = 0
controlEvidence = False
indel = False
check = True
# Check if variant is an indel
if (len(record.REF) > 1 or len(record.ALT[0]) > 1):
indel = True
for call in (record.samples):
sample = True
update_call_data(call, ['FT'], [None], vcf_reader)
# Check is the sample is in the control list
if call.sample in args.control:
sample = False
# QC fails if there is no genotype
if call['GT'] == './.':
qc[sample][call.sample] = 'NoGenoType'
# QC fails if the coverage is too low
elif (call['DP'] == None or call['DP'] < args.coverage):
qc[sample][call.sample] = 'LowCov'
elif sample:
# If sample is homozygous reference
if call['GT'] == '0/0':
noSampleEvidence += 1
if indel and (not call['GQ'] or call['GQ'] < args.indel_gq_homozygous):
qc[sample][call.sample] = 'LowGQ'
elif not indel and (not call['GQ'] or call['GQ'] < args.sample_gq_homozygous):
qc[sample][call.sample] = 'LowGQ'
else:
qc[sample][call.sample] = 'PASS'
# Check QC for homozygous variant
elif call['GT'] == '1/1':
if indel and (not call['GQ'] or call['GQ'] < args.indel_gq_homozygous):
qc[sample][call.sample] = 'LowGQ'
elif not indel and (not call['GQ'] or call['GQ'] < args.sample_gq_homozygous):
qc[sample][call.sample] = 'LowGQ'
else:
qc[sample][call.sample] = 'PASS'
elif call['GT'] == '0/1':
if indel and (not call['GQ'] or call['GQ'] < args.indel_gq_heterozygous):
qc[sample][call.sample] = 'LowGQ'
elif not indel and (not call['GQ'] or call['GQ'] < args.sample_gq_heterozygous):
qc[sample][call.sample] = 'LowGQ'
else:
qc[sample][call.sample] = 'PASS'
else:
# If variant is also found in a control
if call['GT'] == '0/1':
controlEvidence = True
if indel and (not call['GQ'] or call['GQ'] < args.indel_gq_heterozygous):
qc[sample][call.sample] = 'LowGQ'
elif not indel and (not call['GQ'] or call['GQ'] < args.control_gq_heterozygous):
qc[sample][call.sample] = 'LowGQ'
else:
qc[sample][call.sample] = 'isVariant'
elif call['GT'] == '1/1':
controlEvidence = True
if indel and (not call['GQ'] or call['GQ'] < args.indel_gq_homozygous):
qc[sample][call.sample] = 'LowGQ'
elif not indel and (not call['GQ'] or call['GQ'] < args.control_gq_homozygous):
qc[sample][call.sample] = 'LowGQ'
else:
qc[sample][call.sample] = 'isVariant'
elif call['GT'] == '0/0':
if indel and (not call['GQ'] or call['GQ'] < args.indel_gq_homozygous):
qc[sample][call.sample] = 'LowGQ'
elif not indel and (not call['GQ'] or call['GQ'] < args.control_gq_homozygous):
qc[sample][call.sample] = 'LowGQ'
else:
qc[sample][call.sample] = 'PASS'
if call.sample in qc[sample]:
update_call_data(call,['FT'],[qc[sample][call.sample]], vcf_reader)
format_list = list(vcf_reader.formats.keys())
format_list.remove('GT')
format_list.insert(0,'GT')
# Add VAF information to the format field of each sample
record.FORMAT = ":".join(format_list)
# Flag variant if it is found in one of the controls
if controlEvidence:
record.FILTER.append('ControlEvidence')
check = False
# Flag variant if it is not found in one of the samples
elif len(qc[True]) == noSampleEvidence :
record.FILTER.append('NoSampleEvidence')
check = False
elif len(qc[False].keys()) > 0 and list(qc[False].values()).count('PASS') == 0:
record.FILTER.append('AllControlsFailedQC')
check = False
elif list(qc[True].values()).count('PASS') == 0:
record.FILTER.append('AllSamplesFailedQC')
check = False
else:
record.INFO['PASS_QC_SAMPLES'] = list(qc[True].values()).count('PASS')
record.INFO['FAIL_QC_SAMPLES'] = len(qc[True])-list(qc[True].values()).count('PASS')
record.INFO['PASS_QC_SAMPLE_NAMES'] = list(np.array(list(qc[True].keys()))[list(np.where(np.array(list(qc[True].values())) == 'PASS')[0])])
record.INFO['FAIL_QC_SAMPLE_NAMES'] = list(np.array(list(qc[True].keys()))[list(np.where(np.array(list(qc[True].values())) != 'PASS')[0])])
if ( len(qc[False].keys()) > 0 ):
record.INFO['PASS_QC_CONTROLS'] = list(qc[False].values()).count('PASS')
record.INFO['FAIL_QC_CONTROLS'] = len(qc[False])- list(qc[False].values()).count('PASS')
record.INFO['PASS_QC_CONTROL_NAMES'] = list(np.array(list(qc[False].keys()))[list(np.where(np.array(list(qc[False].values())) == 'PASS')[0])])
record.INFO['FAIL_QC_CONTROL_NAMES'] = list(np.array(list(qc[False].keys()))[list(np.where(np.array(list(qc[False].values())) != 'PASS')[0])])
return( check )
def calculate_vaf( record ):
"""
Function to calculate the VAF in the bam files
Input: VCF record object
Return: A tuple with VAF of each sample
"""
record_vaf = {}
vaf_info = collections.defaultdict(lambda: collections.defaultdict(list))
qc = collections.defaultdict(dict)
for call in (record.samples):
# Add empty VAF and CAD tag to the record
update_call_data(call, ['VAF','CAD'], [None, None], vcf_reader)
for bam in args.bam:
F=pysam.AlignmentFile(bam,'rb')
if bam not in bam_sample_names:
sample_name = get_sample_name(F)
bam_sample_names[bam] = sample_name
else:
sample_name = bam_sample_names[bam]
dv = 0
dr = 0
vaf = 0.0
# Loop through each reads that is overlapping the position of the variant
for pileupcolumn in F.pileup(record.CHROM, int(record.POS)-1, int(record.POS), truncate=True, stepper='nofilter',min_base_quality=args.base_phred_quality):
for pileupread in pileupcolumn.pileups:
# QC the read
if ( check_pileupread( pileupread) ):
alt = record.ALT[0]
# If variant is a SNV
if (len(record.REF) == 1 and len(alt) == 1):
# Read has the reference
if pileupread.alignment.query_sequence[pileupread.query_position] == record.REF:
dr+=1
# Read has the variant
elif pileupread.alignment.query_sequence[pileupread.query_position] == alt:
dv+=1
# If variant is deletion
elif (len(record.REF) > 1 and len(alt) == 1):
# Read has the deletion
if ( pileupread.indel*-1 == len(record.REF)-1 ):
dv+=1
# Read has no deletion
elif pileupread.indel == 0:
dr+=1
# If variant is an insertion
elif ( len(record.REF) == 1 and len(alt) > 1 ):
# Read has the insertion
if ( pileupread.indel == len(alt)-1 ):
dv+=1
# Read has no insertion
elif pileupread.indel == 0:
dr+=1
# If variant is an INDEL
else:
# Read has the INDEL
if ( pileupread.indel == (len(alt)-len(record.REF)) ):
dv+=1
# Read has no INDEL
elif pileupread.indel == 0:
dr+=1
# Calculate the VAF
try:
vaf = float("{0:.2f}".format(dv/float(dv+dr)))
except ZeroDivisionError:
continue
# Loop through each sample in the vcf file
for call in (record.samples):
sample = True
if call.sample in args.control:
sample = False
# Check if the sample name in the vcf file is the same as a sample name in the bam file
if call.sample == sample_name:
# Add the VAF and sample name to the output tuple
if vaf > args.absent_threshold and call.sample not in args.control:
record_vaf[call.sample] = vaf
# Update the format field for this sample
ft = call['FT']
if float(dv+dr) < args.coverage:
ft = 'LowCov'
qc[sample][call.sample] = 'LowCov'
else:
qc[sample][call.sample] = call['FT']
update_call_data(call, ['VAF','CAD','FT'], [vaf, [dr, dv], ft], vcf_reader)
# Set absent, subclonal or clonal based on the VAF and threshold
if vaf <= args.absent_threshold:
vaf_info[sample]['ABSENT'].append(call.sample)
elif vaf < args.clonal_threshold:
vaf_info[sample]['SUBCLONAL'].append(call.sample)
else:
vaf_info[sample]['CLONAL'].append(call.sample)
format_list = list(vcf_reader.formats.keys())
format_list.remove('GT')
format_list.insert(0,'GT')
# Add VAF information to the format field of each sample
record.FORMAT = ":".join(format_list)
# Add QC information to the INFO field
if len(qc[False].keys()) > 0 and list(qc[False].values()).count('PASS') == 0:
record.FILTER.append('AllControlsFailedQC')
check = False
elif list(qc[True].values()).count('PASS') == 0:
record.FILTER.append('AllSamplesFailedQC')
check = False
else:
record.INFO['PASS_QC_SAMPLES'] = list(qc[True].values()).count('PASS')
record.INFO['FAIL_QC_SAMPLES'] = len(qc[True])-list(qc[True].values()).count('PASS')
record.INFO['PASS_QC_SAMPLE_NAMES'] = list(np.array(list(qc[True].keys()))[list(np.where(np.array(list(qc[True].values())) == 'PASS')[0])])
record.INFO['FAIL_QC_SAMPLE_NAMES'] = list(np.array(list(qc[True].keys()))[list(np.where(np.array(list(qc[True].values())) != 'PASS')[0])])
if ( len(qc[False].keys()) > 0 ):
record.INFO['PASS_QC_CONTROLS'] = list(qc[False].values()).count('PASS')
record.INFO['FAIL_QC_CONTROLS'] = len(qc[False])- list(qc[False].values()).count('PASS')
record.INFO['PASS_QC_CONTROL_NAMES'] = list(np.array(list(qc[False].keys()))[list(np.where(np.array(list(qc[False].values())) == 'PASS')[0])])
record.INFO['FAIL_QC_CONTROL_NAMES'] = list(np.array(list(qc[False].keys()))[list(np.where(np.array(list(qc[False].values())) != 'PASS')[0])])
# Add clonal information to the INFO field
record.INFO['ABSENT_SAMPLES'] = len(vaf_info[True]['ABSENT'])
record.INFO['SUBCLONAL_SAMPLES'] = len(vaf_info[True]['SUBCLONAL'])
record.INFO['CLONAL_SAMPLES'] = len(vaf_info[True]['CLONAL'])
record.INFO['ABSENT_CONTROLS'] = len(vaf_info[False]['ABSENT'])
record.INFO['SUBCLONAL_CONTROLS'] = len(vaf_info[False]['SUBCLONAL'])
record.INFO['CLONAL_CONTROLS'] = len(vaf_info[False]['CLONAL'])
record.INFO['ABSENT_SAMPLE_NAMES'] = vaf_info[True]['ABSENT']
record.INFO['SUBCLONAL_SAMPLE_NAMES'] = vaf_info[True]['SUBCLONAL']
record.INFO['CLONAL_SAMPLE_NAMES'] = vaf_info[True]['CLONAL']
record.INFO['ABSENT_CONTROL_NAMES'] = vaf_info[False]['ABSENT']
record.INFO['SUBCLONAL_CONTROL_NAMES'] = vaf_info[False]['SUBCLONAL']
record.INFO['CLONAL_CONTROL_NAMES'] = vaf_info[False]['CLONAL']
# Flag variant if it is found subclonal in a control
if len(vaf_info[False]['SUBCLONAL']) > 0:
record.FILTER.append('ControlSubclonal')
# Flag variant if it is found clonal in a control
elif len(vaf_info[False]['CLONAL']) > 0:
record.FILTER.append('ControlClonal')
# Flag variant if it is not found clonal in one of the samples
elif len(vaf_info[True]['CLONAL']) == 0:
record.FILTER.append('NoClonalSample')
return( record_vaf )
def create_blacklist():
"""
Function to fill the blacklist dictionary
"""
# global blacklist
# Loop through every blacklist file given on the command line
blacklists = []
for bl_vcf in args.blacklist:
blacklist_single = pandas.read_csv(bl_vcf,
sep="\t",
comment="#",
header=None,
names=["chr", "loc"],
usecols=["chr", "loc"],
dtype={0: "str", 1: "int"})
if bl_vcf.endswith(".bed"):
blacklist_single["loc"] =+ 1 #Bed files are 0-based and are converted to 1-based.
elif not bl_vcf.endswith(".vcf|.vcf.gz"):
warnings.warn("""The blacklist: {0} is not a .vcf or .bed file. Continuing with the following assumptions:\n
Column 1: Chromosome\n
Column 2: 1-based position\n
Header/Comments: #""".format(bl_vcf))
blacklists.append(blacklist_single)
blacklist = pd.concat(blacklists)
blacklist["chr"] = blacklist["chr"].str.lower().str.replace("chr|chrom", "")
blacklist = {k: g["loc"].tolist() for k, g in blacklist.groupby("chr")}
def merge_tmp_vcfs():
"""
Function to merge all the tmp contig vcf files
"""
start = time.time()
header = False
# Loop through all chromomsomes
for contig in contig_list:
if not header:
os.system('cat tool_tmp/{}_flag.vcf > flag.vcf'.format(contig))
header = True
else:
os.system('grep -v \'^#\' tool_tmp/{}_flag.vcf >> flag.vcf'.format(contig))
os.system("grep -P '^#|\s+PASS\s+' flag.vcf > tool_tmp/filter.vcf")
def add_responsibilities():
vcf_reader = pyvcf.Reader(filename="tool_tmp/filter.vcf")
vcf_reader.formats['PC'] = pyvcf.parser._Format('PC',None,'Float','Probability of each component')
vcf_writer = pyvcf.Writer(open('filter.vcf','w'), vcf_reader)
for record in vcf_reader:
for call in (record.samples):
vaf = call['VAF']
sample = call.sample
if vaf != None and vaf > 0.0:
vaf = '{0:.2f}'.format(vaf)
update_call_data(call, ['PC'], [responsibilities_dict[sample][str(vaf)]], vcf_reader)
else:
update_call_data(call, ['PC'], [None], vcf_reader)
format_list = list(vcf_reader.formats.keys())
format_list.remove('GT')
format_list.insert(0,'GT')
# Add VAF information to the format field of each sample
record.FORMAT = ":".join(format_list)
vcf_writer.write_record(record)
os.system("rm -rf tool_tmp/*")
os.system("rm -rf tool_tmp")
if __name__ == "__main__":
#get_command_line()
main()
create_vaf_plot()
merge_tmp_vcfs()
add_responsibilities()