-
Notifications
You must be signed in to change notification settings - Fork 93
/
parseVCF.py
executable file
·391 lines (295 loc) · 17.6 KB
/
parseVCF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
#!/usr/bin/env python
#Various functions to parse and filter genotype data from vcf files.
#If run independently it will pipe "genotypes" format to stdout.
import argparse, sys, gzip, re, subprocess
from collections import defaultdict
import numpy as np
def GTtype(alleles):
alleleSet = set(alleles)
if len(alleleSet) > 1: return "Het"
elif "0" in alleleSet: return "HomRef"
elif "." in alleleSet: return "Missing"
else: return "HomAlt"
re_cigar = re.compile(r"\d+|[MXDI]")
re_phaser = re.compile("[/|]")
def simplifyAlt(alt, cigar, missing="N"):
l = re_cigar.findall(cigar)
i = 0
simp = ""
try:
for x in range(0,len(l),2):
label = l[x+1]
n = int(l[x])
if label == "M" or label == "X":
#we have a matching stretch
simp += alt[i:i + n]
i+=n
elif label == "I":
#insertion, so just advance index
i += n
elif label == "D":
#deletion, so add missing without advancing
simp += "".join([missing]*n)
except:
raise ValueError("Malformed CIGAR: " + cigar)
return(simp)
class VcfSite:
__slots__ = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'REFlen', 'nALT', 'lenMatchDict', 'QUAL', 'FILTER', 'INFO', 'sampleNames', 'genoData', "alleleDict"]
def __init__(self, elements=None, line=None, headers=None, headerLine=None, precompGenoData=None,
parseINFO=False, simplifyALT=False):
assert((elements != None or line != None) and (headers != None or headerLine != None))
if not headers: headers = headerLine.split()
if not elements: elements = line.split()
lineDict = dict(zip(headers,elements))
self.CHROM = lineDict["#CHROM"]
self.POS = int(lineDict["POS"])
self.ID = lineDict["ID"]
self.REF = lineDict["REF"]
self.REFlen = len(self.REF)
self.ALT = lineDict["ALT"].split(",") if lineDict["ALT"] != "." else []
self.nALT = len(self.ALT)
self.QUAL = lineDict["QUAL"]
self.FILTER = lineDict["FILTER"]
if parseINFO or simplifyALT:
self.INFO = dict([x.split("=") for x in lineDict["INFO"].split(";")])
if simplifyALT:
cigars = self.INFO["CIGAR"].split(",")
for x in range(self.nALT):
self.ALT[x] = simplifyAlt(self.ALT[x], cigars[x])
self.alleleDict = dict(zip([str(i) for i in range(self.nALT+1)], [self.REF] + self.ALT))
self.lenMatchDict = dict([(key, len(self.alleleDict[key]) == self.REFlen) for key in self.alleleDict])
genoInfoNames = lineDict["FORMAT"].split(":")
self.sampleNames = headers[9:]
self.genoData = {}
for sampleName in self.sampleNames:
#if pre-compiled genotype data are available, try using those
#need to check is both the format headings AND individual data match. So use a tuple as index
if precompGenoData and (lineDict["FORMAT"], lineDict[sampleName],) in precompGenoData:
self.genoData[sampleName] = precompGenoData[(lineDict["FORMAT"], lineDict[sampleName],)]
else:
#otherwise make dictionary for this sample
self.genoData[sampleName] = dict(zip(genoInfoNames, lineDict[sampleName].split(":")))
if "GT" in self.genoData[sampleName]:
self.genoData[sampleName]["alleles"] = tuple(re_phaser.split(self.genoData[sampleName]["GT"]))
self.genoData[sampleName]["phase"] = "|" if "|" in self.genoData[sampleName]["GT"] else "/"
if precompGenoData["__counter__"] < precompGenoData["__maxSize__"]:
precompGenoData[(lineDict["FORMAT"], lineDict[sampleName],)] = self.genoData[sampleName]
precompGenoData["__counter__"] += 1
def getSiteType(self):
if len(self.ALT) == 0: return 'MONO'
if np.all(list(self.lenMatchDict.values())): return 'SNP'
return 'INDEL'
def getGenotype(self, sample, gtFilters = [], withPhase=True, asNumbers = False, missing = None,
allowOnly=None, mustMatchREFlen=False, keepPartial=False, ploidy=None, ploidyMismatchToMissing=False, expandMulti=False):
genoData = self.genoData[sample]
if missing is None:
if asNumbers: missing = "."
else:
missing = "N" if not expandMulti or self.REFlen==1 else ["N"]*self.REFlen
#check each gt filter
passed = True
for gtFilter in gtFilters:
#first check that it's applicable
if ("siteTypes" in gtFilter and self.getSiteType() not in gtFilter["siteTypes"]): continue
if ("gtTypes" in gtFilter and GTtype(genoData["alleles"]) not in gtFilter["gtTypes"]): continue
if ("samples" in gtFilter and sample not in gtFilter["samples"]): continue
#now check that it passes
#might be a single value, nut could be several separated by commas. So will split in case
try:
values = np.array(genoData[gtFilter["flag"]].split(","), dtype=float)
passed = np.all(gtFilter["min"] <= values) and np.all(values <= gtFilter["max"])
except: passed = False
#try: passed = gtFilter["min"] <= float(genoData[gtFilter["flag"]]) <= gtFilter["max"]
#except: passed = False
if not passed: break
if ploidy is None: ploidy=len(genoData["alleles"])
elif ploidy != len(genoData["alleles"]):
if ploidyMismatchToMissing:
passed=False
else:
raise ValueError("Sample {} at {}:{} genotype {} does not match explected ploidy of {}".format(sample, self.CHROM, self.POS,
self.genoData[sample]["GT"], ploidy))
if passed:
if not asNumbers:
try:
#retrieve alleles, but check if lengths must match and if they don't add a missing allele
sampleAlleles = [self.alleleDict[a] if (not mustMatchREFlen or self.lenMatchDict[a]) else missing for a in genoData["alleles"]]
if allowOnly: sampleAlleles = [a if a in allowOnly else missing for a in sampleAlleles]
if not keepPartial: sampleAlleles = sampleAlleles if missing not in sampleAlleles else [missing]*ploidy
except:
sampleAlleles = [missing]*ploidy
else:
sampleAlleles = genoData["alleles"][:]
else: sampleAlleles = [missing]*ploidy
sep = genoData["phase"] if withPhase else ""
if expandMulti:
return tuple(sep.join([a[i] for a in sampleAlleles]) for i in range(self.REFlen))
return sep.join(sampleAlleles)
def getGenotypes(self, gtFilters = [], asList = False, withPhase=True, asNumbers = False,
samples = None, missing = None, allowOnly=None, mustMatchREFlen=False,
keepPartial=False, ploidyDict=None, ploidyMismatchToMissing=False, expandMulti=False):
if not samples: samples = self.sampleNames
output = {}
for sample in samples:
ploidy = ploidyDict[sample] if ploidyDict is not None else None
output[sample] = self.getGenotype(sample, gtFilters=gtFilters, withPhase=withPhase, asNumbers=asNumbers,
missing=missing, allowOnly=allowOnly, mustMatchREFlen=mustMatchREFlen,
keepPartial=keepPartial, ploidy=ploidy, ploidyMismatchToMissing=ploidyMismatchToMissing,
expandMulti=expandMulti)
if asList: return [output[sample] for sample in samples]
return output
def getGenoField(self, field, samples = None, missing=None):
if missing is None: missing = "."
if samples is None: samples = self.sampleNames
fields = []
for sample in samples:
try: fields.append(self.genoData[sample][field])
except: fields.append(missing)
return fields
def parseHeaderLines(fileObj):
output = {}
output["contigs"] = []
output["contigLengths"] = {}
for line in fileObj:
if line.startswith("##contig"):
contigDataDict = dict([x.split("=", maxsplit=1) for x in re.split('<|>', line)[1].split(",")])
elements = re.split('=|,|>', line)
output["contigs"].append(contigDataDict["ID"])
try: output["contigLengths"][contigDataDict["ID"]] = int(contigDataDict["length"])
except: output["contigLengths"][contigDataDict["ID"]] = None
if line.startswith("#CHROM"):
output["mainHead"] = line
elements = line.split()
output["sampleNames"] = line.split()[9:]
output["nSamples"] = len(output["sampleNames"])
output["mainHeaders"] = elements
break
return output
def getHeadData(fileName):
with gzip.open(fileName, "rt") if fileName.endswith(".gz") else open(fileName, "rt") as fileObj:
return parseHeaderLines(fileObj)
def parseVcfSites(lines, mainHeaders, precomp=True, precompMaxSize=10000, excludeDuplicates=False, parseINFO=False, simplifyALT=False):
if precomp:
precompGenoData = {}
precompGenoData["__maxSize__"] = precompMaxSize
precompGenoData["__counter__"] = 0
else: precompGenoData = None
if excludeDuplicates: lastChrom = lastPos = None
for elements in lines:
if isinstance(elements, str): elements = elements.split()
if len(elements) == 0 or elements[0][0] == "#": continue
if excludeDuplicates:
if elements[0] == lastChrom and elements[1] == lastPos: continue
lastChrom = elements[0]
lastPos = elements[1]
yield VcfSite(elements=elements, headers=mainHeaders, precompGenoData=precompGenoData, parseINFO=parseINFO, simplifyALT=simplifyALT)
def canFloat(string):
try: float(string)
except: return False
return True
def parseGenotypeFilterArg(arg):
try:
gtfDict = dict([tuple(i.split("=")) for i in arg])
for key in gtfDict.keys():
assert key in ["flag","min","max", "siteTypes", "gtTypes", "samples"]
for key in ["siteTypes", "gtTypes", "samples"]:
if key in gtfDict: gtfDict[key] = gtfDict[key].split(",")
gtfDict["min"] = float(gtfDict["min"]) if "min" in gtfDict else -np.inf
gtfDict["max"] = float(gtfDict["max"]) if "max" in gtfDict else np.inf
return gtfDict
except: raise ValueError("Bad genotype filter specification. See help.")
def addArgs(parser, requireInfile=False):
parser.add_argument("-o", "--outFile", help="Output csv file", action = "store")
#specific samples
parser.add_argument("-s", "--samples", help="sample names (separated by commas)", action='store')
#contigs
parser.add_argument("--include", help="include contigs (separated by commas)", action='store')
parser.add_argument("--includeFile", help="File of contigs (one per line)", action='store')
parser.add_argument("--exclude", help="exclude contigs (separated by commas)", action='store')
parser.add_argument("--excludeFile", help="File of contigs (one per line)", action='store')
#vcf parsing arguments
parser.add_argument("--minQual", help="Minimum QUAL for a site", type=int, action = "store")
parser.add_argument("--gtf", help="Genotype filter. Syntax: flag=X min=X max=X siteTypes=X,X.. gtTypes=X,X.. samples=X,X..", action = "append", nargs = '+')
parser.add_argument("--skipIndels", help="Skip indels", action = "store_true")
parser.add_argument("--excludeDuplicates", help="Only include the first in a series of duplicated positions", action = "store_true")
parser.add_argument("--simplifyALT", help="Simplify multi-site alternate alleles using CIGAR (as in Freebayes output)", action = "store_true")
parser.add_argument("--expandMulti", help="Expand multi-site alleles (also Sets simplifyALT to True)", action = "store_true")
parser.add_argument("--maxREFlen", help="Maximum length for refernece allele", action = "store", type=int)
parser.add_argument("--ploidy", help="Ploidy for each sample", action = "store", type=int, default=2)
parser.add_argument("--ploidyFile", help="File with samples names and ploidy as columns", action = "store")
parser.add_argument("--ploidyMismatchToMissing", help="Set genotypes with mismatched ploidy to missing", action = "store_true")
parser.add_argument("--keepPartial", help="Keep genotypes where some but not all alleles are missing", action = "store_true")
parser.add_argument("--addRefTrack", help="Add a third column with the header REF and the reference allele", action = "store_true")
parser.add_argument("--noHeader", help="Output without header line", action = "store_true")
parser.add_argument("--field", help="Optional - format field to extract", action = "store")
parser.add_argument("--missing", help="Value to use for missing data", action = "store")
parser.add_argument("--outSep", help="Output separator", action = "store", default = "\t")
def parseIncludeExcludeArgs(args):
include = []
exclude = []
if args.include: include += args.include.split(",")
if args.exclude: exclude += args.exclude.split(",")
if args.includeFile:
with open(args.includeFile, 'rt') as includeFile:
include += [c.strip() for c in includeFile.read().split("\n")]
if args.excludeFile:
with open(args.excludeFile, 'rt') as excludeFile:
exclude += [c.strip() for c in excludeFile.read().split("\n")]
if len(include) >= 1:
include = set(include)
sys.stderr.write("{} contigs will be included.".format(len(include)))
if len(exclude) >= 1:
exclude = set(exclude)
sys.stderr.write("{} contigs will be excluded.".format(len(exclude)))
return (include,exclude,)
###############################################################################################################
if __name__ == "__main__":
### parse arguments
parser = argparse.ArgumentParser()
addArgs(parser)
parser.add_argument("-i", "--inFile", help="Input vcf file", action = "store")
args = parser.parse_args()
samples = args.samples.split(",") if args.samples else None
include,exclude = parseIncludeExcludeArgs(args)
gtFilters = [parseGenotypeFilterArg(gtf) for gtf in args.gtf] if args.gtf else []
simplifyALT = args.simplifyALT or args.expandMulti
##########################################################################################################################
### open files
if args.inFile: inFile = gzip.open(args.inFile, "rt") if args.inFile.endswith(".gz") else open(args.inFile, "rt")
else: inFile = sys.stdin
if args.outFile: outFile = gzip.open(args.outFile, "wt") if args.outFile.endswith(".gz") else open(args.outFile, "wt")
else: outFile = sys.stdout
#header data
headData = parseHeaderLines(inFile)
#check specified samples are in first file. Otherwise use this entire set
if samples:
for sample in samples: assert sample in headData["sampleNames"], "Sample {} not in VCF header\n".format(sample)
else: samples = headData["sampleNames"]
ploidyDict = defaultdict(lambda: args.ploidy)
if args.ploidyFile:
with open(args.ploidyFile, "rt") as pf: ploidyDict.update(dict([[s[0],int(s[1])] for s in [l.split() for l in pf]]))
##########################################################################################################################
if not args.noHeader:
first_columns = ["#CHROM", "POS"]
if args.addRefTrack: first_columns.append("REF")
outFile.write(args.outSep.join(first_columns + samples) + "\n")
for vcfSite in parseVcfSites(inFile, headData["mainHeaders"], excludeDuplicates=args.excludeDuplicates, simplifyALT=simplifyALT):
if (exclude and vcfSite.CHROM in exclude) or (include and vcfSite.CHROM not in include): continue
if args.minQual and canFloat(vcfSite.QUAL) and float(vcfSite.QUAL) < args.minQual: continue
if args.maxREFlen and len(vcfSite.REF) > args.maxREFlen: continue
if args.field is not None: output = vcfSite.getGenoField(args.field,samples=samples, missing=args.missing)
else:
output = vcfSite.getGenotypes(gtFilters,asList=True,withPhase=True,samples=samples,missing=args.missing,
mustMatchREFlen=args.skipIndels,keepPartial=args.keepPartial,ploidyDict=ploidyDict,
ploidyMismatchToMissing=args.ploidyMismatchToMissing,expandMulti=args.expandMulti)
#if we expanded multi-site genotypes, we need to write multiple lines
if args.expandMulti:
for x in range(vcfSite.REFlen):
first_columns = [vcfSite.CHROM, str(vcfSite.POS + x)]
if args.addRefTrack: first_columns.append(vcfSite.REF[x])
outFile.write(args.outSep.join(first_columns + [o[x] for o in output]) + "\n")
continue
first_columns = [vcfSite.CHROM, str(vcfSite.POS)]
if args.addRefTrack: first_columns.append(vcfSite.REF)
outFile.write(args.outSep.join(first_columns + output) + "\n")
outFile.close()