-
Notifications
You must be signed in to change notification settings - Fork 15
/
vcf2snapp.py
105 lines (78 loc) · 2.88 KB
/
vcf2snapp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/python
# vcf2snapp converts VCF into nexus format with binary encoding for SNPs
import argparse
from collections import OrderedDict
parser = argparse.ArgumentParser(description="Convert VCF to nexus for SNAPP")
parser.add_argument("-in", dest="infile", help="The vcftools input file",
required=True)
parser.add_argument("-chr", dest="chromossomes",
help="Use if your VCF is organized by chromossomes"
"and not by loci",
action='store_true')
arg = parser.parse_args()
def vcf2snapp(vcf_file, output_file, chromossomes):
"""
Converts VCF file into Nexus binary format to SNAPP. Ignores non-biallelic
SNPs.
"""
fh = open(vcf_file)
if chromossomes is True:
chrom_p = lambda x: " ".join(x[0:2])
else:
chrom_p = lambda x: x[0]
chroms = []
for line in fh:
# Skip header
if line.startswith("##"):
pass
elif line.startswith("#CHROM"):
# Get taxa information
taxa_list = line.strip().split()
nexus_data = OrderedDict((x, []) for x in taxa_list[9:])
elif line.strip() != "":
fields = line.strip().split()
# Get locus number. Check if this locus has already been recorded.
# If so, ignore
chrom = chrom_p(fields)
if chrom in chroms:
continue
ref_snp = fields[3]
alt_snp = fields[4]
# If SNP is not bialleic, ignore
if len(alt_snp) > 1:
continue
# Record data for each Taxon
for tx in nexus_data:
# Get genotype
gen = fields[taxa_list.index(tx)]
if "./." in gen:
nexus_data[tx].append("?")
elif "0|0" in gen or "0/0" in gen:
nexus_data[tx].append("0")
elif "1|1" in gen or "1/1" in gen:
nexus_data[tx].append("2")
elif "1|0" in gen or "0|1" in gen or "0/1" in gen or "1/0" in gen:
nexus_data[tx].append("1")
else:
chroms.append(chrom)
# Write nexus files
nexus_fh = open(output_file, "w")
# Write header
ntaxa = len(nexus_data)
nloci = len(nexus_data[tx])
nexus_fh.write("#NEXUS\nBEGIN Data;\n\tDIMENSIONS NTAX={} NCHAR={};\n\t"
"FORMAT DATATYPE=integer INTERLEAVE=no missing=?;\nMatrix\n".format(
ntaxa, nloci))
# Write Data
for tx, gens in nexus_data.items():
nexus_fh.write("{}\t{}\n".format(tx, "".join(gens)))
# Write file ending
nexus_fh.write(";\nEND;\n")
nexus_fh.close()
def main():
# Args
infile = arg.infile
output_file = infile.split(".")[0] + ".nex"
chromossomes = arg.chromossomes
vcf2snapp(infile, output_file, chromossomes)
main()