-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmissing_data.py
40 lines (38 loc) · 1.07 KB
/
missing_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import sys
import glob
import os
from Bio import SeqIO
from Bio import AlignIO
if len(sys.argv) == 2:
inputfolder = sys.argv[1]
else:
print "FORMAT: python missing_data.py [folder]"
print "EXAMPLE: missing_data.py ./fasta"
sys.exit()
files = glob.glob(inputfolder+"/*.fas")
taxaset = set()
locilist = []
maindict = {}
for f in files:
fhandle = open(f, "r")
sortdict = {}
form = "fasta"
for seq in SeqIO.parse(fhandle, form):
taxaset.add(seq.id)
#sortdict[seq.id] = len(str(seq.seq).replace("-", "").upper().replace("N", ""))
sortdict[seq.id] = len(str(seq.seq).replace("-", "").replace("?", "").upper().replace("N", "")) / float(len(str(seq.seq)))
fhandle.close()
maindict[f.split("/")[-1]] = sortdict
locilist.append(f.split("/")[-1])
l = len(maindict)
outhandle = open("missing_data.csv", "w")
print >> outhandle, "taxon,"+",".join(locilist)
for taxon in taxaset:
tempstring = taxon
for locus in locilist:
if taxon in maindict[locus].keys():
tempstring += ","+str(maindict[locus][taxon])
else:
tempstring += ",0"
print >> outhandle, tempstring
outhandle.close()