-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfconv.py
111 lines (108 loc) · 4.57 KB
/
fconv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from Bio import AlignIO
from Bio import SeqIO
from Bio.Alphabet import IUPAC, Gapped
import sys
import glob
import os
if len(sys.argv) == 8:
option = sys.argv[1]
inputfolder = sys.argv[2]
inputformat = sys.argv[3]
inputext = sys.argv[4]
outputfolder = sys.argv[5]
outputformat = sys.argv[6]
outputext = sys.argv[7]
elif len(sys.argv) == 6:
option = sys.argv[1]
infilename = sys.argv[2]
inputformat = sys.argv[3]
outputformat = sys.argv[4]
outputext = sys.argv[5]
else:
print "fconv.py script for converting aligned or unaligned sequence files"
print "-----------folder input------------"
print "FORMAT: python fconv.py [option: -a, -s, -print] [inputfolder] [inputformat] [inputext] [outputfolder] [outputformat] [outputext]"
print "EXAMPLE: python fconv.py -a ./fasta fasta .fas ./phylip phylip-relaxed .phy"
print "------------file input-------------"
print "FORMAT: python fconv.py [option: -a, -s, -print] [inputfile] [inputformat] [outputformat] [outputext]"
print "EXAMPLE: python fconv.py -a ./test.fas fasta phylip-relaxed .phy"
print "--------general manual--------"
print "for options -a (AlignIO) and -s (SeqIO) see format options in the corresponding biopython module manual. Some are listed below"
print "fasta - fasta format"
print "phylip - basic phylip with truncated names"
print "phylip-relaxed - extended interleaved phylip (only in -a mode)"
print "option -print in conjunction with output format set to fasta writes fasta file directly"
print "option -print in conjunction with output format set to phylip-relaxed writes non-interleaved phylip-relaxed file directly"
sys.exit()
def check_alphabet(filename, fformat):#code for this function is modified from https://stackoverflow.com/questions/41588725/estimate-alphabet-in-biopython-from-fasta-file/41596563#41596563
alphabets = [IUPAC.ambiguous_dna, IUPAC.protein]#[ambiguous_dna, extended_protein]#, unambiguous_dna, extended_dna, ambiguous_rna, unambiguous_rna]
first_record = list(SeqIO.parse(filename, fformat))[0]
detected = ""
#check DNA first:
leftover = set(str(first_record.seq).upper()) - set(alphabets[0].letters) - set(["-", "?"])
if len(leftover) == 0:
detected = "DNA"
print "Detected alphabet: DNA"
else:
leftover = set(str(first_record.seq).upper()) - set(alphabets[1].letters) - set(["-", "?","X"])
if len(leftover) == 0:
detected = "Prot"
print "Detected alphabet: Protein"
else:
print "Error: unknown alphabet, problematic symbols:", leftover
sys.exit()
return detected
if len(sys.argv) == 8:
files = glob.glob(inputfolder+"/*"+inputext)
extlen = len(inputext)
if not os.path.exists ("./"+outputfolder):
os.makedirs("./"+outputfolder)
else:
files = [infilename]
extlen = 0
outputfolder = "."
count = 0
for f in files:
fnew = f.split("/")
fn = fnew[len(fnew)-1]
print "basename detected:", fn[:(len(fn)-extlen)]
print "output file:", outputfolder+"/"+fn[:(len(fn)-extlen)]+outputext
input_handle = open(f, "rU")
output_handle = open(outputfolder+"/"+fn[:(len(fn)-extlen)]+outputext, "w")
if option == "-a":
if outputformat == "nexus" or inputformat == "nexus":
alph = check_alphabet(input_handle, inputformat)
input_handle.seek(0)
if alph == "DNA":
alignments = AlignIO.parse(input_handle, inputformat, alphabet = Gapped(IUPAC.ambiguous_dna))
elif alph == "Prot":
alignments = AlignIO.parse(input_handle, inputformat, alphabet = Gapped(IUPAC.protein, '-'))
else:
alignments = AlignIO.parse(input_handle, inputformat)
#print list(alignments)
AlignIO.write(alignments, output_handle, outputformat)
elif option == "-s":
if outputformat == "nexus" or inputformat == "nexus":
alph = check_alphabet(input_handle, inputformat)
input_handle.seek(0)
if alph == "DNA":
sequences = SeqIO.parse(input_handle, inputformat, alphabet = Gapped(IUPAC.ambiguous_dna))
elif alph == "Prot":
sequences = SeqIO.parse(input_handle, inputformat, alphabet = Gapped(IUPAC.protein, '-'))
else:
sequences = SeqIO.parse(input_handle, inputformat)
SeqIO.write(sequences, output_handle, outputformat)
elif option == "-print" and outputformat == "fasta":
sequences = SeqIO.parse(input_handle, inputformat)
for seq in sequences:
print >> output_handle, ">"+seq.id, "\n", seq.seq
elif option == "-print" and outputformat == "phylip-relaxed":
alignments = AlignIO.read(input_handle, inputformat)
print >> output_handle, str(len(alignments))+" "+str(alignments.get_alignment_length())
for seq in alignments:
print >> output_handle, str(seq.id)+" "+str(seq.seq)
output_handle.close()
input_handle.close()
count += 1
print "Converted %i records" % count
print "Done"