-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathagalma_format.py
88 lines (79 loc) · 3 KB
/
agalma_format.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#! /usr/python
# AUGUST GUANG
# MAY 2014: quick tool for concatenating sequences for homologize in agalma pipeline
# designed specifically for how I've set up indelseqgen
# JULY 2014: combined agalma_format and simmod.py to create tool for creating sequences in agalma pipeline format
# added species naming on original tree
import sys, getopt, string
import itertools
import pprint #tmp for pretty printing
import os
# separates genes for each species and places them into a species dictionary
def sepGenes(iFile, numGenes):
speciesDict = {}
geneID = 0
for i in range(0,numGenes):
f = iFile + str(i) + '_sequences.seq'
try:
with open(f, 'r') as fin:
# read lines in chunks of n
# format will be:
# >species_ID-geneID
# ACTGACTGNNNNNNN
duplicates = {}
for key, group in itertools.groupby(fin, lambda line: line.startswith('>')):
if key:
header = next(group).strip()
else:
lines=''.join(group).strip()
geneName = header + '-' + str(geneID)
# hacky way of identifying and differentiating gene duplicates
if geneName in duplicates:
geneName = geneName + '-01'
else:
duplicates[geneName] = 0
if header in speciesDict:
speciesDict[header].append((geneName, lines))
else:
speciesDict[header] = [(geneName, lines)]
geneID = geneID + 1
except:
print geneID, " gene family not found, skipping"
geneID = geneID + 1
return speciesDict
# writes all genes for each species with accompanying species ID
def writeSpecies(directory, speciesDict):
i = 0
for speciesKey in speciesDict:
out = os.path.join(directory, speciesKey[1:] + '.fa')
f = open(out, 'w')
for seq in speciesDict[speciesKey]:
f.write(seq[0] + '\n')
f.write(seq[1] + '\n')
f.close()
def main(argv):
numGenes = 0
inExt = ''
directory = ''
# parse command line options
try:
opts, args = getopt.getopt(sys.argv[1:], "hi:d:n:", ["ifile="])
except getopt.error, msg:
print 'agalma_format.py -i <sequenceFile> -n <numGenes> -d <directory>'
sys.exit(2)
# process arguments
iFile = []
for opt, arg in opts:
if opt in ("-i", "--inFile"):
# extension for sequence files
inExt = arg
if opt in ("-d", "--directory"):
# directory for out files
directory = arg
if opt in ("-n", "--numGenes"):
numGenes = int(arg)
speciesDict = sepGenes(inExt, numGenes)
writeSpecies(directory, speciesDict)
# pprint.pprint(speciesDict)
if __name__ == "__main__":
main(sys.argv[1:])