-
Notifications
You must be signed in to change notification settings - Fork 4
/
mergeRepStats.py
executable file
·120 lines (106 loc) · 3.81 KB
/
mergeRepStats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/bin/env python
'''
mergeRepStats.py
Copyright (c) 2015, David Edwards, Bernie Pope, Kat Holt
All rights reserved. (see README.txt for more details)
merges the general 'replicon' statistics for two set of reads.
example:
python mergeRepStats.py <new_replicon_RepStats.tab> sdOutgroupMutiplier reads_to_replace <mergeDirectory> runType
Created: 23/10/2012
Modified: 28/10/2013 to mergeRepStats from mergeStats
15/04/2014 changed to produce outgroup.txt file if there are any outgroups to report
20/05/2014 fix to outgroup reporting
04/07/2014 change to replace_reads handling
author: David Edwards
'''
import sys, glob
from pipe_utils import splitPath
inFileName = sys.argv[1]
(inPrefix, inName, inExt) = splitPath(inFileName)
sdOutgroupMutiplier = int(sys.argv[2])
mergeFileName = sys.argv[4] + inName + inExt
replace = sys.argv[3]
runType = sys.argv[5]
outgroup_outfile_name = sys.argv[4] + inName[:-9] + '_outgroups.txt'
outgroups = []
average = 0
sd = 0
count = 0
output = ""
# Combine the two sets
# note: only need to count SNPs for 'phylogeny' runType
# First get the merge run list
# setting any reads in the replace list to "fail"
mergeFile = open(mergeFileName, "r")
for line in mergeFile:
items = line.split()
if items[0] != "Isolate":
if items[-1] != "f":
# if there is a replaceRead list and the read appears on the list
# change it to a fail...
if replace != "":
for name in replace.split(","):
if items[0] == name:
line = line[:-2] + "f\n"
items = line.split()
if items[-1] != "f" and runType=='phylogeny':
count += 1
number = float(items[6]) / (float(items[1])/100)
average += number
sd += (number*number)
output += line
mergeFile.close()
# then add the new run stats.tab leaving off the header
inFile = open(inFileName)
for line in inFile:
items = line.split()
if items[0] != "Isolate":
if items[-1] != "f" and runType=='phylogeny':
count += 1
number = float(items[6]) / (float(items[1])/100)
average += number
sd += (number*number)
output += line
inFile.close()
# work out new mean and sd
# Note: 'pangenome' runType will 'skip' this as count == 0
if count > 0:
average = average/(count*1.0)
sd = ((sd*1.0)/count - average*average)
if sd < 0:
sd = sd * -1
sd = sd**(1/2.0)
# identify ingroups/outgroups in the merged set for 'phylogeny' runType
finalOutput = ""
for line in output.split("\n"):
if line != "":
items = line.split()
if items[0] == "Isolate" or items[-1] == "f":
finalOutput += line + "\n"
elif runType == 'phylogeny':
number = float(items[6]) / (float(items[1])/100)
# if number < (average - (sd * sdOutgroupMutiplier)) or number > (average + (sd * sdOutgroupMutiplier)):
if number > (average + (sd * sdOutgroupMutiplier)):
outgroups.append(items[0])
if items[-1] != "o":
finalOutput += line[:-1] + "o\n"
else:
finalOutput += line + "\n"
else:
if items[-1] != "i":
finalOutput += line[:-1] + "i\n"
else:
finalOutput += line + "\n"
else:
if items[-1] != "i":
finalOutput += line[:-1] + "i\n"
else:
finalOutput += line + "\n"
if outgroups != []:
outgroup_outfile = open(outgroup_outfile_name,"w")
for outgroup in outgroups:
outgroup_outfile.write(outgroup+'\n')
outgroup_outfile.close()
mergeFile = open(mergeFileName, "w")
mergeFile.writelines(finalOutput)
mergeFile.close()