-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathqualcheck.py
executable file
·168 lines (142 loc) · 5.37 KB
/
qualcheck.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#!/usr/bin/env python
import sys, gzip
from math import pow
import Script
def decode(q):
return ord(q) - 33
class Stats(object):
filename = ""
nreads = 0
nbases = 0 # Total bases seen
sumqual = 0 # Sum of qualities (for average)
minqual = 100 # Minumum quality seen
maxqual = 0 # Maximum quality seen
minreadqual = 100 # Minimum per-read quality seen
maxreadqual = 0 # Maximum per-read quality seen
def __init__(self, filename):
self.filename = filename
def add(self, quals):
if not quals:
return
nb = 0 # number of bases in this read
sq = 0 # sum of qualities in this read
self.nreads += 1
for q in quals:
qv = decode(q)
self.nbases += 1
self.sumqual += qv
self.minqual = min(self.minqual, qv)
self.maxqual = max(self.maxqual, qv)
nb += 1
sq += qv
readqual = 1.0 * sq / nb
self.minreadqual = min(self.minreadqual, readqual)
self.maxreadqual = max(self.maxreadqual, readqual)
def report(self, out=sys.stdout):
avgqual = 1.0 * self.sumqual / self.nbases
err_rate = pow(10, -avgqual/10.0)
out.write("== {} ==\n".format(self.filename))
out.write("Number of reads: {}\n".format(self.nreads))
out.write("Number of bases: {}\n".format(self.nbases))
out.write("Average read length: {:.3f}\n".format(1.0 * self.nbases / self.nreads))
out.write("Overall average quality: {:.3f}\n".format(avgqual))
out.write("Quality range: {} - {}\n".format(self.minqual, self.maxqual))
out.write("Per-read quality range: {} - {}\n".format(self.minreadqual, self.maxreadqual))
out.write("Average error rate: {}\n".format(err_rate))
out.write("Expected number of errors: {:.2f}\n\n".format(self.nbases * err_rate))
out.flush()
def reportCSV(self, out):
avgqual = 1.0 * self.sumqual / self.nbases
err_rate = pow(10, -avgqual/10.0)
out.write("{}\t{}\t{}\t{:.3f}\t{}\t{}\t{}\t{}\t{}\t{}\t{:.2f}\n".format(
self.filename, self.nreads, self.nbases, 1.0 * self.nbases / self.nreads, avgqual,
self.minqual, self.maxqual, self.minreadqual, self.maxreadqual, err_rate,
self.nbases * err_rate))
out.flush()
def collect(fastqfile, maxreads=0, T=None):
nr = 0
S = Stats(fastqfile)
with gzip.open(fastqfile, "rt") as f:
while True:
try:
read = f.readline()
if not read:
break
read = f.readline()
read = f.readline()
quals = f.readline().strip("\r\n")
S.add(quals)
if T:
T.add(quals)
nr += 1
if nr == maxreads:
break
except KeyboardInterrupt:
break
return S
# S.report()
class Main(Script.Script):
outfile = ""
maxreads = 0
fastqfiles = []
def run(self, args):
self.standardOpts(args)
self.parseArgs(args, "+n,+o")
self.outfile = self.getOpt("o")
self.maxreads = int(self.getOpt("n", 0))
self.fastqfiles = self.getArgs()
if self.fastqfiles:
T = None
q = []
if self.outfile:
if self.outfile == "-":
self.outfile = "/dev/stdout"
out = open(self.outfile, "w")
out.write("#Filename\tNumber of reads\tNumber of bases\tAvg read length\tOverall avg qual\tMin qual\tMax qual\tMin readqual\tMax readqual\tError rate\tExpected errors\n")
mode = "csv"
else:
out = sys.stdout
mode = "txt"
try:
if len(self.fastqfiles) > 1:
T = Stats("Total")
for fq in self.fastqfiles:
S = collect(fq, maxreads=self.maxreads, T=T)
if mode == "csv":
S.reportCSV(out)
else:
S.report(out)
if T:
if mode == "csv":
T.reportCSV(out)
else:
T.report(out)
finally:
out.close()
else:
usage()
def usage(what):
sys.stdout.write("""qualcheck.py - Compute quality stats on fastq files.
Usage: qualcheck.py [options] fastqfiles...
Options:
-n N | Examine the first N reads
-o O | Write results in tab-delimited format to file O
This program reads one or more fastq files, printing out the following information for each:
Number of reads
Number of bases
Average read length
Overall average quality
Range of quality scores
Per-read quality range
Average error rate
Expected number of errors
"Per-read quality" is the average quality score over the whole read.
"Average error rate" is the probability of error corresponding to the
overall average quality Q (= 10^(-Q/10)).
Processing can be interrupted with Ctrl-C, and the program will print
the statistics computed up to that point.
""")
if __name__ == "__main__":
args = sys.argv[1:]
M = Main("qualcheck.py", version="1.0", usage=usage)
M.run(args)