-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathCHHFilter.py
executable file
·164 lines (137 loc) · 4.95 KB
/
CHHFilter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/env python
import sys, csv
# Utils
def getZS(line):
for v in line[11:]:
if v[:2] == "ZS":
return v
return ""
# Main class
class CHHFilter(object):
fastafile = ""
maxhits = 3
ref = []
def usage(self):
sys.stdout.write("""CHHFilter.py - Filter aligned BS reads based on consecutive unconverted CHH sites
Usage: CHHFilter.py [options] reference.fa
Where reference.fa is the fasta file that the reads were aligned to. Options:
-m M | Discard a read if M consecutive unconverted sites are found (default: {})
This program is designed to work as a filter, receiving SAM lines on standard input and writing filtered SAM
lines to standard output. To filter a BAM file to another BAM file use:
samtools view -h input.bam | CHHFilter.py reference.fa | samtools view -b > output.bam
When processing is done, the program will write the number of input reads and the number and percentage of
output reads to standard error.
Notes:
* This program requires reads to have the ZS tag, added by bsmap to indicate the read strand.
* The reference sequence should have a samtools .fai index (generated by samtools faidx reference.fa).
* Information on read pairing may not be correct after filtering: if a read is discarded and its mate
is not, the mate will still be flagged as being properly paired. This can be fixed with the Picard
FixMateInformation tool.
""".format(self.maxhits))
def parseArgs(self, args):
prev = ""
for a in args:
if prev == "-m":
self.maxhits = int(a)
prev = ""
elif a in ["-m"]:
prev = a
else:
self.fastafile = a
def loadReference(self, chrom):
chromlen = 0
startpos = 0
fai = self.fastafile + ".fai"
with open(fai, "r") as f:
c = csv.reader(f, delimiter='\t')
for line in c:
if line[0] == chrom:
chromlen = int(line[1])
startpos = int(line[2])
if not startpos:
return
self.ref = ['']*chromlen
sys.stderr.write("Loading reference for {}...\n".format(chrom))
with open(self.fastafile, "r") as f:
f.seek(startpos)
idx = 0
while True:
line = f.readline()
if not line:
break
if line[0] == '>':
break
line = line.rstrip("\n")
for ch in line:
self.ref[idx] = ch.upper()
idx += 1
return self.ref
def findUnconvertedTop(self, gread, read):
nchh = 0
for i in range(len(gread) - 3):
if gread[i] == 'C' and gread[i+1] != 'G' and gread[i+2] != 'G':
if read[i] == 'C':
nchh += 1
if nchh == self.maxhits:
return True
else:
nchh = 0
return False
def findUnconvertedBot(self, gread, read):
nchh = 0
for i in range(2, len(gread)):
try:
if gread[i] == 'G' and gread[i-1] != 'C' and gread[i-2] != 'C':
if read[i] == 'G':
nchh += 1
if nchh == self.maxhits:
return True
else:
nchh = 0
except:
sys.stderr.write("Error:\n{}\n{}\n".format(gread, read))
return False
def processSAM(self):
current = ""
ref = []
nr = 0
nf = 0
c = csv.reader(sys.stdin, delimiter="\t")
for line in c:
if line[0][0] == '@':
sys.stdout.write("\t".join(line) + "\n")
continue
chrom = line[2]
pos = int(line[3]) - 1
read = line[9]
if chrom != current:
self.loadReference(chrom)
if not self.ref:
sys.stderr.write("Error: could not find reference sequence for `{}'\n".format(chrom))
break
current = chrom
gread = self.ref[pos:pos+len(read)]
zs = getZS(line)
nr += 1
if zs[5] == '+':
filt = self.findUnconvertedTop(gread, read)
else:
filt = self.findUnconvertedBot(gread, read)
if filt:
nf += 1
else:
sys.stdout.write("\t".join(line) + "\n")
#sys.stdout.write("{} {} {}\n{} {}\n\n".format(read, read.count("C"), read.count("G"), "".join(gread), getZS(line)))
sys.stderr.write("{} reads, {} reads filtered ({:.2f}%)\n".format(nr, nf, 100.0*nf/nr))
if __name__ == "__main__":
args = sys.argv[1:]
F = CHHFilter()
F.parseArgs(args)
if F.fastafile:
F.processSAM()
else:
F.usage()
# ++ C->T
# +- C->T
# -+ G->A
# -- G->A