-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpyTSOfilter.py
211 lines (186 loc) · 6.94 KB
/
pyTSOfilter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import os
import pysam
import argparse
import multiprocessing as mp
import regex
#from Bio import SeqIO
#make the lookup dictionary for reverse complementing
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
def reverse_complement(seq):
bases = list(seq)
bases = reversed([complement.get(base,base) for base in bases])
bases = ''.join(bases)
return bases
def complementseq(seq):
bases = list(seq)
bases = [complement.get(base,base) for base in bases]
bases = ''.join(bases)
return bases
def idx_bam(bam, threads):
threads = str(threads)
try:
pysam.index("-@"+threads,bam)
except:
outcome = 'idxerror'
else:
outcome = 'idxsuccess'
if outcome == 'idxerror':
print("indexing failed, trying to sort bam file...")
inbam = bam
bam = bam+".sorted.bam"
pysam.sort("-@"+threads,"-o", bam, inbam)
print("indexing bam file...")
pysam.index(bam)
return bam
def makeBAMheader(args, v):
bam = pysam.AlignmentFile(args.bam, 'rb')
hdr = bam.header.to_dict()
bam.close()
cmdlinecall = 'pyTSOfilter --bam '+args.bam+' --out '+args.out+' --fa '+args.fa+' --p '+str(args.p)+' --n_mismatch '+str(args.n_mismatch)
if args.ggg:
cmdlinecall = cmdlinecall+' --ggg'
pg = {'ID': 'pyTSOfilter', 'PN': 'pyTSOfilter',
'CL': cmdlinecall, 'VN': v}
if 'PG' in hdr:
pglines = hdr['PG']
pglines.append(pg)
else:
pglines = [pg]
hdr['PG'] = pglines
return hdr
def is_fiveprime_read(read):
fivep = False
if read.has_tag('UB'):
UMIseq = read.get_tag('UB')
if UMIseq != "":
fivep = True
return fivep
def get_gene(read):
if read.has_tag("GE"):
gene = read.get_tag("GE")
else:
if read.has_tag("GI"):
gene = read.get_tag("GI")
else:
gene = "none"
return gene
def collect_bam_chunks(inpath, chrs, outpath):
allpaths = [inpath+".tmp."+c+".bam" for c in chrs[:-1]]
allpaths.append(inpath+".tmp."+"unmapped"+".bam")
cat_args = ['-o', outpath]+allpaths
pysam.cat(*cat_args)
x = [os.remove(f) for f in allpaths]
def filterTSO(inpath, threads, fastapath, chr, mm, ggg, outheader):
fa = pysam.FastaFile(fastapath)
if chr == '*':
chrlabel = 'unmapped'
else:
chrlabel = chr
#reflen = fa.get_reference_length(chr)
#open in/out files
outpath = inpath+".tmp."+chrlabel+".bam"
inp = pysam.AlignmentFile(inpath, 'rb', threads = threads)
out = pysam.AlignmentFile(outpath, 'wb', header = outheader, threads = threads)
for read in inp.fetch(chr):
isfivep = is_fiveprime_read(read)
gene = get_gene(read)
if chr in fa.references and isfivep and gene != "none":
if read.is_reverse:
strand = "-"
rloffset = read.query_length
else:
strand = "+"
if read.is_paired:
readtype = "PE"
if read.is_read1:
startpos = read.reference_start
else:
if read.mate_is_unmapped:
continue
startpos = read.next_reference_start
if read.mate_is_reverse:
strand = "-"
rloffset = read.query_length-22
else:
strand = "+"
else:
startpos = read.reference_start
readtype = "SE"
UMIseq = read.get_tag('UB')
if ggg:
UMIseq = UMIseq+"GGG"
if strand == "-":
#if(startpos+rloffset+21 > reflen):
# upstream_seq = fa.fetch(chr, startpos+rloffset, reflen)
#else:
# upstream_seq = fa.fetch(chr, startpos+rloffset, startpos+rloffset+21)
#if the fetch is longer than reflen, it just returns empty string but no error
upstream_seq = fa.fetch(chr, startpos+rloffset, startpos+rloffset+21)
upstream_seq = reverse_complement(upstream_seq)
else:
if startpos-20 < 0:
startpos = startpos + 20-startpos
upstream_seq = fa.fetch(chr, startpos-20, startpos+1)
pattern = r"(" + UMIseq+ "){s<="+ str(mm) + "}"
m = regex.search(pattern, upstream_seq)
if m is not None:
continue
out.write(read)
inp.close()
out.close()
def main():
parser = argparse.ArgumentParser(add_help=True)
parser.add_argument('--bam', type=str, metavar='FILENAME',
help='Path to input BAM file', required = True)
parser.add_argument('--out', type=str, metavar='FILENAME',
help='Path to output bam file', required = True)
parser.add_argument('--fa', type=str, metavar='FILENAME',
help='Path to genome reference fasta', required = True)
parser.add_argument('--p', type=int, default = 10,
help='Number of processes to use')
parser.add_argument('--n_mismatch', type=int, default = 1,
help='Number of mismatches allowed when matching TSO')
parser.add_argument('--ggg', action = 'store_true',
help='Require template-switching Gs as part of sequence match')
args = parser.parse_args()
v = '0.1.2'
print("pyTSOfilter.py v"+v)
print("WARNING: This script only works correctly when using stranded gene assignment via zUMIs.")
bampath = args.bam
print("Loading reference sequence...")
#reffa = '/home/chris/projects/pyTSOfilter/hg38.primary_assembly.fa'
reffa = args.fa
#fa_dict = SeqIO.to_dict(SeqIO.parse(reffa, "fasta"))
#check if fasta file is indexed
try:
fa = pysam.FastaFile(reffa)
except ValueError:
print("Error: Reference fasta file is not indexed!")
print("Please run: samtools faidx "+args.fa)
quit()
fa.close()
if not os.path.exists(bampath+'.bai'):
print("input bam index not found, indexing...")
bampath = idx_bam(bampath,args.p)
chrs = pysam.idxstats(bampath).split('\n')
chrs = [c.split('\t')[0] for c in chrs[:-1]]
#chrs.append('*')
if args.p > 20:
pysam_workers = 2
n_jobs = int(args.p/2)
else:
pysam_workers = 1
n_jobs = args.p
#Construct the new bam header to work with
bamheader = makeBAMheader(args, v)
print("Working...")
pool = mp.Pool(n_jobs)
results = [pool.apply_async(filterTSO, (args.bam,pysam_workers,args.fa,chr,args.n_mismatch,args.ggg,bamheader, )) for chr in chrs]
x = [r.get() for r in results]
print("Creating final output .bam file...")
collect_bam_chunks(inpath = bampath, chrs = chrs, outpath = args.out)
print("Indexing final output .bam file...")
y = idx_bam(args.out,args.p)
print("Done!")
if __name__ == "__main__":
main()