Skip to content

Commit

Permalink
Read twice solution for finding read pairs
Browse files Browse the repository at this point in the history
  • Loading branch information
IanSudbery committed May 28, 2016
1 parent 2b6a0b8 commit bbbe020
Showing 1 changed file with 54 additions and 7 deletions.
61 changes: 54 additions & 7 deletions umi_tools/dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,53 @@ def remove_umis(adj_list, cluster, nodes):

return cluster - nodes_to_remove

class TwoPassPairWriter:

def __init__(self, infile, outfile):
self.infile = infile
self.outfile = outfile
self.read1s = set()
self.chrom = None

def write(self, read):

if not self.chrom == read.reference_id:
self.write_mates()
self.chrom = read.reference_id

key = read.query_name, read.next_reference_id, read.next_reference_start
self.read1s.add(key)
self.outfile.write(read)

def write_mates(self):

U.debug("Dumping %s mates" % self.chrom)
for read in self.infile.fetch(tid=self.chrom, multiple_iterators=True):
if any((read.is_unmapped, read.mate_is_unmapped, read.is_read1)):
continue

key = read.query_name, read.reference_id, read.reference_start
if key in self.read1s:
self.outfile.write(read)
self.read1s.remove(key)
U.debug("%i mates remaining" % len(self.read1s))

def close(self):

self.write_mates()
U.debug("Searching for mates for %i unmatched alignments" % len(self.read1s))

found = 0
for name, chrom, pos in self.read1s:
for read in self.outfile.fetch(start=pos, end=pos+1, tid=chrom):
if (read.query_name, read.pos) == (name, pos):
self.outfile.write(read)
found += 1
break

U.debug("%i mates never found" % (len(self.read1s) - found))
self.outfile.close()


class ClusterAndReducer:
'''A functor that clusters a bundle of reads,
Expand Down Expand Up @@ -842,6 +889,9 @@ def main(argv=None):
outfile = pysam.Samfile(out_name, out_mode,
template=infile)

if options.paired:
outfile = TwoPassPairWriter(infile, outfile)

nInput, nOutput = 0, 0

if options.detection_method:
Expand Down Expand Up @@ -926,13 +976,7 @@ def main(argv=None):
for read in reads:
outfile.write(read)
nOutput += 1
if options.paired:
# TS - write out paired end mate
try:
outfile.write(infile.mate(read))
except:
raise ValueError("Mate not found for read: %s" %
bundle[umi]["read"].query_name)

if options.stats:

# collect pre-dudupe stats
Expand All @@ -959,6 +1003,8 @@ def main(argv=None):
for c_type, count in nodes.most_common():
node_counts[c_type] += count

outfile.close()

if options.stats:

stats_pre_df = pd.DataFrame(stats_pre_df_dict)
Expand Down Expand Up @@ -1046,6 +1092,7 @@ def tallyCounts(binned_cluster, max_edit_distance):
x, y in node_counts.most_common()]) + "\n")

# write footer and output benchmark information.

U.info("Number of reads in: %i, Number of reads out: %i" %
(nInput, nOutput))
U.Stop()
Expand Down

0 comments on commit bbbe020

Please sign in to comment.