Read twice solution for finding read pairs

CGATOxford · May 28, 2016 · bbbe020 · bbbe020
1 parent 2b6a0b8
commit bbbe020
Showing 1 changed file with 54 additions and 7 deletions.
diff --git a/umi_tools/dedup.py b/umi_tools/dedup.py
@@ -233,6 +233,53 @@ def remove_umis(adj_list, cluster, nodes):
 
     return cluster - nodes_to_remove
 
+class TwoPassPairWriter:
+
+    def __init__(self, infile, outfile):
+        self.infile = infile
+        self.outfile = outfile
+        self.read1s = set()
+        self.chrom = None
+
+    def write(self, read):
+
+        if not self.chrom == read.reference_id:
+            self.write_mates()
+            self.chrom = read.reference_id
+
+        key = read.query_name, read.next_reference_id, read.next_reference_start
+        self.read1s.add(key)
+        self.outfile.write(read)
+
+    def write_mates(self):
+
+        U.debug("Dumping %s mates" % self.chrom)
+        for read in self.infile.fetch(tid=self.chrom, multiple_iterators=True):
+            if any((read.is_unmapped, read.mate_is_unmapped, read.is_read1)):
+                continue
+
+            key = read.query_name, read.reference_id, read.reference_start
+            if key in self.read1s:
+                self.outfile.write(read)
+                self.read1s.remove(key)
+        U.debug("%i mates remaining" % len(self.read1s))
+
+    def close(self):
+
+        self.write_mates()
+        U.debug("Searching for mates for %i unmatched alignments" % len(self.read1s))
+
+        found = 0
+        for name, chrom, pos in self.read1s:
+            for read in self.outfile.fetch(start=pos, end=pos+1, tid=chrom):
+                if (read.query_name, read.pos) == (name, pos):
+                    self.outfile.write(read)
+                    found += 1
+                    break
+
+        U.debug("%i mates never found" % (len(self.read1s) - found))
+        self.outfile.close()
+
 
 class ClusterAndReducer:
     '''A functor that clusters a bundle of reads,
@@ -842,6 +889,9 @@ def main(argv=None):
     outfile = pysam.Samfile(out_name, out_mode,
                             template=infile)
 
+    if options.paired:
+        outfile = TwoPassPairWriter(infile, outfile)
+
     nInput, nOutput = 0, 0
 
     if options.detection_method:
@@ -926,13 +976,7 @@ def main(argv=None):
             for read in reads:
                 outfile.write(read)
                 nOutput += 1
-                if options.paired:
-                    # TS - write out paired end mate
-                    try:
-                        outfile.write(infile.mate(read))
-                    except:
-                        raise ValueError("Mate not found for read: %s" %
-                                         bundle[umi]["read"].query_name)
+
             if options.stats:
 
                 # collect pre-dudupe stats
@@ -959,6 +1003,8 @@ def main(argv=None):
                     for c_type, count in nodes.most_common():
                         node_counts[c_type] += count
 
+    outfile.close()
+
     if options.stats:
 
         stats_pre_df = pd.DataFrame(stats_pre_df_dict)
@@ -1046,6 +1092,7 @@ def tallyCounts(binned_cluster, max_edit_distance):
                                x, y in node_counts.most_common()]) + "\n")
 
     # write footer and output benchmark information.
+
     U.info("Number of reads in: %i, Number of reads out: %i" %
            (nInput, nOutput))
     U.Stop()