PolyA filter became more sensitive.

stela2502 · Jan 19, 2024 · 3139ce9 · 3139ce9
1 parent a20e8bc
commit 3139ce9
Show file tree

Hide file tree

Showing 13 changed files with 126 additions and 57 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "rustody"
-version = "1.2.2"
+version = "1.2.4"
 edition = "2021"
 license-file = "LICENSE"
 description = "Split BD rapsody fastq files into sample specific fastq files"

diff --git a/News.md b/News.md
@@ -1,3 +1,7 @@
+# 1.2.4
+
+PolyA containing R2 reads are now filtered out if a PolyA streatch of at least 15 A's is detected in the last 30 bp of a R2 read. 
+
 # 1.2.3
 
 Analysis of 10x data should now also be possible.

diff --git a/README.md b/README.md
@@ -580,3 +580,12 @@ zcat testData/output_one_cell/BD_Rhapsody_expression/features.tsv.gz | grep Cd3e
 This problem has been fixed. Cd3e is no longer "expressed" in this cell.
 
 And this problem re-surfaced later on after I spent so much time making the mapping more sensitive :-D - good I have a negative control here!
+
+
+## Zbtb16 expression
+
+Comparing my results to the results from the DB pipeline showed Zbtb16 as the gene with the lowest correlation between the tools.
+To debug this I selected two cells from the whole data which one had detected expression in both tools and one was only detected in Rustody.
+I had hoped my mapping was more sensitive, but it turned out it was these stragne PolA containing R2 reads (again).
+
+I have updated the PolyA detection in the newest version and also updated the report to be more phony of why a read was filtered out.
diff --git a/src/analysis.rs b/src/analysis.rs
@@ -38,6 +38,13 @@ use crate::ofiles::Ofiles;
 
 static EMPTY_VEC: Vec<String> = Vec::new();
 
+#[derive(Debug)]
+enum FilterError {
+    Length,
+    PolyA,
+    Ns,
+    Quality,
+}
 
 fn mean_u8( data:&[u8] ) -> f32 {
     let this = b"!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~";
@@ -265,53 +272,60 @@ impl Analysis{
 	}
 
 
-    fn quality_control(   read1:&SequenceRecord, read2:&SequenceRecord, min_quality:f32, pos: &[usize;8], min_sizes: &[usize;2]  ) -> bool {
+    fn quality_control(   read1:&SequenceRecord, read2:&SequenceRecord, min_quality:f32, pos: &[usize;8], min_sizes: &[usize;2]  ) -> Result<(), FilterError> {
 
         // the quality measurements I get here do not make sense. I assume I would need to add lots of more work here.
         //println!("The read1 mean quality == {}", mean_u8( seqrec.qual().unwrap() ));
 
         if mean_u8( read1.qual().unwrap() ) < min_quality {
             //println!("filtered R1 by quality" );
-            return false
+            return Err(FilterError::Quality);
         }
         if mean_u8( read2.qual().unwrap() ) < min_quality {
             //println!("filtered R2 by quality" );
-            return false
+            return Err(FilterError::Quality);
         }
 
         // totally unusable sequence
         // as described in the BD rhapsody Bioinformatic Handbook
         // GMX_BD-Rhapsody-genomics-informatics_UG_EN.pdf (google should find this)
         if read1.seq().len() < min_sizes[0] {
         	//println!("R1 too short (< {}bp)\n", min_sizes[0] );
-            return false
+            return Err(FilterError::Length);
         }
         if read2.seq().len() < min_sizes[1] {
         	//println!("R2 too short (< {}bp)\n",  min_sizes[1]);
-            return false
+            return Err(FilterError::Length);
         }
         //let seq = seqrec.seq().into_owned();
         for nuc in &read2.seq()[pos[6]..pos[7]] {  
             if *nuc ==b'N'{
             	//println!("N nucs\n");
-                return false
+                return Err(FilterError::Ns);
             }
         }
         // I have huge problems with reds that contain a lot of ploy A in the end
         // None of them match using blast - so I think we can savely just dump them.
         let mut bad = 0;
 
-        for nuc in &read2.seq()[ (read2.seq().len()-30).. ] {  
+        let mut last_a = false;
+
+        for nuc in &read2.seq()[ (read2.seq().len()-30).. ] {
         	if *nuc ==b'A'{
-        		bad += 1;
+        		if last_a{
+        			bad += 1;
+        		}
+        		last_a = true;
+        	}else {
+        		last_a = false;
         	}
         }
-        if bad >20 {
+        if bad >15 {
         	//eprintln!("Sequence rejected due to high polyA content \n{:?}", String::from_utf8_lossy(&read2.seq()) );
-        	return false
+        	return Err(FilterError::PolyA)
         }
 
-        true
+        Ok(())
     }
 
     fn analyze_paralel( &self, data:&[(Vec<u8>, Vec<u8>)], report:&mut MappingInfo, pos: &[usize;8] ) -> SingleCellData{
@@ -515,12 +529,29 @@ impl Analysis{
 	                    continue 'main;
 	                }
 	            };
-        		if Self::quality_control( &read1, &read2, report.min_quality, pos, min_sizes){
-        			good_read_count +=1;
-        			good_reads.push( (read1.seq().to_vec(), read2.seq().to_vec() ) );
-        		}else {
-        			report.unknown +=1;
-        		}
+
+        		match Self::quality_control( &read1, &read2, report.min_quality, pos, min_sizes){
+        			Ok(()) => {
+        				good_read_count +=1;
+        				good_reads.push( (read1.seq().to_vec(), read2.seq().to_vec() ) );
+        			},
+        			Err(FilterError::PolyA)=> {
+        				report.poly_a +=1;
+        				continue 'main;
+        			},
+        			Err(FilterError::Quality) => {
+        				report.quality +=1;
+        				continue 'main;
+        			},
+        			Err(FilterError::Length) => {
+        				report.length +=1;
+        				continue 'main;
+        			},
+        			Err(FilterError::Ns) => {
+        				report.n_s +=1;
+        				continue 'main;
+        			}
+        		};
     		}
     		else {
     			report.stop_file_io_time();
@@ -646,7 +677,7 @@ impl Analysis{
         	if let Some(record1) = readereads.next() {
         		report.total += 1;
 
-        		let seqrec = match record2{
+        		let seqrec2 = match record2{
         			Ok( res ) => res,
         			Err(err) => {
         				eprintln!("could not read from R2:\n{err}");
@@ -668,10 +699,27 @@ impl Analysis{
 
                 // the quality measurements I get here do not make sense. I assume I would need to add lots of more work here.
                 //println!("The read1 mean quality == {}", mean_u8( seqrec.qual().unwrap() ));
-                if ! Self::quality_control( &seqrec1, &seqrec, report.min_quality, pos, min_sizes){
-        			report.unknown +=1;
-        			continue 'main;
-        		}
+                match Self::quality_control( &seqrec1, &seqrec2, report.min_quality, pos, min_sizes){
+        			Ok(()) => {
+        				//good_read_count +=1;
+        			},
+        			Err(FilterError::PolyA)=> {
+        				report.poly_a +=1;
+        				continue 'main;
+        			},
+        			Err(FilterError::Quality) => {
+        				report.quality +=1;
+        				continue 'main;
+        			},
+        			Err(FilterError::Length) => {
+        				report.length +=1;
+        				continue 'main;
+        			},
+        			Err(FilterError::Ns) => {
+        				report.n_s +=1;
+        				continue 'main;
+        			}
+        		};
 
                 let mut ok: bool;
                 // first match the cell id - if that does not work the read is unusable
@@ -686,7 +734,7 @@ impl Analysis{
 		            	// or a mRNA match
 		            	// And of casue not a match at all
 
-		            	ok = match &self.antibodies.get_strict( &seqrec.seq(),  &mut tool ){
+		            	ok = match &self.antibodies.get_strict( &seqrec2.seq(),  &mut tool ){
 		            		Some(gene_id) =>{
 		            			report.iter_read_type( "antibody reads" );
 		                    	if gene_id.len() == 1 {
@@ -709,7 +757,7 @@ impl Analysis{
 		                };
 
 		                if ! ok{
-		                	ok = match &self.samples.get_strict( &seqrec.seq(),  &mut tool ){
+		                	ok = match &self.samples.get_strict( &seqrec2.seq(),  &mut tool ){
 		                		Some(gene_id) =>{
 		                			report.iter_read_type( "sample reads" );
 			                    	//println!("sample ({gene_id:?}) with {:?}",String::from_utf8_lossy(&seqrec.seq()) );
@@ -738,7 +786,7 @@ impl Analysis{
 			            if ! ok{
 			            	// multimapper are only allowed for expression reads - 
 			            	// if either sample ids or AB reads would have this it would be a library design fault!
-			            	match &self.genes.get( &seqrec.seq(),  &mut tool ){
+			            	match &self.genes.get( &seqrec2.seq(),  &mut tool ){
 			            		Some(gene_id) =>{
 			            			report.iter_read_type( "expression reads" );
 			            			if gene_id.len() == 1 {

diff --git a/src/bin/quantify_rhapsody.rs b/src/bin/quantify_rhapsody.rs
@@ -23,7 +23,7 @@ use rustody::ofiles::Ofiles;
 /// You need quite long R1 and R2 reads for this! (>70R1 and >70R2 \[v1\] and 52 bp reads for v2.96 and v2.384)
 
 #[derive(Parser)]
-#[clap(version = "1.2.3", author = "Stefan L. <stefan.lang@med.lu.se>")]
+#[clap(version = "1.2.4", author = "Stefan L. <stefan.lang@med.lu.se>")]
 struct Opts {
     /// the input R1 reads file
     #[clap(short, long)]

diff --git a/src/bin/quantify_rhapsody_multi.rs b/src/bin/quantify_rhapsody_multi.rs
@@ -23,7 +23,7 @@ use num_cpus;
 /// You need quite long R1 and R2 reads for this! (>70R1 and >70R2 \[v1\] and 52 bp reads for v2.96 and v2.384)
 
 #[derive(Parser)]
-#[clap(version = "1.2.3", author = "Stefan L. <stefan.lang@med.lu.se>")]
+#[clap(version = "1.2.4", author = "Stefan L. <stefan.lang@med.lu.se>")]
 struct Opts {
     /// the input R1 reads file
     #[clap(short, long)]

diff --git a/src/fast_mapper/fast_mapper.rs b/src/fast_mapper/fast_mapper.rs
@@ -756,17 +756,17 @@ impl FastMapper{
             return None
         }
 
-        /*let bad_gene = "Zbtb16";
+        let bad_gene = "Zbtb16";
         let bad = self.get_id( bad_gene.to_string()) ;
-        */
+
         // check if there is only one gene //
         if self.get_best_gene( &genes, &mut matching_geneids ){
             // Cd3e <- keep that to fiond this place back
             //println!("I have these genes: {genes:?} And am returning: {:?}",  matching_geneids);
-            /*if matching_geneids[0] == bad {
-                eprintln!("read mapping to {} - should not happen here!: {:?}\n{:?}", bad_gene, self.gene_names_for_ids( &matching_geneids ),String::from_utf8_lossy(seq) );
-                eprintln!("This is our total matching set: {:?}", genes);
-            }*/
+            if matching_geneids[0] == bad {
+                println!("read mapping to {} - should not happen here!: {:?}\n{:?}", bad_gene, self.gene_names_for_ids( &matching_geneids ),String::from_utf8_lossy(seq) );
+                println!("This is our total matching set: {:?}", genes);
+            }
             return Some( matching_geneids )
         }
         if matching_geneids.len() > 2{

diff --git a/src/mapping_info.rs b/src/mapping_info.rs
@@ -6,10 +6,15 @@ use std::collections::BTreeMap;
 use std::io::Write;
 use std::time::{Duration, SystemTime};
 
+
+
 /// MappingInfo captures all mapping data and is a way to easily copy this data over multiple analysis runs.
 pub struct MappingInfo{
 	/// reads that did not pass the filters
-	pub unknown:usize,
+	pub quality:usize,
+	pub length:usize,
+	pub n_s:usize,
+	pub poly_a:usize,
 	/// reads that had no cell id
     pub no_sample:usize,
     /// reads that have no match in the geneIds object
@@ -41,37 +46,30 @@ pub struct MappingInfo{
 
 impl MappingInfo{
 	pub fn new(log_writer:File, min_quality:f32, max_reads:usize, ofile:Ofiles, ) -> Self{
-		let unknown = 0;
-		let no_sample = 0;
-		let no_data = 0;
-		let ok_reads = 0;
-		let cellular_reads = 0;
-		let pcr_duplicates = 0;
-		let split = 1_000_000;
-		let log_iter = 0;
-		let local_dup = 0;
-		let total = 0;
 		let absolute_start = SystemTime::now();
 		let realtive_start = None;
 		let single_processor_time = Duration::new(0,0);
 		let multi_processor_time = Duration::new(0,0);
 		let file_io_time = Duration::new(0,0);
 		let reads_log = BTreeMap::new();
 		let mut this = Self{
-			unknown,
-			no_sample,
-			no_data,
-			ok_reads,
-			cellular_reads,
-			pcr_duplicates,
-			split,
-			log_iter,
+			quality: 0,
+		    length: 0,
+		    n_s: 0,
+		    poly_a: 0,
+			no_sample: 0,
+			no_data: 0,
+			ok_reads: 0,
+			cellular_reads: 0,
+			pcr_duplicates: 0,
+			split: 1_000_000,
+			log_iter: 0,
 			log_writer,
 			min_quality,
 			max_reads,
 			ofile,
-			local_dup,
-			total,
+			local_dup: 0,
+			total: 0,
 			absolute_start,
 			realtive_start,
 			single_processor_time,
@@ -195,11 +193,17 @@ impl MappingInfo{
 
 		let pcr_duplicates = self.cellular_reads - reads_genes - reads_ab - reads_samples;
 
+		let unknown = self.quality + self.length + self.n_s + self.poly_a;
 	    let mut result = "\nSummary:\n".to_owned()
 	    	+format!(     "cellular   reads  : {} reads ({:.2}% of total)\n", self.cellular_reads, (self.cellular_reads as f32 / self.total as f32) * 100.0 ).as_str()
 	    	+format!(     "no cell ID reads  : {} reads ({:.2}% of total)\n", self.no_sample.saturating_sub(self.no_data), (self.no_sample.saturating_sub(self.no_data) as f32 / self.total as f32) * 100.0).as_str()
 	    	+format!(     "no gene ID reads  : {} reads ({:.2}% of total)\n", self.no_data, (self.no_data as f32 / self.total as f32) * 100.0).as_str()
-	    	+format!(     "N's or too short  : {} reads ({:.2}% of total)\n", self.unknown, (self.unknown as f32 / self.total as f32) * 100.0).as_str()
+	    	+format!(     "filtered   reads  : {} reads ({:.2}% of total)\n", unknown, (unknown as f32 / self.total as f32) * 100.0).as_str()
+	    	+format!(     " ->        polyA  : {} reads ({:.2}% of total)\n", self.poly_a, ( self.poly_a as f32 / self.total as f32) * 100.0).as_str()
+	    	+format!(     " -> bad qualiity  : {} reads ({:.2}% of total)\n", self.quality, ( self.quality as f32 / self.total as f32) * 100.0).as_str()
+	    	+format!(     " ->    too short  : {} reads ({:.2}% of total)\n", self.length, ( self.length as f32 / self.total as f32) * 100.0).as_str()
+	    	+format!(     " ->          N's  : {} reads ({:.2}% of total)\n", self.n_s, ( self.n_s as f32 / self.total as f32) * 100.0).as_str()
+	    	+"\n"
 	    	+format!(     "total      reads  : {} reads\n", self.total ).as_str()
 	    	+"\ncollected read counts:\n"
 	    	+self.read_types_to_string(vec!["expression reads", "antibody reads", "sample reads"]).as_str()

diff --git a/src/singlecelldata/singlecelldata.rs b/src/singlecelldata/singlecelldata.rs
@@ -381,7 +381,11 @@ impl SingleCellData{
         }
 
         if genes.max_id  ==0 && ! names.is_empty() {
-            eprintln!( "None of the genes have data:\n{} ...", names[0..10].join( ", " ) );
+            let mut to = 10;
+            if names.len() < 10{
+                to = names.len() -1;
+            }
+            eprintln!( "None of the genes have data:\n{} ...", names[0..to].join( ", " ) );
         }
         //else { println!("{} genes requested and {} with data found", names.len(), genes.max_id); }
         if names.len() != genes.max_id{

diff --git a/testData/OneSingleCell.13151654.R1.fastq.gz b/testData/OneSingleCell.13151654.R1.fastq.gz
diff --git a/testData/OneSingleCell.13151654.R2.fastq.gz b/testData/OneSingleCell.13151654.R2.fastq.gz
diff --git a/testData/OneSingleCell.1360517.R1.fastq.gz b/testData/OneSingleCell.1360517.R1.fastq.gz
diff --git a/testData/OneSingleCell.1360517.R2.fastq.gz b/testData/OneSingleCell.1360517.R2.fastq.gz