Skip to content

Commit

Permalink
PolyA filter became more sensitive.
Browse files Browse the repository at this point in the history
  • Loading branch information
stela2502 committed Jan 19, 2024
1 parent a20e8bc commit 3139ce9
Show file tree
Hide file tree
Showing 13 changed files with 126 additions and 57 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "rustody"
version = "1.2.2"
version = "1.2.4"
edition = "2021"
license-file = "LICENSE"
description = "Split BD rapsody fastq files into sample specific fastq files"
Expand Down
4 changes: 4 additions & 0 deletions News.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# 1.2.4

PolyA containing R2 reads are now filtered out if a PolyA streatch of at least 15 A's is detected in the last 30 bp of a R2 read.

# 1.2.3

Analysis of 10x data should now also be possible.
Expand Down
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -580,3 +580,12 @@ zcat testData/output_one_cell/BD_Rhapsody_expression/features.tsv.gz | grep Cd3e
This problem has been fixed. Cd3e is no longer "expressed" in this cell.

And this problem re-surfaced later on after I spent so much time making the mapping more sensitive :-D - good I have a negative control here!


## Zbtb16 expression

Comparing my results to the results from the DB pipeline showed Zbtb16 as the gene with the lowest correlation between the tools.
To debug this I selected two cells from the whole data which one had detected expression in both tools and one was only detected in Rustody.
I had hoped my mapping was more sensitive, but it turned out it was these stragne PolA containing R2 reads (again).

I have updated the PolyA detection in the newest version and also updated the report to be more phony of why a read was filtered out.
98 changes: 73 additions & 25 deletions src/analysis.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,13 @@ use crate::ofiles::Ofiles;

static EMPTY_VEC: Vec<String> = Vec::new();

#[derive(Debug)]
enum FilterError {
Length,
PolyA,
Ns,
Quality,
}

fn mean_u8( data:&[u8] ) -> f32 {
let this = b"!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~";
Expand Down Expand Up @@ -265,53 +272,60 @@ impl Analysis{
}


fn quality_control( read1:&SequenceRecord, read2:&SequenceRecord, min_quality:f32, pos: &[usize;8], min_sizes: &[usize;2] ) -> bool {
fn quality_control( read1:&SequenceRecord, read2:&SequenceRecord, min_quality:f32, pos: &[usize;8], min_sizes: &[usize;2] ) -> Result<(), FilterError> {

// the quality measurements I get here do not make sense. I assume I would need to add lots of more work here.
//println!("The read1 mean quality == {}", mean_u8( seqrec.qual().unwrap() ));

if mean_u8( read1.qual().unwrap() ) < min_quality {
//println!("filtered R1 by quality" );
return false
return Err(FilterError::Quality);
}
if mean_u8( read2.qual().unwrap() ) < min_quality {
//println!("filtered R2 by quality" );
return false
return Err(FilterError::Quality);
}

// totally unusable sequence
// as described in the BD rhapsody Bioinformatic Handbook
// GMX_BD-Rhapsody-genomics-informatics_UG_EN.pdf (google should find this)
if read1.seq().len() < min_sizes[0] {
//println!("R1 too short (< {}bp)\n", min_sizes[0] );
return false
return Err(FilterError::Length);
}
if read2.seq().len() < min_sizes[1] {
//println!("R2 too short (< {}bp)\n", min_sizes[1]);
return false
return Err(FilterError::Length);
}
//let seq = seqrec.seq().into_owned();
for nuc in &read2.seq()[pos[6]..pos[7]] {
if *nuc ==b'N'{
//println!("N nucs\n");
return false
return Err(FilterError::Ns);
}
}
// I have huge problems with reds that contain a lot of ploy A in the end
// None of them match using blast - so I think we can savely just dump them.
let mut bad = 0;

for nuc in &read2.seq()[ (read2.seq().len()-30).. ] {
let mut last_a = false;

for nuc in &read2.seq()[ (read2.seq().len()-30).. ] {
if *nuc ==b'A'{
bad += 1;
if last_a{
bad += 1;
}
last_a = true;
}else {
last_a = false;
}
}
if bad >20 {
if bad >15 {
//eprintln!("Sequence rejected due to high polyA content \n{:?}", String::from_utf8_lossy(&read2.seq()) );
return false
return Err(FilterError::PolyA)
}

true
Ok(())
}

fn analyze_paralel( &self, data:&[(Vec<u8>, Vec<u8>)], report:&mut MappingInfo, pos: &[usize;8] ) -> SingleCellData{
Expand Down Expand Up @@ -515,12 +529,29 @@ impl Analysis{
continue 'main;
}
};
if Self::quality_control( &read1, &read2, report.min_quality, pos, min_sizes){
good_read_count +=1;
good_reads.push( (read1.seq().to_vec(), read2.seq().to_vec() ) );
}else {
report.unknown +=1;
}

match Self::quality_control( &read1, &read2, report.min_quality, pos, min_sizes){
Ok(()) => {
good_read_count +=1;
good_reads.push( (read1.seq().to_vec(), read2.seq().to_vec() ) );
},
Err(FilterError::PolyA)=> {
report.poly_a +=1;
continue 'main;
},
Err(FilterError::Quality) => {
report.quality +=1;
continue 'main;
},
Err(FilterError::Length) => {
report.length +=1;
continue 'main;
},
Err(FilterError::Ns) => {
report.n_s +=1;
continue 'main;
}
};
}
else {
report.stop_file_io_time();
Expand Down Expand Up @@ -646,7 +677,7 @@ impl Analysis{
if let Some(record1) = readereads.next() {
report.total += 1;

let seqrec = match record2{
let seqrec2 = match record2{
Ok( res ) => res,
Err(err) => {
eprintln!("could not read from R2:\n{err}");
Expand All @@ -668,10 +699,27 @@ impl Analysis{

// the quality measurements I get here do not make sense. I assume I would need to add lots of more work here.
//println!("The read1 mean quality == {}", mean_u8( seqrec.qual().unwrap() ));
if ! Self::quality_control( &seqrec1, &seqrec, report.min_quality, pos, min_sizes){
report.unknown +=1;
continue 'main;
}
match Self::quality_control( &seqrec1, &seqrec2, report.min_quality, pos, min_sizes){
Ok(()) => {
//good_read_count +=1;
},
Err(FilterError::PolyA)=> {
report.poly_a +=1;
continue 'main;
},
Err(FilterError::Quality) => {
report.quality +=1;
continue 'main;
},
Err(FilterError::Length) => {
report.length +=1;
continue 'main;
},
Err(FilterError::Ns) => {
report.n_s +=1;
continue 'main;
}
};

let mut ok: bool;
// first match the cell id - if that does not work the read is unusable
Expand All @@ -686,7 +734,7 @@ impl Analysis{
// or a mRNA match
// And of casue not a match at all

ok = match &self.antibodies.get_strict( &seqrec.seq(), &mut tool ){
ok = match &self.antibodies.get_strict( &seqrec2.seq(), &mut tool ){
Some(gene_id) =>{
report.iter_read_type( "antibody reads" );
if gene_id.len() == 1 {
Expand All @@ -709,7 +757,7 @@ impl Analysis{
};

if ! ok{
ok = match &self.samples.get_strict( &seqrec.seq(), &mut tool ){
ok = match &self.samples.get_strict( &seqrec2.seq(), &mut tool ){
Some(gene_id) =>{
report.iter_read_type( "sample reads" );
//println!("sample ({gene_id:?}) with {:?}",String::from_utf8_lossy(&seqrec.seq()) );
Expand Down Expand Up @@ -738,7 +786,7 @@ impl Analysis{
if ! ok{
// multimapper are only allowed for expression reads -
// if either sample ids or AB reads would have this it would be a library design fault!
match &self.genes.get( &seqrec.seq(), &mut tool ){
match &self.genes.get( &seqrec2.seq(), &mut tool ){
Some(gene_id) =>{
report.iter_read_type( "expression reads" );
if gene_id.len() == 1 {
Expand Down
2 changes: 1 addition & 1 deletion src/bin/quantify_rhapsody.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ use rustody::ofiles::Ofiles;
/// You need quite long R1 and R2 reads for this! (>70R1 and >70R2 \[v1\] and 52 bp reads for v2.96 and v2.384)
#[derive(Parser)]
#[clap(version = "1.2.3", author = "Stefan L. <stefan.lang@med.lu.se>")]
#[clap(version = "1.2.4", author = "Stefan L. <stefan.lang@med.lu.se>")]
struct Opts {
/// the input R1 reads file
#[clap(short, long)]
Expand Down
2 changes: 1 addition & 1 deletion src/bin/quantify_rhapsody_multi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ use num_cpus;
/// You need quite long R1 and R2 reads for this! (>70R1 and >70R2 \[v1\] and 52 bp reads for v2.96 and v2.384)
#[derive(Parser)]
#[clap(version = "1.2.3", author = "Stefan L. <stefan.lang@med.lu.se>")]
#[clap(version = "1.2.4", author = "Stefan L. <stefan.lang@med.lu.se>")]
struct Opts {
/// the input R1 reads file
#[clap(short, long)]
Expand Down
12 changes: 6 additions & 6 deletions src/fast_mapper/fast_mapper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -756,17 +756,17 @@ impl FastMapper{
return None
}

/*let bad_gene = "Zbtb16";
let bad_gene = "Zbtb16";
let bad = self.get_id( bad_gene.to_string()) ;
*/

// check if there is only one gene //
if self.get_best_gene( &genes, &mut matching_geneids ){
// Cd3e <- keep that to fiond this place back
//println!("I have these genes: {genes:?} And am returning: {:?}", matching_geneids);
/*if matching_geneids[0] == bad {
eprintln!("read mapping to {} - should not happen here!: {:?}\n{:?}", bad_gene, self.gene_names_for_ids( &matching_geneids ),String::from_utf8_lossy(seq) );
eprintln!("This is our total matching set: {:?}", genes);
}*/
if matching_geneids[0] == bad {
println!("read mapping to {} - should not happen here!: {:?}\n{:?}", bad_gene, self.gene_names_for_ids( &matching_geneids ),String::from_utf8_lossy(seq) );
println!("This is our total matching set: {:?}", genes);
}
return Some( matching_geneids )
}
if matching_geneids.len() > 2{
Expand Down
48 changes: 26 additions & 22 deletions src/mapping_info.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,15 @@ use std::collections::BTreeMap;
use std::io::Write;
use std::time::{Duration, SystemTime};



/// MappingInfo captures all mapping data and is a way to easily copy this data over multiple analysis runs.
pub struct MappingInfo{
/// reads that did not pass the filters
pub unknown:usize,
pub quality:usize,
pub length:usize,
pub n_s:usize,
pub poly_a:usize,
/// reads that had no cell id
pub no_sample:usize,
/// reads that have no match in the geneIds object
Expand Down Expand Up @@ -41,37 +46,30 @@ pub struct MappingInfo{

impl MappingInfo{
pub fn new(log_writer:File, min_quality:f32, max_reads:usize, ofile:Ofiles, ) -> Self{
let unknown = 0;
let no_sample = 0;
let no_data = 0;
let ok_reads = 0;
let cellular_reads = 0;
let pcr_duplicates = 0;
let split = 1_000_000;
let log_iter = 0;
let local_dup = 0;
let total = 0;
let absolute_start = SystemTime::now();
let realtive_start = None;
let single_processor_time = Duration::new(0,0);
let multi_processor_time = Duration::new(0,0);
let file_io_time = Duration::new(0,0);
let reads_log = BTreeMap::new();
let mut this = Self{
unknown,
no_sample,
no_data,
ok_reads,
cellular_reads,
pcr_duplicates,
split,
log_iter,
quality: 0,
length: 0,
n_s: 0,
poly_a: 0,
no_sample: 0,
no_data: 0,
ok_reads: 0,
cellular_reads: 0,
pcr_duplicates: 0,
split: 1_000_000,
log_iter: 0,
log_writer,
min_quality,
max_reads,
ofile,
local_dup,
total,
local_dup: 0,
total: 0,
absolute_start,
realtive_start,
single_processor_time,
Expand Down Expand Up @@ -195,11 +193,17 @@ impl MappingInfo{

let pcr_duplicates = self.cellular_reads - reads_genes - reads_ab - reads_samples;

let unknown = self.quality + self.length + self.n_s + self.poly_a;
let mut result = "\nSummary:\n".to_owned()
+format!( "cellular reads : {} reads ({:.2}% of total)\n", self.cellular_reads, (self.cellular_reads as f32 / self.total as f32) * 100.0 ).as_str()
+format!( "no cell ID reads : {} reads ({:.2}% of total)\n", self.no_sample.saturating_sub(self.no_data), (self.no_sample.saturating_sub(self.no_data) as f32 / self.total as f32) * 100.0).as_str()
+format!( "no gene ID reads : {} reads ({:.2}% of total)\n", self.no_data, (self.no_data as f32 / self.total as f32) * 100.0).as_str()
+format!( "N's or too short : {} reads ({:.2}% of total)\n", self.unknown, (self.unknown as f32 / self.total as f32) * 100.0).as_str()
+format!( "filtered reads : {} reads ({:.2}% of total)\n", unknown, (unknown as f32 / self.total as f32) * 100.0).as_str()
+format!( " -> polyA : {} reads ({:.2}% of total)\n", self.poly_a, ( self.poly_a as f32 / self.total as f32) * 100.0).as_str()
+format!( " -> bad qualiity : {} reads ({:.2}% of total)\n", self.quality, ( self.quality as f32 / self.total as f32) * 100.0).as_str()
+format!( " -> too short : {} reads ({:.2}% of total)\n", self.length, ( self.length as f32 / self.total as f32) * 100.0).as_str()
+format!( " -> N's : {} reads ({:.2}% of total)\n", self.n_s, ( self.n_s as f32 / self.total as f32) * 100.0).as_str()
+"\n"
+format!( "total reads : {} reads\n", self.total ).as_str()
+"\ncollected read counts:\n"
+self.read_types_to_string(vec!["expression reads", "antibody reads", "sample reads"]).as_str()
Expand Down
6 changes: 5 additions & 1 deletion src/singlecelldata/singlecelldata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,11 @@ impl SingleCellData{
}

if genes.max_id ==0 && ! names.is_empty() {
eprintln!( "None of the genes have data:\n{} ...", names[0..10].join( ", " ) );
let mut to = 10;
if names.len() < 10{
to = names.len() -1;
}
eprintln!( "None of the genes have data:\n{} ...", names[0..to].join( ", " ) );
}
//else { println!("{} genes requested and {} with data found", names.len(), genes.max_id); }
if names.len() != genes.max_id{
Expand Down
Binary file added testData/OneSingleCell.13151654.R1.fastq.gz
Binary file not shown.
Binary file added testData/OneSingleCell.13151654.R2.fastq.gz
Binary file not shown.
Binary file added testData/OneSingleCell.1360517.R1.fastq.gz
Binary file not shown.
Binary file added testData/OneSingleCell.1360517.R2.fastq.gz
Binary file not shown.

0 comments on commit 3139ce9

Please sign in to comment.