Skip to content

Commit

Permalink
The test index gives now comparable result to the fasta reference.
Browse files Browse the repository at this point in the history
Only slightly improved with unspliced transcripts detected.
  • Loading branch information
stela2502 committed Nov 13, 2023
1 parent c652550 commit 9c60fd2
Show file tree
Hide file tree
Showing 10 changed files with 267 additions and 196 deletions.
19 changes: 17 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -255,9 +255,23 @@ OPTIONS:
```

```
./target/release/create_index -n 12 --transcript "gene_name" -f testData/mapperTest/Juan_genes.fa.gz -g testData/mapperTest/Juan_genes.fixed.gtf.gz -o testData/mapperTest/index > testData/mapperTest/mRNA.fa
./target/release/create_index -n 12 -f testData/mapperTest/Juan_genes.fa.gz -g testData/mapperTest/Juan_genes.fixed.gtf.gz -o testData/mapperTest/index > testData/mapperTest/mRNA.fa
```

And does that work?

```
target/release/quantify_rhapsody_multi -r testData/cells.1.Rhapsody_SV_index1_S1_R1_001.fastq.gz -f testData/cells.1.Rhapsody_SV_index1_S1_R2_001.fastq.gz -o testData/BD_results/Rustody_S1 -s mouse -e testData/2276_20220531_chang_to_rpl36a_amplicons.fasta -a testData/MyAbSeqPanel.fasta -m 200 -v v2.96
```

The same as the result of that?

```
target/release/quantify_rhapsody_multi -r testData/cells.1.Rhapsody_SV_index1_S1_R1_001.fastq.gz -f testData/cells.1.Rhapsody_SV_index1_S1_R2_001.fastq.gz -o testData/BD_results/Rustody_S1_index -s mouse -i testData/mapperTest/index/ -e testData/addOn.fa -a testData/MyAbSeqPanel.fasta -m 200 -v v2.96
```

In version 1.2.2 quantify_rhapsody_multi foudn 8 more cells using the index than using the fasta data only. This might be due to the \_int unprocessed transcripts being recoreded using the index. In total 31 of these internal transcript were recorded.

The program by default creates transcipt specific indices. Which is kind of counter intuitive and could be changed later on if necessary.

**Performance**
Expand Down Expand Up @@ -427,14 +441,15 @@ Little less than 2h. Let's check how much time BD's version does need...
## And BD software for the S2 sample


Repeated runs on my desktop did faile after up to 38h runtime.
Repeated runs on my desktop did fale after up to 38h runtime.
Therefore I report the seven bridges run time here: 7 hours, 39 minutes.
That is significantly faster than the 38h on my system, but it nevertheless is ~4x slower than my Rhapsody implementation on a single core.

Why single core? The system IO does affect the Rust implementation.
The splitting of large gzipped fastq files can easiliy take more than halve an hour and given the fast processing time
the additional work to implement mutiprocessor capabilities for Rustody seams unnecessary.

Actually single core is history - new multicore processing is working WAY faster now.

## The last run on my desktop:

Expand Down
5 changes: 4 additions & 1 deletion src/bin/create_index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ fn process_lines ( gtf: &str, re_gene_name: &Regex,
// };
// }
}

genes
}

Expand Down Expand Up @@ -366,6 +367,7 @@ fn main() {

let genes_hash = process_lines( &opts.gtf, &re_gene_name, &re_gene_id, &re_transcript_id );
let genes: Vec<Gene> = genes_hash.into_values().collect();
report.stop_file_io_time();

let results:Vec<FastMapper> = genes.par_chunks(genes.len() / num_threads + 1) // Split the data into chunks for parallel processing
.map(|data_split| {
Expand Down Expand Up @@ -399,8 +401,9 @@ fn main() {
//report.merge( &gex.1 );
}

index.make_index_te_ready();
//index.make_index_te_ready();
report.stop_single_processor_time();

let (h,m,s,_ms) = MappingInfo::split_duration( report.absolute_start.elapsed().unwrap() );

eprintln!("For {} sequence regions we needed {} h {} min and {} sec to process.",max_dim, h, m, s );
Expand Down
2 changes: 1 addition & 1 deletion src/bin/create_index_te.rs
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ fn process_lines ( lines:&&[String], index: &mut FastMapper ,seq_records: &HashM
parts[4].to_string(),
parts[6].to_string(),
transcript_id.to_string(), // this will be the id we add to the index
vec![gene_name.to_string(), family_name.to_string(), class_name.to_string()],
vec![transcript_id.to_string(), gene_name.to_string(), family_name.to_string(), class_name.to_string()],
);
gene.add_exon( parts[3].to_string(),parts[4].to_string());

Expand Down
7 changes: 7 additions & 0 deletions testData/addOn.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
>tdTomato|ENSMUST99999999999.7| site_id: user_seq_1, AMPLICON
CGCTGATCTACAAGGTGAAGATGCGCGGCACCAACTTCCCCCCCGACGGCCCCGTAATGC
AGAAGAAGACCATGGGCTGGGAGGCCTCCACCGAGCGCCTGTACCCCCGCGACGGCGTGC
TGAAGGGCGAGATCCACCAGGCCCTGAAGCTGAAGGACGGCGGCCACTACCTGGTGGAGT
TCAAGACCATCTACATGGCCAAGAAGCCCGTGCAACTGCCCGGCTACTACTACGTGGACA
CCAAGCTGGACATCACCTCCCACAACGAGGACTACACCATCGTGGAACAGTACGAGCGCT
CCGAGGGCCGCCACCACCTGTTCCTGTACGGCATGGACGAGCTGTACAAGTAG
1 change: 1 addition & 0 deletions this/src/analysis.rs
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ impl Analysis{

pub fn write_index(&mut self, path:&String ){
self.genes.write_index( path.to_string() ).unwrap();
self.genes.write_index_txt( path.to_string() ).unwrap();
}


Expand Down
131 changes: 86 additions & 45 deletions this/src/fast_mapper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ pub struct FastMapper{
pub names : BTreeMap<std::string::String, usize>, // gene name and gene id
pub names4sparse: BTreeMap<std::string::String, usize>, // gene name and gene id
pub names_store: Vec<String>, // store gene names as vector
pub names_count: Vec<usize>, // store the number of mappers for this gene
pub max_id: usize,// hope I get the ids right this way...
pub last_count: usize,
pub with_data: usize,
Expand All @@ -72,6 +73,7 @@ impl FastMapper{
let names = BTreeMap::<std::string::String, usize>::new();
let names4sparse = BTreeMap::<std::string::String, usize>::new();
let names_store = Vec::with_capacity( allocate );
let names_count = Vec::with_capacity( allocate );
let max_id = 0;
let last_count = 0;
let with_data = 0;
Expand All @@ -93,6 +95,7 @@ impl FastMapper{
names,
names4sparse,
names_store,
names_count,
max_id,
last_count,
with_data,
Expand All @@ -108,6 +111,7 @@ impl FastMapper{
self.last_count = new_start;
for _i in 0..new_start{
self.names_store.push("na".to_string());
self.names_count.push(0);
}
}

Expand All @@ -121,9 +125,10 @@ impl FastMapper{
for idx in 0..name_entry.classes.len(){
// Now I need to copy the gene names from the other object to my own.
let gene_names = other.gene_names_for_ids( &name_entry.classes[idx] );
let gene_name = other.gene_names_for_ids( &vec![ name_entry.data[idx].0 ]);
// And get my own ids for these names
//let gene_ids = self.ids_for_gene_names( &gene_names );
let _ = self.incorporate_match_combo( primary_id, *secondary_match as usize, gene_names[0].clone(), gene_names );
let _ = self.incorporate_match_combo( primary_id, *secondary_match as usize, gene_name[0].clone(), gene_names );
}
}
}
Expand All @@ -144,6 +149,7 @@ impl FastMapper{
if ! self.names.contains_key( name ){
self.names.insert( name.clone(), self.max_id );
self.names_store.push( name.clone() );
self.names_count.push( 0 );
self.max_id += 1;
}
let id = self.get_id( name.to_string() ).clone();
Expand All @@ -157,6 +163,7 @@ impl FastMapper{
if ! self.names.contains_key( &name ){
self.names.insert( name.clone(), self.max_id );
self.names_store.push( name.clone() );
self.names_count.push( 0 );
self.max_id += 1;
}
let gene_id = self.get_id( name.to_string() ).clone();
Expand All @@ -165,13 +172,12 @@ impl FastMapper{
// will add in the next step so
self.with_data +=1;
}
match self.mapper[initial_match].add( secondary_match.try_into().unwrap(), ( gene_id, 0), classes ){
Ok(_) => self.pos += 1,
Err(_) => {
//eprintln!("incorporate_match_combo - gene seq was already registered {}", name);
self.neg +=1
},
};
if self.mapper[initial_match].add( secondary_match.try_into().unwrap(), ( gene_id, 0), classes ){
self.pos += 1
}else{
self.neg +=1
}

Ok( () )
}

Expand All @@ -182,17 +188,17 @@ impl FastMapper{


pub fn add(&mut self, seq: &Vec<u8>, name: std::string::String, class_ids: Vec<String> ) -> usize{

//println!("fast_mapper adds this genes: {} of length {}", name, seq.len() );

if ! self.names.contains_key( &name ){
self.names.insert( name.clone(), self.max_id );
self.names_store.push( name.clone() );
self.names_count.push( 0 );
self.max_id += 1;
}

let classes = self.ids_for_gene_names( &class_ids );
let gene_id = self.get_id( name.to_string() );
//eprintln!("fast_mapper: I have {} -> {}", name, gene_id);

//let mut long:u64;
//let mut short:u16;
Expand All @@ -211,7 +217,7 @@ impl FastMapper{
let mut i =0;
//let mut short = String::from("");
//let mut long = String::from("");
let mut n =20;
let mut n = 50;

while let Some(entries) = self.tool.next(){
n -=1;
Expand All @@ -229,25 +235,35 @@ impl FastMapper{
// too short second entry
break;
}

if entries.0 > 65534{
continue; // this would be TTTTTTTT and hence not use that!
}
if ! self.mapper[entries.0 as usize].has_data() {
// will add in the next step so
self.with_data +=1;
}
// if entries.2 < self.kmer_len{
// println!("adding a smaller than expected sequence for gene {name}/{gene_id} ({})", entries.2);
// }
match self.mapper[entries.0 as usize].add( entries.1, (gene_id, 0), classes.clone() ){
Ok(_) => self.pos += 1,
Err(_) => self.neg +=1,
};
i+=1;
if entries.2 < self.kmer_len{
//eprintln!("Too small sequence");
continue;
}
if self.mapper[entries.0 as usize].add( entries.1, (gene_id, 0), classes.clone() ){
//eprintln!("I have added a sequence!");
self.pos += 1;
self.names_count[gene_id] +=1;
i+=1;
}else {
//eprintln!("NOT - I have added a sequence!");
self.neg +=1
}

}

//eprintln!("fast_mapper adds this genes: {} of length {} ({}, {})", name, seq.len(), self.pos, self.neg );

//println!("I have now {i} mappers for the gene {name}");
if i == 0 {
eprint!(".");
//eprintln!("gene {} ({:?}) does not get an entry in the fast_mapper object! - too short? {} bp", &name.as_str(), classes, seq.len() );
//eprint!(".");
eprintln!("gene {} ({:?}) does not get an entry in the fast_mapper object! - too short? {} bp", &name.as_str(), classes, seq.len() );
}
/*else {
println!("I added {i} mappper entries for gene {name}");
Expand All @@ -269,8 +285,8 @@ impl FastMapper{
/// and only the contents of the least class will be used. The original gene name will also be used - that does not need to be part of the ids.
pub fn make_index_te_ready( &mut self ) {

//println!("Before make_index_te_ready:");
//self.print();
eprintln!("Before make_index_te_ready:");
self.eprint();
for mapper_entry in self.mapper.iter_mut() {
if mapper_entry.has_data() {
mapper_entry.collapse_classes();
Expand All @@ -284,8 +300,8 @@ impl FastMapper{
}
}

//println!("\nAfter:");
//self.print();
eprintln!("\nAfter:");
self.eprint();
}
/// [entries with values, total second level entries, total single gene first/second pairs, total >1 first/second level pairs]
pub fn info( &self ) -> [usize;4]{
Expand Down Expand Up @@ -339,10 +355,22 @@ impl FastMapper{

fn get_best_gene( &self, genes:&HashMap::<(usize, usize), usize>, ret: &mut Vec::<usize> ) -> bool{
ret.clear();
//eprintln!("I suppose we have a problem with the level being 1 and not 0 here? {genes:?}");

// let mut report = false;
// if genes.len() > 2{
// report = true;
// eprintln!("Lots of genes matched here?: {genes:?}");
// for (gene_id, _level) in genes.keys(){
// eprint!(" {}", self.names_store[*gene_id] );
// }
// eprint!("\n");
// }
if genes.len() == 1 {
if let Some((key, _)) = genes.iter().next() {
ret.push(key.0.clone());
// if report {
// eprintln!("1 This was selected as good: {} or {}\n",key.0, self.names_store[key.0] )
// }
return true
}
}
Expand Down Expand Up @@ -371,15 +399,24 @@ impl FastMapper{
}
if good.len() ==1 {
ret.push( good[0] );
// if report {
// eprintln!("2 This was selected as good: {} or {}\n",good[0], self.names_store[good[0]] )
// }
return true
}
if genes_with_best_matches == 1{
ret.push( best_gene );
// if report {
// eprintln!("3 This was selected as good: {} or {}\n",best_gene, self.names_store[best_gene] )
// }
return true
}


}
// if report {
// eprintln!("! No good gene identified!\n" )
// }
return false

}
Expand Down Expand Up @@ -718,6 +755,24 @@ impl FastMapper{
self.with_data = u64::from_le_bytes( buff_u64 ) as usize;
eprintln!("reading {} entries",self.with_data );

eprintln!("Reading the gene names\n");
for name in ifile.buff2.lines() {
//println!("I read this name: {name:?}");
//let mut buff = [0_u8 ;8 ];
match name {
Ok(n) => {
//eprintln!("the name of the gene: {n}\n");
if ! self.names.contains_key( &n ){
self.names.insert( n.clone(), self.max_id );
self.names_store.push( n.clone() );
self.names_count.push(0); // will be populated later.
self.max_id += 1;
}
},
Err(e) => panic!("I could not read from the genes table {e:?}"),
};
}

for i in 0..self.with_data {
// read in the single mapper elements
// first 2 - kmer position
Expand Down Expand Up @@ -750,29 +805,15 @@ impl FastMapper{
// ifile.buff1.read_exact(&mut buff_u64).unwrap();
// sig_bits= usize::from_le_bytes( buff_u64 );
// this should never throw an error.
let _ = self.mapper[idx].add( kmer, (gene_id, gene_level), EMPTY_VEC.clone() );
if self.mapper[idx].add( kmer, (gene_id, gene_level), EMPTY_VEC.clone() ){
self.names_count[gene_id] +=1;
}

}
}
}

eprintln!("Reading the gene names\n");
for name in ifile.buff2.lines() {
//println!("I read this name: {name:?}");
//let mut buff = [0_u8 ;8 ];
match name {
Ok(n) => {
//eprintln!("the name of the gene: {n}\n");
if ! self.names.contains_key( &n ){
self.names.insert( n.clone(), self.max_id );
self.names_store.push( n.clone() );
self.max_id += 1;
}
},
Err(e) => panic!("I could not read from the genes table {e:?}"),
};
}


if self.with_data == 0 {
panic!("No data read!");
}
Expand Down
Loading

0 comments on commit 9c60fd2

Please sign in to comment.