Skip to content

Commit

Permalink
The mapper has gotten a huge upgrade.
Browse files Browse the repository at this point in the history
From 100% match to needleman wunsch inspired global alignment.
It became significantly slower - but that part is multi processor enabled.
  • Loading branch information
stela2502 committed Jan 22, 2024
1 parent 3139ce9 commit f7e65f9
Show file tree
Hide file tree
Showing 14 changed files with 6,500 additions and 98 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "rustody"
version = "1.2.4"
version = "1.2.5"
edition = "2021"
license-file = "LICENSE"
description = "Split BD rapsody fastq files into sample specific fastq files"
Expand Down
615 changes: 615 additions & 0 deletions hamming_dist_Ighm_reads.txt

Large diffs are not rendered by default.

5,668 changes: 5,668 additions & 0 deletions needleman_Ighm_reads.txt

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions src/analysis.rs
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,8 @@ impl Analysis{
return Err(FilterError::Ns);
}
}

/*
// I have huge problems with reds that contain a lot of ploy A in the end
// None of them match using blast - so I think we can savely just dump them.
let mut bad = 0;
Expand All @@ -324,6 +326,7 @@ impl Analysis{
//eprintln!("Sequence rejected due to high polyA content \n{:?}", String::from_utf8_lossy(&read2.seq()) );
return Err(FilterError::PolyA)
}
*/

Ok(())
}
Expand Down
63 changes: 54 additions & 9 deletions src/bin/map_one_sequence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ static EMPTY_VEC: Vec<String> = Vec::new();
#[derive(Parser)]
#[clap(version = "0.0.3", author = "Stefan L. <stefan.lang@med.lu.se>")]
struct Opts {
/// the input R1 reads file
/// the dna to seach for
#[clap(short, long)]
sequence: String,
dna: String,
/// the fasta database containing the genes
#[clap(short, long)]
expression: Option<String>,
Expand Down Expand Up @@ -135,18 +135,63 @@ fn main() {
}

}
let mut id = 0;
samples.change_start_id( antibodies.last_count );
if opts.specie.eq("human") {
// get all the human sample IDs into this.
// GTTGTCAAGATGCTACCGTTCAGAGATTCAAGGGCAGCCGCGTCACGATTGGATACGACTGTTGGACCGG
// b"ATTCAAGGGCAGCCGCGTCACGATTGGATACGACTGTTGGACCGG"
// Seams they have introduced new sample ids, too....
let sequences = [ b"ATTCAAGGGCAGCCGCGTCACGATTGGATACGACTGTTGGACCGG", b"TGGATGGGATAAGTGCGTGATGGACCGAAGGGACCTCGTGGCCGG",
b"CGGCTCGTGCTGCGTCGTCTCAAGTCCAGAAACTCCGTGTATCCT", b"ATTGGGAGGCTTTCGTACCGCTGCCGCCACCAGGTGATACCCGCT",
b"CTCCCTGGTGTTCAATACCCGATGTGGTGGGCAGAATGTGGCTGG", b"TTACCCGCAGGAAGACGTATACCCCTCGTGCCAGGCGACCAATGC",
b"TGTCTACGTCGGACCGCAAGAAGTGAGTCAGAGGCTGCACGCTGT", b"CCCCACCAGGTTGCTTTGTCGGACGAGCCCGCACAGCGCTAGGAT",
b"GTGATCCGCGCAGGCACACATACCGACTCAGATGGGTTGTCCAGG", b"GCAGCCGGCGTCGTACGAGGCACAGCGGAGACTAGATGAGGCCCC",
b"CGCGTCCAATTTCCGAAGCCCCGCCCTAGGAGTTCCCCTGCGTGC", b"GCCCATTCATTGCACCCGCCAGTGATCGACCCTAGTGGAGCTAAG" ];

for seq in sequences{
//seq.reverse();
let mut seq_ext = b"GTTGTCAAGATGCTACCGTTCAGAG".to_vec();
seq_ext.extend_from_slice( seq );
samples.add_small( &seq_ext, format!("Sample{id}"),EMPTY_VEC.clone() );
//sample_names.push( format!("Sample{id}") );
id +=1;
}
}
else if opts.specie.eq("mouse") {
// and the mouse ones
let sequences = [b"AAGAGTCGACTGCCATGTCCCCTCCGCGGGTCCGTGCCCCCCAAG", b"ACCGATTAGGTGCGAGGCGCTATAGTCGTACGTCGTTGCCGTGCC",
b"AGGAGGCCCCGCGTGAGAGTGATCAATCCAGGATACATTCCCGTC", b"TTAACCGAGGCGTGAGTTTGGAGCGTACCGGCTTTGCGCAGGGCT",
b"GGCAAGGTGTCACATTGGGCTACCGCGGGAGGTCGACCAGATCCT", b"GCGGGCACAGCGGCTAGGGTGTTCCGGGTGGACCATGGTTCAGGC",
b"ACCGGAGGCGTGTGTACGTGCGTTTCGAATTCCTGTAAGCCCACC", b"TCGCTGCCGTGCTTCATTGTCGCCGTTCTAACCTCCGATGTCTCG",
b"GCCTACCCGCTATGCTCGTCGGCTGGTTAGAGTTTACTGCACGCC", b"TCCCATTCGAATCACGAGGCCGGGTGCGTTCTCCTATGCAATCCC",
b"GGTTGGCTCAGAGGCCCCAGGCTGCGGACGTCGTCGGACTCGCGT", b"CTGGGTGCCTGGTCGGGTTACGTCGGCCCTCGGGTCGCGAAGGTC"];

for seq in sequences{
//seq.reverse();
//let mut seq_ext = b"GTTGTCAAGATGCTACCGTTCAGAG".to_vec();
//seq_ext.extend_from_slice( seq );
//samples.add_small( &seq_ext, format!("Sample{id}"),EMPTY_VEC.clone() );
samples.add_small( &seq.to_vec(), format!("Sample{id}"),EMPTY_VEC.clone() );
//sample_names.push( format!("Sample{id}") );
id +=1;
}

} else {
println!("Sorry, but I have no primers for species {}", &opts.specie);
std::process::exit(1)
}

// to understand the mapping a little better I now need the index written to a file
let _ = genes.write_index_txt( format!("{}/genes.index.txt", &opts.outpath) );
let _ = antibodies.write_index_txt( format!("{}/antibodies.index.txt", &opts.outpath) );
let _ = samples.write_index_txt( format!("{}/samples.index.txt", &opts.outpath) );
let _ = genes.write_index_txt( format!("{}/genes_index", &opts.outpath) );
let _ = antibodies.write_index_txt( format!("{}/antibodies_index", &opts.outpath) );
let _ = samples.write_index_txt( format!("{}/samples_index", &opts.outpath) );

eprintln!("Ill check this sequence: '{}'", &opts.sequence );
eprintln!("Ill check this dna: '{}'", &opts.dna );

let mut collected:HashMap::<usize, usize>= HashMap::new();

let mut ok = match antibodies.get( &opts.sequence.as_bytes(), &mut tool ){
let mut ok = match antibodies.get( &opts.dna.as_bytes(), &mut tool ){
Some(gene_ids) =>{
//eprintln!("gene id {gene_id:?} seq {:?}", String::from_utf8_lossy(&data[i].1) );
//eprintln!("I got an ab id {gene_id}");
Expand All @@ -171,7 +216,7 @@ fn main() {
}
};
if ! ok{
ok = match samples.get( &opts.sequence.as_bytes(), &mut tool ){
ok = match samples.get( &opts.dna.as_bytes(), &mut tool ){
Some(gene_ids) =>{
//eprintln!("gene id {gene_id:?} seq {:?}", String::from_utf8_lossy(&data[i].1) );
//eprintln!("I got an ab id {gene_id}");
Expand All @@ -198,7 +243,7 @@ fn main() {
}

if ! ok{
let _ = match genes.get( &opts.sequence.as_bytes(), &mut tool ){
let _ = match genes.get( &opts.dna.as_bytes(), &mut tool ){
Some(gene_ids) =>{
//eprintln!("gene id {gene_id:?} seq {:?}", String::from_utf8_lossy(&data[i].1) );
//eprintln!("I got an ab id {gene_id}");
Expand Down
2 changes: 1 addition & 1 deletion src/bin/quantify_rhapsody.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ use rustody::ofiles::Ofiles;
/// You need quite long R1 and R2 reads for this! (>70R1 and >70R2 \[v1\] and 52 bp reads for v2.96 and v2.384)
#[derive(Parser)]
#[clap(version = "1.2.4", author = "Stefan L. <stefan.lang@med.lu.se>")]
#[clap(version = "1.2.5", author = "Stefan L. <stefan.lang@med.lu.se>")]
struct Opts {
/// the input R1 reads file
#[clap(short, long)]
Expand Down
2 changes: 1 addition & 1 deletion src/bin/quantify_rhapsody_multi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ use num_cpus;
/// You need quite long R1 and R2 reads for this! (>70R1 and >70R2 \[v1\] and 52 bp reads for v2.96 and v2.384)
#[derive(Parser)]
#[clap(version = "1.2.4", author = "Stefan L. <stefan.lang@med.lu.se>")]
#[clap(version = "1.2.5", author = "Stefan L. <stefan.lang@med.lu.se>")]
struct Opts {
/// the input R1 reads file
#[clap(short, long)]
Expand Down
58 changes: 33 additions & 25 deletions src/fast_mapper/fast_mapper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ impl FastMapper{
//println!("I have now {i} mappers for the gene {name}");
if i == 0 {
//eprint!(".");
eprintln!("gene {} ({:?}) does not get an entry in the fast_mapper object! - too short? {} bp", &name.as_str(), classes, seq.len() );
println!("gene {} ({:?}) does not get an entry in the fast_mapper object! - too short? {} bp", &name.as_str(), classes, seq.len() );
}
/*else {
println!("I added {i} mappper entries for gene {name}");
Expand Down Expand Up @@ -339,14 +339,14 @@ impl FastMapper{
}

if self.mapper[entries.0 as usize].add( entries.1, (gene_id, 0), classes.clone() ){
//eprintln!("I have added a sequence! {:#b}+{:?} -> {gene_id} & {classes:?} ",entries.0, entries.1 );
//println!("I have added a sequence! {:#b}+{} -> {gene_id} & {classes:?} ",entries.0, entries.1 );
self.pos += 1;
self.names_count[gene_id] +=1;
if i < self.names_count[gene_id]{
i = self.names_count[gene_id];
}
}else {
//eprintln!("NOT - I have added a sequence!");
//println!("NOT added this sequence! {:#b}+{} -> {gene_id} & {classes:?} ",entries.0, entries.1);
self.neg +=1
}

Expand All @@ -357,7 +357,7 @@ impl FastMapper{
//println!("I have now {i} mappers for the gene {name}");
if i == 0 {
//eprint!(".");
eprintln!("gene {} ({:?}) does not get an entry in the fast_mapper object! - too short or a duplicate? {} bp", &name.as_str(), classes, seq.len() );
println!("gene {} ({:?}) does not get an entry in the fast_mapper object! - too short or a duplicate? {} bp", &name.as_str(), classes, seq.len() );
}
/*else {
println!("I added {i} mappper entries for gene {name}");
Expand Down Expand Up @@ -457,15 +457,16 @@ impl FastMapper{
return false
}

// let mut report = false;
// if genes.len() > 2{
// report = true;
// eprintln!("Lots of genes matched here?: {genes:?}");
// for (gene_id, _level) in genes.keys(){
// eprint!(" {}", self.names_store[*gene_id] );
// }
// eprint!("\n");
// }
/*let mut report = false;
if genes.len() > 2{
report = true;
eprintln!("Lots of genes matched here?: {genes:?}");
for (gene_id, _level) in genes.keys(){
eprint!(" {}", self.names_store[*gene_id] );
}
eprint!("\n");
}*/

let most_matches = genes.values().max().unwrap_or(&0);
//eprintln!("I got this as most matches {most_matches}");

Expand Down Expand Up @@ -661,6 +662,7 @@ impl FastMapper{
let mut matching_geneids = Vec::< usize>::with_capacity(10);

// entries is a Option<(u16, u64, usize)

/*let mut i = 0;
let mut small_seq :String = Default::default();
let mut large_seq: String = Default::default();*/
Expand All @@ -671,13 +673,18 @@ impl FastMapper{
// the 8bp bit is a match

//eprintln!("We are at iteration {i}");
if &entries.1.1 < &20_u8{
//will never map as the mapper fails every sequence below 20 bp - too many false positives!
continue
}

/*small_seq.clear();
large_seq.clear();
tool.u16_to_str( 8, &entries.0, &mut small_seq);
tool.u64_to_str( entries.1.1.into(), &entries.1.0, &mut large_seq);
eprintln!("We are on iteration {i} with seq {:?}-{:?})", small_seq, large_seq);
*/
println!("We are on iteration {i} with seq {}-{} ( {}-{})", small_seq, large_seq, &entries.0, &entries.1);
i +=1;*/


// if matching_geneids.len() == 1 && i > 3 {
// //eprintln!("we have one best gene identified! {}",matching_geneids[0]);
Expand All @@ -690,19 +697,19 @@ impl FastMapper{
match &self.mapper[entries.0 as usize].get( &entries.1 ){

Some( gene_ids ) => {
//eprintln!("Got some gene ids (one?): {:?}", gene_ids);
//println!("Got some gene ids (one?): {:?}", gene_ids);
for name_entry in gene_ids {
for gid in &name_entry.data{
match genes.get_mut( &gid) {
Some(gene_count) => {
//eprintln!( "Adding to existsing {} with count {gene_count}+1", gid.0);
//println!( "Adding to existsing {} with count {gene_count}+1", gid.0);
*gene_count +=1;
if *gene_count == 4 && genes.len() ==1 {
break 'main;
}
},
None => {
//eprintln!( "Adding a new gene {} with count 1 here!", gid.0);
//println!( "Adding a new gene {} with count 1 here!", gid.0);
genes.insert( gid.clone(), 1);
},
};
Expand All @@ -712,7 +719,7 @@ impl FastMapper{
},
None => {

//eprintln!("Got one no gene id in the first run:");
//eprintln!("Got no gene id in the first run get() run -> find() instead:");
match self.mapper[entries.0 as usize].find( &entries.1 ){
Some( gene_ids ) => {
//eprintln!("But in the second I got one: {gene_ids:?}");
Expand All @@ -735,7 +742,8 @@ impl FastMapper{
}
}
},
None => {
None => {
//eprintln!("And none in the find() either.");
//no_32bp_match +=1;
},
}
Expand All @@ -756,17 +764,17 @@ impl FastMapper{
return None
}

let bad_gene = "Zbtb16";
let bad = self.get_id( bad_gene.to_string()) ;
/*let bad_gene = "Ighm";
let bad = self.get_id( bad_gene.to_string()) ;*/

// check if there is only one gene //
if self.get_best_gene( &genes, &mut matching_geneids ){
// Cd3e <- keep that to fiond this place back
//println!("I have these genes: {genes:?} And am returning: {:?}", matching_geneids);
if matching_geneids[0] == bad {
/*if matching_geneids[0] == bad {
println!("read mapping to {} - should not happen here!: {:?}\n{:?}", bad_gene, self.gene_names_for_ids( &matching_geneids ),String::from_utf8_lossy(seq) );
println!("This is our total matching set: {:?}", genes);
}
}*/
return Some( matching_geneids )
}
if matching_geneids.len() > 2{
Expand Down Expand Up @@ -1080,7 +1088,7 @@ impl FastMapper{
nucl.clear();
self.tool.u64_to_str( 8, &(i as u64), &mut nucl );
if self.mapper[idx].has_data(){
match write!(ofile.buff1, "\ni: {} {} ", i, nucl ){
match write!(ofile.buff1, "\ni: {} ({:016b}) {} ", i, i,nucl ){
Ok(_) => (),
//Ok(_) => println!("8bp mapper: {:b} binary -> {:?} bytes",i, &i.to_le_bytes() ) ,
Err(_err) => return Err::<(), &str>("i could not be written"),
Expand Down
Loading

0 comments on commit f7e65f9

Please sign in to comment.