Start of an ambient RNA identification process.

stela2502 · Jan 25, 2024 · 270a83f · 270a83f
1 parent 552e5e9
commit 270a83f
Show file tree

Hide file tree

Showing 8 changed files with 33,695 additions and 2,408 deletions.
diff --git a/src/analysis.rs b/src/analysis.rs
@@ -423,14 +423,15 @@ impl Analysis{
 			                        	report.pcr_duplicates += 1 
 			                        }
 			                    }else {
-			                    	if ! gex.try_insert_multimapper(
+			                    	panic!("Multimapper have been deactivated!");
+			                    	/*if ! gex.try_insert_multimapper(
 				                    		&(*cell_id as u64),
 				                    		gene_id,
 				                    		umi,
 				                    		report
 				                    	) {
 			                    		report.pcr_duplicates += 1 
-			                    	}
+			                    	}*/
 			                    }
 
 		                    },
@@ -801,12 +802,13 @@ impl Analysis{
 				            			);
 				            		}
 				            		else {
-			            				self.gex.try_insert_multimapper(
+				            			panic!("Multimapper? - I have removed this option!");
+			            				/*self.gex.try_insert_multimapper(
 				                    		&(*cell_id as u64),
 				                    		gene_id,
 				                    		umi,
 				                    		report
-				                    	);
+				                    	);*/
 				            		}
 			                        //println!("R2 {}",String::from_utf8_lossy(&seqrec.id()).to_owned() );
 			                        //println!("R1 {}",String::from_utf8_lossy(&seqrec1.id()).to_owned() );

diff --git a/src/cellids.rs b/src/cellids.rs
@@ -8,6 +8,8 @@ use std::collections::HashSet;
 use crate::int_to_str::IntToStr;
 //use crate::sampleids::SampleIds;
 use crate::traits::CellIndex;
+use crate::fast_mapper::mapper_entries::second_seq::SecondSeq;
+
 //use std::thread;
 
 //mod cellIDsError;
@@ -83,7 +85,7 @@ impl CellIndex for CellIds{
                             good.push( (i, dist ) );
                         }
                     }
-                    if let Some(c1) = Self::best_entry( good ){
+                    if let Some(c1) = Self::best_entry( good, fuzziness as usize ){
                         ok = true;
                         c1 * max * max
                     }else {
@@ -110,7 +112,7 @@ impl CellIndex for CellIds{
                             good.push( (i, dist)  );
                         }
                     }
-                    if let Some(c2) = Self::best_entry( good ){
+                    if let Some(c2) = Self::best_entry( good, fuzziness as usize ){
                         ok = true;
                         c2 * max 
                     }else {
@@ -137,7 +139,7 @@ impl CellIndex for CellIds{
                             good.push( (i, dist ) );
                         }
                     }
-                    if let Some(c3) = Self::best_entry( good ){
+                    if let Some(c3) = Self::best_entry( good, fuzziness as usize ){
                         ok = true;
                         c3 
                     }else {
@@ -148,6 +150,14 @@ impl CellIndex for CellIds{
             if ok {
                 let tool =IntToStr::new( r1[(self.umi.0+add)..(self.umi.1+add)].to_vec(), 31 );
                 let umi:u64 = tool.into_u64();
+                fn in_list( this:u64 )-> bool{
+                    ((this == 52638_u64) | ( this == 29170_u64))| (this == 156688_u64)
+                }
+                if cell_id +1 == 7243072{
+                    //if !( (in_list(km1) | in_list(km2)) | in_list(km3) ) {
+                        println!("Cell_id 7243072 from this seq: {}/{} {}/{} {}/{}",&km1, tool.u64_to_string( 9, &km1 ), &km2, tool.u64_to_string( 9, &km2 ), &km3, tool.u64_to_string( 9, &km3 ) )
+                    //}
+                }
                 return Ok( (cell_id +1, umi) ); 
             }
             // ok no match for shift add == iteration of this loop
@@ -160,6 +170,16 @@ impl CellIndex for CellIds{
 // here the functions
 impl CellIds{
 
+    /// convert the cellid's sequences into one SecondSeq sequence of length 27 to
+    /// be able to later on compare them rather quickly.
+    pub fn as_second_seq( &self, id:u32 ) -> Option<SecondSeq> {
+        let seqs:Vec::<u64> = self.to_sequence( id );
+        let larger_seq:u64 = (seqs[0] << 18) | (seqs[1] << 9) | seqs[2];
+        let ret = SecondSeq( larger_seq, 27 );
+        println!("I created a seq {ret}");
+        return Some(ret)
+    }
+
     /// calculate the base flips between two u64 sequences
     /// stops after having detected 4 different bases.
     pub fn hamming_distance(&self, this:&u64, other: &u64) -> u32 {
@@ -594,15 +614,15 @@ impl CellIds{
 
     /// returns the first entry in the tuple that has the lowest second entry
     /// and only if there is only one tuple with the lowest second entry 
-    fn best_entry( data:Vec<(usize, u32)> ) -> Option<u32>{
+    fn best_entry( data:Vec<(usize, u32)> , max_val:usize) -> Option<u32>{
         if data.len() == 0 {
             return None
         }
         if data.len() == 1 {
             return Some(data[0].0 as u32)
         }
         else {
-            let mut counter = vec![0_u32,0_u32,0_u32];
+            let mut counter = vec![0_u32; max_val];
             let mut min = u32::MAX;
             for ( _i, mismatch ) in &data{
                 if min > *mismatch{
@@ -614,6 +634,9 @@ impl CellIds{
                 // this is great - we have a minimum overlap and that has exactly one match!
                 for ( i, mismatch ) in &data{
                     if *mismatch == min{
+                        if ( (data[*i].0 == 122688) | (data[*i].0 == 69584)  ) | (data[*i].0 == 3600) {
+                            println!("Sample ID search I have a mismatch of {min} and am returning id {}", data[*i].0);
+                        }
                         return Some( data[*i].0 as u32 )
                     }
                 }

diff --git a/src/int_to_str.rs b/src/int_to_str.rs
@@ -477,6 +477,17 @@ impl IntToStr {
 		self.shifted = 0;
 	}
 
+	/// <unsigned_int>_to_str functions do exactly that.
+	/// they regenerate the initial utf8 encoded String from a IntToStr encoded integer.
+	/// u64_to_str - convert a IntToStr encoded u64 to utf8 Vec::<u8> with a length of up to 32bp
+	pub fn u64_to_string ( &self, kmer_size:usize, km:&u64) ->String {
+
+	    let array = km.to_le_bytes();
+	    let mut data: String = "".to_string();
+
+	    self.u8_array_to_str( kmer_size, (&array).to_vec(), &mut data);
+	    return data
+	}
 
 	/// <unsigned_int>_to_str functions do exactly that.
 	/// they regenerate the initial utf8 encoded String from a IntToStr encoded integer.

diff --git a/src/singlecelldata/cell_data.rs → src/singlecelldata/cell_data/cell_data.rs b/src/singlecelldata/cell_data.rs → src/singlecelldata/cell_data/cell_data.rs
@@ -2,6 +2,7 @@ use std::collections::BTreeMap;
 use std::collections::HashSet;
 
 use std::collections::hash_map::DefaultHasher;
+use crate::singlecelldata::cell_data::GeneUmiHash;
 use std::hash::Hash;
 use std::hash::Hasher;
 
@@ -14,7 +15,7 @@ use crate::fast_mapper::FastMapper;
 pub struct CellData{
     //pub kmer_size: usize,
     pub name: u64,
-    pub genes: BTreeMap<usize, HashSet<u64>>, // I want to know how many times I got the same UMI
+    pub genes: BTreeMap<GeneUmiHash, usize>, // I want to know how many times I got the same UMI
     pub genes_with_data: HashSet<String>, // when exporting genes it is helpfule to know which of the possible genomic genes actually have an expression reported...
     pub total_reads: BTreeMap<usize, usize>, // instead of the per umi counting
     //pub cell_counts: BTreeMap<usize, usize>,
@@ -70,7 +71,7 @@ impl CellData{
     }
 
     pub fn deep_clone(&self) -> CellData {
-        let cloned_genes: BTreeMap<usize, HashSet<u64>> = self
+        let cloned_genes: BTreeMap<GeneUmiHash , usize> = self
             .genes
             .iter()
             .map(|(k, v)| (*k, v.clone())) // Clone each HashSet within the BTreeMap
@@ -95,67 +96,67 @@ impl CellData{
 
     /// adds the other values into this object
     pub fn merge(&mut self, other:&CellData ){
-        let mut too_much:usize;
+
         self.total_umis += other.total_umis;
-        for (gene_id, umis) in &other.genes {
-            too_much = 0;
-            for umi in umis {
-                self.add( *gene_id, *umi );
-                too_much += 1;
-            }
-            match self.total_reads.get_mut( &gene_id ){
-                Some( val ) => {
-                    let add = other.total_reads.get( &gene_id ).unwrap_or(&too_much) - too_much;
-                    *val += add
+        let mut too_much = BTreeMap::<usize, usize>::new();
+
+        for (gene_umi_combo, counts) in &other.genes {
+            match self.genes.get_mut( gene_umi_combo ) {
+                Some(count) => {
+                    *count += counts;
+                    let counter = too_much.entry(gene_umi_combo.0).or_insert(0);
+                    *counter += 1;
                 },
-                None => panic!("merge could not copy the total gene values for gene {gene_id}",  ),
-            };
+                None => {
+                    self.genes.insert( *gene_umi_combo, 1);
+                }
+            }
+        }
+        for (gene_id, count) in &other.total_reads {
+            let double_counts = *too_much.entry(*gene_id).or_insert_with(|| 0);
+            let mine = self.total_reads.entry(*gene_id).or_insert(0);
+            *mine += count - double_counts;
         }
-        for (hash, (umis, ids) ) in &other.multimapper {
-            too_much = 0;
+        /*for (hash, (umis, ids) ) in &other.multimapper {
+            let double_counts = 0;
             for umi in umis {
                 self.add_multimapper( ids.clone(), *umi );
-                too_much += 1;
+                double_counts += 1;
             }
             match self.total_reads.get_mut( &(*hash as usize) ){
                 Some( val ) => {
-                    let add = other.total_reads.get( &(*hash as usize) ).unwrap_or(&too_much) - too_much;
+                    let add = other.total_reads.get( &(*hash as usize) ).unwrap_or(&double_counts) - double_counts;
                     *val += add
                 },
                 None => panic!("merge could not copy the total gene values for multimapper hash {hash}",  ),
             };
-        }
+        }*/
     }
 
     // returns false if the gene/umi combo has already been recorded!
     pub fn add(&mut self, geneid: usize, umi:u64 ) -> bool{
         //println!("adding gene id {}", geneid );
-        return match self.genes.get_mut( &geneid ) {
+        let gh = GeneUmiHash( geneid, umi);
+        return match self.genes.get_mut( &gh ) {
             Some( gene ) => {
-                if gene.insert( umi ){
-                    match self.total_reads.get_mut( &geneid ){
-                        Some( count ) => *count += 1,
-                        None => panic!("This must not happen - libraray error"),
-                    }
-                    //eprintln!("Good data");
-                    self.total_umis += 1;
-                    return true
-                }
-                //eprintln!("UMI already known {umi}");
+                *gene +=1;
                 false
             }, 
             None => {
-                let mut gc:HashSet<u64> = HashSet::new(); //to store the umis
-                gc.insert( umi );
-                self.total_reads.insert( geneid, 1 );
-                self.genes.insert( geneid, gc );
-                //eprintln!("Good data2");
+                self.genes.insert(gh, 1);
+                match self.total_reads.get_mut( &geneid ){
+                    Some( count ) => *count += 1,
+                    None => {
+                        self.total_reads.insert( geneid, 1 );
+                    }
+                };
                 self.total_umis += 1;
                 true
             }
         }
     }
 
+    /*
     // returns false if the gene/umi combo has already been recorded!
     pub fn add_multimapper(&mut self, geneids: Vec::<usize>, umi:u64 ) -> bool{
         //println!("adding gene id {}", geneid );
@@ -187,7 +188,7 @@ impl CellData{
                 true
             }
         }
-    }
+    }*/
 
     pub fn n_umi( &self, _gene_info:&FastMapper, _gnames: &Vec<String> ) -> usize {
         return self.total_umis;
@@ -222,6 +223,7 @@ impl CellData{
         n
     }
 
+    /*
     pub fn fix_multimappers( &mut self ){
         for (umis, geneids ) in self.multimapper.values(){
             eprintln!("I have a multimapper group: {:?}", geneids);
@@ -293,23 +295,14 @@ impl CellData{
             };
         }
         self.multimapper.clear();
-    }
+    }*/
 
     pub fn n_umi_4_gene( &self, gene_info:&FastMapper, gname:&String) -> usize {
-        let mut n = 0;
-
         let id = match gene_info.names.get( gname ){
             Some(g_id) => g_id,
-            None => panic!("I could not resolve the gene name {gname}" ),
+            None => panic!("Programming error: I could not resolve the gene name {gname}" ),
         };
-        n += match self.genes.get( id  ){
-            Some( map ) => {
-                map.len()
-            }
-            None => 0
-        };
-        //if n > 0 { println!("I got {} umis for gene {}", n, gname ); }
-        n
+        *self.total_reads.get( id  ).unwrap_or(&0)
     }
 
 

diff --git a/src/singlecelldata/cell_data/gene_umi_hash.rs b/src/singlecelldata/cell_data/gene_umi_hash.rs
@@ -0,0 +1,49 @@
+use std::hash::{Hash, Hasher};
+use core::fmt;
+use std::cmp::{max, Ord};
+
+#[derive(Debug, Copy, Clone)]
+pub struct GeneUmiHash( pub usize, pub u64);
+impl PartialEq for GeneUmiHash {
+    fn eq(&self, other: &Self) -> bool {
+    	return (self.0 == other.0) && (self.1 == other.1)
+    }
+}
+
+
+impl Ord for GeneUmiHash {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        // Compare based on the first field (usize) first
+        match self.0.cmp(&other.0) {
+            std::cmp::Ordering::Equal => {
+                // If the first fields are equal, compare based on the second field (u64)
+                self.1.cmp(&other.1)
+            }
+            ordering => ordering,
+        }
+    }
+}
+
+impl PartialOrd for GeneUmiHash {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Eq for GeneUmiHash {}
+
+impl Hash for GeneUmiHash {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.0.hash(state);
+        self.1.hash(state);
+    }
+}
+
+// Implementing Display trait for GeneUmiHash
+impl fmt::Display for GeneUmiHash {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "GeneUmiHash (gene_id: {}, umi: {})", self.0, self.1 )
+    }
+}
+
+
diff --git a/src/singlecelldata/cell_data/mod.rs b/src/singlecelldata/cell_data/mod.rs
@@ -0,0 +1,8 @@
+// singlecelldata/cell_data/mod.rs
+
+pub mod cell_data;
+pub mod gene_umi_hash;
+
+pub use cell_data::CellData as CellData;
+pub use gene_umi_hash::GeneUmiHash as GeneUmiHash;
+