From c5f1bf094b3d069750812756176d7837c6421ba9 Mon Sep 17 00:00:00 2001 From: Stefan Lang Date: Tue, 10 Dec 2024 10:24:41 +0100 Subject: [PATCH] indexed_genes has become a stand alone index replacement Used in the quantify_gtf package. --- src/singlecelldata/cell_data/cell_data.rs | 32 +++++++++++++++++++++++ src/singlecelldata/indexed_genes.rs | 22 +++++++++++++--- src/singlecelldata/singlecelldata.rs | 31 ++++++++++++++++++++++ 3 files changed, 82 insertions(+), 3 deletions(-) diff --git a/src/singlecelldata/cell_data/cell_data.rs b/src/singlecelldata/cell_data/cell_data.rs index 9745d2b..5248e90 100644 --- a/src/singlecelldata/cell_data/cell_data.rs +++ b/src/singlecelldata/cell_data/cell_data.rs @@ -137,6 +137,38 @@ impl CellData{ *mine += count - double_counts; } } + + /// adds the other values into this object + pub fn merge_re_id_genes(&mut self, other: &CellData, other_genes: &Vec:: ) { + self.total_umis += other.total_umis; + let mut too_much = BTreeMap::::new(); + + for (gene_umi_combo, counts) in &other.genes { + // re-id the UMI count touple + let re_ided_umi_combo= GeneUmiHash( + other_genes[gene_umi_combo.0], + gene_umi_combo.1 + ); + match self.genes.entry(re_ided_umi_combo) { + std::collections::btree_map::Entry::Occupied(mut entry) => { + *entry.get_mut() += counts; + let counter = too_much.entry(re_ided_umi_combo.0).or_insert(0); + *counter += 1; + } + std::collections::btree_map::Entry::Vacant(entry) => { + entry.insert(*counts); + } + } + } + + //let other_total_reads = std::mem::take(&mut other.total_reads); + for (gene_id, count) in &other.total_reads { + let double_counts = too_much.get(&gene_id).unwrap_or(&0); + let mine = self.total_reads.entry(*gene_id).or_insert(0); + *mine += count - double_counts; + } + } + /* my old function pub fn merge(&mut self, other:&CellData ){ diff --git a/src/singlecelldata/indexed_genes.rs b/src/singlecelldata/indexed_genes.rs index 8652fdd..d64f78a 100644 --- a/src/singlecelldata/indexed_genes.rs +++ b/src/singlecelldata/indexed_genes.rs @@ -25,13 +25,25 @@ impl IndexedGenes{ Self{ names: BTreeMap::new(), ids_to_name: Vec::with_capacity( 80_000 ), - offset : match( off ){ + offset : match off { Some(o) => o, None => 0, }, } } + + + /// merges the other list into this list and returns a Vec of new ids for the other data + pub fn merge (&mut self, other: &IndexedGenes) -> Vec{ + let mut rec = Vec::::with_capacity( other.len() ); + for gene in &other.ids_to_name { + rec.push ( self.get_gene_id( gene ) ) + } + rec + } + + /// Returns the gene ID for the given gene name. /// If the gene does not exist, assigns a new ID. pub fn get_gene_id(&mut self, gene: &str) -> usize { @@ -51,11 +63,15 @@ impl IndexedGenes{ new_id } + pub fn len( &self ) -> usize{ + self.names.len() + } + pub fn new ( data: &BTreeMap, offset: usize ) -> Self { let mut ids_to_name = vec![String::new(); data.len() + offset ]; let mut names = BTreeMap::new(); let mut max_entry = 0_usize; - let _ =data.iter().for_each(| (_,val) | if *val > max_entry {max_entry = *val}); + let _ = data.iter().for_each(| (_,val) | if *val > max_entry {max_entry = *val}); println!("I have gotten a max extry of {max_entry} and have a vec with {} available spaces",ids_to_name.len() ); if max_entry > (data.len() + offset) { panic!("This is a library error - why is my largest id {max_entry} when it should be {}?!?", data.len() + offset); @@ -73,7 +89,7 @@ impl IndexedGenes{ } /// returns the external id - the ID used in the single_cell_data class. - pub fn ids_for_gene_names(&self, names:&Vec ) -> Vec{ + pub fn ids_for_gene_names(&self, names:&Vec:: ) -> Vec{ let mut ret = Vec::::with_capacity( names.len() ); for name in names{ if let Some(id) = self.names.get( name ){ diff --git a/src/singlecelldata/singlecelldata.rs b/src/singlecelldata/singlecelldata.rs index 40be9d4..2b4b866 100644 --- a/src/singlecelldata/singlecelldata.rs +++ b/src/singlecelldata/singlecelldata.rs @@ -159,6 +159,37 @@ impl SingleCellData{ } } } + + /// merge two SingleCellData objects - with a different gene list!! + pub fn merge_re_id_genes(&mut self, other: SingleCellData, other_ids: &Vec:: ) { + if ! other.is_empty() { + // Reset all internal measurements + self.checked = false; + self.passing = 0; + self.genes_with_data.clear(); + + for other_cell in other.data.iter().flat_map(|map| map.values()) { + let index = self.to_key(&other_cell.name); // Extracting the first u8 of the u64 key + match self.data[index].entry(other_cell.name) { + std::collections::btree_map::Entry::Occupied(mut entry) => { + // If cell exists, merge with existing cell + //println!( "merge cell {} {}", other_cell.name, other_cell); + let cell = entry.get_mut(); + cell.merge_re_id_genes(other_cell, &other_ids); + } + std::collections::btree_map::Entry::Vacant(entry) => { + // If cell doesn't exist, insert new cell + //println!( "steel cell {} {}", other_cell.name, other_cell); + let mut cell = CellData::new( other_cell.name.clone() ); + cell.merge_re_id_genes( other_cell, &other_ids ); + entry.insert( cell ); // Move ownership + } + } + } + + } + } + /* // my old fucntion: pub fn merge( &mut self, other:&SingleCellData ){ // reset all internal measurements