Skip to content

Commit

Permalink
Start of an ambient RNA identification process.
Browse files Browse the repository at this point in the history
  • Loading branch information
stela2502 committed Jan 25, 2024
1 parent 552e5e9 commit 270a83f
Show file tree
Hide file tree
Showing 8 changed files with 33,695 additions and 2,408 deletions.
10 changes: 6 additions & 4 deletions src/analysis.rs
Original file line number Diff line number Diff line change
Expand Up @@ -423,14 +423,15 @@ impl Analysis{
report.pcr_duplicates += 1
}
}else {
if ! gex.try_insert_multimapper(
panic!("Multimapper have been deactivated!");
/*if ! gex.try_insert_multimapper(
&(*cell_id as u64),
gene_id,
umi,
report
) {
report.pcr_duplicates += 1
}
}*/
}

},
Expand Down Expand Up @@ -801,12 +802,13 @@ impl Analysis{
);
}
else {
self.gex.try_insert_multimapper(
panic!("Multimapper? - I have removed this option!");
/*self.gex.try_insert_multimapper(
&(*cell_id as u64),
gene_id,
umi,
report
);
);*/
}
//println!("R2 {}",String::from_utf8_lossy(&seqrec.id()).to_owned() );
//println!("R1 {}",String::from_utf8_lossy(&seqrec1.id()).to_owned() );
Expand Down
33 changes: 28 additions & 5 deletions src/cellids.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ use std::collections::HashSet;
use crate::int_to_str::IntToStr;
//use crate::sampleids::SampleIds;
use crate::traits::CellIndex;
use crate::fast_mapper::mapper_entries::second_seq::SecondSeq;

//use std::thread;

//mod cellIDsError;
Expand Down Expand Up @@ -83,7 +85,7 @@ impl CellIndex for CellIds{
good.push( (i, dist ) );
}
}
if let Some(c1) = Self::best_entry( good ){
if let Some(c1) = Self::best_entry( good, fuzziness as usize ){
ok = true;
c1 * max * max
}else {
Expand All @@ -110,7 +112,7 @@ impl CellIndex for CellIds{
good.push( (i, dist) );
}
}
if let Some(c2) = Self::best_entry( good ){
if let Some(c2) = Self::best_entry( good, fuzziness as usize ){
ok = true;
c2 * max
}else {
Expand All @@ -137,7 +139,7 @@ impl CellIndex for CellIds{
good.push( (i, dist ) );
}
}
if let Some(c3) = Self::best_entry( good ){
if let Some(c3) = Self::best_entry( good, fuzziness as usize ){
ok = true;
c3
}else {
Expand All @@ -148,6 +150,14 @@ impl CellIndex for CellIds{
if ok {
let tool =IntToStr::new( r1[(self.umi.0+add)..(self.umi.1+add)].to_vec(), 31 );
let umi:u64 = tool.into_u64();
fn in_list( this:u64 )-> bool{
((this == 52638_u64) | ( this == 29170_u64))| (this == 156688_u64)
}
if cell_id +1 == 7243072{
//if !( (in_list(km1) | in_list(km2)) | in_list(km3) ) {
println!("Cell_id 7243072 from this seq: {}/{} {}/{} {}/{}",&km1, tool.u64_to_string( 9, &km1 ), &km2, tool.u64_to_string( 9, &km2 ), &km3, tool.u64_to_string( 9, &km3 ) )
//}
}
return Ok( (cell_id +1, umi) );
}
// ok no match for shift add == iteration of this loop
Expand All @@ -160,6 +170,16 @@ impl CellIndex for CellIds{
// here the functions
impl CellIds{

/// convert the cellid's sequences into one SecondSeq sequence of length 27 to
/// be able to later on compare them rather quickly.
pub fn as_second_seq( &self, id:u32 ) -> Option<SecondSeq> {
let seqs:Vec::<u64> = self.to_sequence( id );
let larger_seq:u64 = (seqs[0] << 18) | (seqs[1] << 9) | seqs[2];
let ret = SecondSeq( larger_seq, 27 );
println!("I created a seq {ret}");
return Some(ret)
}

/// calculate the base flips between two u64 sequences
/// stops after having detected 4 different bases.
pub fn hamming_distance(&self, this:&u64, other: &u64) -> u32 {
Expand Down Expand Up @@ -594,15 +614,15 @@ impl CellIds{

/// returns the first entry in the tuple that has the lowest second entry
/// and only if there is only one tuple with the lowest second entry
fn best_entry( data:Vec<(usize, u32)> ) -> Option<u32>{
fn best_entry( data:Vec<(usize, u32)> , max_val:usize) -> Option<u32>{
if data.len() == 0 {
return None
}
if data.len() == 1 {
return Some(data[0].0 as u32)
}
else {
let mut counter = vec![0_u32,0_u32,0_u32];
let mut counter = vec![0_u32; max_val];
let mut min = u32::MAX;
for ( _i, mismatch ) in &data{
if min > *mismatch{
Expand All @@ -614,6 +634,9 @@ impl CellIds{
// this is great - we have a minimum overlap and that has exactly one match!
for ( i, mismatch ) in &data{
if *mismatch == min{
if ( (data[*i].0 == 122688) | (data[*i].0 == 69584) ) | (data[*i].0 == 3600) {
println!("Sample ID search I have a mismatch of {min} and am returning id {}", data[*i].0);
}
return Some( data[*i].0 as u32 )
}
}
Expand Down
11 changes: 11 additions & 0 deletions src/int_to_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,17 @@ impl IntToStr {
self.shifted = 0;
}

/// <unsigned_int>_to_str functions do exactly that.
/// they regenerate the initial utf8 encoded String from a IntToStr encoded integer.
/// u64_to_str - convert a IntToStr encoded u64 to utf8 Vec::<u8> with a length of up to 32bp
pub fn u64_to_string ( &self, kmer_size:usize, km:&u64) ->String {

let array = km.to_le_bytes();
let mut data: String = "".to_string();

self.u8_array_to_str( kmer_size, (&array).to_vec(), &mut data);
return data
}

/// <unsigned_int>_to_str functions do exactly that.
/// they regenerate the initial utf8 encoded String from a IntToStr encoded integer.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use std::collections::BTreeMap;
use std::collections::HashSet;

use std::collections::hash_map::DefaultHasher;
use crate::singlecelldata::cell_data::GeneUmiHash;
use std::hash::Hash;
use std::hash::Hasher;

Expand All @@ -14,7 +15,7 @@ use crate::fast_mapper::FastMapper;
pub struct CellData{
//pub kmer_size: usize,
pub name: u64,
pub genes: BTreeMap<usize, HashSet<u64>>, // I want to know how many times I got the same UMI
pub genes: BTreeMap<GeneUmiHash, usize>, // I want to know how many times I got the same UMI
pub genes_with_data: HashSet<String>, // when exporting genes it is helpfule to know which of the possible genomic genes actually have an expression reported...
pub total_reads: BTreeMap<usize, usize>, // instead of the per umi counting
//pub cell_counts: BTreeMap<usize, usize>,
Expand Down Expand Up @@ -70,7 +71,7 @@ impl CellData{
}

pub fn deep_clone(&self) -> CellData {
let cloned_genes: BTreeMap<usize, HashSet<u64>> = self
let cloned_genes: BTreeMap<GeneUmiHash , usize> = self
.genes
.iter()
.map(|(k, v)| (*k, v.clone())) // Clone each HashSet within the BTreeMap
Expand All @@ -95,67 +96,67 @@ impl CellData{

/// adds the other values into this object
pub fn merge(&mut self, other:&CellData ){
let mut too_much:usize;

self.total_umis += other.total_umis;
for (gene_id, umis) in &other.genes {
too_much = 0;
for umi in umis {
self.add( *gene_id, *umi );
too_much += 1;
}
match self.total_reads.get_mut( &gene_id ){
Some( val ) => {
let add = other.total_reads.get( &gene_id ).unwrap_or(&too_much) - too_much;
*val += add
let mut too_much = BTreeMap::<usize, usize>::new();

for (gene_umi_combo, counts) in &other.genes {
match self.genes.get_mut( gene_umi_combo ) {
Some(count) => {
*count += counts;
let counter = too_much.entry(gene_umi_combo.0).or_insert(0);
*counter += 1;
},
None => panic!("merge could not copy the total gene values for gene {gene_id}", ),
};
None => {
self.genes.insert( *gene_umi_combo, 1);
}
}
}
for (gene_id, count) in &other.total_reads {
let double_counts = *too_much.entry(*gene_id).or_insert_with(|| 0);
let mine = self.total_reads.entry(*gene_id).or_insert(0);
*mine += count - double_counts;
}
for (hash, (umis, ids) ) in &other.multimapper {
too_much = 0;
/*for (hash, (umis, ids) ) in &other.multimapper {
let double_counts = 0;
for umi in umis {
self.add_multimapper( ids.clone(), *umi );
too_much += 1;
double_counts += 1;
}
match self.total_reads.get_mut( &(*hash as usize) ){
Some( val ) => {
let add = other.total_reads.get( &(*hash as usize) ).unwrap_or(&too_much) - too_much;
let add = other.total_reads.get( &(*hash as usize) ).unwrap_or(&double_counts) - double_counts;
*val += add
},
None => panic!("merge could not copy the total gene values for multimapper hash {hash}", ),
};
}
}*/
}

// returns false if the gene/umi combo has already been recorded!
pub fn add(&mut self, geneid: usize, umi:u64 ) -> bool{
//println!("adding gene id {}", geneid );
return match self.genes.get_mut( &geneid ) {
let gh = GeneUmiHash( geneid, umi);
return match self.genes.get_mut( &gh ) {
Some( gene ) => {
if gene.insert( umi ){
match self.total_reads.get_mut( &geneid ){
Some( count ) => *count += 1,
None => panic!("This must not happen - libraray error"),
}
//eprintln!("Good data");
self.total_umis += 1;
return true
}
//eprintln!("UMI already known {umi}");
*gene +=1;
false
},
None => {
let mut gc:HashSet<u64> = HashSet::new(); //to store the umis
gc.insert( umi );
self.total_reads.insert( geneid, 1 );
self.genes.insert( geneid, gc );
//eprintln!("Good data2");
self.genes.insert(gh, 1);
match self.total_reads.get_mut( &geneid ){
Some( count ) => *count += 1,
None => {
self.total_reads.insert( geneid, 1 );
}
};
self.total_umis += 1;
true
}
}
}

/*
// returns false if the gene/umi combo has already been recorded!
pub fn add_multimapper(&mut self, geneids: Vec::<usize>, umi:u64 ) -> bool{
//println!("adding gene id {}", geneid );
Expand Down Expand Up @@ -187,7 +188,7 @@ impl CellData{
true
}
}
}
}*/

pub fn n_umi( &self, _gene_info:&FastMapper, _gnames: &Vec<String> ) -> usize {
return self.total_umis;
Expand Down Expand Up @@ -222,6 +223,7 @@ impl CellData{
n
}

/*
pub fn fix_multimappers( &mut self ){
for (umis, geneids ) in self.multimapper.values(){
eprintln!("I have a multimapper group: {:?}", geneids);
Expand Down Expand Up @@ -293,23 +295,14 @@ impl CellData{
};
}
self.multimapper.clear();
}
}*/

pub fn n_umi_4_gene( &self, gene_info:&FastMapper, gname:&String) -> usize {
let mut n = 0;

let id = match gene_info.names.get( gname ){
Some(g_id) => g_id,
None => panic!("I could not resolve the gene name {gname}" ),
None => panic!("Programming error: I could not resolve the gene name {gname}" ),
};
n += match self.genes.get( id ){
Some( map ) => {
map.len()
}
None => 0
};
//if n > 0 { println!("I got {} umis for gene {}", n, gname ); }
n
*self.total_reads.get( id ).unwrap_or(&0)
}


Expand Down
49 changes: 49 additions & 0 deletions src/singlecelldata/cell_data/gene_umi_hash.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
use std::hash::{Hash, Hasher};
use core::fmt;
use std::cmp::{max, Ord};

#[derive(Debug, Copy, Clone)]
pub struct GeneUmiHash( pub usize, pub u64);
impl PartialEq for GeneUmiHash {
fn eq(&self, other: &Self) -> bool {
return (self.0 == other.0) && (self.1 == other.1)
}
}


impl Ord for GeneUmiHash {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
// Compare based on the first field (usize) first
match self.0.cmp(&other.0) {
std::cmp::Ordering::Equal => {
// If the first fields are equal, compare based on the second field (u64)
self.1.cmp(&other.1)
}
ordering => ordering,
}
}
}

impl PartialOrd for GeneUmiHash {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}

impl Eq for GeneUmiHash {}

impl Hash for GeneUmiHash {
fn hash<H: Hasher>(&self, state: &mut H) {
self.0.hash(state);
self.1.hash(state);
}
}

// Implementing Display trait for GeneUmiHash
impl fmt::Display for GeneUmiHash {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "GeneUmiHash (gene_id: {}, umi: {})", self.0, self.1 )
}
}


8 changes: 8 additions & 0 deletions src/singlecelldata/cell_data/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
// singlecelldata/cell_data/mod.rs

pub mod cell_data;
pub mod gene_umi_hash;

pub use cell_data::CellData as CellData;
pub use gene_umi_hash::GeneUmiHash as GeneUmiHash;

Loading

0 comments on commit 270a83f

Please sign in to comment.