Skip to content

Commit

Permalink
[MRG] speed up SeqToHashes translate (#1946)
Browse files Browse the repository at this point in the history
* lightning fast translate ⚡

* rust clippy

* in-line comments for translate
  • Loading branch information
mr-eyes authored Apr 13, 2022
1 parent 01119a2 commit 3fcf2db
Showing 1 changed file with 33 additions and 20 deletions.
53 changes: 33 additions & 20 deletions src/core/src/signature.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
//!
//! A signature is a collection of sketches for a genomic dataset.

use std::collections::VecDeque;
use std::fs::File;
use std::io;
use std::iter::Iterator;
Expand Down Expand Up @@ -170,7 +169,7 @@ pub struct SeqToHashes {
is_protein: bool,
hash_function: HashFunctions,
seed: u64,
hashes_buffer: VecDeque<u64>,
hashes_buffer: Vec<u64>,

dna_configured: bool,
dna_rc: Vec<u8>,
Expand All @@ -180,6 +179,7 @@ pub struct SeqToHashes {

prot_configured: bool,
aa_seq: Vec<u8>,
translate_iter_step: usize,
}

impl SeqToHashes {
Expand Down Expand Up @@ -215,25 +215,34 @@ impl SeqToHashes {
is_protein,
hash_function,
seed,
hashes_buffer: VecDeque::with_capacity(1000),
hashes_buffer: Vec::with_capacity(1000),
dna_configured: false,
dna_rc: Vec::with_capacity(1000),
dna_ksize: 0,
dna_len: 0,
dna_last_position_check: 0,
prot_configured: false,
aa_seq: Vec::new(),
translate_iter_step: 0,
}
}
}

/*
Iterator that return a kmer hash for all modes except translate.
In translate mode:
- all the frames are processed at once and converted to hashes.
- all the hashes are stored in `hashes_buffer`
- after processing all the kmers, `translate_iter_step` is incremented
per iteration to iterate over all the indeces of the `hashes_buffer`.
- the iterator will die once `translate_iter_step` == length(hashes_buffer)
More info https://github.com/sourmash-bio/sourmash/pull/1946
*/

impl Iterator for SeqToHashes {
type Item = Result<u64, Error>;

fn next(&mut self) -> Option<Self::Item> {
// TODO: Remove the hashes buffer
// Priority for flushing the hashes buffer

if (self.kmer_index < self.max_index) || !self.hashes_buffer.is_empty() {
// Processing DNA or Translated DNA
if !self.is_protein {
Expand Down Expand Up @@ -291,18 +300,17 @@ impl Iterator for SeqToHashes {
let hash = crate::_hash_murmur(std::cmp::min(kmer, krc), self.seed);
self.kmer_index += 1;
Some(Ok(hash))
} else if self.hashes_buffer.is_empty() {
} else if self.hashes_buffer.is_empty() && self.translate_iter_step == 0 {
// Processing protein by translating DNA
// TODO: make it a real iterator not a buffer
// TODO: Implement iterator over frames instead of hashes_buffer.

// Three frames
for i in 0..3 {
for frame_number in 0..3 {
let substr: Vec<u8> = self
.sequence
.iter()
.cloned()
.skip(i)
.take(self.sequence.len() - i)
.skip(frame_number)
.take(self.sequence.len() - frame_number)
.collect();

let aa = to_aa(
Expand All @@ -314,15 +322,15 @@ impl Iterator for SeqToHashes {

aa.windows(self.k_size as usize).for_each(|n| {
let hash = crate::_hash_murmur(n, self.seed);
self.hashes_buffer.push_back(hash);
self.hashes_buffer.push(hash);
});

let rc_substr: Vec<u8> = self
.dna_rc
.iter()
.cloned()
.skip(i)
.take(self.dna_rc.len() - i)
.skip(frame_number)
.take(self.dna_rc.len() - frame_number)
.collect();
let aa_rc = to_aa(
&rc_substr,
Expand All @@ -333,14 +341,19 @@ impl Iterator for SeqToHashes {

aa_rc.windows(self.k_size as usize).for_each(|n| {
let hash = crate::_hash_murmur(n, self.seed);
self.hashes_buffer.push_back(hash);
self.hashes_buffer.push(hash);
});
}
self.kmer_index = self.max_index;
Some(Ok(self.hashes_buffer.remove(0).unwrap()))
Some(Ok(0))
} else {
let first_element: u64 = self.hashes_buffer.pop_front().unwrap();
Some(Ok(first_element))
if self.translate_iter_step == self.hashes_buffer.len() {
self.hashes_buffer.clear();
self.kmer_index = self.max_index;
return Some(Ok(0));
}
let curr_idx = self.translate_iter_step;
self.translate_iter_step += 1;
Some(Ok(self.hashes_buffer[curr_idx]))
}
} else {
// Processing protein
Expand Down

0 comments on commit 3fcf2db

Please sign in to comment.