Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MRG: report correct md5sum #45

Merged
merged 10 commits into from
Aug 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "pyo3-branchwater"
version = "0.5.1"
version = "0.6.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand Down
177 changes: 75 additions & 102 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
// TODO:
// * md5sum output by search and countergather are of modified/downsampled,
// not orig. This is different from sourmash...
/// Rust code for pyo3_branchwater.

use pyo3::prelude::*;

Expand All @@ -17,7 +15,7 @@ use std::collections::BinaryHeap;

use std::cmp::{PartialOrd, Ordering};

use anyhow::{Context, Result, anyhow};
use anyhow::{Result, anyhow};

#[macro_use]
extern crate simple_error;
Expand All @@ -39,6 +37,7 @@ struct SmallSignature {

struct PrefetchResult {
name: String,
md5sum: String,
minhash: KmerMinHash,
overlap: u64,
}
Expand Down Expand Up @@ -99,34 +98,45 @@ fn check_compatible_downsample(
Ok(())
}

/// Given a search Signature containing one or more sketches, and a template
/// Sketch, return a compatible (& now downsampled) Sketch from the search
/// Signature.

/// Given a vec of search Signatures, each containing one or more sketches,
/// and a template Sketch, return a compatible (& now downsampled)
/// Sketch from the search Signatures..
///
/// CTB note: this will return the first acceptable match, I think, ignoring
/// all others.

fn prepare_query(search_sig: &Signature, template: &Sketch) -> Option<KmerMinHash> {
let mut search_mh = None;

// find exact match for template?
if let Some(Sketch::MinHash(mh)) = search_sig.select_sketch(template) {
search_mh = Some(mh.clone());
} else {
// no - try to find one that can be downsampled
if let Sketch::MinHash(template_mh) = template {
for sketch in search_sig.sketches() {
if let Sketch::MinHash(ref_mh) = sketch {
if check_compatible_downsample(&ref_mh, template_mh).is_ok() {
let max_hash = max_hash_for_scaled(template_mh.scaled());
let mh = ref_mh.downsample_max_hash(max_hash).unwrap();
return Some(mh);

fn prepare_query(search_sigs: &[Signature], template: &Sketch) -> Option<SmallSignature> {

for search_sig in search_sigs.iter() {
// find exact match for template?
if let Some(Sketch::MinHash(mh)) = search_sig.select_sketch(template) {
return Some(SmallSignature {
name: search_sig.name(),
md5sum: mh.md5sum(),
minhash: mh.clone()
});
} else {
// no - try to find one that can be downsampled
if let Sketch::MinHash(template_mh) = template {
for sketch in search_sig.sketches() {
if let Sketch::MinHash(ref_mh) = sketch {
if check_compatible_downsample(&ref_mh, template_mh).is_ok() {
let max_hash = max_hash_for_scaled(template_mh.scaled());
let mh = ref_mh.downsample_max_hash(max_hash).unwrap();
return Some(SmallSignature {
name: search_sig.name(),
md5sum: ref_mh.md5sum(), // original
minhash: mh, // downsampled
});
}
}
}
}
}
}
search_mh
None
}

/// Search many queries against a list of signatures.
Expand Down Expand Up @@ -223,48 +233,29 @@ fn manysearch<P: AsRef<Path>>(
info!("Processed {} search sigs", i);
}

let mut search_mh = None;
let mut search_sig = None;
let mut results = vec![];

// load search signature from path:
let search_sigs = Signature::from_path(filename);
if search_sigs.is_ok() {
let search_sigs = search_sigs.unwrap();

for ss in search_sigs.iter() {
if let Some(mh) = prepare_query(ss, &template) {
search_sig = Some(ss);
search_mh = Some(mh);
break;
}
}
// make sure it is compatible etc.

if !search_mh.is_some() {
eprintln!("WARNING: no compatible sketches in path '{}'",
filename.display());
let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst);
}

if search_mh.is_some() {
let search_mh = search_mh.unwrap();

if let Ok(search_sigs) = Signature::from_path(filename) {
if let Some(search_sm) = prepare_query(&search_sigs, &template) {
// search for matches & save containment.
for q in queries.iter() {
let overlap = q.minhash.count_common(&search_mh, false).unwrap() as f64;
let overlap = q.minhash.count_common(&search_sm.minhash, false).unwrap() as f64;
let size = q.minhash.size() as f64;

let containment = overlap / size;
if containment > threshold {
let search_sig = search_sig.unwrap();
results.push((q.name.clone(),
q.minhash.md5sum(),
search_sig.name(),
search_mh.md5sum(),
q.md5sum.clone(),
search_sm.name.clone(),
search_sm.md5sum.clone(),
overlap))
}
}
} else {
eprintln!("WARNING: no compatible sketches in path '{}'",
filename.display());
let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst);
}
} else {
let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst);
Expand Down Expand Up @@ -350,11 +341,11 @@ fn write_prefetch<P: AsRef<Path> + std::fmt::Debug + std::fmt::Display + Clone>(
None => Box::new(std::io::stdout()),
};
let mut writer = BufWriter::new(prefetch_out);
writeln!(&mut writer, "query_file,match,match_md5sum,overlap").ok();
writeln!(&mut writer, "query_file,match,match_md5,overlap").ok();

for m in matchlist.iter() {
writeln!(&mut writer, "{},\"{}\",{},{}", query_label,
m.name, m.minhash.md5sum(), m.overlap).ok();
m.name, m.md5sum, m.overlap).ok();
}

Ok(())
Expand Down Expand Up @@ -403,18 +394,10 @@ fn load_sketches(sketchlist_paths: Vec<PathBuf>, template: &Sketch) ->
let filename = m.display();

if let Ok(sigs) = Signature::from_path(m) {
for sig in &sigs {
if let Some(mh) = prepare_query(sig, template) {
sm = Some(SmallSignature {
name: sig.name(),
md5sum: mh.md5sum(),
minhash: mh,
});
} else {
// track number of paths that have no matching sigs
// @CTB: print error?
let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst);
}
sm = prepare_query(&sigs, template);
if sm.is_none() {
// track number of paths that have no matching sigs
let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst);
}
} else {
// failed to load from this path - print error & track.
Expand Down Expand Up @@ -453,24 +436,24 @@ fn load_sketches_above_threshold(
match sigs {
Ok(sigs) => {
let mut mm = None;
for sig in &sigs {
if let Some(mh) = prepare_query(sig, template) {
if let Ok(overlap) = mh.count_common(query, false) {
if overlap >= threshold_hashes {
let result = PrefetchResult {
name: sig.name(),
minhash: mh,
overlap,
};
mm = Some(result);
break;
}

if let Some(sm) = prepare_query(&sigs, template) {
let mh = sm.minhash;
if let Ok(overlap) = mh.count_common(query, false) {
if overlap >= threshold_hashes {
let result = PrefetchResult {
name: sm.name,
md5sum: sm.md5sum,
minhash: mh,
overlap,
};
mm = Some(result);
}
} else {
eprintln!("WARNING: no compatible sketches in path '{}'",
m.display());
let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst);
}
} else {
eprintln!("WARNING: no compatible sketches in path '{}'",
m.display());
let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst);
}
mm
}
Expand Down Expand Up @@ -507,7 +490,7 @@ fn consume_query_by_gather<P: AsRef<Path> + std::fmt::Debug + std::fmt::Display
None => Box::new(std::io::stdout()),
};
let mut writer = BufWriter::new(gather_out);
writeln!(&mut writer, "query_file,rank,match,match_md5sum,overlap").ok();
writeln!(&mut writer, "query_file,rank,match,match_md5,overlap").ok();

let mut matching_sketches = matchlist;
let mut rank = 0;
Expand All @@ -521,7 +504,7 @@ fn consume_query_by_gather<P: AsRef<Path> + std::fmt::Debug + std::fmt::Display
query.remove_from(&best_element.minhash)?;

writeln!(&mut writer, "{},{},\"{}\",{},{}", query_label, rank,
best_element.name, best_element.minhash.md5sum(),
best_element.name, best_element.md5sum,
best_element.overlap).ok();

// recalculate remaining overlaps between query and all sketches.
Expand Down Expand Up @@ -555,16 +538,9 @@ fn countergather<P: AsRef<Path> + std::fmt::Debug + std::fmt::Display + Clone>(
let query_label = query_filename.to_string();
eprintln!("Loading query from '{}'", query_label);
let query = {
let mut mm = None;
let sigs = Signature::from_path(query_filename)?;

for sig in &sigs {
if let Some(mh) = prepare_query(sig, &template) {
mm = Some(mh.clone());
break;
}
};
mm
prepare_query(&sigs, &template)
};

// did we find anything matching the desired template?
Expand Down Expand Up @@ -596,7 +572,7 @@ fn countergather<P: AsRef<Path> + std::fmt::Debug + std::fmt::Display + Clone>(
// load a set of sketches, filtering for those with overlaps > threshold
let result = load_sketches_above_threshold(matchlist_paths,
&template,
&query,
&query.minhash,
threshold_hashes)?;
let matchlist = result.0;
let skipped_paths = result.1;
Expand All @@ -619,7 +595,7 @@ fn countergather<P: AsRef<Path> + std::fmt::Debug + std::fmt::Display + Clone>(
write_prefetch(query_label.clone(), prefetch_output, &matchlist).ok();

// run the gather!
consume_query_by_gather(query, matchlist, threshold_hashes,
consume_query_by_gather(query.minhash, matchlist, threshold_hashes,
gather_output, query_label).ok();
Ok(())
}
Expand Down Expand Up @@ -698,12 +674,8 @@ fn multigather<P: AsRef<Path> + std::fmt::Debug + Clone>(
// load query from q
let mut mm = None;
if let Ok(sigs) = Signature::from_path(dbg!(q)) {
for sig in &sigs {
if let Some(mh) = prepare_query(sig, &template) {
mm = Some(mh);
break;
}
}
mm = prepare_query(&sigs, &template);

if mm.is_none() {
eprintln!("WARNING: no compatible sketches in path '{}'",
q.display());
Expand All @@ -723,10 +695,11 @@ fn multigather<P: AsRef<Path> + std::fmt::Debug + Clone>(
.filter_map(|sm| {
let mut mm = None;

if let Ok(overlap) = sm.minhash.count_common(&query, false) {
if let Ok(overlap) = sm.minhash.count_common(&query.minhash, false) {
if overlap >= threshold_hashes {
let result = PrefetchResult {
name: sm.name.clone(),
md5sum: sm.md5sum.clone(),
minhash: sm.minhash.clone(),
overlap,
};
Expand All @@ -746,7 +719,7 @@ fn multigather<P: AsRef<Path> + std::fmt::Debug + Clone>(
&matchlist).ok();

// now, do the gather!
consume_query_by_gather(query, matchlist, threshold_hashes,
consume_query_by_gather(query.minhash, matchlist, threshold_hashes,
Some(gather_output), query_label).ok();
} else {
println!("No matches to '{}'", query_label);
Expand Down
Loading