Skip to content

Commit

Permalink
MRG: fix RocksDB-based gather & other rust-based infelicities reveale…
Browse files Browse the repository at this point in the history
…d by plugins (#3193)

This PR fixes a bug in `disk_revindex.rs::RevIndexOps::gather` where
RocksDB-based `gather` was
incorrectly subtracting hashes multiple times from the counter in
situations of high redundancy.

For example, consider this Venn diagram of the 3-way intersection
between three sketches:


![image](https://github.com/sourmash-bio/sourmash/assets/51016/f98bf6a0-acc4-415f-bf39-1c48a599996a)

When a metagenome contains the union of all three of these sketches, the
broken implementation would subtract the `10` at the center multiple
times. This was caused by removing hashes from the matches, rather than
the intersection, each pass through the counter.

Of note, this made RocksDB-based `fastmultigather` return incorrect
results, ref
sourmash-bio/sourmash_plugin_branchwater#322;
first discovered in
#3138 (comment).

The PR fixes this, and adds a more complete pair of tests, based on
`test_gather_metagenome_num_results` in the Python tests for sourmash.

This PR also adjusts the hash function string for DNA sketches in Rust
to be uppercase `DNA` rather than lowercase `dna`, ref
sourmash-bio/sourmash_plugin_directsketch#49

And remember, it's not *just* the destination - it's the friends you
make along the way, like `env_logger`.

* Fixes #3139
* Fixes
sourmash-bio/sourmash_plugin_directsketch#49

For consideration:

Right now we are calculating the intersection twice, once in
`disk_revindex.rs` and once in `calculate_gather_stats` in
`revindex/mod.rs`. This is unnecessary, of course. But the function
signature for `calculate_gather_stats` would need to change to take the
intersection as an argument. We could:
* keep calculating it twice, just for simplicity;
* change `calculate_gather_stats` to take an _optional_ intersection,
and calculate it if not provided;
* change `calculate_gather_stats` to require an intersection.

TODO:
- [x] add at least one explicit test for the moltype fix

Other notes:

* there is a discrepancy between the Python (`sourmash gather`) and Rust
(`sourmash_plugin_branchwater` results) calculations for `remaining_bp`.
It seems to me like the Python one is definitely wrong; not yet sure
about Rust. Viz #3194.
  • Loading branch information
ctb committed Jun 9, 2024
1 parent a133e68 commit 8c6b58c
Show file tree
Hide file tree
Showing 8 changed files with 200 additions and 28 deletions.
6 changes: 3 additions & 3 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion src/core/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "sourmash"
version = "0.13.1"
version = "0.14.0"
authors = ["Luiz Irber <luiz@sourmash.bio>", "N. Tessa Pierce-Ward <tessa@sourmash.bio>"]
description = "tools for comparing biological sequences with k-mer sketches"
repository = "https://github.com/sourmash-bio/sourmash"
Expand Down
2 changes: 1 addition & 1 deletion src/core/src/encodings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ impl std::fmt::Display for HashFunctions {
f,
"{}",
match self {
HashFunctions::Murmur64Dna => "dna",
HashFunctions::Murmur64Dna => "DNA",
HashFunctions::Murmur64Protein => "protein",
HashFunctions::Murmur64Dayhoff => "dayhoff",
HashFunctions::Murmur64Hp => "hp",
Expand Down
14 changes: 11 additions & 3 deletions src/core/src/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ pub mod search;
use std::path::Path;

use getset::{CopyGetters, Getters, Setters};
use log::trace;
use serde::{Deserialize, Serialize};
use stats::{median, stddev};
use typed_builder::TypedBuilder;
Expand Down Expand Up @@ -220,8 +221,15 @@ pub fn calculate_gather_stats(
) -> Result<GatherResult> {
// get match_mh
let match_mh = match_sig.minhash().unwrap();

// calculate intersection
let isect = match_mh.intersection(&query)?;
let isect_size = isect.0.len();
trace!("isect_size: {}", isect_size);
trace!("query.size: {}", query.size());

//bp remaining in subtracted query
let remaining_bp = (query.size() - match_size) * query.scaled() as usize;
let remaining_bp = (query.size() - isect_size) * query.scaled() as usize;

// stats for this match vs original query
let (intersect_orig, _) = match_mh.intersection_size(orig_query).unwrap();
Expand All @@ -231,8 +239,8 @@ pub fn calculate_gather_stats(

// stats for this match vs current (subtracted) query
let f_match = match_size as f64 / match_mh.size() as f64;
let unique_intersect_bp = match_mh.scaled() as usize * match_size;
let f_unique_to_query = match_size as f64 / orig_query.size() as f64;
let unique_intersect_bp = match_mh.scaled() as usize * isect_size;
let f_unique_to_query = isect_size as f64 / orig_query.size() as f64;

// // get ANI values
let ksize = match_mh.ksize() as f64;
Expand Down
19 changes: 12 additions & 7 deletions src/core/src/index/revindex/disk_revindex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,14 @@ impl RevIndexOps for RevIndex {
// just calculate essentials here
let gather_result_rank = matches.len();

let query_mh = KmerMinHash::from(query.clone());

// grab the specific intersection:
let isect = match_mh.intersection(&query_mh)?;
let mut isect_mh = match_mh.clone();
isect_mh.clear();
isect_mh.add_many(&isect.0)?;

// Calculate stats
let gather_result = calculate_gather_stats(
&orig_query_ds,
Expand All @@ -371,8 +379,9 @@ impl RevIndexOps for RevIndex {

// TODO: Use HashesToColors here instead. If not initialized,
// build it.
match_mh
.iter_mins()
isect
.0
.iter()
.filter_map(|hash| hash_to_color.get(hash))
.flat_map(|color| {
// TODO: remove this clone
Expand All @@ -381,11 +390,7 @@ impl RevIndexOps for RevIndex {
.for_each(|dataset| {
// TODO: collect the flat_map into a Counter, and remove more
// than one at a time...
counter.entry(dataset).and_modify(|e| {
if *e > 0 {
*e -= 1
}
});
counter.entry(dataset).and_modify(|e| { *e -= 1 });
});

counter.remove(&dataset_id);
Expand Down
152 changes: 140 additions & 12 deletions src/core/src/index/revindex/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -644,14 +644,13 @@ mod test {
counter,
query_colors,
hash_to_color,
0,
5, // 50kb threshold
&query,
Some(selection),
)?;

// should be 11, based on test_gather_metagenome_num_results @CTB.
// see sourmash#3139 and sourmash_plugin_branchwater#322.
assert_eq!(matches.len(), 6);
// should be 11, based on test_gather_metagenome_num_results
assert_eq!(matches.len(), 11);

fn round5(a: f64) -> f64 {
(a * 1e5).round() / 1e5
Expand Down Expand Up @@ -691,18 +690,147 @@ mod test {
dbg!(match_);
let names: Vec<&str> = match_.name().split(' ').take(1).collect();
assert_eq!(names[0], "NC_009486.1");
assert_eq!(round5(match_.f_match()), round5(0.4842105));
assert_eq!(round5(match_.f_unique_to_query()), round5(0.0627557));

let match_ = &matches[6];
dbg!(match_);
let names: Vec<&str> = match_.name().split(' ').take(1).collect();
assert_eq!(names[0], "NC_006905.1");
assert_eq!(round5(match_.f_match()), round5(0.161016949152542));
assert_eq!(
round5(match_.f_unique_to_query()),
round5(0.0518417462482947)
);

let match_ = &matches[7];
dbg!(match_);
let names: Vec<&str> = match_.name().split(' ').take(1).collect();
assert_eq!(names[0], "NC_011080.1");
assert_eq!(round5(match_.f_match()), round5(0.125799573560768));
assert_eq!(
round5(match_.f_unique_to_query()),
round5(0.04024556616643930)
);

let match_ = &matches[8];
dbg!(match_);
let names: Vec<&str> = match_.name().split(' ').take(1).collect();
assert_eq!(names[0], "NC_011274.1");
assert_eq!(round5(match_.f_match()), round5(0.0919037199124727));
assert_eq!(
round5(match_.f_unique_to_query()),
round5(0.0286493860845839)
);

let match_ = &matches[9];
dbg!(match_);
let names: Vec<&str> = match_.name().split(' ').take(1).collect();
assert_eq!(names[0], "NC_006511.1");
assert_eq!(round5(match_.f_match()), round5(0.0725995316159251));
assert_eq!(
round5(match_.f_unique_to_query()),
round5(0.021145975443383400)
);

let match_ = &matches[10];
dbg!(match_);
let names: Vec<&str> = match_.name().split(' ').take(1).collect();
assert_eq!(names[0], "NC_011294.1");
assert_eq!(round5(match_.f_match()), round5(0.0148619957537155));
assert_eq!(
round5(match_.f_unique_to_query()),
round5(0.0047748976807639800)
);

Ok(())
}

#[test]
// a more detailed/focused version of revindex_load_and_gather_2,
// added in sourmash#3193 for debugging purposes.
fn revindex_load_and_gather_3() -> Result<()> {
let mut basedir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
basedir.push("../../tests/test-data/gather/");

let against = vec![
"GCF_000016785.1_ASM1678v1_genomic.fna.gz.sig",
"GCF_000018945.1_ASM1894v1_genomic.fna.gz.sig",
"GCF_000008545.1_ASM854v1_genomic.fna.gz.sig",
];
let against: Vec<_> = against
.iter()
.map(|sig| {
let mut filename = basedir.clone();
filename.push(sig);
filename
})
.collect();

// build 'against' sketches into a revindex
let selection = Selection::builder().ksize(21).scaled(10000).build();
let output = TempDir::new()?;

// @CTB this fails: 0.43158 != 0.48421
// assert_eq!(round5(match_.f_match()), round5(0.4842105));
let collection = Collection::from_paths(&against)?.select(&selection)?;
let _index = RevIndex::create(output.path(), collection.try_into()?, false);

let index = RevIndex::open(output.path(), true, None)?;

let mut query = None;
let mut query_filename = basedir.clone();
query_filename.push("combined.sig");
let query_sig = Signature::from_path(query_filename)?
.swap_remove(0)
.select(&selection)?;

if let Some(q) = prepare_query(query_sig, &selection) {
query = Some(q);
}
let query = query.unwrap();

let (counter, query_colors, hash_to_color) = index.prepare_gather_counters(&query);

let matches = index.gather(
counter,
query_colors,
hash_to_color,
0,
&query,
Some(selection),
)?;

// @CTB this fails: 0.05593 != 0.06276
// assert_eq!(round5(match_.f_unique_to_query()), round5(0.0627557));
// should be 3.
// see sourmash#3193.
assert_eq!(matches.len(), 3);

// @CTB fails
// assert_eq!(match_.unique_intersect_bp, 820000);
fn round5(a: f64) -> f64 {
(a * 1e5).round() / 1e5
}

// @CTB fails
// assert_eq!(match_.remaining_bp, 2170000);
let match_ = &matches[0];
let names: Vec<&str> = match_.name().split(' ').take(1).collect();
assert_eq!(names[0], "NC_000853.1");
assert_eq!(match_.f_match(), 1.0);
assert_eq!(round5(match_.f_unique_to_query()), round5(0.13096862));
assert_eq!(match_.unique_intersect_bp, 1920000);
assert_eq!(match_.remaining_bp, 12740000);

let match_ = &matches[1];
let names: Vec<&str> = match_.name().split(' ').take(1).collect();
assert_eq!(names[0], "NC_011978.1");
assert_eq!(match_.f_match(), 0.898936170212766);
assert_eq!(round5(match_.f_unique_to_query()), round5(0.115279));
assert_eq!(match_.unique_intersect_bp, 1690000);
assert_eq!(match_.remaining_bp, 11050000);

let match_ = &matches[2];
dbg!(match_);
let names: Vec<&str> = match_.name().split(' ').take(1).collect();
assert_eq!(names[0], "NC_009486.1");
assert_eq!(round5(match_.f_match()), round5(0.4842105));
assert_eq!(round5(match_.f_unique_to_query()), round5(0.0627557));
assert_eq!(match_.unique_intersect_bp, 920000);
assert_eq!(match_.remaining_bp, 10130000);

Ok(())
}
Expand Down
31 changes: 31 additions & 0 deletions src/core/src/manifest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,37 @@ mod test {
}
}

#[test]
fn manifest_to_writer_moltype_dna() {
let base_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));

let test_sigs = vec![PathBuf::from("../../tests/test-data/47.fa.sig")];

let full_paths: Vec<PathBuf> = test_sigs
.into_iter()
.map(|sig| base_path.join(sig))
.collect();

let manifest = Manifest::from(&full_paths[..]); // pass full_paths as a slice

let temp_dir = TempDir::new().unwrap();
let utf8_output = PathBuf::from_path_buf(temp_dir.path().to_path_buf())
.expect("Path should be valid UTF-8");

let filename = utf8_output.join("sigs.manifest.csv");
let mut wtr = File::create(&filename).expect("Failed to create file");

manifest.to_writer(&mut wtr).unwrap();

// check that we can reopen the file as a manifest + properly check abund
let infile = File::open(&filename).expect("Failed to open file");
let m2 = Manifest::from_reader(&infile).unwrap();
for record in m2.iter() {
eprintln!("{:?} {}", record.name(), record.moltype());
assert_eq!(record.moltype().to_string(), "DNA");
}
}

#[test]
fn manifest_selection() {
let base_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
Expand Down
2 changes: 1 addition & 1 deletion tests/test_sourmash_sketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ def test_protein_override_bad_rust_foo():
with pytest.raises(ValueError) as exc:
sig.add_protein(record.sequence)

assert 'Invalid hash function: "dna"' in str(exc)
assert 'Invalid hash function: "DNA"' in str(exc)


def test_dayhoff_defaults():
Expand Down

0 comments on commit 8c6b58c

Please sign in to comment.