From 8c6b58c44ea126a38df2c7012ed4bb492e5ccdf5 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 9 Jun 2024 13:05:04 -0700 Subject: [PATCH] MRG: fix RocksDB-based gather & other rust-based infelicities revealed by plugins (#3193) This PR fixes a bug in `disk_revindex.rs::RevIndexOps::gather` where RocksDB-based `gather` was incorrectly subtracting hashes multiple times from the counter in situations of high redundancy. For example, consider this Venn diagram of the 3-way intersection between three sketches: ![image](https://github.com/sourmash-bio/sourmash/assets/51016/f98bf6a0-acc4-415f-bf39-1c48a599996a) When a metagenome contains the union of all three of these sketches, the broken implementation would subtract the `10` at the center multiple times. This was caused by removing hashes from the matches, rather than the intersection, each pass through the counter. Of note, this made RocksDB-based `fastmultigather` return incorrect results, ref https://github.com/sourmash-bio/sourmash_plugin_branchwater/issues/322; first discovered in https://github.com/sourmash-bio/sourmash/pull/3138#issuecomment-2104889751. The PR fixes this, and adds a more complete pair of tests, based on `test_gather_metagenome_num_results` in the Python tests for sourmash. This PR also adjusts the hash function string for DNA sketches in Rust to be uppercase `DNA` rather than lowercase `dna`, ref https://github.com/sourmash-bio/sourmash_plugin_directsketch/issues/49 And remember, it's not *just* the destination - it's the friends you make along the way, like `env_logger`. * Fixes https://github.com/sourmash-bio/sourmash/issues/3139 * Fixes https://github.com/sourmash-bio/sourmash_plugin_directsketch/issues/49 For consideration: Right now we are calculating the intersection twice, once in `disk_revindex.rs` and once in `calculate_gather_stats` in `revindex/mod.rs`. This is unnecessary, of course. But the function signature for `calculate_gather_stats` would need to change to take the intersection as an argument. We could: * keep calculating it twice, just for simplicity; * change `calculate_gather_stats` to take an _optional_ intersection, and calculate it if not provided; * change `calculate_gather_stats` to require an intersection. TODO: - [x] add at least one explicit test for the moltype fix Other notes: * there is a discrepancy between the Python (`sourmash gather`) and Rust (`sourmash_plugin_branchwater` results) calculations for `remaining_bp`. It seems to me like the Python one is definitely wrong; not yet sure about Rust. Viz https://github.com/sourmash-bio/sourmash/issues/3194. --- Cargo.lock | 6 +- src/core/Cargo.toml | 2 +- src/core/src/encodings.rs | 2 +- src/core/src/index/mod.rs | 14 +- src/core/src/index/revindex/disk_revindex.rs | 19 ++- src/core/src/index/revindex/mod.rs | 152 +++++++++++++++++-- src/core/src/manifest.rs | 31 ++++ tests/test_sourmash_sketch.py | 2 +- 8 files changed, 200 insertions(+), 28 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index dbbdc8865..cb1913615 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -48,9 +48,9 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anstyle" -version = "1.0.0" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41ed9a86bf92ae6580e0a31281f65a1b1d867c0cc68d5346e2ae128dddfa6a7d" +checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b" [[package]] name = "approx" @@ -1615,7 +1615,7 @@ checksum = "9f1341053f34bb13b5e9590afb7d94b48b48d4b87467ec28e3c238693bb553de" [[package]] name = "sourmash" -version = "0.13.1" +version = "0.14.0" dependencies = [ "az", "byteorder", diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index 91b76d625..e6e58331e 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sourmash" -version = "0.13.1" +version = "0.14.0" authors = ["Luiz Irber ", "N. Tessa Pierce-Ward "] description = "tools for comparing biological sequences with k-mer sketches" repository = "https://github.com/sourmash-bio/sourmash" diff --git a/src/core/src/encodings.rs b/src/core/src/encodings.rs index f8934596d..733831e49 100644 --- a/src/core/src/encodings.rs +++ b/src/core/src/encodings.rs @@ -58,7 +58,7 @@ impl std::fmt::Display for HashFunctions { f, "{}", match self { - HashFunctions::Murmur64Dna => "dna", + HashFunctions::Murmur64Dna => "DNA", HashFunctions::Murmur64Protein => "protein", HashFunctions::Murmur64Dayhoff => "dayhoff", HashFunctions::Murmur64Hp => "hp", diff --git a/src/core/src/index/mod.rs b/src/core/src/index/mod.rs index 4363c5ebe..3ce265280 100644 --- a/src/core/src/index/mod.rs +++ b/src/core/src/index/mod.rs @@ -14,6 +14,7 @@ pub mod search; use std::path::Path; use getset::{CopyGetters, Getters, Setters}; +use log::trace; use serde::{Deserialize, Serialize}; use stats::{median, stddev}; use typed_builder::TypedBuilder; @@ -220,8 +221,15 @@ pub fn calculate_gather_stats( ) -> Result { // get match_mh let match_mh = match_sig.minhash().unwrap(); + + // calculate intersection + let isect = match_mh.intersection(&query)?; + let isect_size = isect.0.len(); + trace!("isect_size: {}", isect_size); + trace!("query.size: {}", query.size()); + //bp remaining in subtracted query - let remaining_bp = (query.size() - match_size) * query.scaled() as usize; + let remaining_bp = (query.size() - isect_size) * query.scaled() as usize; // stats for this match vs original query let (intersect_orig, _) = match_mh.intersection_size(orig_query).unwrap(); @@ -231,8 +239,8 @@ pub fn calculate_gather_stats( // stats for this match vs current (subtracted) query let f_match = match_size as f64 / match_mh.size() as f64; - let unique_intersect_bp = match_mh.scaled() as usize * match_size; - let f_unique_to_query = match_size as f64 / orig_query.size() as f64; + let unique_intersect_bp = match_mh.scaled() as usize * isect_size; + let f_unique_to_query = isect_size as f64 / orig_query.size() as f64; // // get ANI values let ksize = match_mh.ksize() as f64; diff --git a/src/core/src/index/revindex/disk_revindex.rs b/src/core/src/index/revindex/disk_revindex.rs index d8287d1bf..d3603acb5 100644 --- a/src/core/src/index/revindex/disk_revindex.rs +++ b/src/core/src/index/revindex/disk_revindex.rs @@ -346,6 +346,14 @@ impl RevIndexOps for RevIndex { // just calculate essentials here let gather_result_rank = matches.len(); + let query_mh = KmerMinHash::from(query.clone()); + + // grab the specific intersection: + let isect = match_mh.intersection(&query_mh)?; + let mut isect_mh = match_mh.clone(); + isect_mh.clear(); + isect_mh.add_many(&isect.0)?; + // Calculate stats let gather_result = calculate_gather_stats( &orig_query_ds, @@ -371,8 +379,9 @@ impl RevIndexOps for RevIndex { // TODO: Use HashesToColors here instead. If not initialized, // build it. - match_mh - .iter_mins() + isect + .0 + .iter() .filter_map(|hash| hash_to_color.get(hash)) .flat_map(|color| { // TODO: remove this clone @@ -381,11 +390,7 @@ impl RevIndexOps for RevIndex { .for_each(|dataset| { // TODO: collect the flat_map into a Counter, and remove more // than one at a time... - counter.entry(dataset).and_modify(|e| { - if *e > 0 { - *e -= 1 - } - }); + counter.entry(dataset).and_modify(|e| { *e -= 1 }); }); counter.remove(&dataset_id); diff --git a/src/core/src/index/revindex/mod.rs b/src/core/src/index/revindex/mod.rs index 0e3e443bd..fc0389638 100644 --- a/src/core/src/index/revindex/mod.rs +++ b/src/core/src/index/revindex/mod.rs @@ -644,14 +644,13 @@ mod test { counter, query_colors, hash_to_color, - 0, + 5, // 50kb threshold &query, Some(selection), )?; - // should be 11, based on test_gather_metagenome_num_results @CTB. - // see sourmash#3139 and sourmash_plugin_branchwater#322. - assert_eq!(matches.len(), 6); + // should be 11, based on test_gather_metagenome_num_results + assert_eq!(matches.len(), 11); fn round5(a: f64) -> f64 { (a * 1e5).round() / 1e5 @@ -691,18 +690,147 @@ mod test { dbg!(match_); let names: Vec<&str> = match_.name().split(' ').take(1).collect(); assert_eq!(names[0], "NC_009486.1"); + assert_eq!(round5(match_.f_match()), round5(0.4842105)); + assert_eq!(round5(match_.f_unique_to_query()), round5(0.0627557)); + + let match_ = &matches[6]; + dbg!(match_); + let names: Vec<&str> = match_.name().split(' ').take(1).collect(); + assert_eq!(names[0], "NC_006905.1"); + assert_eq!(round5(match_.f_match()), round5(0.161016949152542)); + assert_eq!( + round5(match_.f_unique_to_query()), + round5(0.0518417462482947) + ); + + let match_ = &matches[7]; + dbg!(match_); + let names: Vec<&str> = match_.name().split(' ').take(1).collect(); + assert_eq!(names[0], "NC_011080.1"); + assert_eq!(round5(match_.f_match()), round5(0.125799573560768)); + assert_eq!( + round5(match_.f_unique_to_query()), + round5(0.04024556616643930) + ); + + let match_ = &matches[8]; + dbg!(match_); + let names: Vec<&str> = match_.name().split(' ').take(1).collect(); + assert_eq!(names[0], "NC_011274.1"); + assert_eq!(round5(match_.f_match()), round5(0.0919037199124727)); + assert_eq!( + round5(match_.f_unique_to_query()), + round5(0.0286493860845839) + ); + + let match_ = &matches[9]; + dbg!(match_); + let names: Vec<&str> = match_.name().split(' ').take(1).collect(); + assert_eq!(names[0], "NC_006511.1"); + assert_eq!(round5(match_.f_match()), round5(0.0725995316159251)); + assert_eq!( + round5(match_.f_unique_to_query()), + round5(0.021145975443383400) + ); + + let match_ = &matches[10]; + dbg!(match_); + let names: Vec<&str> = match_.name().split(' ').take(1).collect(); + assert_eq!(names[0], "NC_011294.1"); + assert_eq!(round5(match_.f_match()), round5(0.0148619957537155)); + assert_eq!( + round5(match_.f_unique_to_query()), + round5(0.0047748976807639800) + ); + + Ok(()) + } + + #[test] + // a more detailed/focused version of revindex_load_and_gather_2, + // added in sourmash#3193 for debugging purposes. + fn revindex_load_and_gather_3() -> Result<()> { + let mut basedir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + basedir.push("../../tests/test-data/gather/"); + + let against = vec![ + "GCF_000016785.1_ASM1678v1_genomic.fna.gz.sig", + "GCF_000018945.1_ASM1894v1_genomic.fna.gz.sig", + "GCF_000008545.1_ASM854v1_genomic.fna.gz.sig", + ]; + let against: Vec<_> = against + .iter() + .map(|sig| { + let mut filename = basedir.clone(); + filename.push(sig); + filename + }) + .collect(); + + // build 'against' sketches into a revindex + let selection = Selection::builder().ksize(21).scaled(10000).build(); + let output = TempDir::new()?; - // @CTB this fails: 0.43158 != 0.48421 - // assert_eq!(round5(match_.f_match()), round5(0.4842105)); + let collection = Collection::from_paths(&against)?.select(&selection)?; + let _index = RevIndex::create(output.path(), collection.try_into()?, false); + + let index = RevIndex::open(output.path(), true, None)?; + + let mut query = None; + let mut query_filename = basedir.clone(); + query_filename.push("combined.sig"); + let query_sig = Signature::from_path(query_filename)? + .swap_remove(0) + .select(&selection)?; + + if let Some(q) = prepare_query(query_sig, &selection) { + query = Some(q); + } + let query = query.unwrap(); + + let (counter, query_colors, hash_to_color) = index.prepare_gather_counters(&query); + + let matches = index.gather( + counter, + query_colors, + hash_to_color, + 0, + &query, + Some(selection), + )?; - // @CTB this fails: 0.05593 != 0.06276 - // assert_eq!(round5(match_.f_unique_to_query()), round5(0.0627557)); + // should be 3. + // see sourmash#3193. + assert_eq!(matches.len(), 3); - // @CTB fails - // assert_eq!(match_.unique_intersect_bp, 820000); + fn round5(a: f64) -> f64 { + (a * 1e5).round() / 1e5 + } - // @CTB fails - // assert_eq!(match_.remaining_bp, 2170000); + let match_ = &matches[0]; + let names: Vec<&str> = match_.name().split(' ').take(1).collect(); + assert_eq!(names[0], "NC_000853.1"); + assert_eq!(match_.f_match(), 1.0); + assert_eq!(round5(match_.f_unique_to_query()), round5(0.13096862)); + assert_eq!(match_.unique_intersect_bp, 1920000); + assert_eq!(match_.remaining_bp, 12740000); + + let match_ = &matches[1]; + let names: Vec<&str> = match_.name().split(' ').take(1).collect(); + assert_eq!(names[0], "NC_011978.1"); + assert_eq!(match_.f_match(), 0.898936170212766); + assert_eq!(round5(match_.f_unique_to_query()), round5(0.115279)); + assert_eq!(match_.unique_intersect_bp, 1690000); + assert_eq!(match_.remaining_bp, 11050000); + + let match_ = &matches[2]; + dbg!(match_); + let names: Vec<&str> = match_.name().split(' ').take(1).collect(); + assert_eq!(names[0], "NC_009486.1"); + assert_eq!(round5(match_.f_match()), round5(0.4842105)); + assert_eq!(round5(match_.f_unique_to_query()), round5(0.0627557)); + assert_eq!(match_.unique_intersect_bp, 920000); + assert_eq!(match_.remaining_bp, 10130000); Ok(()) } diff --git a/src/core/src/manifest.rs b/src/core/src/manifest.rs index 565bd230d..b5325189b 100644 --- a/src/core/src/manifest.rs +++ b/src/core/src/manifest.rs @@ -448,6 +448,37 @@ mod test { } } + #[test] + fn manifest_to_writer_moltype_dna() { + let base_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + + let test_sigs = vec![PathBuf::from("../../tests/test-data/47.fa.sig")]; + + let full_paths: Vec = test_sigs + .into_iter() + .map(|sig| base_path.join(sig)) + .collect(); + + let manifest = Manifest::from(&full_paths[..]); // pass full_paths as a slice + + let temp_dir = TempDir::new().unwrap(); + let utf8_output = PathBuf::from_path_buf(temp_dir.path().to_path_buf()) + .expect("Path should be valid UTF-8"); + + let filename = utf8_output.join("sigs.manifest.csv"); + let mut wtr = File::create(&filename).expect("Failed to create file"); + + manifest.to_writer(&mut wtr).unwrap(); + + // check that we can reopen the file as a manifest + properly check abund + let infile = File::open(&filename).expect("Failed to open file"); + let m2 = Manifest::from_reader(&infile).unwrap(); + for record in m2.iter() { + eprintln!("{:?} {}", record.name(), record.moltype()); + assert_eq!(record.moltype().to_string(), "DNA"); + } + } + #[test] fn manifest_selection() { let base_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); diff --git a/tests/test_sourmash_sketch.py b/tests/test_sourmash_sketch.py index 550b24a94..6537383b7 100644 --- a/tests/test_sourmash_sketch.py +++ b/tests/test_sourmash_sketch.py @@ -353,7 +353,7 @@ def test_protein_override_bad_rust_foo(): with pytest.raises(ValueError) as exc: sig.add_protein(record.sequence) - assert 'Invalid hash function: "dna"' in str(exc) + assert 'Invalid hash function: "DNA"' in str(exc) def test_dayhoff_defaults():