From ba581ea8c6f046b7e19da274155f326d2bdbc8e7 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sun, 27 Aug 2023 15:01:49 +0000 Subject: [PATCH] Remove BIGSI and SBT code (#2732) (spun off #2230, might help with #2665) BIGSI and SBT are prototype-level code (and some of the first Rust code I wrote...), and mostly makes it harder to change/refactor other parts of the codebase. Can bring it back later in the future if needed, but `mastiff` cover many of the same use cases. --- src/core/Cargo.toml | 4 - src/core/benches/index.rs | 83 ---- src/core/src/index/bigsi.rs | 218 -------- src/core/src/index/mod.rs | 36 +- src/core/src/index/sbt/mhbt.rs | 361 -------------- src/core/src/index/sbt/mhmt.rs | 227 --------- src/core/src/index/sbt/mod.rs | 878 --------------------------------- 7 files changed, 20 insertions(+), 1787 deletions(-) delete mode 100644 src/core/benches/index.rs delete mode 100644 src/core/src/index/bigsi.rs delete mode 100644 src/core/src/index/sbt/mhbt.rs delete mode 100644 src/core/src/index/sbt/mhmt.rs delete mode 100644 src/core/src/index/sbt/mod.rs diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index 61f0556bcf..cbc897b28b 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -60,10 +60,6 @@ proptest = { version = "1.2.0", default-features = false, features = ["std"]} rand = "0.8.2" tempfile = "3.7.1" -[[bench]] -name = "index" -harness = false - [[bench]] name = "compute" harness = false diff --git a/src/core/benches/index.rs b/src/core/benches/index.rs deleted file mode 100644 index d3d4b54118..0000000000 --- a/src/core/benches/index.rs +++ /dev/null @@ -1,83 +0,0 @@ -#[macro_use] -extern crate criterion; - -use std::path::PathBuf; - -use criterion::{Bencher, Criterion, Fun}; -use sourmash::index::bigsi::BIGSI; -use sourmash::index::linear::LinearIndex; -use sourmash::index::Index; -use sourmash::index::MHBT; -use sourmash::signature::Signature; - -fn find_small_bench(c: &mut Criterion) { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/v5.sbt.json"); - - let sbt = MHBT::from_path(filename).expect("Loading error"); - - let leaf: Signature = (*sbt.signatures().first().unwrap()).clone(); - - let mut linear = LinearIndex::builder().storage(sbt.storage()).build(); - - for l in sbt.signatures() { - linear.insert(l).unwrap(); - } - - let mut bigsi = BIGSI::new(10000, 10); - for l in sbt.signatures() { - bigsi.insert(l).unwrap(); - } - - let sbt_find = Fun::new("sbt_search", move |b: &mut Bencher, leaf: &Signature| { - b.iter(|| sbt.search(leaf, 0.1, false)) - }); - - let linear_find = Fun::new("linear_search", move |b: &mut Bencher, leaf: &Signature| { - b.iter(|| linear.search(leaf, 0.1, false)) - }); - - let bigsi_find = Fun::new("bigsi_search", move |b: &mut Bencher, leaf: &Signature| { - b.iter(|| bigsi.search(leaf, 0.1, false)) - }); - - let functions = vec![sbt_find, linear_find, bigsi_find]; - c.bench_functions("search_small", functions, leaf); -} - -fn find_subset_bench(c: &mut Criterion) { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/subset.sbt.json"); - - let sbt = MHBT::from_path(filename).expect("Loading error"); - - let leaf: Signature = (*sbt.signatures().first().unwrap()).clone(); - - let mut linear = LinearIndex::builder().storage(sbt.storage()).build(); - for l in sbt.signatures() { - linear.insert(l).unwrap(); - } - - let mut bigsi = BIGSI::new(10000, 10); - for l in sbt.signatures() { - bigsi.insert(l).unwrap(); - } - - let sbt_find = Fun::new("sbt_search", move |b: &mut Bencher, leaf: &Signature| { - b.iter(|| sbt.search(leaf, 0.1, false)) - }); - - let linear_find = Fun::new("linear_search", move |b: &mut Bencher, leaf: &Signature| { - b.iter(|| linear.search(leaf, 0.1, false)) - }); - - let bigsi_find = Fun::new("bigsi_search", move |b: &mut Bencher, leaf: &Signature| { - b.iter(|| bigsi.search(leaf, 0.1, false)) - }); - - let functions = vec![sbt_find, linear_find, bigsi_find]; - c.bench_functions("search_subset", functions, leaf); -} - -criterion_group!(benches, find_small_bench, find_subset_bench); -criterion_main!(benches); diff --git a/src/core/src/index/bigsi.rs b/src/core/src/index/bigsi.rs deleted file mode 100644 index 0e45348fc7..0000000000 --- a/src/core/src/index/bigsi.rs +++ /dev/null @@ -1,218 +0,0 @@ -use std::collections::HashMap; -use std::path::Path; - -use fixedbitset::FixedBitSet; -use thiserror::Error; -use typed_builder::TypedBuilder; - -use crate::index::Index; -use crate::signature::{Signature, SigsTrait}; -use crate::sketch::nodegraph::Nodegraph; -use crate::sketch::Sketch; -use crate::Error; -use crate::HashIntoType; - -#[derive(Clone, TypedBuilder)] -pub struct BIGSI { - matrix: Vec, - ksize: usize, - datasets: Vec, - //#[builder(setter(skip))] - //storage: Rc, -} - -#[derive(Debug, Error)] -pub enum BIGSIError { - #[error("BIGSI doesn't support this method")] - MethodDisabled, -} - -impl BIGSI { - pub fn new(bf_size: usize, ksize: usize) -> BIGSI { - let mut matrix = Vec::with_capacity(bf_size); - for _ in 0..bf_size { - // TODO: figure initial capacity for each row - matrix.push(FixedBitSet::with_capacity(100)); - } - - BIGSI { - matrix, - ksize, - datasets: Vec::new(), - } - } -} - -impl BIGSI { - pub fn add(&mut self, dataset: Signature) { - let mut ng = Nodegraph::new(&[self.matrix.len()], self.ksize); - - // TODO: select correct minhash - if let Sketch::MinHash(mh) = &dataset.signatures[0] { - for h in mh.mins() { - ng.count(h); - } - } else { - // TODO: what if it is not a mh? - unimplemented!() - } - - self.datasets.push(dataset); - let col = self.datasets.len() - 1; - - let bs = ng.into_bitsets(); - for pos in bs[0].ones() { - let bs = &mut self.matrix[pos]; - if bs.len() == col { - bs.grow(col + col / 2); - } - bs.insert(col); - } - } - - pub fn query(&self, hash: HashIntoType) -> impl Iterator + '_ { - let pos = hash as usize % self.matrix.len(); - let bs = &self.matrix[pos]; - bs.ones() - } - - pub fn query_datasets(&self, hash: HashIntoType) -> impl Iterator + '_ { - self.query(hash).map(move |pos| self.datasets[pos].clone()) - } -} - -impl<'a> Index<'a> for BIGSI { - type Item = Signature; - //type SignatureIterator = std::slice::Iter<'a, Self::Item>; - - fn search( - &self, - sig: &Self::Item, - threshold: f64, - containment: bool, - ) -> Result, Error> { - let mut results = Vec::new(); - - //TODO: still assuming one mh in the signature! - if let Sketch::MinHash(hashes) = &sig.signatures[0] { - let mut counter: HashMap = HashMap::with_capacity(hashes.size()); - - for hash in hashes.mins() { - self.query(hash).for_each(|dataset_idx| { - let idx = counter.entry(dataset_idx).or_insert(0); - *idx += 1; - }); - } - - for (idx, count) in counter { - let match_sig = &self.datasets[idx]; - //TODO: still assuming one mh in the signature! - let match_mh = match_sig.signatures[0].size(); - - let score = if containment { - count as f64 / hashes.size() as f64 - } else { - count as f64 / (hashes.size() + match_mh - count) as f64 - }; - - if score >= threshold { - results.push(match_sig) - } - } - - Ok(results) - } else { - // TODO: what if it is not a minhash? - unimplemented!() - } - } - - fn insert(&mut self, node: Self::Item) -> Result<(), Error> { - self.add(node); - Ok(()) - } - - fn save>(&self, _path: P) -> Result<(), Error> { - unimplemented!() - } - - fn load>(_path: P) -> Result<(), Error> { - unimplemented!() - } - - fn signatures(&self) -> Vec { - unimplemented!() - } - - fn signature_refs(&self) -> Vec<&Self::Item> { - unimplemented!() - } - - /* - fn iter_signatures(&'a self) -> Self::SignatureIterator { - self.datasets.iter() - } - */ -} - -#[cfg(test)] -mod test { - use std::convert::TryInto; - use std::fs::File; - use std::io::BufReader; - use std::path::PathBuf; - - use super::BIGSI; - - use crate::index::SigStore; - use crate::index::{Index, MHBT}; - use crate::signature::Signature; - - #[test] - fn bigsi_sbt_oracle() { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/v5.sbt.json"); - - let sbt = MHBT::from_path(filename).expect("Loading error"); - - let mut bigsi = BIGSI::new(10000, 10); - let datasets = sbt.signatures(); - - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); - - let mut reader = BufReader::new(File::open(filename).unwrap()); - let sigs = Signature::load_signatures( - &mut reader, - Some(31), - Some("DNA".try_into().unwrap()), - None, - ) - .unwrap(); - let sig_data = sigs[0].clone(); - - let leaf: SigStore<_> = sig_data.into(); - - for l in datasets { - bigsi.insert(l).expect("insertion error!"); - } - - let results_sbt = sbt.search(&leaf, 0.5, false).unwrap(); - assert_eq!(results_sbt.len(), 1); - - let data = leaf.data.get().unwrap(); - let results_bigsi = bigsi.search(data, 0.5, false).unwrap(); - assert_eq!(results_bigsi.len(), 1); - - assert_eq!(results_sbt.len(), results_bigsi.len()); - - let results_sbt = sbt.search(&leaf, 0.1, false).unwrap(); - assert_eq!(results_sbt.len(), 2); - - let data = leaf.data.get().unwrap(); - let results_bigsi = bigsi.search(data, 0.1, false).unwrap(); - assert_eq!(results_bigsi.len(), 2); - - assert_eq!(results_sbt.len(), results_bigsi.len()); - } -} diff --git a/src/core/src/index/mod.rs b/src/core/src/index/mod.rs index 4e43074ebe..832fdf9091 100644 --- a/src/core/src/index/mod.rs +++ b/src/core/src/index/mod.rs @@ -3,10 +3,8 @@ //! An index organizes signatures to allow for fast similarity search. //! Some indices also support containment searches. -pub mod bigsi; pub mod linear; pub mod revindex; -pub mod sbt; pub mod search; @@ -18,27 +16,13 @@ use serde::{Deserialize, Serialize}; use typed_builder::TypedBuilder; use crate::errors::ReadDataError; -use crate::index::sbt::{Node, SBT}; use crate::index::search::{search_minhashes, search_minhashes_containment}; use crate::prelude::*; use crate::signature::SigsTrait; -use crate::sketch::nodegraph::Nodegraph; use crate::sketch::Sketch; use crate::storage::{InnerStorage, Storage}; use crate::Error; -pub type MHBT = SBT, Signature>; - -/* FIXME: bring back after MQF works on macOS and Windows -use cfg_if::cfg_if; -cfg_if! { - if #[cfg(not(target_arch = "wasm32"))] { - use mqf::MQF; - pub type MHMT = SBT, Signature>; - } -} -*/ - pub trait Index<'a> { type Item: Comparable; //type SignatureIterator: Iterator; @@ -188,6 +172,26 @@ impl ReadData for SigStore { } } +impl SigStore +where + T: ToWriter, +{ + pub fn save(&self, path: &str) -> Result { + if let Some(storage) = &self.storage { + if let Some(data) = self.data.get() { + let mut buffer = Vec::new(); + data.to_writer(&mut buffer)?; + + Ok(storage.save(path, &buffer)?) + } else { + unimplemented!() + } + } else { + unimplemented!() + } + } +} + impl SigStore { pub fn count_common(&self, other: &SigStore) -> u64 { let ng: &Signature = self.data().unwrap(); diff --git a/src/core/src/index/sbt/mhbt.rs b/src/core/src/index/sbt/mhbt.rs deleted file mode 100644 index 2d4ceb3fb8..0000000000 --- a/src/core/src/index/sbt/mhbt.rs +++ /dev/null @@ -1,361 +0,0 @@ -use std::collections::HashMap; -use std::io::Write; - -use crate::errors::ReadDataError; -use crate::index::sbt::{Factory, FromFactory, Node, SBT}; -use crate::prelude::*; -use crate::signature::SigsTrait; -use crate::sketch::nodegraph::Nodegraph; -use crate::sketch::Sketch; -use crate::storage::Storage; -use crate::Error; - -impl ToWriter for Nodegraph { - fn to_writer(&self, writer: &mut W) -> Result<(), Error> - where - W: Write, - { - self.save_to_writer(writer) - } -} - -impl FromFactory> for SBT, L> { - fn factory(&self, name: &str) -> Result, Error> { - match self.factory { - Factory::GraphFactory { args: (k, t, n) } => { - let n = Nodegraph::with_tables(t as usize, n as usize, k as usize); - - Ok(Node::builder() - .filename(name) - .name(name) - .metadata(HashMap::default()) - .storage(self.storage()) - .data(n) - .build()) - } - } - } -} - -impl Update> for Node { - fn update(&self, _other: &mut Node) -> Result<(), Error> { - unimplemented!(); - } -} - -impl Update> for Signature { - fn update(&self, parent: &mut Node) -> Result<(), Error> { - // TODO: avoid copy here - let mut parent_data = parent.data()?.clone(); - - if let Sketch::MinHash(sig) = &self.signatures[0] { - for h in sig.mins() { - parent_data.count(h); - } - - let min_n_below = parent - .metadata - .entry("min_n_below".into()) - .or_insert(u64::max_value()); - - *min_n_below = u64::min(sig.size() as u64, *min_n_below); - if *min_n_below == 0 { - *min_n_below = 1 - } - } else { - //TODO what if it is not a minhash? - unimplemented!() - } - - parent.data = parent_data.into(); - - Ok(()) - } -} - -impl Comparable> for Node { - fn similarity(&self, other: &Node) -> f64 { - let ng: &Nodegraph = self.data().unwrap(); - let ong: &Nodegraph = other.data().unwrap(); - ng.similarity(ong) - } - - fn containment(&self, other: &Node) -> f64 { - let ng: &Nodegraph = self.data().unwrap(); - let ong: &Nodegraph = other.data().unwrap(); - ng.containment(ong) - } -} - -impl Comparable for Node { - fn similarity(&self, other: &Signature) -> f64 { - let ng: &Nodegraph = self.data().unwrap(); - - // TODO: select the right signatures... - if let Sketch::MinHash(sig) = &other.signatures[0] { - if sig.size() == 0 { - return 0.0; - } - - let matches: usize = sig.mins().iter().map(|h| ng.get(*h)).sum(); - - let min_n_below = self.metadata["min_n_below"] as f64; - - // This overestimates the similarity, but better than truncating too - // soon and losing matches - matches as f64 / min_n_below - } else { - //TODO what if it is not a minhash? - unimplemented!() - } - } - - fn containment(&self, other: &Signature) -> f64 { - let ng: &Nodegraph = self.data().unwrap(); - - // TODO: select the right signatures... - if let Sketch::MinHash(sig) = &other.signatures[0] { - if sig.size() == 0 { - return 0.0; - } - - let matches: usize = sig.mins().iter().map(|h| ng.get(*h)).sum(); - - matches as f64 / sig.size() as f64 - } else { - //TODO what if it is not a minhash? - unimplemented!() - } - } -} - -impl ReadData for Node { - fn data(&self) -> Result<&Nodegraph, Error> { - if let Some(storage) = &self.storage { - Ok(self.data.get_or_init(|| { - let raw = storage.load(&self.filename).unwrap(); - Nodegraph::from_reader(&mut &raw[..]).unwrap() - })) - } else if let Some(data) = self.data.get() { - Ok(data) - } else { - Err(ReadDataError::LoadError.into()) - } - } -} - -#[cfg(test)] -mod test { - use std::convert::TryInto; - use std::fs::File; - use std::io::{BufReader, Seek, SeekFrom}; - use std::path::PathBuf; - - use assert_matches::assert_matches; - - use super::Factory; - - use crate::index::linear::LinearIndex; - use crate::index::sbt::scaffold; - use crate::index::search::{search_minhashes, search_minhashes_containment}; - use crate::index::{Index, SigStore, MHBT}; - use crate::prelude::*; - - #[test] - fn save_sbt() { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/v5.sbt.json"); - - let mut sbt = MHBT::from_path(filename).expect("Loading error"); - - let mut tmpfile = tempfile::NamedTempFile::new().unwrap(); - sbt.save_file(tmpfile.path(), None).unwrap(); - - tmpfile.seek(SeekFrom::Start(0)).unwrap(); - } - - #[test] - fn load_sbt() { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/v5.sbt.json"); - - let sbt = MHBT::from_path(filename).expect("Loading error"); - - assert_eq!(sbt.d, 2); - //assert_eq!(sbt.storage.backend, "FSStorage"); - //assert_eq!(sbt.storage.args["path"], ".sbt.v5"); - //assert_matches!(&sbt.storage, ::FSStorage(args) => { - // assert_eq!(args, &[1, 100000, 4]); - //}); - assert_matches!(&sbt.factory, Factory::GraphFactory { args } => { - assert_eq!(args, &(1, 100000.0, 4)); - }); - - println!("sbt leaves {:?} {:?}", sbt.leaves.len(), sbt.leaves); - - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); - - let mut reader = BufReader::new(File::open(filename).unwrap()); - let sigs = Signature::load_signatures( - &mut reader, - Some(31), - Some("DNA".try_into().unwrap()), - None, - ) - .unwrap(); - let leaf = sigs[0].clone(); - - let results = sbt.find(search_minhashes, &leaf, 0.5).unwrap(); - assert_eq!(results.len(), 1); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = sbt.find(search_minhashes, &leaf, 0.1).unwrap(); - assert_eq!(results.len(), 2); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let mut linear = LinearIndex::builder().storage(sbt.storage()).build(); - for l in &sbt.leaves { - linear.insert(l.1.data().unwrap().clone()).unwrap(); - } - - let datasets = linear.signatures(); - println!("linear leaves {:?} {:?}", datasets.len(), datasets); - - let results = linear.find(search_minhashes, &leaf, 0.5).unwrap(); - assert_eq!(results.len(), 1); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = linear.find(search_minhashes, &leaf, 0.1).unwrap(); - assert_eq!(results.len(), 2); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = linear - .find(search_minhashes_containment, &leaf, 0.5) - .unwrap(); - assert_eq!(results.len(), 2); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = linear - .find(search_minhashes_containment, &leaf, 0.1) - .unwrap(); - assert_eq!(results.len(), 4); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - } - - #[test] - #[ignore] - fn roundtrip_sbt() -> Result<(), Box> { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/v5.sbt.json"); - - let sbt = MHBT::from_path(filename)?; - - assert_eq!(sbt.d, 2); - //assert_eq!(sbt.storage.backend, "FSStorage"); - //assert_eq!(sbt.storage.args["path"], ".sbt.v5"); - //assert_matches!(&sbt.storage, ::FSStorage(args) => { - // assert_eq!(args, &[1, 100000, 4]); - //}); - assert_matches!(&sbt.factory, Factory::GraphFactory { args } => { - assert_eq!(args, &(1, 100000.0, 4)); - }); - - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); - - let mut reader = BufReader::new(File::open(filename)?); - let sigs = Signature::load_signatures( - &mut reader, - Some(31), - Some("DNA".try_into().unwrap()), - None, - )?; - let sig_data = sigs[0].clone(); - - let leaf: SigStore<_> = sig_data.into(); - - let results = sbt.find(search_minhashes, &leaf, 0.5)?; - assert_eq!(results.len(), 1); - //println!("results: {:?}", results); - //println!("leaf: {:?}", leaf); - - let results = sbt.find(search_minhashes, &leaf, 0.1)?; - assert_eq!(results.len(), 2); - //println!("results: {:?}", results); - //println!("leaf: {:?}", leaf); - - println!("sbt internal {:?} {:?}", sbt.nodes.len(), sbt.nodes); - println!("sbt leaves {:?} {:?}", sbt.leaves.len(), sbt.leaves); - - let mut new_sbt: MHBT = MHBT::builder().storage(None).build(); - let datasets = sbt.signatures(); - for l in datasets { - new_sbt.insert(l)?; - } - - for (i, node) in &sbt.nodes { - assert_eq!(node.data().unwrap(), new_sbt.nodes[i].data().unwrap()); - } - - assert_eq!(new_sbt.signature_refs().len(), 7); - println!("new_sbt internal {:?} {:?}", sbt.nodes.len(), sbt.nodes); - println!("new_sbt leaves {:?} {:?}", sbt.leaves.len(), sbt.leaves); - - let results = new_sbt.find(search_minhashes, &leaf, 0.5)?; - //println!("results: {:?}", results); - //println!("leaf: {:?}", leaf); - assert_eq!(results.len(), 1); - - let results = new_sbt.find(search_minhashes, &leaf, 0.1)?; - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - assert_eq!(results.len(), 2); - - let results = new_sbt.find(search_minhashes_containment, &leaf, 0.5)?; - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - assert_eq!(results.len(), 2); - - let results = new_sbt.find(search_minhashes_containment, &leaf, 0.1)?; - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - assert_eq!(results.len(), 4); - - Ok(()) - } - - #[test] - fn scaffold_sbt() { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/v5.sbt.json"); - - let sbt = MHBT::from_path(filename).expect("Loading error"); - - let new_sbt: MHBT = scaffold(sbt.leaves(), sbt.storage()); - - assert_eq!(new_sbt.signatures().len(), 7); - } - - #[test] - fn load_v4() { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/v4.sbt.json"); - - let _sbt = MHBT::from_path(filename).expect("Loading error"); - } - - #[test] - fn load_v5() { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/v5.sbt.json"); - - let _sbt = MHBT::from_path(filename).expect("Loading error"); - } -} diff --git a/src/core/src/index/sbt/mhmt.rs b/src/core/src/index/sbt/mhmt.rs deleted file mode 100644 index 5eeb8a09b3..0000000000 --- a/src/core/src/index/sbt/mhmt.rs +++ /dev/null @@ -1,227 +0,0 @@ -use std::io::{Read, Write}; - -use mqf::MQF; - -use crate::Error; -use crate::index::sbt::{FromFactory, Node, Update, SBT}; -use crate::index::storage::{ReadData, ReadDataError, ToWriter}; -use crate::index::Comparable; -use crate::signature::{Signature, SigsTrait}; -use crate::sketch::Sketch; - -impl ToWriter for MQF { - fn to_writer(&self, writer: &mut W) -> Result<(), Error> - where - W: Write, - { - // TODO: using tempfile for now, but ideally want to avoid that - let mut tmpfile = tempfile::NamedTempFile::new()?; - self.serialize(tmpfile.path()).unwrap(); // TODO: convert this to a proper error - - let mut buffer = Vec::new(); - tmpfile.read_to_end(&mut buffer)?; - writer.write_all(&buffer)?; - - Ok(()) - } -} - -impl ReadData for Node { - fn data(&self) -> Result<&MQF, Error> { - if let Some(storage) = &self.storage { - Ok(self.data.get_or_create(|| { - let raw = storage.load(&self.filename).unwrap(); - - // TODO: using tempfile for now, but ideally want to avoid that - let mut tmpfile = tempfile::NamedTempFile::new().unwrap(); - tmpfile.write_all(&raw[..]).unwrap(); - - MQF::deserialize(tmpfile.path()).unwrap() - })) - } else if let Some(data) = self.data.get() { - Ok(data) - } else { - Err(ReadDataError::LoadError.into()) - } - } -} - -impl FromFactory> for SBT, L> { - fn factory(&self, _name: &str) -> Result, Error> { - unimplemented!() - } -} - -impl Update> for Node { - fn update(&self, _other: &mut Node) -> Result<(), Error> { - unimplemented!(); - } -} - -impl Update> for Signature { - fn update(&self, _other: &mut Node) -> Result<(), Error> { - unimplemented!(); - } -} - -impl Comparable> for Node { - fn similarity(&self, other: &Node) -> f64 { - let _ng: &MQF = self.data().unwrap(); - let _ong: &MQF = other.data().unwrap(); - unimplemented!(); - //ng.similarity(&ong) - } - - fn containment(&self, other: &Node) -> f64 { - let _ng: &MQF = self.data().unwrap(); - let _ong: &MQF = other.data().unwrap(); - unimplemented!(); - //ng.containment(&ong) - } -} - -impl Comparable for Node { - fn similarity(&self, other: &Signature) -> f64 { - let ng: &MQF = self.data().unwrap(); - - // TODO: select the right signatures... - if let Sketch::MinHash(sig) = &other.signatures[0] { - if sig.size() == 0 { - return 0.0; - } - - let matches: usize = sig - .mins - .iter() - .filter(|h| dbg!(ng.count_key(**h % u64::pow(2, 26))) > 0) - //.filter(|h| dbg!(ng.count_key(**h)) > 0) - .count(); - - let min_n_below = self.metadata["min_n_below"] as f64; - - // This overestimates the similarity, but better than truncating too - // soon and losing matches - matches as f64 / min_n_below - } else { - //TODO what if it is not a minhash? - unimplemented!() - } - } - - fn containment(&self, other: &Signature) -> f64 { - let ng: &MQF = self.data().unwrap(); - - // TODO: select the right signatures... - if let Sketch::MinHash(sig) = &other.signatures[0] { - if sig.size() == 0 { - return 0.0; - } - - let matches: usize = sig - .mins - .iter() - .filter(|h| ng.count_key(**h % u64::pow(2, 26)) > 0) - //.filter(|h| ng.count_key(**h) > 0) - .count(); - - matches as f64 / sig.size() as f64 - } else { - //TODO what if it is not a minhash? - unimplemented!() - } - } -} - -/* FIXME: bring back after MQF works on macOS and Windows -#[cfg(test)] -mod test { - use std::fs::File; - use std::io::{BufReader, Seek, SeekFrom}; - use std::path::PathBuf; - use std::rc::Rc; - use tempfile; - - use assert_matches::assert_matches; - use lazy_init::Lazy; - - use super::{scaffold, Factory}; - - use crate::index::linear::LinearIndex; - use crate::index::search::{search_minhashes, search_minhashes_containment}; - use crate::index::storage::ReadData; - use crate::index::{Index, SigStore, MHBT}; - use crate::signature::Signature; - - #[cfg(not(target_arch = "wasm32"))] - #[test] - fn load_mhmt() { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("tests/test-data/v5_mhmt.sbt.json"); - - let mut sbt = crate::index::MHMT::from_path(filename).expect("Loading error"); - - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); - - let mut reader = BufReader::new(File::open(filename).unwrap()); - let sigs = Signature::load_signatures(&mut reader, 31, Some("DNA".into()), None).unwrap(); - let sig_data = sigs[0].clone(); - - let data = Lazy::new(); - data.get_or_create(|| sig_data); - - let leaf = SigStore::builder() - .data(Rc::new(data)) - .filename("") - .name("") - .metadata("") - .storage(None) - .build(); - - let results = sbt.find(search_minhashes, &leaf, 0.5).unwrap(); - //assert_eq!(results.len(), 1); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = sbt.find(search_minhashes, &leaf, 0.1).unwrap(); - assert_eq!(results.len(), 2); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let mut linear = LinearIndex::builder().storage(sbt.storage()).build(); - for l in &sbt.leaves { - linear.insert(l.1.data().unwrap().clone()).unwrap(); - } - - println!( - "linear leaves {:?} {:?}", - linear.datasets.len(), - linear.datasets - ); - - let results = linear.find(search_minhashes, &leaf, 0.5).unwrap(); - assert_eq!(results.len(), 1); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = linear.find(search_minhashes, &leaf, 0.1).unwrap(); - assert_eq!(results.len(), 2); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = linear - .find(search_minhashes_containment, &leaf, 0.5) - .unwrap(); - assert_eq!(results.len(), 2); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = linear - .find(search_minhashes_containment, &leaf, 0.1) - .unwrap(); - assert_eq!(results.len(), 4); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - } - */ -} diff --git a/src/core/src/index/sbt/mod.rs b/src/core/src/index/sbt/mod.rs deleted file mode 100644 index 5245defe1f..0000000000 --- a/src/core/src/index/sbt/mod.rs +++ /dev/null @@ -1,878 +0,0 @@ -pub mod mhbt; - -/* FIXME: bring back after boomphf changes -pub mod ukhs; -*/ - -/* FIXME: bring back after MQF works on macOS and Windows -#[cfg(not(target_arch = "wasm32"))] -pub mod mhmt; -*/ - -use std::collections::hash_map::Entry; -use std::collections::{HashMap, HashSet}; -use std::fmt::Debug; -use std::fs::File; -use std::hash::BuildHasherDefault; -use std::io::{BufReader, Read}; -use std::path::{Path, PathBuf}; - -use log::info; -use nohash_hasher::NoHashHasher; -use once_cell::sync::OnceCell; -use serde::{Deserialize, Serialize}; -use typed_builder::TypedBuilder; - -use crate::index::{Comparable, DatasetInfo, Index, SigStore}; -use crate::prelude::*; -use crate::storage::{FSStorage, InnerStorage, StorageInfo}; -use crate::Error; - -#[derive(TypedBuilder)] -pub struct SBT { - #[builder(default = 2)] - d: u32, - - #[builder(default, setter(into))] - storage: Option, - - #[builder(default = Factory::GraphFactory { args: (1, 100000.0, 4) })] - factory: Factory, - - #[builder(default = HashMap::default())] - nodes: HashMap, - - #[builder(default = HashMap::default())] - leaves: HashMap>, -} - -const fn parent(pos: u64, d: u64) -> u64 { - (pos - 1) / d -} - -const fn child(parent: u64, pos: u64, d: u64) -> u64 { - d * parent + pos + 1 -} - -impl SBT -where - L: std::clone::Clone + Default, - N: Default, -{ - #[inline(always)] - fn parent(&self, pos: u64) -> Option { - if pos == 0 { - None - } else { - Some(parent(pos, u64::from(self.d))) - } - } - - #[inline(always)] - fn child(&self, parent: u64, pos: u64) -> u64 { - child(parent, pos, u64::from(self.d)) - } - - #[inline(always)] - fn children(&self, pos: u64) -> Vec { - (0..u64::from(self.d)).map(|c| self.child(pos, c)).collect() - } - - pub fn storage(&self) -> Option { - self.storage.clone() - } - - /* - fn fill_up(&mut self) -> Result<(), Error> { - let mut visited = HashSet::new(); - let mut queue: Vec<_> = self.leaves.keys().collect(); - - while !queue.is_empty() { - let pos = queue.pop().unwrap(); - - if !visited.contains(&pos) { - visited.insert(pos); - } - } - - Ok(()) - } - */ - - // combine -} - -impl SBT, T> -where - T: ToWriter + Clone, - U: ToWriter, - Node: ReadData, - SigStore: ReadData, -{ - fn parse_v4(rdr: &mut R) -> Result - where - R: Read, - { - let sinfo: SBTInfoV4 = serde_json::from_reader(rdr)?; - Ok(SBTInfo::V4(sinfo)) - } - - fn parse_v5(rdr: &mut R) -> Result - where - R: Read, - { - let sinfo: SBTInfoV5 = serde_json::from_reader(rdr)?; - Ok(SBTInfo::V5(sinfo)) - } - - pub fn from_reader(mut rdr: R, path: P) -> Result, T>, Error> - where - R: Read, - P: AsRef, - { - // TODO: I would love to do this, but I get an untagged enum error with - // SBTInfo... - //let sinfo: SBTInfo = serde_json::from_reader(rdr)?; - - let mut s = String::new(); - rdr.read_to_string(&mut s)?; - - let sinfo = - Self::parse_v5(&mut s.as_bytes()).or_else(|_| Self::parse_v4(&mut s.as_bytes()))?; - - // TODO: support other storages - let mut st: FSStorage = match sinfo { - SBTInfo::V4(ref sbt) => (&sbt.storage.args).into(), - SBTInfo::V5(ref sbt) => (&sbt.storage.args).into(), - SBTInfo::V6(ref sbt) => (&sbt.storage.args).into(), - }; - st.set_base(path.as_ref().to_str().unwrap()); - let storage = InnerStorage::new(st); - - let d = match sinfo { - SBTInfo::V4(ref sbt) => sbt.d, - SBTInfo::V5(ref sbt) => sbt.d, - SBTInfo::V6(ref sbt) => sbt.d, - }; - - let factory = match sinfo { - SBTInfo::V4(ref sbt) => sbt.factory.clone(), - SBTInfo::V5(ref sbt) => sbt.factory.clone(), - SBTInfo::V6(ref sbt) => sbt.factory.clone(), - }; - - let (nodes, leaves) = match sinfo { - SBTInfo::V6(sbt) => { - let nodes = sbt - .nodes - .into_iter() - .map(|(n, l)| { - ( - n, - Node::builder() - .filename(l.filename) - .name(l.name) - .metadata(l.metadata) - .storage(Some(storage.clone())) - .build(), - ) - }) - .collect(); - let leaves = sbt - .signatures - .into_iter() - .map(|(n, l)| { - ( - n, - SigStore::builder() - .filename(l.filename) - .name(l.name) - .metadata(l.metadata) - .storage(Some(storage.clone())) - .build(), - ) - }) - .collect(); - (nodes, leaves) - } - SBTInfo::V5(sbt) => { - let nodes = sbt - .nodes - .into_iter() - .map(|(n, l)| { - ( - n, - Node::builder() - .filename(l.filename) - .name(l.name) - .metadata(l.metadata) - .storage(Some(storage.clone())) - .build(), - ) - }) - .collect(); - let leaves = sbt - .leaves - .into_iter() - .map(|(n, l)| { - ( - n, - SigStore::builder() - .filename(l.filename) - .name(l.name) - .metadata(l.metadata) - .storage(Some(storage.clone())) - .build(), - ) - }) - .collect(); - (nodes, leaves) - } - SBTInfo::V4(sbt) => { - let nodes = sbt - .nodes - .iter() - .filter_map(|(n, x)| match x { - NodeInfoV4::Node(l) => Some(( - *n, - Node::builder() - .filename(l.filename.clone()) - .name(l.name.clone()) - .metadata(l.metadata.clone()) - .storage(Some(storage.clone())) - .build(), - )), - NodeInfoV4::Leaf(_) => None, - }) - .collect(); - - let leaves = sbt - .nodes - .into_iter() - .filter_map(|(n, x)| match x { - NodeInfoV4::Node(_) => None, - NodeInfoV4::Leaf(l) => Some(( - n, - SigStore::builder() - .filename(l.filename) - .name(l.name) - .metadata(l.metadata) - .storage(Some(storage.clone())) - .build(), - )), - }) - .collect(); - - (nodes, leaves) - } - }; - - Ok(SBT { - d, - factory, - storage: Some(storage), - nodes, - leaves, - }) - } - - pub fn from_path>(path: P) -> Result, T>, Error> { - let file = File::open(&path)?; - let mut reader = BufReader::new(file); - - // TODO: match with available Storage while we don't - // add a function to build a Storage from a StorageInfo - let mut basepath = PathBuf::new(); - basepath.push(path); - // TODO: canonicalize doesn't work on wasm32-wasi - //basepath.canonicalize()?; - - let sbt = SBT::, T>::from_reader(&mut reader, basepath.parent().unwrap())?; - Ok(sbt) - } - - pub fn save_file>( - &mut self, - path: P, - storage: Option, - ) -> Result<(), Error> { - let ref_path = path.as_ref(); - let mut basename = ref_path.file_name().unwrap().to_str().unwrap().to_owned(); - if basename.ends_with(".sbt.json") { - basename = basename.replace(".sbt.json", ""); - } - let location = ref_path.parent().unwrap(); - - let storage = match storage { - Some(s) => s, - None => { - let subdir = format!(".sbt.{}", basename); - InnerStorage::new(FSStorage::new(location.to_str().unwrap(), &subdir)) - } - }; - - let args = storage.args(); - let storage_info = StorageInfo { - backend: "FSStorage".into(), - args, - }; - - let info: SBTInfoV5 = SBTInfoV5 { - d: self.d, - factory: self.factory.clone(), - storage: storage_info, - version: 5, - nodes: self - .nodes - .iter_mut() - .map(|(n, l)| { - // Trigger data loading - let _: &U = (*l).data().expect("Couldn't load data"); - - // set storage to new one - l.storage = Some(storage.clone()); - - let filename = (*l).save(&l.filename).unwrap(); - let new_node = NodeInfo { - filename, - name: l.name.clone(), - metadata: l.metadata.clone(), - }; - (*n, new_node) - }) - .collect(), - leaves: self - .leaves - .iter_mut() - .map(|(n, l)| { - // Trigger data loading - let _: &T = (*l).data().unwrap(); - - // set storage to new one - l.storage = Some(storage.clone()); - - // TODO: this should be l.md5sum(), not l.filename - let filename = (*l).save(&l.filename).unwrap(); - let new_node = DatasetInfo { - filename, - name: l.name.clone(), - metadata: l.metadata.clone(), - }; - (*n, new_node) - }) - .collect(), - }; - - let file = File::create(path)?; - serde_json::to_writer(file, &info)?; - - Ok(()) - } - - pub fn leaves(&self) -> Vec> { - self.leaves.values().cloned().collect() - } -} - -impl<'a, N, L> Index<'a> for SBT -where - N: Comparable + Comparable + Update + Debug + Default, - L: Comparable + Update + Clone + Debug + Default, - SBT: FromFactory, - SigStore: From + ReadData, -{ - type Item = L; - - fn find(&self, search_fn: F, sig: &L, threshold: f64) -> Result, Error> - where - F: Fn(&dyn Comparable, &Self::Item, f64) -> bool, - { - let mut matches = Vec::new(); - let mut visited = HashSet::new(); - let mut queue = vec![0u64]; - - while let Some(pos) = queue.pop() { - if !visited.contains(&pos) { - visited.insert(pos); - - if let Some(node) = self.nodes.get(&pos) { - if search_fn(&node, sig, threshold) { - for c in self.children(pos) { - queue.push(c); - } - } - } else if let Some(leaf) = self.leaves.get(&pos) { - let data = leaf.data().expect("Error reading data"); - if search_fn(data, sig, threshold) { - matches.push(data); - } - } - } - } - - Ok(matches) - } - - fn insert(&mut self, dataset: L) -> Result<(), Error> { - if self.leaves.is_empty() { - // in this case the tree is empty, - // just add the dataset to the first available leaf - self.leaves.entry(0).or_insert_with(|| dataset.into()); - return Ok(()); - } - - // we can unwrap here because the root node case - // only happens on an empty tree, and if we got - // to this point we have at least one leaf already. - // TODO: find position by similarity search - let pos = self.leaves.keys().max().unwrap() + 1; - let parent_pos = self.parent(pos).unwrap(); - let final_pos; - - if let Entry::Occupied(pnode) = self.leaves.entry(parent_pos) { - // Case 1: parent is a Leaf - // create a new internal node, add it to self.nodes[parent_pos] - - let (_, leaf) = pnode.remove_entry(); - - let mut new_node = self.factory(&format!("internal.{}", parent_pos))?; - - // for each children update the parent node - // TODO: write the update method - leaf.data.get().unwrap().update(&mut new_node)?; - dataset.update(&mut new_node)?; - - // node and parent are children of new internal node - let mut c_pos = self.children(parent_pos).into_iter().take(2); - let c1_pos = c_pos.next().unwrap(); - let c2_pos = c_pos.next().unwrap(); - - self.leaves.entry(c1_pos).or_insert(leaf); - self.leaves.entry(c2_pos).or_insert_with(|| dataset.into()); - final_pos = c2_pos; - - // add the new internal node to self.nodes[parent_pos) - // TODO check if it is really empty? - self.nodes.entry(parent_pos).or_insert(new_node); - } else { - // TODO: moved these two lines here to avoid borrow checker - // error E0502 in the Vacant case, but would love to avoid it! - let mut new_node = self.factory(&format!("internal.{}", parent_pos))?; - let c_pos = self.children(parent_pos)[0]; - - match self.nodes.entry(parent_pos) { - // Case 2: parent is a node and has an empty child spot available - // (if there isn't an empty spot, it was already covered by case 1) - Entry::Occupied(mut pnode) => { - dataset.update(pnode.get_mut())?; - self.leaves.entry(pos).or_insert_with(|| dataset.into()); - final_pos = pos; - } - - // Case 3: parent is None/empty - // this can happen with d != 2, need to create parent node - Entry::Vacant(pnode) => { - dataset.update(&mut new_node)?; - self.leaves.entry(c_pos).or_insert_with(|| dataset.into()); - final_pos = c_pos; - pnode.insert(new_node); - } - } - } - - let entry = &self.leaves[&final_pos]; - let data = entry.data.get().unwrap(); - - let mut parent_pos = parent_pos; - while let Some(ppos) = self.parent(parent_pos) { - if let Entry::Occupied(mut pnode) = self.nodes.entry(parent_pos) { - //TODO: use children for this node to update, instead of dragging - // dataset up to the root? It would be more generic, but this - // works for minhash, draff signatures and nodegraphs... - data.update(pnode.get_mut())?; - } - parent_pos = ppos; - } - - Ok(()) - } - - /* - fn batch_insert(&mut self, nodes: Vec) -> Result<(), Error> { - self = scaffold(nodes, self.storage()); - Ok(()) - } - */ - - fn save>(&self, _path: P) -> Result<(), Error> { - unimplemented!(); - } - - fn load>(_path: P) -> Result<(), Error> { - unimplemented!() - } - - fn signatures(&self) -> Vec { - self.leaves - .values() - .map(|x| x.data().unwrap().clone()) - .collect() - } - - fn signature_refs(&self) -> Vec<&Self::Item> { - self.leaves.values().map(|x| x.data().unwrap()).collect() - } - - /* - fn iter_signatures(&'a self) -> Self::SignatureIterator { - self.leaves.values() - } - */ -} - -/* -#[derive(TypedBuilder, Clone, Default, Serialize, Deserialize)] -pub struct Factory { - class: String, - args: Vec, -} -*/ - -#[derive(Debug, Clone, Serialize, Deserialize)] -#[serde(tag = "class")] -pub enum Factory { - GraphFactory { args: (u64, f64, u64) }, -} - -#[derive(TypedBuilder, Default, Clone)] -pub struct Node { - #[builder(setter(into))] - filename: String, - - #[builder(setter(into))] - name: String, - - metadata: HashMap, - - #[builder(default)] - storage: Option, - - #[builder(setter(into), default)] - data: OnceCell, -} - -impl Node -where - T: ToWriter, -{ - pub fn save(&self, path: &str) -> Result { - if let Some(storage) = &self.storage { - if let Some(data) = self.data.get() { - let mut buffer = Vec::new(); - data.to_writer(&mut buffer)?; - - Ok(storage.save(path, &buffer)?) - } else { - // TODO throw error, data was not initialized - unimplemented!() - } - } else { - unimplemented!() - } - } -} - -impl PartialEq for Node -where - T: PartialEq, - Node: ReadData, -{ - fn eq(&self, other: &Node) -> bool { - self.data().unwrap() == other.data().unwrap() - } -} - -impl SigStore -where - T: ToWriter, -{ - pub fn save(&self, path: &str) -> Result { - if let Some(storage) = &self.storage { - if let Some(data) = self.data.get() { - let mut buffer = Vec::new(); - data.to_writer(&mut buffer)?; - - Ok(storage.save(path, &buffer)?) - } else { - unimplemented!() - } - } else { - unimplemented!() - } - } -} - -impl std::fmt::Debug for Node -where - T: Debug, -{ - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "Node [name={}, filename={}, metadata: {:?}, data: {:?}]", - self.name, - self.filename, - self.metadata, - self.data.get().is_some() - ) - } -} - -#[derive(Serialize, Deserialize, Debug)] -struct NodeInfo { - filename: String, - name: String, - metadata: HashMap, -} - -#[derive(Serialize, Deserialize, Debug)] -#[serde(untagged)] -enum NodeInfoV4 { - Node(NodeInfo), - Leaf(DatasetInfo), -} - -#[derive(Serialize, Deserialize)] -struct SBTInfoV4 { - d: u32, - version: u32, - storage: StorageInfo, - factory: Factory, - nodes: HashMap, -} - -#[derive(Serialize, Deserialize)] -struct SBTInfoV5 { - d: u32, - version: u32, - storage: StorageInfo, - factory: Factory, - nodes: HashMap, - leaves: HashMap, -} - -#[derive(Serialize, Deserialize)] -struct SBTInfoV6 { - d: u32, - version: u32, - storage: StorageInfo, - factory: Factory, - nodes: HashMap, - signatures: HashMap, -} - -#[derive(Deserialize)] -#[serde(untagged)] -enum SBTInfo { - V6(SBTInfoV6), - V5(SBTInfoV5), - V4(SBTInfoV4), -} - -enum BinaryTree { - Empty, - Internal(Box>>>>), - Leaf(Box>>), -} - -struct TreeNode { - element: T, - left: BinaryTree, - right: BinaryTree, -} - -pub fn scaffold( - mut datasets: Vec>, - storage: Option, -) -> SBT, Signature> -where - N: Clone + Default, -{ - let mut leaves: HashMap> = HashMap::with_capacity(datasets.len()); - - let mut next_round = Vec::new(); - - // generate two bottom levels: - // - datasets - // - first level of internal nodes - info!("Start processing leaves"); - while let Some(next_leaf) = datasets.pop() { - let (simleaf_tree, in_common) = if datasets.is_empty() { - (BinaryTree::Empty, next_leaf.mins().into_iter().collect()) - } else { - let mut similar_leaf_pos = 0; - let mut current_max = 0; - for (pos, leaf) in datasets.iter().enumerate() { - let common = next_leaf.count_common(leaf); - if common > current_max { - current_max = common; - similar_leaf_pos = pos; - } - } - - let similar_leaf = datasets.remove(similar_leaf_pos); - - let in_common = next_leaf - .mins() - .into_iter() - .collect::>>>() - .union(&similar_leaf.mins().into_iter().collect()) - .cloned() - .collect(); - - let simleaf_tree = BinaryTree::Leaf(Box::new(TreeNode { - element: similar_leaf, - left: BinaryTree::Empty, - right: BinaryTree::Empty, - })); - (simleaf_tree, in_common) - }; - - let leaf_tree = BinaryTree::Leaf(Box::new(TreeNode { - element: next_leaf, - left: BinaryTree::Empty, - right: BinaryTree::Empty, - })); - - let tree = BinaryTree::Internal(Box::new(TreeNode { - element: in_common, - left: leaf_tree, - right: simleaf_tree, - })); - - next_round.push(tree); - - if next_round.len() % 100 == 0 { - info!("Processed {} leaves", next_round.len() * 2); - } - } - info!("Finished processing leaves"); - - // while we don't get to the root, generate intermediary levels - while next_round.len() != 1 { - next_round = BinaryTree::process_internal_level(next_round); - info!("Finished processing round {}", next_round.len()); - } - - // Convert from binary tree to nodes/leaves - let root = next_round.pop().unwrap(); - let mut visited = HashSet::new(); - let mut queue = vec![(0u64, root)]; - - while let Some((pos, cnode)) = queue.pop() { - if !visited.contains(&pos) { - visited.insert(pos); - - match cnode { - BinaryTree::Leaf(leaf) => { - leaves.insert(pos, leaf.element); - } - BinaryTree::Internal(mut node) => { - let left = std::mem::replace(&mut node.left, BinaryTree::Empty); - let right = std::mem::replace(&mut node.right, BinaryTree::Empty); - queue.push((2 * pos + 1, left)); - queue.push((2 * pos + 2, right)); - } - BinaryTree::Empty => (), - } - } - } - - SBT::builder() - .storage(storage) - .nodes(HashMap::default()) - .leaves(leaves) - .build() -} - -impl BinaryTree { - fn process_internal_level(mut current_round: Vec) -> Vec { - let mut next_round = Vec::with_capacity(current_round.len() + 1); - - while let Some(next_node) = current_round.pop() { - let similar_node = if current_round.is_empty() { - BinaryTree::Empty - } else { - let mut similar_node_pos = 0; - let mut current_max = 0; - for (pos, cmpe) in current_round.iter().enumerate() { - let common = BinaryTree::intersection_size(&next_node, cmpe); - if common > current_max { - current_max = common; - similar_node_pos = pos; - } - } - current_round.remove(similar_node_pos) - }; - - let tree = BinaryTree::new_tree(next_node, similar_node); - - next_round.push(tree); - } - next_round - } - - // Remove this when MSRV is >= 1.40 - #[allow(clippy::mem_replace_with_default)] - fn new_tree(mut left: BinaryTree, mut right: BinaryTree) -> BinaryTree { - let in_common = if let BinaryTree::Internal(ref mut el1) = left { - match right { - BinaryTree::Internal(ref mut el2) => { - let c1 = std::mem::replace( - &mut el1.element, - HashSet::>>::default(), - ); - let c2 = std::mem::replace( - &mut el2.element, - HashSet::>>::default(), - ); - c1.union(&c2).cloned().collect() - } - BinaryTree::Empty => std::mem::replace( - &mut el1.element, - HashSet::>>::default(), - ), - _ => panic!("Should not see a Leaf at this level"), - } - } else { - HashSet::>>::default() - }; - - BinaryTree::Internal(Box::new(TreeNode { - element: in_common, - left, - right, - })) - } - - fn intersection_size(n1: &BinaryTree, n2: &BinaryTree) -> usize { - if let BinaryTree::Internal(ref el1) = n1 { - if let BinaryTree::Internal(ref el2) = n2 { - return el1.element.intersection(&el2.element).count(); - } - }; - 0 - } -} - -/* -impl From> for SBT, Signature> -where - U: Default + Clone, -{ - fn from(other: LinearIndex) -> Self { - let storage = other.storage(); - scaffold(other.datasets, storage) - } -} -*/