Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MRG: core: add scaled selection to manifest; add helper functions for collection and sig/sketch usage #2948

Merged
merged 19 commits into from
Feb 5, 2024
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
219 changes: 219 additions & 0 deletions src/core/src/collection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,14 @@ impl Collection {
assert_eq!(sig.signatures.len(), 1);
Ok(sig)
}

pub fn sig_from_record(&self, record: &Record) -> Result<SigStore> {
let match_path = record.internal_location().as_str();
let selection = Selection::from_record(record)?;
let sig = self.storage.load_sig(match_path)?.select(&selection)?;
assert_eq!(sig.signatures.len(), 1);
Ok(sig)
}
}

impl Select for Collection {
Expand All @@ -188,3 +196,214 @@ impl Select for Collection {
Ok(self)
}
}

#[cfg(test)]
mod test {
use camino::Utf8PathBuf as PathBuf;
use std::fs::File;
use std::io::BufReader;

use super::Collection;

use crate::encodings::HashFunctions;
use crate::prelude::Select;
use crate::selection::Selection;
use crate::signature::Signature;

#[test]
fn sigstore_selection_with_downsample() {
// load test sigs
let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
filename.push("../../tests/test-data/47+63-multisig.sig");
let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");
// create Selection object
let mut selection = Selection::default();
selection.set_scaled(2000);
// load sigs into collection + select compatible signatures
let cl = Collection::from_sigs(sigs)
.unwrap()
.select(&selection)
.unwrap();
// count collection length
assert_eq!(cl.len(), 6);
for (idx, _rec) in cl.iter() {
// need to pass select again here so we actually downsample
let this_sig = cl.sig_for_dataset(idx).unwrap().select(&selection).unwrap();
let this_mh = this_sig.minhash().unwrap();
assert_eq!(this_mh.scaled(), 2000);
}
}

#[test]
fn sigstore_selection_with_downsample_too_low() {
// load test sigs
let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
filename.push("../../tests/test-data/47+63-multisig.sig");
let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");
// create Selection object
let mut selection = Selection::default();
selection.set_scaled(500);
// load sigs into collection + select compatible signatures
let cl = Collection::from_sigs(sigs)
.unwrap()
.select(&selection)
.unwrap();
// no sigs should remain
assert_eq!(cl.len(), 0);
}

#[test]
fn sigstore_selection_scaled_handle_num_sig() {
// load test sigs
let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
// four num=500 sigs
filename.push("../../tests/test-data/genome-s11.fa.gz.sig");
let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");
assert_eq!(sigs.len(), 4);
// create Selection object
let mut selection = Selection::default();
selection.set_scaled(1000);
// load sigs into collection + select compatible signatures
let cl = Collection::from_sigs(sigs)
.unwrap()
.select(&selection)
.unwrap();
// no sigs should remain
assert_eq!(cl.len(), 0);
}

#[test]
fn sigstore_selection_num() {
// load test sigs
let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
// four num=500 sigs
filename.push("../../tests/test-data/genome-s11.fa.gz.sig");
let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");
let sigs_copy = sigs.clone();
assert_eq!(sigs.len(), 4);
// create Selection object
let mut selection = Selection::default();
selection.set_num(500);
// load sigs into collection + select compatible signatures
let cl = Collection::from_sigs(sigs)
.unwrap()
.select(&selection)
.unwrap();
// all sigs should remain
assert_eq!(cl.len(), 4);
//now select diff num and none should remain
selection.set_num(100);
let cl2 = Collection::from_sigs(sigs_copy)
.unwrap()
.select(&selection)
.unwrap();
assert_eq!(cl2.len(), 0);
}

#[test]
fn sigstore_selection_num_handle_scaled_sig() {
// load test sigs
let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
// four num=500 sigs
filename.push("../../tests/test-data/47+63-multisig.sig");
let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");
assert_eq!(sigs.len(), 6);
// create Selection object
let mut selection = Selection::default();
selection.set_num(500);
// load sigs into collection + select compatible signatures
let cl = Collection::from_sigs(sigs)
.unwrap()
.select(&selection)
.unwrap();
// no sigs should remain
assert_eq!(cl.len(), 0);
}

#[test]
fn sigstore_sig_from_record() {
// load test sigs
let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
filename.push("../../tests/test-data/47+63-multisig.sig");
let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");
// create Selection object
let mut selection = Selection::default();
selection.set_scaled(2000);
// load sigs into collection + select compatible signatures
let cl = Collection::from_sigs(sigs)
.unwrap()
.select(&selection)
.unwrap();
// no sigs should remain
assert_eq!(cl.len(), 6);
for (_idx, rec) in cl.iter() {
// need to pass select again here so we actually downsample
let this_sig = cl.sig_from_record(rec).unwrap().select(&selection).unwrap();
let this_mh = this_sig.minhash().unwrap();
assert_eq!(this_mh.scaled(), 2000);
}
}

#[test]
fn sigstore_selection_moltype_zip() {
// load test sigs
let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
filename.push("../../tests/test-data/prot/hp.zip");
// create Selection object
let mut selection = Selection::default();
selection.set_scaled(100);
selection.set_moltype(HashFunctions::Murmur64Hp);
// load sigs into collection + select compatible signatures
let cl = Collection::from_zipfile(&filename)
.unwrap()
.select(&selection)
.unwrap();
// count collection length
assert_eq!(cl.len(), 2);
for (idx, _rec) in cl.iter() {
// need to pass select again here so we actually downsample
let this_sig = cl.sig_for_dataset(idx).unwrap().select(&selection).unwrap();
let this_mh = this_sig.minhash().unwrap();
assert_eq!(this_mh.scaled(), 100);
}
}

#[test]
fn sigstore_selection_moltype_sig() {
// load test sigs
let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
filename
.push("../../tests/test-data/prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig");
let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");
// create Selection object
let mut selection = Selection::default();
selection.set_moltype(HashFunctions::Murmur64Hp);
// load sigs into collection + select compatible signatures
let cl = Collection::from_sigs(sigs)
.unwrap()
.select(&selection)
.unwrap();
// count collection length
assert_eq!(cl.len(), 1);
for (idx, _rec) in cl.iter() {
// need to pass select again here so we actually downsample
let this_sig = cl.sig_for_dataset(idx).unwrap().select(&selection).unwrap();
let this_mh = this_sig.minhash().unwrap();
assert_eq!(this_mh.scaled(), 100);
}
}
}
113 changes: 112 additions & 1 deletion src/core/src/manifest.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use std::convert::TryInto;
use std::io::{Read, Write};
use std::fs::File;
use std::io::{BufRead, BufReader, Read, Write};
use std::ops::Deref;

use camino::Utf8PathBuf as PathBuf;
Expand Down Expand Up @@ -200,6 +201,17 @@ impl Select for Manifest {
} else {
valid
};
valid = if let Some(scaled) = selection.scaled() {
// num sigs have row.scaled = 0, don't include them
valid && row.scaled != 0 && row.scaled <= scaled as u64
} else {
valid
};
valid = if let Some(num) = selection.num() {
valid && row.num == num
} else {
valid
};
valid
});

Expand Down Expand Up @@ -270,10 +282,109 @@ impl From<&[PathBuf]> for Manifest {
}
}

impl From<&PathBuf> for Manifest {
fn from(pathlist: &PathBuf) -> Self {
let file = File::open(pathlist).unwrap_or_else(|_| panic!("Failed to open {:?}", pathlist));
let reader = BufReader::new(file);

let paths: Vec<PathBuf> = reader
.lines()
.map(|line| line.unwrap_or_else(|_| panic!("Failed to read line from {:?}", pathlist)))
.map(PathBuf::from)
.collect();

paths.as_slice().into()
}
}

impl Deref for Manifest {
type Target = Vec<Record>;

fn deref(&self) -> &Self::Target {
&self.records
}
}

#[cfg(test)]
mod test {
use camino::Utf8PathBuf as PathBuf;
use std::fs::File;
use std::io::Write;
use tempfile::TempDir;

use super::Manifest;

#[test]
fn manifest_from_pathlist() {
let temp_dir = TempDir::new().unwrap();
let utf8_output = PathBuf::from_path_buf(temp_dir.path().to_path_buf())
.expect("Path should be valid UTF-8");
let mut filename = utf8_output.join("sig-pathlist.txt");
//convert to camino utf8pathbuf
filename = PathBuf::from(filename);
// build sig filenames
let base_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
let test_sigs = vec![
"../../tests/test-data/47.fa.sig",
"../../tests/test-data/63.fa.sig",
];

let full_paths: Vec<_> = test_sigs
.into_iter()
.map(|sig| base_path.join(sig))
.collect();

// write a file in test directory with a filename on each line
let mut pathfile = File::create(&filename).unwrap();
for sigfile in &full_paths {
writeln!(pathfile, "{}", sigfile).unwrap();
}

// load into manifest
let manifest = Manifest::from(&filename);
assert_eq!(manifest.len(), 2);
}

#[test]
#[should_panic(expected = "Failed to open \"no-exist\"")]
fn manifest_from_pathlist_nonexistent_file() {
let filename = PathBuf::from("no-exist");
let _manifest = Manifest::from(&filename);
}

#[test]
#[should_panic]
fn manifest_from_pathlist_badfile() {
let temp_dir = TempDir::new().unwrap();
let utf8_output = PathBuf::from_path_buf(temp_dir.path().to_path_buf())
.expect("Path should be valid UTF-8");
let mut filename = utf8_output.join("sig-pathlist.txt");
//convert to camino utf8pathbuf
filename = PathBuf::from(filename);

let mut pathfile = File::create(&filename).unwrap();
write!(pathfile, "Valid line\n").unwrap();
pathfile.write_all(&[0xED, 0xA0, 0x80]).unwrap(); // invalid UTF-8

// load into manifest
let _manifest = Manifest::from(&filename);
}

#[test]
#[should_panic]
fn manifest_from_paths_badpath() {
let base_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
let test_sigs = vec![
PathBuf::from("no-exist"),
PathBuf::from("../../tests/test-data/63.fa.sig"),
];

let full_paths: Vec<PathBuf> = test_sigs
.into_iter()
.map(|sig| base_path.join(sig))
.collect();

// load into manifest
let _manifest = Manifest::from(&full_paths[..]); // pass full_paths as a slice
}
}
Loading
Loading