Skip to content

Commit

Permalink
Merge branch 'ctb_misc2' into test_load_revindex
Browse files Browse the repository at this point in the history
  • Loading branch information
ctb committed Aug 24, 2024
2 parents c760aa8 + b3e5b81 commit 6534d2f
Show file tree
Hide file tree
Showing 7 changed files with 101 additions and 71 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 1 addition & 3 deletions src/index.rs
Original file line number Diff line number Diff line change
@@ -1,22 +1,20 @@
use anyhow::Context;
use camino::Utf8PathBuf as PathBuf;
use log::debug;
use sourmash::index::revindex::RevIndex;
use sourmash::index::revindex::RevIndexOps;
use sourmash::prelude::*;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;

use crate::utils::{load_collection, ReportType};
use sourmash::collection::{Collection, CollectionSet};

pub fn index<P: AsRef<Path>>(
siglist: String,
selection: &Selection,
output: P,
colors: bool,
allow_failed_sigpaths: bool,
_allow_failed_sigpaths: bool,
use_internal_storage: bool,
) -> Result<(), Box<dyn std::error::Error>> {
println!("Loading siglist");
Expand Down
10 changes: 0 additions & 10 deletions src/multisearch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,6 @@ pub fn multisearch(
let processed_cmp = AtomicUsize::new(0);
let ksize = selection.ksize().unwrap() as f64;

if queries.is_empty() {
eprintln!("No query sketches present. Exiting.");
return Err(anyhow::anyhow!("failed to load query sketches").into());
}

if against.is_empty() {
eprintln!("No search sketches present. Exiting.");
return Err(anyhow::anyhow!("failed to load search sketches").into());
}

let send = against
.par_iter()
.filter_map(|against| {
Expand Down
Binary file added src/python/tests/test-data/2.fa.k21.sig.gz
Binary file not shown.
60 changes: 59 additions & 1 deletion src/python/tests/test_multisearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -558,7 +558,7 @@ def test_empty_query(runtmp, capfd):
# @CTB


def test_nomatch_query(runtmp, capfd, zip_query):
def test_nomatch_query_warn(runtmp, capfd, zip_query):
# test a non-matching (diff ksize) in query; do we get warning message?
query_list = runtmp.output('query.txt')
against_list = runtmp.output('against.txt')
Expand Down Expand Up @@ -586,6 +586,64 @@ def test_nomatch_query(runtmp, capfd, zip_query):
assert 'WARNING: skipped 1 query paths - no compatible signatures' in captured.err


def test_nomatch_query_exit(runtmp, capfd, zip_query):
# test loading no matching sketches - do we error exit appropriately?
query_list = runtmp.output('query.txt')
against_list = runtmp.output('against.txt')

sig1 = get_test_data('1.fa.k21.sig.gz')
sig2 = get_test_data('2.fa.sig.gz')
sig47 = get_test_data('47.fa.sig.gz')
sig63 = get_test_data('63.fa.sig.gz')

make_file_list(query_list, [sig1])
make_file_list(against_list, [sig2, sig47, sig63])

output = runtmp.output('out.csv')

if zip_query:
query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip'))

with pytest.raises(utils.SourmashCommandFailed):
runtmp.sourmash('scripts', 'multisearch', query_list, against_list,
'-o', output)

captured = capfd.readouterr()
print(captured.err)

assert 'WARNING: skipped 1 query paths - no compatible signatures' in captured.err
assert 'No query signatures loaded, exiting' in captured.err


def test_nomatch_against(runtmp, capfd, zip_query):
# test a non-matching (diff ksize) in against; do we get warning message?
query_list = runtmp.output('query.txt')
against_list = runtmp.output('against.txt')

sig1 = get_test_data('1.fa.k21.sig.gz')
sig2 = get_test_data('2.fa.sig.gz')
sig47 = get_test_data('47.fa.sig.gz')
sig63 = get_test_data('63.fa.sig.gz')

make_file_list(query_list, [sig2, sig47, sig63, sig1])
make_file_list(against_list, [sig2, sig47, sig63])

output = runtmp.output('out.csv')

if zip_query:
query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip'))

with pytest.raises(utils.SourmashCommandFailed):
runtmp.sourmash('scripts', 'multisearch', query_list, against_list,
'-o', output, '-k', '21')

captured = capfd.readouterr()
print(captured.err)

assert 'WARNING: skipped 3 search paths - no compatible signatures' in captured.err
assert 'No search signatures loaded, exiting' in captured.err


def test_load_only_one_bug(runtmp, capfd, zip_db):
# check that we behave properly when presented with multiple against
# sketches
Expand Down
33 changes: 29 additions & 4 deletions src/python/tests/test_pairwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ def test_missing_query(runtmp, capfd, zip_db):



def test_empty_query(runtmp):
def test_empty_query(runtmp, capfd):
# test with an empty query list
query_list = runtmp.output('query.txt')

Expand All @@ -267,11 +267,11 @@ def test_empty_query(runtmp):
runtmp.sourmash('scripts', 'pairwise', query_list,
'-o', output)

print(runtmp.last_result.err)
# @CTB
captured = capfd.readouterr()
assert 'Error: No analysis signatures loaded, exiting.' in captured.err


def test_nomatch_query(runtmp, capfd, zip_query):
def test_nomatch_query_warn(runtmp, capfd, zip_query):
# test a non-matching (diff ksize) in query; do we get warning message?
query_list = runtmp.output('query.txt')

Expand All @@ -297,6 +297,31 @@ def test_nomatch_query(runtmp, capfd, zip_query):
assert 'WARNING: skipped 1 analysis paths - no compatible signatures' in captured.err


def test_nomatch_query_exit(runtmp, capfd, zip_query):
# test a non-matching (diff ksize) in query; do we get warning message?
query_list = runtmp.output('query.txt')

sig1 = get_test_data('1.fa.k21.sig.gz')
sig2 = get_test_data('2.fa.k21.sig.gz')

make_file_list(query_list, [sig1, sig2])

output = runtmp.output('out.csv')

if zip_query:
query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip'))

with pytest.raises(utils.SourmashCommandFailed):
runtmp.sourmash('scripts', 'pairwise', query_list,
'-o', output)

captured = capfd.readouterr()
print(captured.err)

assert 'WARNING: skipped 2 analysis paths - no compatible signatures' in captured.err
assert 'Error: No analysis signatures loaded, exiting.' in captured.err


def test_load_only_one_bug(runtmp, capfd, zip_db):
# check that we behave properly when presented with multiple query
# sketches
Expand Down
63 changes: 11 additions & 52 deletions src/utils/multicollection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,6 @@ impl MultiCollection {
.collect();

let (colls, n_failed) = MultiCollection::load_set_of_paths(lines);
let colls: Vec<_> = colls.into_iter().collect();

Ok((MultiCollection::new(colls, false), n_failed))
}
Expand All @@ -206,33 +205,30 @@ impl MultiCollection {
let val: usize = self.collections.iter().map(|c| c.len()).sum();
val
}

pub fn is_empty(&self) -> bool {
let val: usize = self.collections.iter().map(|c| c.len()).sum();
val == 0
}

pub fn iter(&self) -> impl Iterator<Item = &Collection> {
self.collections.iter()
}

// iterate over tuples
pub fn item_iter(&self) -> impl Iterator<Item = (&Collection, Idx, &Record)> {
// CTB: request review by Rust expert pls :). Does this make
// unnecessary copies??
let s: Vec<_> = self
.collections
.iter()
.flat_map(|c| c.iter().map(move |(_idx, record)| (c, _idx, record)))
.collect();
s.into_iter()
}

pub fn par_iter(&self) -> impl IndexedParallelIterator<Item = (&Collection, Idx, &Record)> {
// CTB: request review by Rust expert - why can't I use item_iter here?
// i.e. self.item_iter().into_par_iter()?
// first create a Vec of all triples (Collection, Idx, Record)
let s: Vec<_> = self
.collections
.iter()
.flat_map(|c| c.iter().map(move |(_idx, record)| (c, _idx, record)))
.collect();
// then return a parallel iterator over the Vec.
s.into_par_iter()
}

Expand Down Expand Up @@ -260,7 +256,6 @@ impl MultiCollection {
let minhash = selected_sig.minhash()?.clone();

Some(SmallSignature {
collection: coll.clone(), // @CTB
location: record.internal_location().to_string(),
name: sig.name(),
md5sum: sig.md5sum(),
Expand All @@ -279,58 +274,22 @@ impl MultiCollection {

Ok(sketchinfo)
}

// Load all signatures into memory.
// @CTB remove.
pub fn load_sigs(&self) -> Result<Vec<Signature>> {
let sigs: Vec<_> = self
.par_iter()
.filter_map(|(coll, _idx, record)| match coll.sig_from_record(record) {
Ok(sigstore) => Some(sigstore.into()),
Err(_) => {
eprintln!(
"FAILED to load sketch from '{}'",
record.internal_location()
);
None
}
})
.collect();

Ok(sigs)
}
}

impl Select for MultiCollection {
fn select(mut self, selection: &Selection) -> Result<Self, SourmashError> {
// CTB: request review by Rust expert! Is the clone necessary?
self.collections = self
.iter()
.filter_map(|c| c.clone().select(selection).ok())
fn select(self, selection: &Selection) -> Result<Self, SourmashError> {
let collections = self
.collections
.into_iter()
.filter_map(|c| c.select(selection).ok())
.collect();
Ok(self)
}
}

/*
impl TryFrom<MultiCollection> for CollectionSet {
type Error = SourmashError;
fn try_from(multi: MultiCollection) -> Result<CollectionSet, SourmashError> {
// CTB: request review by Rust expert! Is the clone necessary?
// @CTB need to do something better than just getting the first CS! :sob:
// @CTB could fail if more than one?
let coll = multi.iter().next().unwrap().clone();
let cs: CollectionSet = coll.try_into()?;
Ok(cs)
Ok(MultiCollection::new(collections, self.contains_revindex))
}
}
*/

/// Track a name/minhash.
pub struct SmallSignature {
// CTB: request help - can we/should we use references & lifetimes here?
pub collection: Collection,
pub location: String,
pub name: String,
pub md5sum: String,
Expand Down

0 comments on commit 6534d2f

Please sign in to comment.