From f05e4bdab0b681a801873033b7e5597a78010cba Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Mon, 28 Mar 2022 16:36:34 -0700 Subject: [PATCH] [MRG] Add `StandaloneManifestIndex` class for direct loading of manifest CSVs (#1891) * add -d/--debug to various commands * initial implementation of StandaloneManifestIndex * support prefix if not abspath * clean up * some standalone manifests tests - incl CLI * iterate over internal locations instead * switch to picklist API * aaaaand swap out for load_file_as_index :tada: * remove unnecessary spaces * more tests * more better prefix test * remove unnec space * upgrade output error messages * fix SBT subdir loading error * add message about using --debug * doc etc * rationalize _signatures_with_internal * test describe and fileinfo on manifests * think through more manifest stuff * fix descr * rationalize _signatures_with_internal * fix docstring * add heading anchors config; fix napoleon package ref * pin versions for doc building * fix internal refs * fix one last ref target * add docs * clarify language * add docs * add more/better tests for lazy loading * clarify * a few more tests * update docs * add explicit test for lazy-loading prefetch on StandaloneManifestIndex * update comments/docstrings * Update doc/command-line.md Co-authored-by: Tessa Pierce Ward * update comments/docstrings Co-authored-by: Tessa Pierce Ward --- doc/command-line.md | 65 +++++++ src/sourmash/index/__init__.py | 150 +++++++++++++++- src/sourmash/lca/lca_db.py | 4 + src/sourmash/sourmash_args.py | 7 + tests/test-data/scaled/mf.csv | 17 ++ tests/test-data/scaled/pathlist.txt | 7 + tests/test_cmd_signature.py | 32 ++++ tests/test_cmd_signature_fileinfo.py | 34 ++++ tests/test_index.py | 260 ++++++++++++++++++++++++++- tests/test_sourmash.py | 55 ++++++ 10 files changed, 626 insertions(+), 5 deletions(-) create mode 100644 tests/test-data/scaled/mf.csv create mode 100644 tests/test-data/scaled/pathlist.txt diff --git a/doc/command-line.md b/doc/command-line.md index ecbd311840..54e323419e 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -1621,3 +1621,68 @@ sig` commands will output to stdout. So, for example, `sourmash sketch ... -o - | sourmash sig describe -` will describe the signatures that were just created. + +### Using manifests to explicitly refer to collections of files + +(sourmash v4.4.0 and later) + +Manifests are metadata catalogs of signatures that are used for +signature selection and loading. They are used extensively by sourmash +internals to speed up signature selection through picklists and +pattern matching. + +Manifests can _also_ be used externally (via the command-line), and +may be useful for organizing large collections of signatures. + +Suppose you have a large collection of signature (`.sig` or `.sig.gz` +files) under a directory. You can create a manifest file for them like so: +``` +sourmash sig manifest -o /manifest.csv +``` +and then use the manifest directly for sourmash operations: +``` +sourmash sig fileinfo /manifest.csv +``` +This manifest can be used as a database target for most sourmash +operations - search, gather, etc. Note that manifests for directories +must be placed within (and loaded from) the directory from which the +manifest was generated; the specific manifest filename does not +matter. + +A more advanced and slightly tricky way to use explicit manifest files +is with lists of files. If you create a file with a path list +containing the locations of loadable sourmash collections, you can run +`sourmash sig manifest pathlist.txt -o mf.csv` to generate a manifest +of all of the files. The resulting manifest in `mf.csv` can then be +loaded directly. This is very handy when you have many sourmash +signatures, or large signature files. The tricky part in doing this +is that the manifest will store the same paths listed in the pathlist +file - whether they are relative or absolute paths - and these paths +must be resolvable by sourmash from the current working directory. +This makes explicit manifests built from pathlist files less portable +within or across systems than the other sourmash collections, which +are all relocatable. + +For example, if you create a pathlist file `paths.txt` containing the +following: +``` +/path/to/zipfile.zip +local_directory/some_signature.sig.gz +local_dir2/ +``` +and then run: +``` +sourmash sig manifest paths.txt -o mf.csv +``` +you will be able to use `mf.csv` as a database for `sourmash search` +and `sourmash gather` commands. But, because it contains two relative paths, +you will only be able to use it _from the directory that contains those +two relative paths_. + +**Our advice:** We suggest using zip file collections for most +situations; we primarily recommend using explicit manifests for +situations where you have a **very large** collection of signatures +(1000s or more), and don't want to make multiple copies of signatures +in the collection (as you would have to, with a zipfile). This can be +useful if you want to refer to different subsets of the collection +without making multiple copies in a zip file. diff --git a/src/sourmash/index/__init__.py b/src/sourmash/index/__init__.py index f126981a6d..97a9d35e17 100644 --- a/src/sourmash/index/__init__.py +++ b/src/sourmash/index/__init__.py @@ -25,10 +25,14 @@ ZipFileLinearIndex - simple on-disk storage of signatures. -class MultiIndex - in-memory storage and selection of signatures from multiple -index objects, using manifests. +MultiIndex - in-memory storage and selection of signatures from multiple +index objects, using manifests. All signatures are kept in memory. + +StandaloneManifestIndex - load manifests directly, and do lazy loading of +signatures on demand. No signatures are kept in memory. LazyLoadedIndex - selection on manifests with loading of index on demand. +(Consider using StandaloneManifestIndex instead.) CounterGather - an ancillary class returned by the 'counter_gather()' method. """ @@ -39,6 +43,7 @@ class MultiIndex - in-memory storage and selection of signatures from multiple from collections import namedtuple, Counter import csv from io import TextIOWrapper +from collections import defaultdict from ..search import make_jaccard_search_query, make_gather_query from ..manifest import CollectionManifest @@ -49,7 +54,12 @@ class MultiIndex - in-memory storage and selection of signatures from multiple IndexSearchResult = namedtuple('Result', 'score, signature, location') class Index(ABC): + # this will be removed soon; see sourmash#1894. is_database = False + + # 'manifest', when set, implies efficient selection and direct + # access to signatures. Signatures may be stored in the manifest + # or loaded on demand from disk depending on the class, however. manifest = None @abstractmethod @@ -933,6 +943,11 @@ def sigloc_iter(): # build manifest; note, signatures are stored in memory. # CTB: could do this on demand? + # CTB: should we use get_manifest functionality? + # CTB: note here that the manifest is created by iteration + # *even if it already exists.* This could be changed to be more + # efficient... but for now, use StandaloneManifestIndex if you + # want to avoid this when loading from multiple files. manifest = CollectionManifest.create_manifest(sigloc_iter()) # create! @@ -945,6 +960,8 @@ def load_from_directory(cls, pathname, *, force=False): Takes directory path plus optional boolean 'force'. Attempts to load all files ending in .sig or .sig.gz, by default; if 'force' is True, will attempt to load _all_ files, ignoring errors. + + Will not load anything other than JSON signature files. """ from ..sourmash_args import traverse_find_sigs @@ -1007,8 +1024,8 @@ def load_from_path(cls, pathname, force=False): def load_from_pathlist(cls, filename): """Create a MultiIndex from all files listed in a text file. - Note: this will load signatures from directories and databases, too, - if they are listed in the text file; it uses 'load_file_as_index' + Note: this will attempt to load signatures from each file, + including zip collections, etc; it uses 'load_file_as_index' underneath. """ from ..sourmash_args import (load_pathlist_from_file, @@ -1047,6 +1064,8 @@ class LazyLoadedIndex(Index): from disk every time they are needed (e.g. 'find(...)', 'signatures()'). Wrapper class; signatures dynamically loaded from disk; uses manifests. + + CTB: This may be redundant with StandaloneManifestIndex. """ def __init__(self, filename, manifest): "Create an Index with given filename and manifest." @@ -1139,3 +1158,126 @@ def select(self, **kwargs): new_manifest = manifest.select_to_manifest(**kwargs) return LazyLoadedIndex(self.filename, new_manifest) + + +class StandaloneManifestIndex(Index): + """Load a standalone manifest as an Index. + + This class is useful for the situation where you have a directory + with many signature collections underneath it, and you don't want to load + every collection each time you run sourmash. + + Instead, you can run 'sourmash sig manifest -o mf.csv' to + output a manifest and then use this class to load 'mf.csv' directly. + Sketch type selection, picklists, and pattern matching will all work + directly on the manifest and will load signatures only upon demand. + + One feature of this class is that absolute paths to sketches in + the 'internal_location' field of the manifests will be loaded properly. + This permits manifests to be constructed for various collections of + signatures that reside elsewhere, and not just below a single directory + prefix. + + StandaloneManifestIndex does _not_ store signatures in memory. + + This class overlaps in concept with LazyLoadedIndex and behaves + identically when a manifest contains only rows from a single + on-disk Index object. However, unlike LazyLoadedIndex, this class + can be used to reference multiple on-disk Index objects. + + This class also overlaps in concept with MultiIndex when + MultiIndex.load_from_pathlist is used to load other Index + objects. However, this class does not store any signatures in + memory, unlike MultiIndex. + """ + is_database = True + + def __init__(self, manifest, location, *, prefix=None): + """Create object. 'location' is path of manifest file, 'prefix' is + prepended to signature paths when loading non-abspaths.""" + assert manifest is not None + self.manifest = manifest + self._location = location + self.prefix = prefix + + @classmethod + def load(cls, location, *, prefix=None): + """Load manifest file from given location. + + If prefix is None (default), it is automatically set from dirname. + Set prefix='' to avoid this, or provide an explicit prefix. + """ + if not os.path.isfile(location): + raise ValueError(f"provided manifest location '{location}' is not a file") + + with open(location, newline='') as fp: + m = CollectionManifest.load_from_csv(fp) + + if prefix is None: + prefix = os.path.dirname(location) + + return cls(m, location, prefix=prefix) + + @property + def location(self): + "Return the path to this manifest." + return self._location + + def signatures_with_location(self): + "Return an iterator over all signatures and their locations." + for ss, loc in self._signatures_with_internal(): + yield ss, loc + + def signatures(self): + "Return an iterator over all signatures." + for ss, loc in self._signatures_with_internal(): + yield ss + + def _signatures_with_internal(self): + """Return an iterator over all sigs of (sig, internal_location) + + Note that this is implemented differently from most Index + objects in that it only lists subselected parts of the + manifest, and not the original manifest. This was done out of + convenience: we don't currently have access to the original + manifest in this class. + """ + # collect all internal locations + iloc_to_rows = defaultdict(list) + for row in self.manifest.rows: + iloc = row['internal_location'] + iloc_to_rows[iloc].append(row) + + # iterate over internal locations, selecting relevant sigs + for iloc, iloc_rows in iloc_to_rows.items(): + # prepend with prefix? + if not iloc.startswith('/') and self.prefix: + iloc = os.path.join(self.prefix, iloc) + + sub_mf = CollectionManifest(iloc_rows) + picklist = sub_mf.to_picklist() + + idx = sourmash.load_file_as_index(iloc) + idx = idx.select(picklist=picklist) + for ss in idx.signatures(): + yield ss, iloc + + def __len__(self): + "Number of signatures in this manifest (after any select)." + return len(self.manifest) + + def __bool__(self): + "Is this manifest empty?" + return bool(self.manifest) + + def save(self, *args): + raise NotImplementedError + + def insert(self, *args): + raise NotImplementedError + + def select(self, **kwargs): + "Run 'select' on the manifest." + new_manifest = self.manifest.select_to_manifest(**kwargs) + return StandaloneManifestIndex(new_manifest, self._location, + prefix=self.prefix) diff --git a/src/sourmash/lca/lca_db.py b/src/sourmash/lca/lca_db.py index 6b5eb09eaf..fb9119def4 100644 --- a/src/sourmash/lca/lca_db.py +++ b/src/sourmash/lca/lca_db.py @@ -58,6 +58,10 @@ class LCA_Database(Index): """ is_database = True + # we set manifest to None to avoid implication of fast on-disk access to + # sketches. This may be revisited later. + manifest = None + def __init__(self, ksize, scaled, moltype='DNA'): self.ksize = int(ksize) self.scaled = int(scaled) diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 25372cea02..36d9300e07 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -364,6 +364,12 @@ def _load_stdin(filename, **kwargs): return db +def _load_standalone_manifest(filename, **kwargs): + from sourmash.index import StandaloneManifestIndex + idx = StandaloneManifestIndex.load(filename) + return idx + + def _multiindex_load_from_pathlist(filename, **kwargs): "Load collection from a list of signature/database files" db = MultiIndex.load_from_pathlist(filename) @@ -416,6 +422,7 @@ def _load_zipfile(filename, **kwargs): # all loader functions, in order. _loader_functions = [ ("load from stdin", _load_stdin), + ("load from standalone manifest", _load_standalone_manifest), ("load from path (file or directory)", _multiindex_load_from_path), ("load from file list", _multiindex_load_from_pathlist), ("load SBT", _load_sbt), diff --git a/tests/test-data/scaled/mf.csv b/tests/test-data/scaled/mf.csv new file mode 100644 index 0000000000..e3ff4d09e7 --- /dev/null +++ b/tests/test-data/scaled/mf.csv @@ -0,0 +1,17 @@ +# SOURMASH-MANIFEST-VERSION: 1.0 +internal_location,md5,md5short,ksize,moltype,num,scaled,n_hashes,with_abundance,name,filename +all.lca.json,455c2f95f2d0a95e176870659119f170,455c2f95,31,DNA,0,10000,93,0,, +all.lca.json,684aa226f843eaa7e1e40fc5603d5f2a,684aa226,31,DNA,0,10000,48,0,, +all.lca.json,7f7835d2dd27ba703e843eee4757f3c2,7f7835d2,31,DNA,0,10000,8,0,, +all.lca.json,7ffcfaa4027d4153a991b6bd78cf39fe,7ffcfaa4,31,DNA,0,10000,45,0,, +all.lca.json,d84ef28f610b1783f801734699cf7e40,d84ef28f,31,DNA,0,10000,45,0,, +genome-s10+s11.fa.gz.sig,455c2f95f2d0a95e176870659119f170,455c2f95,31,DNA,0,10000,93,0,,../genome-s10+s11.fa.gz +genome-s11.fa.gz.sig,7ffcfaa4027d4153a991b6bd78cf39fe,7ffcfaa4,31,DNA,0,10000,45,0,,../genome-s11.fa.gz +all.sbt.zip,684aa226f843eaa7e1e40fc5603d5f2a,684aa226,31,DNA,0,10000,48,0,,../genome-s10.fa.gz +all.sbt.zip,7f7835d2dd27ba703e843eee4757f3c2,7f7835d2,31,DNA,0,10000,8,0,,../genome-s10-small.fa.gz +all.sbt.zip,7ffcfaa4027d4153a991b6bd78cf39fe,7ffcfaa4,31,DNA,0,10000,45,0,,../genome-s11.fa.gz +all.sbt.zip,455c2f95f2d0a95e176870659119f170,455c2f95,31,DNA,0,10000,93,0,,../genome-s10+s11.fa.gz +all.sbt.zip,d84ef28f610b1783f801734699cf7e40,d84ef28f,31,DNA,0,10000,45,0,,../genome-s12.fa.gz +genome-s10-small.fa.gz.sig,7f7835d2dd27ba703e843eee4757f3c2,7f7835d2,31,DNA,0,10000,8,0,,../genome-s10-small.fa.gz +genome-s12.fa.gz.sig,d84ef28f610b1783f801734699cf7e40,d84ef28f,31,DNA,0,10000,45,0,,../genome-s12.fa.gz +genome-s10.fa.gz.sig,684aa226f843eaa7e1e40fc5603d5f2a,684aa226,31,DNA,0,10000,48,0,,../genome-s10.fa.gz diff --git a/tests/test-data/scaled/pathlist.txt b/tests/test-data/scaled/pathlist.txt new file mode 100644 index 0000000000..32b8b3bacd --- /dev/null +++ b/tests/test-data/scaled/pathlist.txt @@ -0,0 +1,7 @@ +all.lca.json +all.sbt.zip +genome-s10+s11.fa.gz.sig +genome-s10-small.fa.gz.sig +genome-s10.fa.gz.sig +genome-s11.fa.gz.sig +genome-s12.fa.gz.sig diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index 2168cd9555..d588171f20 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -3376,6 +3376,31 @@ def test_sig_describe_2_exclude_db_pattern(runtmp): assert line.strip() in out +def test_sig_describe_3_manifest_works(runtmp): + # test on a manifest with relative paths, in proper location + mf = utils.get_test_data('scaled/mf.csv') + runtmp.sourmash('sig', 'describe', mf, '--csv', 'out.csv') + + out = runtmp.last_result.out + print(out) + + with open(runtmp.output('out.csv'), newline='') as fp: + r = csv.reader(fp) + rows = list(r) + assert len(rows) == 16 # 15 signatures, plus head + + +def test_sig_describe_3_manifest_fails_when_moved(runtmp): + # test on a manifest with relative paths, when in wrong place; + # should fail, because actual signatures cannot be loaded now. + # note: this tests lazy loading. + mf = utils.get_test_data('scaled/mf.csv') + shutil.copyfile(mf, runtmp.output('mf.csv')) + + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash('sig', 'describe', 'mf.csv') + + @utils.in_tempdir def test_sig_overlap(c): # get overlap details @@ -3566,6 +3591,13 @@ def test_sig_manifest_6_pathlist(runtmp): assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + # note: the manifest output for pathlists will contain the locations + # used in the pathlist. This is required by StandaloneManifestIndex. + for row in manifest.rows: + iloc = row['internal_location'] + print(iloc) + assert iloc.startswith('/'), iloc + def test_sig_manifest_does_not_exist(runtmp): with pytest.raises(SourmashCommandFailed): diff --git a/tests/test_cmd_signature_fileinfo.py b/tests/test_cmd_signature_fileinfo.py index a0847fe97a..ee90fc7ba4 100644 --- a/tests/test_cmd_signature_fileinfo.py +++ b/tests/test_cmd_signature_fileinfo.py @@ -330,3 +330,37 @@ def test_sig_fileinfo_does_not_exist(runtmp): runtmp.run_sourmash('sig', 'fileinfo', 'does-not-exist') assert "Cannot open 'does-not-exist' as a sourmash signature collection" in runtmp.last_result.err + + +def test_sig_fileinfo_8_manifest_works(runtmp): + # test on a manifest with relative paths, in proper location + mf = utils.get_test_data('scaled/mf.csv') + runtmp.sourmash('sig', 'fileinfo', mf) + + out = runtmp.last_result.out + print(out) + + assert '15 sketches with DNA, k=31, scaled=10000 717 total hashes' in out + assert 'num signatures: 15' in out + assert 'has manifest? yes' in out + assert 'is database? yes' in out + assert 'path filetype: StandaloneManifestIndex' in out + + +def test_sig_fileinfo_8_manifest_works_when_moved(runtmp): + # test on a manifest with relative paths, when in wrong place + # note: this works, unlike 'describe', because all the necessary info + # for 'fileinfo' is in the manifest. + mf = utils.get_test_data('scaled/mf.csv') + shutil.copyfile(mf, runtmp.output('mf.csv')) + + runtmp.sourmash('sig', 'fileinfo', 'mf.csv') + + out = runtmp.last_result.out + print(out) + + assert '15 sketches with DNA, k=31, scaled=10000 717 total hashes' in out + assert 'num signatures: 15' in out + assert 'has manifest? yes' in out + assert 'is database? yes' in out + assert 'path filetype: StandaloneManifestIndex' in out diff --git a/tests/test_index.py b/tests/test_index.py index 95eebc6d34..d361517d59 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -13,7 +13,8 @@ from sourmash import load_one_signature, SourmashSignature from sourmash.index import (LinearIndex, ZipFileLinearIndex, make_jaccard_search_query, CounterGather, - LazyLinearIndex, MultiIndex) + LazyLinearIndex, MultiIndex, + StandaloneManifestIndex) from sourmash.index.revindex import RevIndex from sourmash.sbt import SBT, GraphFactory, Leaf from sourmash.sbtmh import SigLeaf @@ -21,6 +22,7 @@ from sourmash.search import JaccardSearch, SearchType from sourmash.picklist import SignaturePicklist, PickStyle from sourmash_tst_utils import SourmashCommandFailed +from sourmash.manifest import CollectionManifest import sourmash_tst_utils as utils @@ -2388,6 +2390,7 @@ def test_lazy_loaded_index_3_find(runtmp): x = list(x) assert len(x) == 0 + def test_revindex_index_search(): sig2 = utils.get_test_data("2.fa.sig") sig47 = utils.get_test_data("47.fa.sig") @@ -2485,3 +2488,258 @@ def is_found(ss, xx): assert not is_found(ss47, results) assert not is_found(ss2, results) assert is_found(ss63, results) + + +def test_standalone_manifest_signatures(runtmp): + # build a StandaloneManifestIndex and test 'signatures' method. + + ## first, build a manifest in memory using MultiIndex + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + ss47 = sourmash.load_one_signature(sig47) + ss63 = sourmash.load_one_signature(sig63) + + lidx1 = LinearIndex.load(sig47) + lidx2 = LinearIndex.load(sig63) + + mi = MultiIndex.load([lidx1, lidx2], [sig47, sig63], "") + + ## got a manifest! ok, now test out StandaloneManifestIndex + mm = StandaloneManifestIndex(mi.manifest, None) + + siglist = [ ss for ss in mm.signatures() ] + assert len(siglist) == 2 + assert ss47 in siglist + assert ss63 in siglist + + +def test_standalone_manifest_signatures_prefix(runtmp): + # try out 'prefix' for StandaloneManifestIndex + + ## first, build a manifest in memory using MultiIndex + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + ss47 = sourmash.load_one_signature(sig47) + ss63 = sourmash.load_one_signature(sig63) + + lidx1 = LinearIndex.load(sig47) + lidx2 = LinearIndex.load(sig63) + mi = MultiIndex.load([lidx1, lidx2], [sig47, sig63], "") + + # ok, now remove the abspath prefix from iloc + for row in mi.manifest.rows: + row['internal_location'] = os.path.basename(row['internal_location']) + + ## this should succeed! + mm = StandaloneManifestIndex(mi.manifest, None, + prefix=utils.get_test_data('')) + + assert len(list(mm.signatures())) == 2 + + +def test_standalone_manifest_signatures_prefix_fail(runtmp): + # give StandaloneManifest the wrong prefix + + ## first, build a manifest in memory using MultiIndex + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + ss47 = sourmash.load_one_signature(sig47) + ss63 = sourmash.load_one_signature(sig63) + + lidx1 = LinearIndex.load(sig47) + lidx2 = LinearIndex.load(sig63) + print('XXX', lidx1.location) + + mi = MultiIndex.load([lidx1, lidx2], [sig47, sig63], "") + + # remove prefix from manifest + for row in mi.manifest.rows: + row['internal_location'] = os.path.basename(row['internal_location']) + + ## got a manifest! ok, now test out StandaloneManifestIndex + mm = StandaloneManifestIndex(mi.manifest, None, prefix='foo') + + # should fail + with pytest.raises(ValueError) as exc: + list(mm.signatures()) + + assert "Error while reading signatures from 'foo/47.fa.sig'" in str(exc) + + +def test_standalone_manifest_load_from_dir(runtmp): + # test loading a mf with relative directory paths from test-data + mf = utils.get_test_data('scaled/mf.csv') + idx = sourmash.load_file_as_index(mf) + + siglist = list(idx.signatures()) + assert len(siglist) == 15 + + assert idx # should be 'True' + assert len(idx) == 15 + + with pytest.raises(NotImplementedError): + idx.insert() + + with pytest.raises(NotImplementedError): + idx.save('foo') + + assert idx.location == mf + + +def test_standalone_manifest_lazy_load(runtmp): + # check that it's actually doing lazy loading + orig_sig47 = utils.get_test_data('47.fa.sig') + sig47 = runtmp.output('47.fa.sig') + + # build an external manifest + shutil.copyfile(orig_sig47, sig47) + + # this is an abspath to sig47 + runtmp.sourmash('sig', 'manifest', sig47, '-o', 'mf.csv') + + # should work to get signatures: + idx = StandaloneManifestIndex.load(runtmp.output('mf.csv')) + + siglist = list(idx.signatures()) + assert len(siglist) == 1 + + # now remove! + os.unlink(sig47) + + # can still access manifest... + assert len(idx) == 1 + + # ...but we should get an error when we call signatures. + with pytest.raises(ValueError): + list(idx.signatures()) + + # but put it back, and all is forgiven. yay! + shutil.copyfile(orig_sig47, sig47) + x = list(idx.signatures()) + assert len(x) == 1 + + +def test_standalone_manifest_lazy_load_2_prefix(runtmp): + # check that it's actually doing lazy loading; supply explicit prefix + orig_sig47 = utils.get_test_data('47.fa.sig') + sig47 = runtmp.output('47.fa.sig') + + # build an external manifest + # note, here use a relative path to 47.fa.sig; the manifest will contain + # just '47.fa.sig' as the location + shutil.copyfile(orig_sig47, sig47) + runtmp.sourmash('sig', 'manifest', '47.fa.sig', '-o', 'mf.csv') + + # should work to get signatures: + idx = StandaloneManifestIndex.load(runtmp.output('mf.csv'), + prefix=runtmp.output('')) + + siglist = list(idx.signatures()) + assert len(siglist) == 1 + + # now remove! + os.unlink(sig47) + + # can still access manifest... + assert len(idx) == 1 + + # ...but we should get an error when we call signatures. + with pytest.raises(ValueError): + list(idx.signatures()) + + # but put it back, and all is forgiven. yay! + shutil.copyfile(orig_sig47, sig47) + x = list(idx.signatures()) + assert len(x) == 1 + + +def test_standalone_manifest_search(runtmp): + # test a straight up 'search' + query_sig = utils.get_test_data('scaled/genome-s12.fa.gz.sig') + mf = utils.get_test_data('scaled/mf.csv') + + runtmp.sourmash('search', query_sig, mf) + + out = runtmp.last_result.out + print(out) + assert '100.0% d84ef28f' in out + + +def test_standalone_manifest_prefetch_lazy(runtmp): + # check that prefetch is actually doing lazy loading on manifest index. + orig_sig47 = utils.get_test_data('47.fa.sig') + sig47 = runtmp.output('47.fa.sig') + orig_sig2 = utils.get_test_data('2.fa.sig') + sig2 = runtmp.output('2.fa.sig') + orig_sig63 = utils.get_test_data('63.fa.sig') + sig63 = runtmp.output('63.fa.sig') + + shutil.copyfile(orig_sig47, sig47) + runtmp.sourmash('sig', 'manifest', sig47, '-o', 'mf1.csv') + shutil.copyfile(orig_sig2, sig2) + runtmp.sourmash('sig', 'manifest', sig2, '-o', 'mf2.csv') + shutil.copyfile(orig_sig63, sig63) + runtmp.sourmash('sig', 'manifest', sig63, '-o', 'mf3.csv') + + # combine the manifests, manually for now... + with open(runtmp.output('mf1.csv'), newline='') as fp: + mf1 = CollectionManifest.load_from_csv(fp) + assert len(mf1) == 1 + + with open(runtmp.output('mf2.csv'), newline='') as fp: + mf2 = CollectionManifest.load_from_csv(fp) + assert len(mf2) == 3 + + with open(runtmp.output('mf3.csv'), newline='') as fp: + mf3 = CollectionManifest.load_from_csv(fp) + assert len(mf3) == 1 + + all_rows = list(mf1.rows) + list(mf2.rows) + list(mf3.rows) + print(all_rows) + mf = CollectionManifest(all_rows) + assert len(mf) == 5 + with open(runtmp.output('mf.csv'), 'w', newline='') as fp: + mf.write_to_csv(fp, write_header=True) + + # ok! now, remove the last signature, 'sig63'. + os.unlink(sig63) + + # ...but loading the manifest should still work. + idx = StandaloneManifestIndex.load(runtmp.output('mf.csv')) + + # double check - third load will fail. this relies on load order :shrug:. + sig_iter = iter(idx.signatures()) + ss = next(sig_iter) + print(ss) + assert '47.fa' in ss.filename + + for i in range(3): + ss = next(sig_iter) + print(i, ss) + assert '2.fa' in ss.filename + + with pytest.raises(ValueError) as exc: + ss = next(sig_iter) + assert 'Error while reading signatures from' in str(exc) + assert '63.fa.sig' in str(exc) + + # ok! now test prefetch... should get one match legit, to 47, + # and then no matches to 2, and then error. + + ss47 = sourmash.load_one_signature(sig47) + idx = idx.select(ksize=31) + g = idx.prefetch(ss47, threshold_bp=0) + + # first value: + sr = next(g) + assert sr.signature == ss47 + + # second value should raise error. + with pytest.raises(ValueError) as exc: + sr = next(g) + + assert 'Error while reading signatures from' in str(exc) + assert '63.fa.sig' in str(exc) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 9f483f9a1e..b775bdf193 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -5196,3 +5196,58 @@ def test_gather_scaled_1(runtmp, linear_gather, prefetch_gather): assert "1.0 kbp 100.0% 100.0%" in runtmp.last_result.out assert "found 1 matches total;" in runtmp.last_result.out + + +def test_standalone_manifest_search(runtmp): + # test loading/searching a manifest file from the command line. + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + dirname = runtmp.output('somedir') + os.mkdir(dirname) + subdir = runtmp.output('somedir/subdir') + os.mkdir(subdir) + shutil.copyfile(sig47, os.path.join(dirname, '47.fa.sig')) + shutil.copyfile(sig63, os.path.join(subdir, '63.fa.sig')) + + # for now, the output manifest must be within top level dir for + # CLI stuff to work properly. + mf = os.path.join(dirname, 'mf.csv') + + # build manifest... + runtmp.sourmash('sig', 'manifest', dirname, '-o', mf) + + # ...and now use for a search! + runtmp.sourmash('search', sig47, mf) + + out = runtmp.last_result.out + print(out) + print(runtmp.last_result.err) + + assert "100.0% NC_009665.1 Shewanella baltica OS185, complete genome" in out + + +def test_standalone_manifest_search_fail(runtmp): + # test loading/searching a manifest file from the command line; should + # fail if manifest is not located within tld. + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + dirname = runtmp.output('somedir') + os.mkdir(dirname) + subdir = runtmp.output('somedir/subdir') + os.mkdir(subdir) + shutil.copyfile(sig47, os.path.join(dirname, '47.fa.sig')) + shutil.copyfile(sig63, os.path.join(subdir, '63.fa.sig')) + + # for now, the output manifest must be within top level dir for + # CLI stuff to work properly. here we intentionally break this, + # for testing purposes. + mf = runtmp.output('mf.csv') + + # build manifest... + runtmp.sourmash('sig', 'manifest', dirname, '-o', mf) + + # ...and now use for a search! + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash('search', sig47, mf)