Skip to content

Commit

Permalink
[MRG] add sourmash sig check for comparing picklists and databases (#…
Browse files Browse the repository at this point in the history
…1907)

* copy sig check code over

* add --fail to sig check

* require manifests etc

* test no --picklist

* test nosave manifest

* test __iadd__ for manifests

* remove 'add_to_found'

* revert

* simplify per @bluegenes
  • Loading branch information
ctb authored Mar 30, 2022
1 parent a78361d commit 3b267fe
Show file tree
Hide file tree
Showing 12 changed files with 592 additions and 38 deletions.
22 changes: 21 additions & 1 deletion doc/command-line.md
Original file line number Diff line number Diff line change
Expand Up @@ -1394,6 +1394,25 @@ iterating over the signatures in the input file. This can be slow for
large collections. Use `--no-rebuild-manifest` to load an existing
manifest if it is available.

### `sourmash signature check` - compare picklists and manifests

Compare picklists and manifests across databases, and optionally output matches
and missing items.

For example,
```
sourmash sig check tests/test-data/gather/GCF*.sig \
--picklist tests/test-data/gather/salmonella-picklist.csv::manifest
```
will load all of the `GCF` signatures and compare them to the given picklist.
With `-o/--output-missing`, `sig check` will save unmatched elements of the
picklist CSV. With `--save-manifest-matching`, `sig check` will save all
of the _matched_ elements to a manifest file, which can then be used as a
sourmash database.

`sourmash sig check` is particularly useful when working with large
collections of signatures and identifiers.

## Advanced command-line usage

### Loading signatures and databases
Expand Down Expand Up @@ -1632,7 +1651,8 @@ internals to speed up signature selection through picklists and
pattern matching.

Manifests can _also_ be used externally (via the command-line), and
may be useful for organizing large collections of signatures.
may be useful for organizing large collections of signatures. They can
be generated with `sourmash sig manifest` as well as `sourmash sig check`.

Suppose you have a large collection of signature (`.sig` or `.sig.gz`
files) under a directory. You can create a manifest file for them like so:
Expand Down
1 change: 1 addition & 0 deletions src/sourmash/cli/sig/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from . import fileinfo as summarize
from . import grep
from . import kmers
from . import check
from . import intersect
from . import inflate
from . import manifest
Expand Down
62 changes: 62 additions & 0 deletions src/sourmash/cli/sig/check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""check signature collections against a picklist"""

usage="""
sourmash sig check <filenames> --picklist ... -o miss.csv -m manifest.csv
This will check the signature contents of <filenames> against the given
picklist, optionally outputting the unmatched picklist rows to 'miss.csv'
and optionally outputting a manifest of the matched signatures to
'manifest.csv'.
By default, 'sig check' requires a pre-existing manifest for collections;
this prevents potentially slow manifest rebuilding. You
can turn this check off with '--no-require-manifest'.
"""

from sourmash.cli.utils import (add_moltype_args, add_ksize_arg,
add_picklist_args, add_pattern_args)


def subparser(subparsers):
subparser = subparsers.add_parser('check', usage=usage)
subparser.add_argument('signatures', nargs='*')
subparser.add_argument(
'-q', '--quiet', action='store_true',
help='suppress non-error output'
)
subparser.add_argument(
'-o', '--output-missing', metavar='FILE',
help='output picklist with remaining unmatched entries to this file',
)
subparser.add_argument(
'-f', '--force', action='store_true',
help='try to load all files as signatures'
)
subparser.add_argument(
'--from-file',
help='a text file containing a list of files to load signatures from'
)
subparser.add_argument(
'-m', '--save-manifest-matching',
help='save a manifest of the matching entries to this file.'
)
subparser.add_argument(
'--fail-if-missing', action='store_true',
help='exit with an error code (-1) if there are any missing picklist values.'
)
subparser.add_argument(
'--no-require-manifest',
help='do not require a manifest; generate dynamically if needed',
action='store_true'
)
add_ksize_arg(subparser, 31)
add_moltype_args(subparser)
add_pattern_args(subparser)
add_picklist_args(subparser)


def main(args):
import sourmash
return sourmash.sig.__main__.check(args)
29 changes: 25 additions & 4 deletions src/sourmash/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,25 @@ class CollectionManifest:

def __init__(self, rows):
"Initialize from an iterable of metadata dictionaries."
self.rows = tuple(rows)
self.rows = ()
self._md5_set = set()

# build a fast lookup table for md5sums in particular
md5set = set()
self._add_rows(rows)

def _add_rows(self, rows):
self.rows += tuple(rows)

# maintain a fast lookup table for md5sums
md5set = self._md5_set
for row in self.rows:
md5set.add(row['md5'])
self._md5_set = md5set

def __iadd__(self, other):
self._add_rows(other.rows)
return self

def __add__(self, other):
return CollectionManifest(self.rows + other.rows)

def __bool__(self):
return bool(self.rows)
Expand All @@ -44,6 +56,11 @@ def __len__(self):
def __eq__(self, other):
return self.rows == other.rows

@classmethod
def load_from_filename(cls, filename):
with open(filename, newline="") as fp:
return cls.load_from_csv(fp)

@classmethod
def load_from_csv(cls, fp):
"load a manifest from a CSV file."
Expand Down Expand Up @@ -80,6 +97,10 @@ def load_from_csv(cls, fp):

return cls(manifest_list)

def write_to_filename(self, filename):
with open(filename, "w", newline="") as fp:
return self.write_to_csv(fp, write_header=True)

@classmethod
def write_csv_header(cls, fp):
"write header for manifest CSV format"
Expand Down
15 changes: 15 additions & 0 deletions src/sourmash/picklist.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ def __init__(self, coltype, *, pickfile=None, column_name=None,
valid_coltypes.update(self.supported_coltypes)
if coltype not in valid_coltypes:
raise ValueError(f"invalid picklist column type '{coltype}'")
self.orig_coltype = coltype
self.orig_colname = column_name

# if we're using gather or prefetch or manifest, set column_name
# automatically (after checks).
Expand Down Expand Up @@ -226,6 +228,19 @@ def matches_manifest_row(self, row):
return True
return False

def matched_csv_row(self, row):
"""did the given CSV row object match this picklist?
This is used for examining matches/nomatches to original picklist file.
"""
q = row[self.column_name]
q = self.preprocess_fn(q)
self.n_queries += 1

if q in self.found:
return True
return False

def filter(self, it):
"yield all signatures in the given iterator that are in the picklist"
for ss in it:
Expand Down
141 changes: 120 additions & 21 deletions src/sourmash/sig/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1194,6 +1194,34 @@ def kmers(args):
_SketchInfo = namedtuple('_SketchInfo', 'ksize, moltype, scaled, num, abund')


def _summarize_manifest(manifest):
info_d = {}

# use a namedtuple to track counts of distinct sketch types and n hashes
total_size = 0
counter = Counter()
hashcounts = Counter()
for row in manifest.rows:
ski = _SketchInfo(ksize=row['ksize'], moltype=row['moltype'],
scaled=row['scaled'], num=row['num'],
abund=row['with_abundance'])
counter[ski] += 1
hashcounts[ski] += row['n_hashes']
total_size += row['n_hashes']

# store in info_d
info_d['total_hashes'] = total_size
sketch_info = []
for ski, count in counter.items():
sketch_d = dict(ski._asdict())
sketch_d['count'] = count
sketch_d['n_hashes'] = hashcounts[ski]
sketch_info.append(sketch_d)
info_d['sketch_info'] = sketch_info

return info_d


# NOTE: also aliased as 'summarize'
def fileinfo(args):
"""
Expand Down Expand Up @@ -1244,27 +1272,7 @@ def fileinfo(args):
notify("** no manifest and cannot be generated; exiting.")
sys.exit(0)

# use a namedtuple to track counts of distinct sketch types and n hashes
total_size = 0
counter = Counter()
hashcounts = Counter()
for row in manifest.rows:
ski = _SketchInfo(ksize=row['ksize'], moltype=row['moltype'],
scaled=row['scaled'], num=row['num'],
abund=row['with_abundance'])
counter[ski] += 1
hashcounts[ski] += row['n_hashes']
total_size += row['n_hashes']

# store in info_d
info_d['total_hashes'] = total_size
sketch_info = []
for ski, count in counter.items():
sketch_d = dict(ski._asdict())
sketch_d['count'] = count
sketch_d['n_hashes'] = hashcounts[ski]
sketch_info.append(sketch_d)
info_d['sketch_info'] = sketch_info
info_d.update(_summarize_manifest(manifest))

if text_out:
print_results(f"total hashes: {info_d['total_hashes']}")
Expand All @@ -1283,6 +1291,97 @@ def fileinfo(args):
print(json.dumps(info_d))


def check(args):
"""
check signature db(s) against a picklist.
"""
from sourmash.picklist import PickStyle
set_quiet(args.quiet)
moltype = sourmash_args.calculate_moltype(args)
picklist = sourmash_args.load_picklist(args)
pattern_search = sourmash_args.load_include_exclude_db_patterns(args)
_extend_signatures_with_from_file(args)

if not picklist:
error("** No picklist provided?! Exiting.")
sys.exit(-1)

if picklist.pickstyle == PickStyle.EXCLUDE and args.output_missing:
error("** ERROR: Cannot use an 'exclude' picklist with '-o/--output-missing'")
sys.exit(-1)

# require manifests?
require_manifest = True
if args.no_require_manifest:
require_manifest = False
debug("sig check: manifest will not be required")
else:
debug("sig check: manifest required")

total_manifest_rows = []

# start loading!
total_rows_examined = 0
for filename in args.signatures:
idx = sourmash_args.load_file_as_index(filename,
yield_all_files=args.force)

idx = idx.select(ksize=args.ksize, moltype=moltype)

if idx.manifest is None and require_manifest:
error(f"ERROR on filename '{filename}'.")
error("sig check requires a manifest by default, but no manifest present.")
error("specify --no-require-manifest to dynamically generate one.")
sys.exit(-1)

# has manifest, or ok to build (require_manifest=False) - continue!
manifest = sourmash_args.get_manifest(idx, rebuild=True)
manifest_rows = manifest._select(picklist=picklist)
total_rows_examined += len(manifest)
total_manifest_rows += manifest_rows

notify(f"loaded {total_rows_examined} signatures.")

sourmash_args.report_picklist(args, picklist)

# output picklist of non-matching in same format as input picklist
n_missing = len(picklist.pickset - picklist.found)
if args.output_missing and n_missing:
pickfile = picklist.pickfile

# go through the input file and pick out missing rows.
n_input = 0
n_output = 0
with open(pickfile, newline='') as csvfp:
r = csv.DictReader(csvfp)

with open(args.output_missing, "w", newline='') as outfp:
w = csv.DictWriter(outfp, fieldnames=r.fieldnames)
w.writeheader()

for row in r:
n_input += 1
if not picklist.matched_csv_row(row):
n_output += 1
w.writerow(row)
notify(f"saved {n_output} non-matching rows of {n_input} picklist rows to '{args.output_missing}'")
elif args.output_missing:
notify(f"(no remaining picklist entries; not saving to '{args.output_missing}')")

# save manifest of matching!
if args.save_manifest_matching and total_manifest_rows:
mf = CollectionManifest(total_manifest_rows)
with open(args.save_manifest_matching, 'w', newline="") as fp:
mf.write_to_csv(fp, write_header=True)
notify(f"wrote {len(mf)} matching manifest rows to '{args.save_manifest_matching}'")
elif args.save_manifest_matching:
notify(f"(not saving matching manifest to '{args.save_manifest_matching}' because no matches)")

if args.fail_if_missing:
error("** ERROR: missing values, and --fail-if-missing requested. Exiting.")
sys.exit(-1)


def main(arglist=None):
args = sourmash.cli.get_parser().parse_args(arglist)
submod = getattr(sourmash.cli.sig, args.subcmd)
Expand Down
3 changes: 2 additions & 1 deletion src/sourmash/sourmash_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,7 +603,8 @@ def open(self):
return self.fp

def close(self):
self.fp.close()
if self.fp is not None: # in case of stdout
self.fp.close()

def __enter__(self):
return self.open()
Expand Down
26 changes: 26 additions & 0 deletions tests/test-data/gather/salmonella-picklist-diffcolumn.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
name2
"NOT THERE"
"NC_003197.2 Salmonella enterica subsp. enterica serovar Typhimurium str. LT2, complete genome"
"NC_003197.2 Salmonella enterica subsp. enterica serovar Typhimurium str. LT2, complete genome"
"NC_003197.2 Salmonella enterica subsp. enterica serovar Typhimurium str. LT2, complete genome"
"NC_004631.1 Salmonella enterica subsp. enterica serovar Typhi Ty2, complete genome"
"NC_004631.1 Salmonella enterica subsp. enterica serovar Typhi Ty2, complete genome"
"NC_004631.1 Salmonella enterica subsp. enterica serovar Typhi Ty2, complete genome"
"NC_006905.1 Salmonella enterica subsp. enterica serovar Choleraesuis str. SC-B67, complete genome"
"NC_006905.1 Salmonella enterica subsp. enterica serovar Choleraesuis str. SC-B67, complete genome"
"NC_006905.1 Salmonella enterica subsp. enterica serovar Choleraesuis str. SC-B67, complete genome"
NC_011294.1 Salmonella enterica subsp. enterica serovar Enteritidis str. P125109 complete genome
NC_011294.1 Salmonella enterica subsp. enterica serovar Enteritidis str. P125109 complete genome
NC_011294.1 Salmonella enterica subsp. enterica serovar Enteritidis str. P125109 complete genome
NC_011274.1 Salmonella enterica subsp. enterica serovar Gallinarum str. 287/91 complete genome
NC_011274.1 Salmonella enterica subsp. enterica serovar Gallinarum str. 287/91 complete genome
NC_011274.1 Salmonella enterica subsp. enterica serovar Gallinarum str. 287/91 complete genome
"NC_006511.1 Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150, complete genome"
"NC_006511.1 Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150, complete genome"
"NC_006511.1 Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150, complete genome"
"NC_011080.1 Salmonella enterica subsp. enterica serovar Newport str. SL254, complete genome"
"NC_011080.1 Salmonella enterica subsp. enterica serovar Newport str. SL254, complete genome"
"NC_011080.1 Salmonella enterica subsp. enterica serovar Newport str. SL254, complete genome"
"NC_003198.1 Salmonella enterica subsp. enterica serovar Typhi str. CT18, complete genome"
"NC_003198.1 Salmonella enterica subsp. enterica serovar Typhi str. CT18, complete genome"
"NC_003198.1 Salmonella enterica subsp. enterica serovar Typhi str. CT18, complete genome"
9 changes: 9 additions & 0 deletions tests/test-data/v6.sbt.zip.mf.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# SOURMASH-MANIFEST-VERSION: 1.0
internal_location,md5,md5short,ksize,moltype,num,scaled,n_hashes,with_abundance,name,filename
6d6e87e1154e95b279e5e7db414bc37b,6d6e87e1154e95b279e5e7db414bc37b,6d6e87e1,31,DNA,500,0,500,0,,SRR2255622_1.fastq.gz
60f7e23c24a8d94791cc7a8680c493f9,60f7e23c24a8d94791cc7a8680c493f9,60f7e23c,31,DNA,500,0,500,0,,SRR2060939_1.fastq.gz
0107d767a345eff67ecdaed2ee5cd7ba,0107d767a345eff67ecdaed2ee5cd7ba,0107d767,31,DNA,500,0,500,0,,SRR453566_1.fastq.gz
f71e78178af9e45e6f1d87a0c53c465c,f71e78178af9e45e6f1d87a0c53c465c,f71e7817,31,DNA,500,0,500,0,,SRR2241509_1.fastq.gz
f0c834bc306651d2b9321fb21d3e8d8f,f0c834bc306651d2b9321fb21d3e8d8f,f0c834bc,31,DNA,500,0,500,0,,SRR453569_1.fastq.gz
4e94e60265e04f0763142e20b52c0da1,4e94e60265e04f0763142e20b52c0da1,4e94e602,31,DNA,500,0,500,0,,SRR2060939_2.fastq.gz
b59473c94ff2889eca5d7165936e64b3,b59473c94ff2889eca5d7165936e64b3,b59473c9,31,DNA,500,0,500,0,,SRR453570_1.fastq.gz
Loading

0 comments on commit 3b267fe

Please sign in to comment.