Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] add sum_hashes to sourmash sig describe output. #1882

Merged
merged 8 commits into from
Mar 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions doc/command-line.md
Original file line number Diff line number Diff line change
Expand Up @@ -989,20 +989,25 @@ Display signature details.

For example,
```
sourmash sig describe tests/test-data/47.fa.sig
sourmash sig describe tests/test-data/track_abund/47.fa.sig
```
will display:

```
signature filename: tests/test-data/47.fa.sig
signature filename: tests/test-data/track_abund/47.fa.sig
signature: NC_009665.1 Shewanella baltica OS185, complete genome
source file: 47.fa
source file: podar-ref/47.fa
md5: 09a08691ce52952152f0e866a59f6261
k=31 molecule=DNA num=0 scaled=1000 seed=42 track_abundance=0
k=31 molecule=DNA num=0 scaled=1000 seed=42 track_abundance=1
size: 5177
sum hashes: 5292
signature license: CC0
```

Here, the `size` is the number of distinct hashes in the sketch, and
`sum_hashes` is the total number of hashes in the sketch, with abundances.
When `track_abundance` is 0, `size` is always the same as `sum_hashes`.

### `sourmash signature fileinfo` - display a summary of the contents of a sourmash collection

Display signature file, database, or collection.
Expand Down
3 changes: 2 additions & 1 deletion src/sourmash/cli/sig/describe.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""show details of signature"""

from sourmash.cli.utils import (add_moltype_args, add_ksize_arg,
add_picklist_args)
add_picklist_args, add_pattern_args)


def subparser(subparsers):
Expand All @@ -26,6 +26,7 @@ def subparser(subparsers):
add_ksize_arg(subparser, 31)
add_moltype_args(subparser)
add_picklist_args(subparser)
add_pattern_args(subparser)


def main(args):
Expand Down
11 changes: 8 additions & 3 deletions src/sourmash/sig/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ def describe(args):
set_quiet(args.quiet)
moltype = sourmash_args.calculate_moltype(args)
picklist = sourmash_args.load_picklist(args)
pattern_search = sourmash_args.load_include_exclude_db_patterns(args)
_extend_signatures_with_from_file(args)

# write CSV?
Expand All @@ -214,11 +215,11 @@ def describe(args):
csv_obj = sourmash_args.FileOutputCSV(args.csv)
csv_fp = csv_obj.open()

# CTB: might want to switch to sourmash_args.FileOutputCSV here?
w = csv.DictWriter(csv_fp,
['signature_file', 'md5', 'ksize', 'moltype',
'num', 'scaled', 'n_hashes', 'seed',
'with_abundance', 'name', 'filename', 'license'],
'with_abundance', 'name', 'filename', 'license',
'sum_hashes'],
extrasaction='ignore')
w.writeheader()

Expand All @@ -230,17 +231,20 @@ def describe(args):
picklist=picklist,
progress=progress,
yield_all_files=args.force,
force=args.force)
force=args.force,
pattern=pattern_search)

for sig, location in loader:
# extract info, write as appropriate.
signature_file = location
mh = sig.minhash
ksize = mh.ksize
moltype = mh.moltype
scaled = mh.scaled
num = mh.num
seed = mh.seed
n_hashes = len(mh)
sum_hashes = sum(mh.hashes.values())
with_abundance = 0
if mh.track_abundance:
with_abundance = 1
Expand All @@ -262,6 +266,7 @@ def describe(args):
md5: {md5}
k={ksize} molecule={moltype} num={num} scaled={scaled} seed={seed} track_abundance={with_abundance}
size: {n_hashes}
sum hashes: {sum_hashes}
signature license: {license}
''', **locals())

Expand Down
1 change: 0 additions & 1 deletion tests/test-data/47.abunds.fa.sig

This file was deleted.

2 changes: 1 addition & 1 deletion tests/test_bugs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
@utils.in_tempdir
def test_bug_803(c):
# can we do a 'sourmash search' on an LCA database and a query with abundance?
query = utils.get_test_data('47.abunds.fa.sig')
query = utils.get_test_data('track_abund/47.fa.sig')
lca_db = utils.get_test_data('lca/47+63.lca.json')

c.run_sourmash('search', query, lca_db, '--ignore-abundance')
Expand Down
111 changes: 110 additions & 1 deletion tests/test_cmd_signature.py
Original file line number Diff line number Diff line change
Expand Up @@ -2800,7 +2800,7 @@ def test_sig_describe_1_hp(c):
c.run_sourmash('sig', 'describe', computed_sig)

out = c.last_result.out
print(c.last_result)
print(c.last_result.out)

# Add final trailing slash for this OS
testdata_dirname = os.path.dirname(testdata) + os.sep
Expand All @@ -2814,6 +2814,7 @@ def test_sig_describe_1_hp(c):
md5: e45a080101751e044d6df861d3d0f3fd
k=7 molecule=protein num=500 scaled=0 seed=42 track_abundance=0
size: 500
sum hashes: 500
signature license: CC0

---
Expand All @@ -2823,6 +2824,7 @@ def test_sig_describe_1_hp(c):
md5: c027e96c3379d38942639219daa24fdc
k=7 molecule=dayhoff num=500 scaled=0 seed=42 track_abundance=0
size: 500
sum hashes: 500
signature license: CC0

---
Expand All @@ -2841,6 +2843,7 @@ def test_sig_describe_1_hp(c):
md5: 1136a8a68420bd93683e45cdaf109b80
k=21 molecule=DNA num=500 scaled=0 seed=42 track_abundance=0
size: 500
sum hashes: 500
signature license: CC0

---
Expand All @@ -2850,6 +2853,7 @@ def test_sig_describe_1_hp(c):
md5: 4244d1612598af044e799587132f007e
k=10 molecule=protein num=500 scaled=0 seed=42 track_abundance=0
size: 500
sum hashes: 500
signature license: CC0

---
Expand All @@ -2859,6 +2863,7 @@ def test_sig_describe_1_hp(c):
md5: 396dcb7c1875f48ca31e0759bec72ee1
k=10 molecule=dayhoff num=500 scaled=0 seed=42 track_abundance=0
size: 500
sum hashes: 500
signature license: CC0

---
Expand All @@ -2868,6 +2873,7 @@ def test_sig_describe_1_hp(c):
md5: 4c43878296459783dbd6a4a071ab7e9d
k=10 molecule=hp num=500 scaled=0 seed=42 track_abundance=0
size: 500
sum hashes: 500
signature license: CC0

---
Expand All @@ -2877,6 +2883,7 @@ def test_sig_describe_1_hp(c):
md5: 71f7c111c01785e5f38efad45b00a0e1
k=30 molecule=DNA num=500 scaled=0 seed=42 track_abundance=0
size: 500
sum hashes: 500
signature license: CC0

""".splitlines()
Expand Down Expand Up @@ -2982,6 +2989,29 @@ def test_sig_describe_1_zipfile(c):
assert line.strip() in out


def test_sig_describe_1_sig_abund(runtmp):
# check output of sig describe on a sketch with abundances
c = runtmp

sigfile = utils.get_test_data('track_abund/47.fa.sig')
c.run_sourmash('sig', 'describe', sigfile)

out = c.last_result.out
print(c.last_result.out)

expected_output = """\
signature: NC_009665.1 Shewanella baltica OS185, complete genome
source file: podar-ref/47.fa
md5: 09a08691ce52952152f0e866a59f6261
k=31 molecule=DNA num=0 scaled=1000 seed=42 track_abundance=1
size: 5177
sum hashes: 5292
signature license: CC0
""".splitlines()
for line in expected_output:
assert line.strip() in out


@utils.in_thisdir
def test_sig_describe_stdin(c):
sig = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig')
Expand Down Expand Up @@ -3045,6 +3075,37 @@ def test_sig_describe_2_csv(runtmp):
assert n == 2


def test_sig_describe_2_csv_abund(runtmp):
# output info in CSV spreadsheet, for abund sig
c = runtmp

sig47 = utils.get_test_data('track_abund/47.fa.sig')
c.run_sourmash('sig', 'describe', sig47, '--csv', 'out.csv')

with open(c.output('out.csv'), 'rt') as fp:
r = csv.DictReader(fp)

n = 0

rows = list(r)
assert len(rows) == 1
row = rows[0]

assert row['signature_file'] == sig47
assert row['md5'] == "09a08691ce52952152f0e866a59f6261"
assert row['ksize'] == "31"
assert row['moltype'] == "DNA"
assert row['num'] == "0"
assert row['scaled'] == "1000"
assert row['n_hashes'] == "5177"
assert row['seed'] == "42"
assert row['with_abundance'] == "1"
assert row['name'] == "NC_009665.1 Shewanella baltica OS185, complete genome"
assert row['filename'] == "podar-ref/47.fa"
assert row['license'] == "CC0"
assert row['sum_hashes'] == "5292"


def test_sig_describe_2_csv_as_picklist(runtmp):
# generate an output CSV from describe and then use it as a manifest
# pickfile
Expand Down Expand Up @@ -3074,6 +3135,54 @@ def test_sig_describe_2_csv_as_picklist(runtmp):
assert line.strip() in out


def test_sig_describe_2_include_db_pattern(runtmp):
# test sig describe --include-db-pattern
c = runtmp

allzip = utils.get_test_data('prot/all.zip')

c.run_sourmash('sig', 'describe', allzip,
'--include-db-pattern', 'os185')

out = c.last_result.out
print(c.last_result)

expected_output = """\
signature: NC_009665.1 Shewanella baltica OS185, complete genome
source file: 47.fa
md5: 09a08691ce52952152f0e866a59f6261
k=31 molecule=DNA num=0 scaled=1000 seed=42 track_abundance=0
size: 5177
signature license: CC0
""".splitlines()
for line in expected_output:
assert line.strip() in out


def test_sig_describe_2_exclude_db_pattern(runtmp):
# test sig describe --exclude-db-pattern
c = runtmp

allzip = utils.get_test_data('prot/all.zip')

c.run_sourmash('sig', 'describe', allzip, '--dna', '-k', '31',
'--exclude-db-pattern', 'os223')

out = c.last_result.out
print(c.last_result)

expected_output = """\
signature: NC_009665.1 Shewanella baltica OS185, complete genome
source file: 47.fa
md5: 09a08691ce52952152f0e866a59f6261
k=31 molecule=DNA num=0 scaled=1000 seed=42 track_abundance=0
size: 5177
signature license: CC0
""".splitlines()
for line in expected_output:
assert line.strip() in out


@utils.in_tempdir
def test_sig_overlap(c):
# get overlap details
Expand Down
2 changes: 1 addition & 1 deletion tests/test_cmd_signature_fileinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def test_fileinfo_1_sig_summarize(runtmp):

def test_fileinfo_1_sig_abund(runtmp):
# get basic info on a signature with abundance
sig47 = utils.get_test_data('47.abunds.fa.sig')
sig47 = utils.get_test_data('track_abund/47.fa.sig')

shutil.copyfile(sig47, runtmp.output('sig47.sig'))
runtmp.run_sourmash('sig', 'fileinfo', 'sig47.sig')
Expand Down