Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] fix issue with identifiers in the LCA code #1347

Merged
merged 9 commits into from
Feb 26, 2021
47 changes: 31 additions & 16 deletions src/sourmash/command_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ def _compute_individual(args, signatures_factory):

if args.singleton:
siglist = []
n = None
for n, record in enumerate(screed.open(filename)):
# make a new signature for each sequence
sigs = signatures_factory()
Expand All @@ -169,15 +170,19 @@ def _compute_individual(args, signatures_factory):
set_sig_name(sigs, filename, name=record.name)
siglist.extend(sigs)

notify('calculated {} signatures for {} sequences in {}',
len(siglist), n + 1, filename)
if n is not None:
notify('calculated {} signatures for {} sequences in {}',
len(siglist), n + 1, filename)
else:
notify(f"no sequences found in '{filename}'?!")
else:
# make a single sig for the whole file
sigs = signatures_factory()

# consume & calculate signatures
notify('... reading sequences from {}', filename)
name = None
n = None
for n, record in enumerate(screed.open(filename)):
if n % 10000 == 0:
if n:
Expand All @@ -188,12 +193,15 @@ def _compute_individual(args, signatures_factory):
add_seq(sigs, record.sequence,
args.input_is_protein, args.check_sequence)

notify('...{} {} sequences', filename, n, end='')
if n is not None: # don't write out signatures if no input
ctb marked this conversation as resolved.
Show resolved Hide resolved
notify('...{} {} sequences', filename, n, end='')

set_sig_name(sigs, filename, name)
siglist.extend(sigs)
set_sig_name(sigs, filename, name)
siglist.extend(sigs)

notify(f'calculated {len(siglist)} signatures for {n+1} sequences in {filename}')
notify(f'calculated {len(siglist)} signatures for {n+1} sequences in {filename}')
else:
notify(f"no sequences found in '{filename}'?!")

# if no --output specified, save to individual files w/in for loop
if not args.output:
Expand All @@ -202,7 +210,8 @@ def _compute_individual(args, signatures_factory):

# if --output specified, all collected signatures => args.output
if args.output:
save_siglist(siglist, args.output)
if siglist:
save_siglist(siglist, args.output)
siglist = []

assert not siglist # juuuust checking.
Expand All @@ -212,28 +221,31 @@ def _compute_merged(args, signatures_factory):
# make a signature for the whole file
sigs = signatures_factory()

n = 0
total_seq = 0
for filename in args.filenames:
# consume & calculate signatures
notify('... reading sequences from {}', filename)

n = None
for n, record in enumerate(screed.open(filename)):
if n % 10000 == 0 and n:
notify('\r... {} {}', filename, n, end='')

add_seq(sigs, record.sequence,
args.input_is_protein, args.check_sequence)
notify('... {} {} sequences', filename, n + 1)

total_seq += n + 1
if n is not None:
notify('... {} {} sequences', filename, n + 1)
total_seq += n + 1
else:
notify(f"no sequences found in '{filename}'?!")

set_sig_name(sigs, filename, name=args.merge)
notify('calculated 1 signature for {} sequences taken from {} files',
total_seq, len(args.filenames))
if total_seq:
set_sig_name(sigs, filename, name=args.merge)
notify('calculated 1 signature for {} sequences taken from {} files',
total_seq, len(args.filenames))

# at end, save!
save_siglist(sigs, args.output)
# at end, save!
save_siglist(sigs, args.output)


def add_seq(sigs, seq, input_is_protein, check_sequence):
Expand All @@ -245,9 +257,12 @@ def add_seq(sigs, seq, input_is_protein, check_sequence):


def set_sig_name(sigs, filename, name=None):
if filename == '-': # if stdin, set filename to empty.
filename = ''
for sig in sigs:
if name is not None:
sig._name = name

sig.filename = filename


Expand Down
10 changes: 7 additions & 3 deletions src/sourmash/lca/command_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def index(args):
n = 0
total_n = len(inp_files)
record_duplicates = set()
record_no_lineage = set()
record_no_lineage = []
record_remnants = set(assignments)
record_used_lineages = set()
record_used_idents = set()
Expand All @@ -207,7 +207,11 @@ def index(args):
md5_to_name[sig.md5sum()] = str(sig)

# parse identifier, potentially with splitting
ident = sig.name
if sig.name:
ident = sig.name
else:
ident = sig.filename

if args.split_identifiers: # hack for NCBI-style names, etc.
# split on space...
ident = ident.split(' ')[0]
Expand Down Expand Up @@ -242,7 +246,7 @@ def index(args):
# track lineage info - either no lineage, or this lineage used.
else:
debug('WARNING: no lineage assignment for {}.', ident)
record_no_lineage.add(ident)
record_no_lineage.append(ident)

# end main add signatures loop

Expand Down
8 changes: 3 additions & 5 deletions src/sourmash/lca/lca_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,13 +129,11 @@ def insert(self, sig, ident=None, lineage=None):
except ValueError:
raise ValueError("cannot downsample signature; is it a scaled signature?")

if ident is None:
ident = sig.name
if not ident:
ident = sig.filename
if not ident:
ident = str(sig)

if ident in self.ident_to_name:
raise ValueError("signature {} is already in this LCA db.".format(ident))
raise ValueError("signature '{}' is already in this LCA db.".format(ident))

# before adding, invalide any caching from @cached_property
self._invalidate_cache()
Expand Down
2 changes: 1 addition & 1 deletion src/sourmash/sig/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def describe(args):
with_abundance = 1
md5 = sig.md5sum()
name = sig.name or "** no name **"
filename = sig.filename
filename = sig.filename or "** no name **"
license = sig.license

if w:
Expand Down
29 changes: 29 additions & 0 deletions tests/test_cmd_signature.py
Original file line number Diff line number Diff line change
Expand Up @@ -1432,6 +1432,35 @@ def test_sig_describe_stdin(c):
assert 'signature: GCA_001593925' in c.last_result.out


@utils.in_tempdir
def test_sig_describe_empty(c):
sig = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig')

ss = sourmash.load_file_as_signatures(sig)
ss = list(ss)
assert len(ss) == 1
ss = ss[0]

ss.name = ''
ss.filename = ''

outsig = c.output('xxx.sig')
with open(outsig, 'wt') as fp:
sourmash.save_signatures([ss], fp)

ss = sourmash.load_file_as_signatures(outsig)
ss = list(ss)
assert len(ss) == 1
ss = ss[0]
assert ss.name == ''
assert ss.filename == ''

c.run_sourmash('sig', 'describe', outsig)
print(c.last_result.out)
assert 'signature: ** no name **' in c.last_result.out
assert 'source file: ** no name **' in c.last_result.out


@utils.in_tempdir
def test_sig_describe_2(c):
# get info in CSV spreadsheet
Expand Down
41 changes: 41 additions & 0 deletions tests/test_lca.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,24 @@ def test_api_create_insert_bad_ksize():
lca_db.insert(ss)


def test_api_create_insert_bad_ident():
# can we insert a signature with no/empty ident?
ss1 = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'),
ksize=31)
ss2 = sourmash.load_one_signature(utils.get_test_data('63.fa.sig'),
ksize=31)
ss1.name = ''
ss1.filename = ''
ss2.name = ''
ss2.filename = ''

lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000)
lca_db.insert(ss1)
lca_db.insert(ss2)
# SUCCESS!
# would fail, previously :)


def test_api_create_insert_bad_scaled():
# can we insert a scaled=1000 signature into a scaled=500 DB?
# hopefully not.
Expand Down Expand Up @@ -643,6 +661,29 @@ def test_basic_index_column_start():
assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in err


@utils.in_tempdir
def test_index_empty_sketch_name(c):
# create two signatures with empty 'name' attributes
cmd = ['sketch', 'dna', utils.get_test_data('genome-s12.fa.gz'),
utils.get_test_data('genome-s11.fa.gz')]
c.run_sourmash(*cmd)

sig1 = c.output('genome-s11.fa.gz.sig')
assert os.path.exists(sig1)
sig2 = c.output('genome-s12.fa.gz.sig')
assert os.path.exists(sig2)

# can we insert them both?
taxcsv = utils.get_test_data('lca/delmont-1.csv')
cmd = ['lca', 'index', taxcsv, 'zzz', sig1, sig2]
c.run_sourmash(*cmd)
assert os.path.exists(c.output('zzz.lca.json'))

print(c.last_result.out)
print(c.last_result.err)
assert 'WARNING: no lineage provided for 2 sig' in c.last_result.err


def test_basic_index_and_classify_with_tsv_and_gz():
with utils.TempDirectory() as location:
taxcsv = utils.get_test_data('lca/delmont-1.tsv')
Expand Down
36 changes: 36 additions & 0 deletions tests/test_sourmash_sketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,42 @@ def test_do_sourmash_sketchdna():
assert str(sig).endswith('short.fa')


@utils.in_tempdir
def test_do_sourmash_sketchdna_noinput(c):
data = ""

cmd = ['sketch', 'dna', '-', '-o', c.output('xxx.sig')]
c.run_sourmash(*cmd, stdin_data=data)

sigfile = c.output('xxx.sig')
assert not os.path.exists(sigfile)
assert 'no sequences found' in c.last_result.err


@utils.in_tempdir
def test_do_sourmash_sketchdna_noinput_singleton(c):
data = ""

cmd = ['sketch', 'dna', '-', '-o', c.output('xxx.sig'), '--singleton']
c.run_sourmash(*cmd, stdin_data=data)

sigfile = c.output('xxx.sig')
assert not os.path.exists(sigfile)
assert 'no sequences found' in c.last_result.err


@utils.in_tempdir
def test_do_sourmash_sketchdna_noinput_merge(c):
data = ""

cmd = ['sketch', 'dna', '-', '-o', c.output('xxx.sig'), '--merge', 'name']
c.run_sourmash(*cmd, stdin_data=data)

sigfile = c.output('xxx.sig')
assert not os.path.exists(sigfile)
assert 'no sequences found' in c.last_result.err


@utils.in_tempdir
def test_do_sourmash_sketchdna_outdir(c):
testdata1 = utils.get_test_data('short.fa')
Expand Down