Skip to content

Commit

Permalink
[MRG] add initial progress reporting wrapper for load_file_as_signatu…
Browse files Browse the repository at this point in the history
…res (#1083)

* add initial progress reporting wrapper for load_file_as_signatures

* improve behavior for slow-loading databases

* minor cleanup

* add progress reporting to rest of sig subcommand

* add progress reporting to compare and index

* clean up sbt index output a little

* fix tests

* shorten notification

* incremental output of sig describe

* set screen width to 79
  • Loading branch information
ctb authored Jul 13, 2020
1 parent ec24ea1 commit 0804556
Show file tree
Hide file tree
Showing 5 changed files with 177 additions and 72 deletions.
13 changes: 8 additions & 5 deletions sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ def compare(args):
more_files = sourmash_args.load_file_list_of_signatures(args.from_file)
inp_files.extend(more_files)

progress = sourmash_args.SignatureLoadingProgress()

# load in the various signatures
siglist = []
ksizes = set()
Expand All @@ -46,7 +48,8 @@ def compare(args):
ksize=args.ksize,
select_moltype=moltype,
traverse=args.traverse_directory,
yield_all_files=args.force)
yield_all_files=args.force,
progress=progress)
loaded = list(loaded)
if not loaded:
notify('\nwarning: no signatures loaded at given ksize/molecule type from {}', filename)
Expand Down Expand Up @@ -352,20 +355,20 @@ def index(args):

notify('loading {} files into SBT', len(inp_files))

progress = sourmash_args.SignatureLoadingProgress()

n = 0
ksizes = set()
moltypes = set()
nums = set()
scaleds = set()
for f in inp_files:
if n % 100 == 0:
notify('\r...reading from {} ({} signatures so far)', f, n, end='')

siglist = sourmash_args.load_file_as_signatures(f,
ksize=args.ksize,
select_moltype=moltype,
traverse=args.traverse_directory,
yield_all_files=args.force)
yield_all_files=args.force,
progress=progress)

# load all matching signatures in this file
ss = None
Expand Down
4 changes: 2 additions & 2 deletions sourmash/sbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,7 +585,7 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False):
if n % 100 == 0:
notify("{} of {} nodes saved".format(n+1, total_nodes), end='\r')

notify("\nFinished saving nodes, now saving SBT json file.")
notify("Finished saving nodes, now saving SBT index file.")
info['nodes'] = nodes
info['signatures'] = leaves

Expand All @@ -599,7 +599,7 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False):
with open(index_filename, 'w') as fp:
json.dump(info, fp)

notify("\nFinished saving SBT, available at {0}\n".format(index_filename))
notify("Finished saving SBT index, available at {0}\n".format(index_filename))

return path

Expand Down
151 changes: 92 additions & 59 deletions sourmash/sig/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,21 +67,21 @@ def cat(args):
"""
set_quiet(args.quiet)

progress = sourmash_args.SignatureLoadingProgress()

siglist = []
for sigfile in args.signatures:
this_siglist = []
try:
this_siglist = sourmash_args.load_file_as_signatures(sigfile, traverse=True)
loader = sourmash_args.load_file_as_signatures(sigfile,
traverse=True,
progress=progress)
for sig in loader:
siglist.append(sig)
except Exception as exc:
error(str(exc))
error('(continuing)')

this_siglist = list(this_siglist)

notify('loaded {} signatures from {}...', len(this_siglist), sigfile,
end='\r')
siglist.extend(this_siglist)

notify('loaded {} signatures total.', len(siglist))

with FileOutput(args.output, 'wt') as fp:
Expand All @@ -105,10 +105,14 @@ def split(args):
notify('Creating --outdir {}', args.outdir)
os.mkdir(args.outdir)

progress = sourmash_args.SignatureLoadingProgress()

total = 0
for sigfile in args.signatures:
# load signatures from input file:
this_siglist = sourmash_args.load_file_as_signatures(sigfile, traverse=True)
this_siglist = sourmash_args.load_file_as_signatures(sigfile,
traverse=True,
progress=progress)

# save each file individually --
n_signatures = 0
Expand Down Expand Up @@ -168,23 +172,6 @@ def describe(args):
"""
set_quiet(args.quiet)

siglist = []
for sigfile in args.signatures:
this_siglist = []
try:
this_siglist = sourmash_args.load_file_as_signatures(sigfile, traverse=True)
for k in this_siglist:
siglist.append((k, sigfile))
except Exception as exc:
error(str(exc))
error('(continuing)')

this_siglist = list(this_siglist)
notify('loaded {} signatures from {}...', len(this_siglist), sigfile,
end='\r')

notify('loaded {} signatures total.', len(siglist))

# write CSV?
w = None
csv_fp = None
Expand All @@ -197,27 +184,38 @@ def describe(args):
extrasaction='ignore')
w.writeheader()

# extract info, write as appropriate.
for (sig, signature_file) in siglist:
mh = sig.minhash
ksize = mh.ksize
moltype = mh.moltype
scaled = mh.scaled
num = mh.num
seed = mh.seed
n_hashes = len(mh)
with_abundance = 0
if mh.track_abundance:
with_abundance = 1
md5 = sig.md5sum()
name = sig.name()
filename = sig.filename
license = sig.license

if w:
w.writerow(locals())

print_results('''\
# load signatures and display info.
progress = sourmash_args.SignatureLoadingProgress()

n_loaded = 0
for signature_file in args.signatures:
try:
loader = sourmash_args.load_file_as_signatures(signature_file,
traverse=True,
progress=progress)
for sig in loader:
n_loaded += 1

# extract info, write as appropriate.
mh = sig.minhash
ksize = mh.ksize
moltype = mh.moltype
scaled = mh.scaled
num = mh.num
seed = mh.seed
n_hashes = len(mh)
with_abundance = 0
if mh.track_abundance:
with_abundance = 1
md5 = sig.md5sum()
name = sig.name()
filename = sig.filename
license = sig.license

if w:
w.writerow(locals())

print_results('''\
---
signature filename: {signature_file}
signature: {name}
Expand All @@ -228,6 +226,14 @@ def describe(args):
signature license: {license}
''', **locals())

except Exception as exc:
error('\nError while reading signatures from {}:'.format(signature_file))
error(str(exc))
error('(continuing)')
raise

notify('loaded {} signatures total.', n_loaded)

if csv_fp:
csv_fp.close()

Expand Down Expand Up @@ -321,12 +327,15 @@ def merge(args):
total_loaded = 0

# iterate over all the sigs from all the files.
progress = sourmash_args.SignatureLoadingProgress()

for sigfile in args.signatures:
notify('loading signatures from {}...', sigfile, end='\r')
this_n = 0
for sigobj in sourmash_args.load_file_as_signatures(sigfile,
select_moltype=moltype,
traverse=True):
select_moltype=moltype,
traverse=True,
progress=progress):

# first signature? initialize a bunch of stuff
if first_sig is None:
Expand Down Expand Up @@ -380,10 +389,13 @@ def intersect(args):
mins = None
total_loaded = 0

progress = sourmash_args.SignatureLoadingProgress()

for sigfile in args.signatures:
for sigobj in sourmash_args.load_file_as_signatures(sigfile,
select_moltype=moltype,
traverse=True):
traverse=True,
progress=progress):
if first_sig is None:
first_sig = sigobj
mins = set(sigobj.minhash.get_mins())
Expand Down Expand Up @@ -446,11 +458,14 @@ def subtract(args):

notify('loaded signature from {}...', from_sigfile, end='\r')

progress = sourmash_args.SignatureLoadingProgress()

total_loaded = 0
for sigfile in args.subtraction_sigs:
for sigobj in sourmash_args.load_file_as_signatures(sigfile,
select_moltype=moltype,
traverse=True):
select_moltype=moltype,
traverse=True,
progress=progress):

if sigobj.minhash.track_abundance and not args.flatten:
error('Cannot use subtract on signatures with abundance tracking, sorry!')
Expand Down Expand Up @@ -484,12 +499,15 @@ def rename(args):
set_quiet(args.quiet, args.quiet)
moltype = sourmash_args.calculate_moltype(args)

progress = sourmash_args.SignatureLoadingProgress()

outlist = []
for filename in args.sigfiles:
debug('loading {}', filename)
siglist = sourmash_args.load_file_as_signatures(filename,
select_moltype=moltype,
traverse=True)
traverse=True,
progress=progress)

for sigobj in siglist:
sigobj._name = args.name
Expand All @@ -508,12 +526,15 @@ def extract(args):
set_quiet(args.quiet)
moltype = sourmash_args.calculate_moltype(args)

progress = sourmash_args.SignatureLoadingProgress()

outlist = []
total_loaded = 0
for filename in args.signatures:
siglist = sourmash_args.load_file_as_signatures(filename,
select_moltype=moltype,
traverse=True)
select_moltype=moltype,
traverse=True,
progress=progress)
siglist = list(siglist)

total_loaded += len(siglist)
Expand Down Expand Up @@ -546,12 +567,15 @@ def filter(args):
set_quiet(args.quiet)
moltype = sourmash_args.calculate_moltype(args)

progress = sourmash_args.SignatureLoadingProgress()

outlist = []
total_loaded = 0
for filename in args.signatures:
siglist = sourmash_args.load_file_as_signatures(filename,
select_moltype=moltype,
traverse=True)
select_moltype=moltype,
traverse=True,
progress=progress)
siglist = list(siglist)

total_loaded += len(siglist)
Expand Down Expand Up @@ -600,12 +624,15 @@ def flatten(args):
set_quiet(args.quiet)
moltype = sourmash_args.calculate_moltype(args)

progress = sourmash_args.SignatureLoadingProgress()

outlist = []
total_loaded = 0
for filename in args.signatures:
siglist = sourmash_args.load_file_as_signatures(filename,
select_moltype=moltype,
traverse=True)
select_moltype=moltype,
traverse=True,
progress=progress)
siglist = list(siglist)

total_loaded += len(siglist)
Expand Down Expand Up @@ -649,10 +676,16 @@ def downsample(args):
error('cannot specify both --num and --scaled')
sys.exit(-1)

progress = sourmash_args.SignatureLoadingProgress()

output_list = []
total_loaded = 0
for sigfile in args.signatures:
siglist = sourmash_args.load_file_as_signatures(sigfile, ksize=args.ksize, select_moltype=moltype, traverse=True)
siglist = sourmash_args.load_file_as_signatures(sigfile,
ksize=args.ksize,
select_moltype=moltype,
traverse=True,
progress=progress)

for sigobj in siglist:
mh = sigobj.minhash
Expand Down
Loading

0 comments on commit 0804556

Please sign in to comment.