Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] migrate command (update old indexes) #494

Merged
merged 9 commits into from
Jun 22, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sourmash/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.0.0a7
2.0.0a8
5 changes: 3 additions & 2 deletions sourmash/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from .commands import (categorize, compare, compute, dump, import_csv,
gather, index, sbt_combine, search,
plot, watch, info, storage)
plot, watch, info, storage, migrate)
from .lca import main as lca_main

usage='''
Expand Down Expand Up @@ -57,7 +57,8 @@ def main():
'watch': watch,
'sbt_combine': sbt_combine, 'info': info,
'storage': storage,
'lca': lca_main}
'lca': lca_main,
'migrate': migrate}
parser = argparse.ArgumentParser(
description='work with compressed sequence representations')
parser.add_argument('command', nargs='?')
Expand Down
12 changes: 12 additions & 0 deletions sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -1160,3 +1160,15 @@ def storage(args):
set_quiet(args.quiet)
if args.command == 'convert':
convert_cmd(args.sbt, args.backend)


def migrate(args):
parser = argparse.ArgumentParser()
parser.add_argument('sbt_name', help='name to save SBT into')

args = parser.parse_args(args)

tree = load_sbt_index(args.sbt_name, print_version_warning=False)

notify('saving SBT under "{}".', args.sbt_name)
tree.save(args.sbt_name, structure_only=True)
12 changes: 12 additions & 0 deletions sourmash/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,18 @@ def notify(s, *args, **kwargs):
sys.stderr.flush()


def debug(s, *args, **kwargs):
"A debug logging function => stderr."
if _quiet:
return

print(u'\r\033[K', end=u'', file=sys.stderr)
print(s.format(*args, **kwargs), file=sys.stderr,
end=kwargs.get('end', u'\n'))
if kwargs.get('flush'):
sys.stderr.flush()


def error(s, *args, **kwargs):
"A simple error logging function => stderr."
print(u'\r\033[K', end=u'', file=sys.stderr)
Expand Down
122 changes: 91 additions & 31 deletions sourmash/sbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def search_transcript(node, seq, threshold):
import khmer

from .sbt_storage import FSStorage, TarStorage, IPFSStorage, RedisStorage
from .logging import error, notify
from .logging import error, notify, debug


STORAGES = {
Expand Down Expand Up @@ -222,7 +222,9 @@ def _rebuild_node(self, pos=0):
if c.pos in self.missing_nodes or isinstance(c.node, Leaf):
if c.node is None:
self._rebuild_node(c.pos)
self.nodes[c.pos].update(node)
c_node = self.nodes[c.pos]
if c_node is not None:
c_node.update(node)
self.missing_nodes.remove(pos)


Expand Down Expand Up @@ -286,7 +288,7 @@ def child(self, parent, pos):
node = self.nodes.get(cd, None)
return NodePos(cd, node)

def save(self, path, storage=None, sparseness=0.0):
def save(self, path, storage=None, sparseness=0.0, structure_only=False):
"""Saves an SBT description locally and node data to a storage.

Parameters
Expand All @@ -300,6 +302,9 @@ def save(self, path, storage=None, sparseness=0.0):
How much of the internal nodes should be saved.
Defaults to 0.0 (save all internal nodes data),
can go up to 1.0 (don't save any internal nodes data)
structure_only: boolean
Write only the index schema and metadata, but not the data.
Defaults to False (save data too)

Returns
-------
Expand Down Expand Up @@ -349,14 +354,22 @@ def save(self, path, storage=None, sparseness=0.0):
'filename': os.path.basename(node.name),
'name': node.name
}

try:
node.metadata.pop('max_n_below')
except (AttributeError, KeyError):
pass

data['metadata'] = node.metadata

# trigger data loading before saving to the new place
node.data
if structure_only is False:
# trigger data loading before saving to the new place
node.data

node.storage = storage
node.storage = storage

data['filename'] = node.save(data['filename'])

data['filename'] = node.save(data['filename'])
structure[i] = data

notify("{} of {} nodes saved".format(n+1, total_nodes), end='\r')
Expand All @@ -369,7 +382,7 @@ def save(self, path, storage=None, sparseness=0.0):
return fn

@classmethod
def load(cls, location, leaf_loader=None, storage=None):
def load(cls, location, leaf_loader=None, storage=None, print_version_warning=True):
"""Load an SBT description from a file.

Parameters
Expand Down Expand Up @@ -423,10 +436,11 @@ def load(cls, location, leaf_loader=None, storage=None):
if version < 3 and storage is None:
storage = FSStorage(dirname, '.sbt.{}'.format(sbt_name))

return loaders[version](jnodes, leaf_loader, dirname, storage)
return loaders[version](jnodes, leaf_loader, dirname, storage,
print_version_warning)

@staticmethod
def _load_v1(jnodes, leaf_loader, dirname, storage):
def _load_v1(jnodes, leaf_loader, dirname, storage, print_version_warning=True):

if jnodes[0] is None:
raise ValueError("Empty tree!")
Expand Down Expand Up @@ -457,7 +471,7 @@ def _load_v1(jnodes, leaf_loader, dirname, storage):
return tree

@classmethod
def _load_v2(cls, info, leaf_loader, dirname, storage):
def _load_v2(cls, info, leaf_loader, dirname, storage, print_version_warning=True):
nodes = {int(k): v for (k, v) in info['nodes'].items()}

if nodes[0] is None:
Expand Down Expand Up @@ -489,7 +503,7 @@ def _load_v2(cls, info, leaf_loader, dirname, storage):
return tree

@classmethod
def _load_v3(cls, info, leaf_loader, dirname, storage):
def _load_v3(cls, info, leaf_loader, dirname, storage, print_version_warning=True):
nodes = {int(k): v for (k, v) in info['nodes'].items()}

if not nodes:
Expand Down Expand Up @@ -526,12 +540,15 @@ def _load_v3(cls, info, leaf_loader, dirname, storage):
# TODO: this might not be true with combine...
tree.next_node = max_node

if print_version_warning:
error("WARNING: this is an old index version, please run `sourmash migrate` to update it.")
error("WARNING: proceeding with execution, but it will take longer to finish!")
tree._fill_min_n_below()

return tree

@classmethod
def _load_v4(cls, info, leaf_loader, dirname, storage):
def _load_v4(cls, info, leaf_loader, dirname, storage, print_version_warning=True):
nodes = {int(k): v for (k, v) in info['nodes'].items()}

if not nodes:
Expand Down Expand Up @@ -575,25 +592,66 @@ def _fill_min_n_below(self):
Propagate the smallest hash size below each node up the tree from
the leaves.
"""
for i, n in self.nodes.items():
if isinstance(n, Leaf):
parent = self.parent(i)
if parent.pos not in self.missing_nodes:
min_n_below = parent.node.metadata.get('min_n_below', sys.maxsize)
min_n_below = min(len(n.data.minhash.get_mins()),
min_n_below)
parent.node.metadata['min_n_below'] = min_n_below

current = parent
parent = self.parent(parent.pos)
while parent and parent.pos not in self.missing_nodes:
min_n_below = parent.node.metadata.get('min_n_below', sys.maxsize)
min_n_below = min(current.node.metadata['min_n_below'],
min_n_below)
parent.node.metadata['min_n_below'] = min_n_below
current = parent
parent = self.parent(parent.pos)
def fill_min_n_below(node, *args, **kwargs):
original_min_n_below = node.metadata.get('min_n_below', sys.maxsize)
min_n_below = original_min_n_below

children = kwargs['children']
for child in children:
if child.node is not None:
if isinstance(child.node, Leaf):
min_n_below = min(len(child.node.data.minhash), min_n_below)
else:
child_n = child.node.metadata.get('min_n_below', sys.maxsize)
min_n_below = min(child_n, min_n_below)

if min_n_below == 0:
min_n_below = 1

node.metadata['min_n_below'] = min_n_below
return original_min_n_below != min_n_below

self._fill_up(fill_min_n_below)

def _fill_up(self, search_fn, *args, **kwargs):
visited, queue = set(), [i[0] for i in reversed(sorted(self._leaves()))]
debug("started filling up")
processed = 0
while queue:
node_p = queue.pop(0)

parent = self.parent(node_p)
if parent is None:
# we are in the root, no more nodes available to search
assert len(queue) == 0
return

was_missing = False
if parent.node is None:
if parent.pos in self.missing_nodes:
self._rebuild_node(parent.pos)
parent = self.parent(node_p)
was_missing = True
else:
continue

siblings = self.children(parent.pos)

if node_p not in visited:
visited.add(node_p)
for sibling in siblings:
visited.add(sibling.pos)
try:
queue.remove(sibling.pos)
except ValueError:
pass

if search_fn(parent.node, children=siblings, *args) or was_missing:
queue.append(parent.pos)

processed += 1
if processed % 100 == 0:
debug("processed {}, in queue {}", processed, len(queue), sep='\r')

def print_dot(self):
print("""
Expand Down Expand Up @@ -747,6 +805,8 @@ def update(self, parent):
parent.data.update(self.data)
min_n_below = min(parent.metadata.get('min_n_below', sys.maxsize),
self.metadata.get('min_n_below'))
if min_n_below == 0:
min_n_below = 1
parent.metadata['min_n_below'] = min_n_below


Expand Down
8 changes: 6 additions & 2 deletions sourmash/sbtmh.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@
from . import signature


def load_sbt_index(filename):
def load_sbt_index(filename, print_version_warning=True):
"Load and return an SBT index."
return SBT.load(filename, leaf_loader=SigLeaf.load)
return SBT.load(filename, leaf_loader=SigLeaf.load,
print_version_warning=print_version_warning)


def create_sbt_index(bloom_filter_size=1e5, n_children=2):
Expand Down Expand Up @@ -59,6 +60,9 @@ def update(self, parent):
min_n_below = min(len(self.data.minhash.get_mins()),
min_n_below)

if min_n_below == 0:
min_n_below = 1

parent.metadata['min_n_below'] = min_n_below

@property
Expand Down
1 change: 1 addition & 0 deletions tests/test-data/sbt-search-bug/empty.sig
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[{"class":"sourmash_signature","email":"","filename":"empty.fa","hash_function":"0.murmur64","license":"CC0","name":"empty sig","signatures":[{"ksize":31,"max_hash":18446744073709552,"md5sum":"c16a5320fa475530d9583c34fd356ef5","mins":[],"molecule":"DNA","num":0,"seed":42}],"version":0.4}]
52 changes: 51 additions & 1 deletion tests/test_sourmash.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from . import sourmash_tst_utils as utils
import sourmash_lib
from sourmash_lib import MinHash
from sourmash_lib.sbt import SBT
from sourmash_lib.sbt import SBT, Node
from sourmash_lib.sbtmh import SigLeaf, load_sbt_index
try:
import matplotlib
Expand Down Expand Up @@ -1274,6 +1274,30 @@ def test_do_sourmash_sbt_search_check_bug():
assert tree.nodes[0].metadata['min_n_below'] == 431


def test_do_sourmash_sbt_search_empty_sig():
with utils.TempDirectory() as location:
# mins: 431
testdata1 = utils.get_test_data('sbt-search-bug/nano.sig')

# mins: 0
testdata2 = utils.get_test_data('sbt-search-bug/empty.sig')

status, out, err = utils.runscript('sourmash',
['index', 'zzz', '-k', '31',
testdata1, testdata2],
in_directory=location)

assert os.path.exists(os.path.join(location, 'zzz.sbt.json'))

status, out, err = utils.runscript('sourmash',
['search', testdata1, 'zzz'],
in_directory=location)
assert '1 matches:' in out

tree = load_sbt_index(os.path.join(location, 'zzz.sbt.json'))
assert tree.nodes[0].metadata['min_n_below'] == 1


def test_do_sourmash_sbt_move_and_search_output():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short.fa')
Expand Down Expand Up @@ -3215,6 +3239,32 @@ def test_storage_convert_fsstorage_newpath():
sorted(identity.nodes.items())))


def test_migrate():
with utils.TempDirectory() as location:
testdata = utils.get_test_data('v3.sbt.json')
shutil.copyfile(testdata, os.path.join(location, 'v3.sbt.json'))
shutil.copytree(os.path.join(os.path.dirname(testdata), '.sbt.v3'),
os.path.join(location, '.sbt.v3'))
testsbt = os.path.join(location, 'v3.sbt.json')

original = SBT.load(testsbt, leaf_loader=SigLeaf.load)

status, out, err = utils.runscript('sourmash', ['migrate', testsbt],
in_directory=location)

identity = SBT.load(testsbt, leaf_loader=SigLeaf.load)

assert len(original.nodes) == len(identity.nodes)
assert all(n1[1].name == n2[1].name
for (n1, n2) in zip(sorted(original.nodes.items()),
sorted(identity.nodes.items())))

assert "this is an old index version" not in err
assert all('min_n_below' in node.metadata
for node in identity.nodes.values()
if isinstance(node, Node))


def test_license_cc0():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short.fa')
Expand Down