Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] provide "protocol" tests for Index, CollectionManifest, and LCA_Database classes #1936

Merged
merged 41 commits into from
Apr 15, 2022
Merged
Show file tree
Hide file tree
Changes from 38 commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
a3389bf
add LCA database test for tricky ordering
ctb Apr 3, 2022
628d722
add test for jaccard ordering to SBTs
ctb Apr 3, 2022
2607c82
add test_index_protocol
ctb Apr 8, 2022
74b7022
add tests of indices after save/load
ctb Apr 8, 2022
baf88b0
match Index definition of __len__ in sbt
ctb Apr 8, 2022
f4f8bb9
Merge branch 'add/test_jaccard_ordering' into add/index_tests
ctb Apr 8, 2022
65fab4e
more index tests
ctb Apr 8, 2022
d243992
add some generic manifest tests
ctb Apr 8, 2022
7739afc
define abstract base class for CollectionManifest
ctb Apr 8, 2022
741f260
fix GTDB example, sigh
ctb Apr 9, 2022
f605cba
test hashval_to_idx
ctb Apr 9, 2022
106de97
add actual test for min num in rankinfo
ctb Apr 9, 2022
2378aa0
update 'get_lineage_assignments' in lca_db
ctb Apr 9, 2022
af565f7
update comment
ctb Apr 9, 2022
8dc859b
make lid_to_idx and idx_to_ident private
ctb Apr 9, 2022
6789150
moar comment
ctb Apr 9, 2022
16caa54
add, then hide, RevIndex test
ctb Apr 9, 2022
0338657
update the LCA_Database protocol
ctb Apr 9, 2022
fb1209e
Merge branch 'latest' of https://github.com/sourmash-bio/sourmash int…
ctb Apr 9, 2022
110c5ea
Merge branch 'latest' of https://github.com/sourmash-bio/sourmash int…
ctb Apr 10, 2022
7e2e033
finish testing the rest of the Index classes
ctb Apr 10, 2022
de8b5fb
cleanup
ctb Apr 10, 2022
d1b259e
upd
ctb Apr 10, 2022
7735cee
backport 08ac110dfad4afb76
ctb Apr 10, 2022
7af8555
remove test for now-implemented func
ctb Apr 10, 2022
b8da770
switch away from a row tuple in CollectionManifest
ctb Apr 10, 2022
11ef719
more clearly separate internals of LCA_Database from public API
ctb Apr 10, 2022
c297edd
Merge branch 'latest' of https://github.com/sourmash-bio/sourmash int…
ctb Apr 10, 2022
8ab82ee
add saved/loaded manifest
ctb Apr 10, 2022
c422f39
add test coverage for exceptions in LazyLoadedIndex
ctb Apr 11, 2022
daf93d4
add docstrings to manifest code
ctb Apr 11, 2022
2e5bc5d
add docstrings / comments
ctb Apr 11, 2022
7b39253
use names in namedtuple; add containment test
ctb Apr 13, 2022
043e4cb
add numerical values to jaccard order tests
ctb Apr 13, 2022
0be189c
add required_keys check
ctb Apr 13, 2022
2fc0ca3
Merge branch 'latest' of https://github.com/sourmash-bio/sourmash int…
ctb Apr 13, 2022
6432315
Merge branch 'latest' into add/index_tests
ctb Apr 14, 2022
de1417a
fix diagnostic output during sourmash index #1949
ctb Apr 15, 2022
bfc25e0
Merge branch 'latest' of https://github.com/sourmash-bio/sourmash int…
ctb Apr 15, 2022
d69596d
fix test name
ctb Apr 15, 2022
cb54d7d
Merge branch 'latest' of https://github.com/sourmash-bio/sourmash int…
ctb Apr 15, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion src/sourmash/index/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,8 @@ def __bool__(self):
return False

def __len__(self):
raise NotImplementedError
db = self.db.select(**self.selection_dict)
return len(db)

def insert(self, node):
raise NotImplementedError
Expand Down Expand Up @@ -1064,6 +1065,12 @@ class LazyLoadedIndex(Index):
"""
def __init__(self, filename, manifest):
"Create an Index with given filename and manifest."
if not os.path.exists(filename):
raise ValueError(f"'{filename}' must exist when creating LazyLoadedIndex")

if manifest is None:
raise ValueError("manifest cannot be None")

self.filename = filename
self.manifest = manifest

Expand Down
2 changes: 1 addition & 1 deletion src/sourmash/index/revindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def save(self, path):
def load(cls, location):
pass

def select(self, ksize=None, moltype=None):
def select(self, ksize=None, moltype=None, **kwargs):
if self.template:
if ksize:
self.template.ksize = ksize
Expand Down
4 changes: 2 additions & 2 deletions src/sourmash/lca/command_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,10 +277,10 @@ def index(args):
sys.exit(1)

# check -- did the signatures we found have any hashes?
if not db.hashval_to_idx:
if not db.hashvals:
error('ERROR: no hash values found - are there any signatures?')
sys.exit(1)
notify(f'loaded {len(db.hashval_to_idx)} hashes at ksize={args.ksize} scaled={args.scaled}')
notify(f'loaded {len(db.hashvals)} hashes at ksize={args.ksize} scaled={args.scaled}')

if picklist:
sourmash_args.report_picklist(args, picklist)
Expand Down
13 changes: 4 additions & 9 deletions src/sourmash/lca/command_rankinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,10 @@ def make_lca_counts(dblist, min_num=0):
# gather all hashvalue assignments from across all the databases
assignments = defaultdict(set)
for lca_db in dblist:
for hashval, idx_list in lca_db.hashval_to_idx.items():
if min_num and len(idx_list) < min_num:
continue

for idx in idx_list:
lid = lca_db.idx_to_lid.get(idx)
if lid is not None:
lineage = lca_db.lid_to_lineage[lid]
assignments[hashval].add(lineage)
for hashval in lca_db.hashvals:
lineages = lca_db.get_lineage_assignments(hashval, min_num)
if lineages:
assignments[hashval].update(lineages)

# now convert to trees -> do LCA & counts
counts = defaultdict(int)
Expand Down
126 changes: 72 additions & 54 deletions src/sourmash/lca/lca_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,21 +39,21 @@ class LCA_Database(Index):
the `ident` keyword argument in `insert`.

Integer `idx` indices can be used as keys in dictionary attributes:
* `idx_to_lid`, to get an (optional) lineage index.
* `idx_to_ident`, to retrieve the unique string identifier for that `idx`.
* `_idx_to_lid`, to get an (optional) lineage index.
* `_idx_to_ident`, to retrieve the unique string identifier for that `idx`.

Integer `lid` indices can be used as keys in dictionary attributes:
* `lid_to_idx`, to get a set of `idx` with that lineage.
* `lid_to_lineage`, to get a lineage for that `lid`.
* `_lid_to_idx`, to get a set of `idx` with that lineage.
* `_lid_to_lineage`, to get a lineage for that `lid`.

`lineage_to_lid` is a dictionary with tuples of LineagePair as keys,
`_lineage_to_lid` is a dictionary with tuples of LineagePair as keys,
`lid` as values.

`ident_to_name` is a dictionary from unique str identifer to a name.
`_ident_to_name` is a dictionary from unique str identifer to a name.

`ident_to_idx` is a dictionary from unique str identifer to integer `idx`.
`_ident_to_idx` is a dictionary from unique str identifer to integer `idx`.

`hashval_to_idx` is a dictionary from individual hash values to sets of
`_hashval_to_idx` is a dictionary from individual hash values to sets of
`idx`.
"""
is_database = True
Expand All @@ -70,12 +70,12 @@ def __init__(self, ksize, scaled, moltype='DNA'):

self._next_index = 0
self._next_lid = 0
self.ident_to_name = {}
self.ident_to_idx = {}
self.idx_to_lid = {}
self.lineage_to_lid = {}
self.lid_to_lineage = {}
self.hashval_to_idx = defaultdict(set)
self._ident_to_name = {}
self._ident_to_idx = {}
self._idx_to_lid = {}
self._lineage_to_lid = {}
self._lid_to_lineage = {}
self._hashval_to_idx = defaultdict(set)
self.picklists = []

@property
Expand All @@ -91,31 +91,31 @@ def _invalidate_cache(self):

def _get_ident_index(self, ident, fail_on_duplicate=False):
"Get (create if nec) a unique int id, idx, for each identifier."
idx = self.ident_to_idx.get(ident)
idx = self._ident_to_idx.get(ident)
if fail_on_duplicate:
assert idx is None # should be no duplicate identities

if idx is None:
idx = self._next_index
self._next_index += 1

self.ident_to_idx[ident] = idx
self._ident_to_idx[ident] = idx

return idx

def _get_lineage_id(self, lineage):
"Get (create if nec) a unique lineage ID for each LineagePair tuples."
# does one exist already?
lid = self.lineage_to_lid.get(lineage)
lid = self._lineage_to_lid.get(lineage)

# nope - create one. Increment next_lid.
if lid is None:
lid = self._next_lid
self._next_lid += 1

# build mappings
self.lineage_to_lid[lineage] = lid
self.lid_to_lineage[lid] = lineage
self._lineage_to_lid[lineage] = lid
self._lid_to_lineage[lid] = lineage

return lid

Expand Down Expand Up @@ -147,14 +147,14 @@ def insert(self, sig, ident=None, lineage=None):
if not ident:
ident = str(sig)

if ident in self.ident_to_name:
if ident in self._ident_to_name:
raise ValueError("signature '{}' is already in this LCA db.".format(ident))

# before adding, invalide any caching from @cached_property
self._invalidate_cache()

# store full name
self.ident_to_name[ident] = sig.name
self._ident_to_name[ident] = sig.name

# identifier -> integer index (idx)
idx = self._get_ident_index(ident, fail_on_duplicate=True)
Expand All @@ -166,12 +166,12 @@ def insert(self, sig, ident=None, lineage=None):
lid = self._get_lineage_id(lineage)

# map idx to lid as well.
self.idx_to_lid[idx] = lid
self._idx_to_lid[idx] = lid
except TypeError:
raise ValueError('lineage cannot be used as a key?!')

for hashval in minhash.hashes:
self.hashval_to_idx[hashval].add(idx)
self._hashval_to_idx[hashval].add(idx)

return len(minhash)

Expand Down Expand Up @@ -290,8 +290,8 @@ def load(cls, db_name):
vv = tuple(vv)
lid_to_lineage[int(k)] = vv
lineage_to_lid[vv] = int(k)
db.lid_to_lineage = lid_to_lineage
db.lineage_to_lid = lineage_to_lid
db._lid_to_lineage = lid_to_lineage
db._lineage_to_lid = lineage_to_lid

# convert hashval -> lineage index keys to integers (looks like
# JSON doesn't have a 64 bit type so stores them as strings)
Expand All @@ -300,21 +300,21 @@ def load(cls, db_name):

for k, v in hashval_to_idx_2.items():
hashval_to_idx[int(k)] = v
db.hashval_to_idx = hashval_to_idx
db._hashval_to_idx = hashval_to_idx

db.ident_to_name = load_d['ident_to_name']
db.ident_to_idx = load_d['ident_to_idx']
db._ident_to_name = load_d['ident_to_name']
db._ident_to_idx = load_d['ident_to_idx']

db.idx_to_lid = {}
db._idx_to_lid = {}
for k, v in load_d['idx_to_lid'].items():
db.idx_to_lid[int(k)] = v
db._idx_to_lid[int(k)] = v

if db.ident_to_idx:
db._next_index = max(db.ident_to_idx.values()) + 1
if db._ident_to_idx:
db._next_index = max(db._ident_to_idx.values()) + 1
else:
db._next_index = 0
if db.idx_to_lid:
db._next_lid = max(db.idx_to_lid.values()) + 1
if db._idx_to_lid:
db._next_lid = max(db._idx_to_lid.values()) + 1
else:
db._next_lid = 0

Expand Down Expand Up @@ -345,18 +345,18 @@ def save(self, db_name):

# convert lineage internals from tuples to dictionaries
d = OrderedDict()
for k, v in self.lid_to_lineage.items():
for k, v in self._lid_to_lineage.items():
d[k] = dict([ (vv.rank, vv.name) for vv in v ])
save_d['lid_to_lineage'] = d

# convert values from sets to lists, so that JSON knows how to save
save_d['hashval_to_idx'] = \
dict((k, list(v)) for (k, v) in self.hashval_to_idx.items())
dict((k, list(v)) for (k, v) in self._hashval_to_idx.items())

save_d['ident_to_name'] = self.ident_to_name
save_d['ident_to_idx'] = self.ident_to_idx
save_d['idx_to_lid'] = self.idx_to_lid
save_d['lid_to_lineage'] = self.lid_to_lineage
save_d['ident_to_name'] = self._ident_to_name
save_d['ident_to_idx'] = self._ident_to_idx
save_d['idx_to_lid'] = self._idx_to_lid
save_d['lid_to_lineage'] = self._lid_to_lineage

json.dump(save_d, fp)

Expand All @@ -378,27 +378,45 @@ def downsample_scaled(self, scaled):

# filter out all hashes over max_hash in value.
new_hashvals = {}
for k, v in self.hashval_to_idx.items():
for k, v in self._hashval_to_idx.items():
if k < max_hash:
new_hashvals[k] = v
self.hashval_to_idx = new_hashvals
self._hashval_to_idx = new_hashvals
self.scaled = scaled

def get_lineage_assignments(self, hashval):
@property
def hashvals(self):
"Return all hashvals stored in this database."
return self._hashval_to_idx.keys()

def get_lineage_assignments(self, hashval, min_num=None):
"""
Get a list of lineages for this hashval.
"""
x = []

idx_list = self.hashval_to_idx.get(hashval, [])
idx_list = self._hashval_to_idx.get(hashval, [])

if min_num and len(idx_list) < min_num:
return []

for idx in idx_list:
lid = self.idx_to_lid.get(idx, None)
lid = self._idx_to_lid.get(idx, None)
if lid is not None:
lineage = self.lid_to_lineage[lid]
lineage = self._lid_to_lineage[lid]
x.append(lineage)

return x

def get_identifiers_for_hashval(self, hashval):
"""
Get a list of identifiers for signatures containing this hashval
"""
idx_list = self._hashval_to_idx.get(hashval, [])

for idx in idx_list:
yield self._idx_to_ident[idx]

@cached_property
def _signatures(self):
"Create a _signatures member dictionary that contains {idx: sigobj}."
Expand All @@ -422,7 +440,7 @@ def _signatures(self):
temp_vals = defaultdict(list)

# invert the hashval_to_idx dictionary
for (hashval, idlist) in self.hashval_to_idx.items():
for (hashval, idlist) in self._hashval_to_idx.items():
for idx in idlist:
temp_hashes = temp_vals[idx]
temp_hashes.append(hashval)
Expand All @@ -445,8 +463,8 @@ def _signatures(self):

sigd = {}
for idx, mh in mhd.items():
ident = self.idx_to_ident[idx]
name = self.ident_to_name[ident]
ident = self._idx_to_ident[idx]
name = self._ident_to_name[ident]
ss = SourmashSignature(mh, name=name)

if passes_all_picklists(ss, self.picklists):
Expand Down Expand Up @@ -481,7 +499,7 @@ def find(self, search_fn, query, **kwargs):
c = Counter()
query_hashes = set(query_mh.hashes)
for hashval in query_hashes:
idx_list = self.hashval_to_idx.get(hashval, [])
idx_list = self._hashval_to_idx.get(hashval, [])
for idx in idx_list:
c[idx] += 1

Expand Down Expand Up @@ -523,16 +541,16 @@ def find(self, search_fn, query, **kwargs):
yield IndexSearchResult(score, subj, self.location)

@cached_property
def lid_to_idx(self):
def _lid_to_idx(self):
d = defaultdict(set)
for idx, lid in self.idx_to_lid.items():
for idx, lid in self._idx_to_lid.items():
d[lid].add(idx)
return d

@cached_property
def idx_to_ident(self):
def _idx_to_ident(self):
d = defaultdict(set)
for ident, idx in self.ident_to_idx.items():
for ident, idx in self._ident_to_idx.items():
assert idx not in d
d[idx] = ident
return d
Expand Down
Loading