Skip to content

Commit

Permalink
Merge branch 'latest' into add-cols
Browse files Browse the repository at this point in the history
  • Loading branch information
bluegenes committed Apr 15, 2022
2 parents a983d48 + 4080b5d commit 29fd59d
Show file tree
Hide file tree
Showing 15 changed files with 969 additions and 284 deletions.
9 changes: 8 additions & 1 deletion src/sourmash/index/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,8 @@ def __bool__(self):
return False

def __len__(self):
raise NotImplementedError
db = self.db.select(**self.selection_dict)
return len(db)

def insert(self, node):
raise NotImplementedError
Expand Down Expand Up @@ -1064,6 +1065,12 @@ class LazyLoadedIndex(Index):
"""
def __init__(self, filename, manifest):
"Create an Index with given filename and manifest."
if not os.path.exists(filename):
raise ValueError(f"'{filename}' must exist when creating LazyLoadedIndex")

if manifest is None:
raise ValueError("manifest cannot be None")

self.filename = filename
self.manifest = manifest

Expand Down
2 changes: 1 addition & 1 deletion src/sourmash/index/revindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def save(self, path):
def load(cls, location):
pass

def select(self, ksize=None, moltype=None):
def select(self, ksize=None, moltype=None, **kwargs):
if self.template:
if ksize:
self.template.ksize = ksize
Expand Down
4 changes: 2 additions & 2 deletions src/sourmash/lca/command_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,10 +277,10 @@ def index(args):
sys.exit(1)

# check -- did the signatures we found have any hashes?
if not db.hashval_to_idx:
if not db.hashvals:
error('ERROR: no hash values found - are there any signatures?')
sys.exit(1)
notify(f'loaded {len(db.hashval_to_idx)} hashes at ksize={args.ksize} scaled={args.scaled}')
notify(f'loaded {len(db.hashvals)} hashes at ksize={args.ksize} scaled={args.scaled}')

if picklist:
sourmash_args.report_picklist(args, picklist)
Expand Down
13 changes: 4 additions & 9 deletions src/sourmash/lca/command_rankinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,10 @@ def make_lca_counts(dblist, min_num=0):
# gather all hashvalue assignments from across all the databases
assignments = defaultdict(set)
for lca_db in dblist:
for hashval, idx_list in lca_db.hashval_to_idx.items():
if min_num and len(idx_list) < min_num:
continue

for idx in idx_list:
lid = lca_db.idx_to_lid.get(idx)
if lid is not None:
lineage = lca_db.lid_to_lineage[lid]
assignments[hashval].add(lineage)
for hashval in lca_db.hashvals:
lineages = lca_db.get_lineage_assignments(hashval, min_num)
if lineages:
assignments[hashval].update(lineages)

# now convert to trees -> do LCA & counts
counts = defaultdict(int)
Expand Down
126 changes: 72 additions & 54 deletions src/sourmash/lca/lca_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,21 +39,21 @@ class LCA_Database(Index):
the `ident` keyword argument in `insert`.
Integer `idx` indices can be used as keys in dictionary attributes:
* `idx_to_lid`, to get an (optional) lineage index.
* `idx_to_ident`, to retrieve the unique string identifier for that `idx`.
* `_idx_to_lid`, to get an (optional) lineage index.
* `_idx_to_ident`, to retrieve the unique string identifier for that `idx`.
Integer `lid` indices can be used as keys in dictionary attributes:
* `lid_to_idx`, to get a set of `idx` with that lineage.
* `lid_to_lineage`, to get a lineage for that `lid`.
* `_lid_to_idx`, to get a set of `idx` with that lineage.
* `_lid_to_lineage`, to get a lineage for that `lid`.
`lineage_to_lid` is a dictionary with tuples of LineagePair as keys,
`_lineage_to_lid` is a dictionary with tuples of LineagePair as keys,
`lid` as values.
`ident_to_name` is a dictionary from unique str identifer to a name.
`_ident_to_name` is a dictionary from unique str identifer to a name.
`ident_to_idx` is a dictionary from unique str identifer to integer `idx`.
`_ident_to_idx` is a dictionary from unique str identifer to integer `idx`.
`hashval_to_idx` is a dictionary from individual hash values to sets of
`_hashval_to_idx` is a dictionary from individual hash values to sets of
`idx`.
"""
is_database = True
Expand All @@ -70,12 +70,12 @@ def __init__(self, ksize, scaled, moltype='DNA'):

self._next_index = 0
self._next_lid = 0
self.ident_to_name = {}
self.ident_to_idx = {}
self.idx_to_lid = {}
self.lineage_to_lid = {}
self.lid_to_lineage = {}
self.hashval_to_idx = defaultdict(set)
self._ident_to_name = {}
self._ident_to_idx = {}
self._idx_to_lid = {}
self._lineage_to_lid = {}
self._lid_to_lineage = {}
self._hashval_to_idx = defaultdict(set)
self.picklists = []

@property
Expand All @@ -91,31 +91,31 @@ def _invalidate_cache(self):

def _get_ident_index(self, ident, fail_on_duplicate=False):
"Get (create if nec) a unique int id, idx, for each identifier."
idx = self.ident_to_idx.get(ident)
idx = self._ident_to_idx.get(ident)
if fail_on_duplicate:
assert idx is None # should be no duplicate identities

if idx is None:
idx = self._next_index
self._next_index += 1

self.ident_to_idx[ident] = idx
self._ident_to_idx[ident] = idx

return idx

def _get_lineage_id(self, lineage):
"Get (create if nec) a unique lineage ID for each LineagePair tuples."
# does one exist already?
lid = self.lineage_to_lid.get(lineage)
lid = self._lineage_to_lid.get(lineage)

# nope - create one. Increment next_lid.
if lid is None:
lid = self._next_lid
self._next_lid += 1

# build mappings
self.lineage_to_lid[lineage] = lid
self.lid_to_lineage[lid] = lineage
self._lineage_to_lid[lineage] = lid
self._lid_to_lineage[lid] = lineage

return lid

Expand Down Expand Up @@ -147,14 +147,14 @@ def insert(self, sig, ident=None, lineage=None):
if not ident:
ident = str(sig)

if ident in self.ident_to_name:
if ident in self._ident_to_name:
raise ValueError("signature '{}' is already in this LCA db.".format(ident))

# before adding, invalide any caching from @cached_property
self._invalidate_cache()

# store full name
self.ident_to_name[ident] = sig.name
self._ident_to_name[ident] = sig.name

# identifier -> integer index (idx)
idx = self._get_ident_index(ident, fail_on_duplicate=True)
Expand All @@ -166,12 +166,12 @@ def insert(self, sig, ident=None, lineage=None):
lid = self._get_lineage_id(lineage)

# map idx to lid as well.
self.idx_to_lid[idx] = lid
self._idx_to_lid[idx] = lid
except TypeError:
raise ValueError('lineage cannot be used as a key?!')

for hashval in minhash.hashes:
self.hashval_to_idx[hashval].add(idx)
self._hashval_to_idx[hashval].add(idx)

return len(minhash)

Expand Down Expand Up @@ -290,8 +290,8 @@ def load(cls, db_name):
vv = tuple(vv)
lid_to_lineage[int(k)] = vv
lineage_to_lid[vv] = int(k)
db.lid_to_lineage = lid_to_lineage
db.lineage_to_lid = lineage_to_lid
db._lid_to_lineage = lid_to_lineage
db._lineage_to_lid = lineage_to_lid

# convert hashval -> lineage index keys to integers (looks like
# JSON doesn't have a 64 bit type so stores them as strings)
Expand All @@ -300,21 +300,21 @@ def load(cls, db_name):

for k, v in hashval_to_idx_2.items():
hashval_to_idx[int(k)] = v
db.hashval_to_idx = hashval_to_idx
db._hashval_to_idx = hashval_to_idx

db.ident_to_name = load_d['ident_to_name']
db.ident_to_idx = load_d['ident_to_idx']
db._ident_to_name = load_d['ident_to_name']
db._ident_to_idx = load_d['ident_to_idx']

db.idx_to_lid = {}
db._idx_to_lid = {}
for k, v in load_d['idx_to_lid'].items():
db.idx_to_lid[int(k)] = v
db._idx_to_lid[int(k)] = v

if db.ident_to_idx:
db._next_index = max(db.ident_to_idx.values()) + 1
if db._ident_to_idx:
db._next_index = max(db._ident_to_idx.values()) + 1
else:
db._next_index = 0
if db.idx_to_lid:
db._next_lid = max(db.idx_to_lid.values()) + 1
if db._idx_to_lid:
db._next_lid = max(db._idx_to_lid.values()) + 1
else:
db._next_lid = 0

Expand Down Expand Up @@ -345,18 +345,18 @@ def save(self, db_name):

# convert lineage internals from tuples to dictionaries
d = OrderedDict()
for k, v in self.lid_to_lineage.items():
for k, v in self._lid_to_lineage.items():
d[k] = dict([ (vv.rank, vv.name) for vv in v ])
save_d['lid_to_lineage'] = d

# convert values from sets to lists, so that JSON knows how to save
save_d['hashval_to_idx'] = \
dict((k, list(v)) for (k, v) in self.hashval_to_idx.items())
dict((k, list(v)) for (k, v) in self._hashval_to_idx.items())

save_d['ident_to_name'] = self.ident_to_name
save_d['ident_to_idx'] = self.ident_to_idx
save_d['idx_to_lid'] = self.idx_to_lid
save_d['lid_to_lineage'] = self.lid_to_lineage
save_d['ident_to_name'] = self._ident_to_name
save_d['ident_to_idx'] = self._ident_to_idx
save_d['idx_to_lid'] = self._idx_to_lid
save_d['lid_to_lineage'] = self._lid_to_lineage

json.dump(save_d, fp)

Expand All @@ -378,27 +378,45 @@ def downsample_scaled(self, scaled):

# filter out all hashes over max_hash in value.
new_hashvals = {}
for k, v in self.hashval_to_idx.items():
for k, v in self._hashval_to_idx.items():
if k < max_hash:
new_hashvals[k] = v
self.hashval_to_idx = new_hashvals
self._hashval_to_idx = new_hashvals
self.scaled = scaled

def get_lineage_assignments(self, hashval):
@property
def hashvals(self):
"Return all hashvals stored in this database."
return self._hashval_to_idx.keys()

def get_lineage_assignments(self, hashval, min_num=None):
"""
Get a list of lineages for this hashval.
"""
x = []

idx_list = self.hashval_to_idx.get(hashval, [])
idx_list = self._hashval_to_idx.get(hashval, [])

if min_num and len(idx_list) < min_num:
return []

for idx in idx_list:
lid = self.idx_to_lid.get(idx, None)
lid = self._idx_to_lid.get(idx, None)
if lid is not None:
lineage = self.lid_to_lineage[lid]
lineage = self._lid_to_lineage[lid]
x.append(lineage)

return x

def get_identifiers_for_hashval(self, hashval):
"""
Get a list of identifiers for signatures containing this hashval
"""
idx_list = self._hashval_to_idx.get(hashval, [])

for idx in idx_list:
yield self._idx_to_ident[idx]

@cached_property
def _signatures(self):
"Create a _signatures member dictionary that contains {idx: sigobj}."
Expand All @@ -422,7 +440,7 @@ def _signatures(self):
temp_vals = defaultdict(list)

# invert the hashval_to_idx dictionary
for (hashval, idlist) in self.hashval_to_idx.items():
for (hashval, idlist) in self._hashval_to_idx.items():
for idx in idlist:
temp_hashes = temp_vals[idx]
temp_hashes.append(hashval)
Expand All @@ -445,8 +463,8 @@ def _signatures(self):

sigd = {}
for idx, mh in mhd.items():
ident = self.idx_to_ident[idx]
name = self.ident_to_name[ident]
ident = self._idx_to_ident[idx]
name = self._ident_to_name[ident]
ss = SourmashSignature(mh, name=name)

if passes_all_picklists(ss, self.picklists):
Expand Down Expand Up @@ -481,7 +499,7 @@ def find(self, search_fn, query, **kwargs):
c = Counter()
query_hashes = set(query_mh.hashes)
for hashval in query_hashes:
idx_list = self.hashval_to_idx.get(hashval, [])
idx_list = self._hashval_to_idx.get(hashval, [])
for idx in idx_list:
c[idx] += 1

Expand Down Expand Up @@ -523,16 +541,16 @@ def find(self, search_fn, query, **kwargs):
yield IndexSearchResult(score, subj, self.location)

@cached_property
def lid_to_idx(self):
def _lid_to_idx(self):
d = defaultdict(set)
for idx, lid in self.idx_to_lid.items():
for idx, lid in self._idx_to_lid.items():
d[lid].add(idx)
return d

@cached_property
def idx_to_ident(self):
def _idx_to_ident(self):
d = defaultdict(set)
for ident, idx in self.ident_to_idx.items():
for ident, idx in self._ident_to_idx.items():
assert idx not in d
d[idx] = ident
return d
Expand Down
Loading

0 comments on commit 29fd59d

Please sign in to comment.