Skip to content

Commit

Permalink
[WIP] add support for sourmash sketch fromfile functionality. (#1884)
Browse files Browse the repository at this point in the history
* upgrade ComputerParameters with __repr__ and __eq__

* cleanup and refactor

* add tests for new behavior

* finish tests

* add ComputeParameters.to_param_str

* fix spelling

* a fix, and some tests

* add tests for to_param_str
  • Loading branch information
ctb committed Mar 14, 2022
1 parent dbda4ef commit 8f4c94c
Show file tree
Hide file tree
Showing 4 changed files with 259 additions and 54 deletions.
54 changes: 54 additions & 0 deletions src/sourmash/command_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,60 @@ def __init__(self, ksizes, seed, protein, dayhoff, hp, dna, num_hashes, track_ab
self.track_abundance = track_abundance
self.scaled = scaled

def to_param_str(self):
"Convert object to equivalent params str."
pi = []

if self.dna:
pi.append("dna")
elif self.protein:
pi.append("protein")
elif self.hp:
pi.append("hp")
elif self.dayhoff:
pi.append("dayhoff")
else:
assert 0 # must be one of the previous

if self.dna:
kstr = [f"k={k}" for k in self.ksizes]
else:
# for protein, divide ksize by three.
kstr = [f"k={k//3}" for k in self.ksizes]
assert kstr
pi.extend(kstr)

if self.num_hashes != 0:
pi.append(f"num={self.num_hashes}")
elif self.scaled != 0:
pi.append(f"scaled={self.scaled}")
else:
assert 0

if self.track_abundance:
pi.append("abund")
# noabund is default

if self.seed != 42:
pi.append(f"seed={self.seed}")
# self.seed

return ",".join(pi)

def __repr__(self):
return f"ComputeParameters({self.ksizes}, {self.seed}, {self.protein}, {self.dayhoff}, {self.hp}, {self.dna}, {self.num_hashes}, {self.track_abundance}, {self.scaled})"

def __eq__(self, other):
return (self.ksizes == other.ksizes and
self.seed == other.seed and
self.protein == other.protein and
self.dayhoff == other.dayhoff and
self.hp == other.hp and
self.dna == other.dna and
self.num_hashes == other.num_hashes and
self.track_abundance == other.track_abundance and
self.scaled == other.scaled)

@staticmethod
def from_args(args):
ptr = lib.computeparams_new()
Expand Down
61 changes: 34 additions & 27 deletions src/sourmash/command_sketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,7 @@ def _parse_params_str(params_str):

class _signatures_for_sketch_factory(object):
"Build sigs on demand, based on args input to 'sketch'."
def __init__(self, params_str_list, default_moltype, mult_ksize_by_3):

def __init__(self, params_str_list, default_moltype):
# first, set up defaults per-moltype
defaults = {}
for moltype, pstr in DEFAULTS.items():
Expand All @@ -94,7 +93,7 @@ def __init__(self, params_str_list, default_moltype, mult_ksize_by_3):

# next, fill out params_list
self.params_list = []
self.mult_ksize_by_3 = mult_ksize_by_3
self.mult_ksize_by_3 = True

if params_str_list:
# parse each params_str passed in, using default_moltype if none
Expand All @@ -103,17 +102,21 @@ def __init__(self, params_str_list, default_moltype, mult_ksize_by_3):
moltype, params = _parse_params_str(params_str)
if moltype and moltype != 'dna' and default_moltype == 'dna':
raise ValueError(f"Incompatible sketch type ({default_moltype}) and parameter override ({moltype}) in '{params_str}'; maybe use 'sketch translate'?")
elif moltype == 'dna' and default_moltype != 'dna':
elif moltype == 'dna' and default_moltype and default_moltype != 'dna':
raise ValueError(f"Incompatible sketch type ({default_moltype}) and parameter override ({moltype}) in '{params_str}'")
elif moltype is None:
if default_moltype is None:
raise ValueError(f"No default moltype and none specified in param string")
moltype = default_moltype

self.params_list.append((moltype, params))
else:
if default_moltype is None:
raise ValueError(f"No default moltype and none specified in param string")
# no params str? default to a single sig, using default_moltype.
self.params_list.append((default_moltype, {}))

def get_compute_params(self):
def get_compute_params(self, *, split_ksizes=False):
for moltype, params_d in self.params_list:
# get defaults for this moltype from self.defaults:
default_params = self.defaults[moltype]
Expand All @@ -134,26 +137,33 @@ def get_compute_params(self):
if not ksizes:
ksizes = def_ksizes

if self.mult_ksize_by_3:
# 'command sketch' adjusts k-mer sizes by 3 if non-DNA sketch.
if self.mult_ksize_by_3 and not def_dna:
ksizes = [ k*3 for k in ksizes ]

params_obj = ComputeParameters(ksizes,
params_d.get('seed', def_seed),
def_protein,
def_dayhoff,
def_hp,
def_dna,
params_d.get('num', def_num),
params_d.get('track_abundance',
def_abund),
params_d.get('scaled', def_scaled))

yield params_obj

def __call__(self):
make_param = lambda ksizes: ComputeParameters(ksizes,
params_d.get('seed', def_seed),
def_protein,
def_dayhoff,
def_hp,
def_dna,
params_d.get('num', def_num),
params_d.get('track_abundance',
def_abund),
params_d.get('scaled', def_scaled))

if split_ksizes:
for ksize in ksizes:
params_obj = make_param([ksize])
yield params_obj
else:
params_obj = make_param(ksizes)
yield params_obj

def __call__(self, *, split_ksizes=False):
"Produce a new set of signatures built to match the param strings."
sigs = []
for params in self.get_compute_params():
for params in self.get_compute_params(split_ksizes=split_ksizes):
sig = SourmashSignature.from_params(params)
sigs.append(sig)

Expand Down Expand Up @@ -214,8 +224,7 @@ def dna(args):

try:
signatures_factory = _signatures_for_sketch_factory(args.param_string,
'dna',
mult_ksize_by_3=False)
'dna')
except ValueError as e:
error(f"Error creating signatures: {str(e)}")
sys.exit(-1)
Expand Down Expand Up @@ -244,8 +253,7 @@ def protein(args):

try:
signatures_factory = _signatures_for_sketch_factory(args.param_string,
moltype,
mult_ksize_by_3=True)
moltype)
except ValueError as e:
error(f"Error creating signatures: {str(e)}")
sys.exit(-1)
Expand Down Expand Up @@ -274,8 +282,7 @@ def translate(args):

try:
signatures_factory = _signatures_for_sketch_factory(args.param_string,
moltype,
mult_ksize_by_3=True)
moltype)
except ValueError as e:
error(f"Error creating signatures: {str(e)}")
sys.exit(-1)
Expand Down
2 changes: 1 addition & 1 deletion src/sourmash/index/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ def select(self, ksize=None, moltype=None, scaled=None, num=None,

def select_signature(ss, *, ksize=None, moltype=None, scaled=0, num=0,
containment=False, abund=None, picklist=None):
"Check that the given signature matches the specificed requirements."
"Check that the given signature matches the specified requirements."
# ksize match?
if ksize and ksize != ss.minhash.ksize:
return False
Expand Down
Loading

0 comments on commit 8f4c94c

Please sign in to comment.