Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] add support for sourmash sketch fromfile functionality. #1884

Merged
merged 8 commits into from
Mar 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions src/sourmash/command_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,60 @@ def __init__(self, ksizes, seed, protein, dayhoff, hp, dna, num_hashes, track_ab
self.track_abundance = track_abundance
self.scaled = scaled

def to_param_str(self):
"Convert object to equivalent params str."
pi = []

if self.dna:
pi.append("dna")
elif self.protein:
pi.append("protein")
elif self.hp:
pi.append("hp")
elif self.dayhoff:
pi.append("dayhoff")
else:
assert 0 # must be one of the previous

if self.dna:
kstr = [f"k={k}" for k in self.ksizes]
else:
# for protein, divide ksize by three.
kstr = [f"k={k//3}" for k in self.ksizes]
assert kstr
pi.extend(kstr)

if self.num_hashes != 0:
pi.append(f"num={self.num_hashes}")
elif self.scaled != 0:
pi.append(f"scaled={self.scaled}")
else:
assert 0

if self.track_abundance:
pi.append("abund")
# noabund is default

if self.seed != 42:
pi.append(f"seed={self.seed}")
# self.seed

return ",".join(pi)

def __repr__(self):
return f"ComputeParameters({self.ksizes}, {self.seed}, {self.protein}, {self.dayhoff}, {self.hp}, {self.dna}, {self.num_hashes}, {self.track_abundance}, {self.scaled})"

def __eq__(self, other):
return (self.ksizes == other.ksizes and
self.seed == other.seed and
self.protein == other.protein and
self.dayhoff == other.dayhoff and
self.hp == other.hp and
self.dna == other.dna and
self.num_hashes == other.num_hashes and
self.track_abundance == other.track_abundance and
self.scaled == other.scaled)

@staticmethod
def from_args(args):
ptr = lib.computeparams_new()
Expand Down
61 changes: 34 additions & 27 deletions src/sourmash/command_sketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,7 @@ def _parse_params_str(params_str):

class _signatures_for_sketch_factory(object):
"Build sigs on demand, based on args input to 'sketch'."
def __init__(self, params_str_list, default_moltype, mult_ksize_by_3):

def __init__(self, params_str_list, default_moltype):
# first, set up defaults per-moltype
defaults = {}
for moltype, pstr in DEFAULTS.items():
Expand All @@ -94,7 +93,7 @@ def __init__(self, params_str_list, default_moltype, mult_ksize_by_3):

# next, fill out params_list
self.params_list = []
self.mult_ksize_by_3 = mult_ksize_by_3
self.mult_ksize_by_3 = True

if params_str_list:
# parse each params_str passed in, using default_moltype if none
Expand All @@ -103,17 +102,21 @@ def __init__(self, params_str_list, default_moltype, mult_ksize_by_3):
moltype, params = _parse_params_str(params_str)
if moltype and moltype != 'dna' and default_moltype == 'dna':
raise ValueError(f"Incompatible sketch type ({default_moltype}) and parameter override ({moltype}) in '{params_str}'; maybe use 'sketch translate'?")
elif moltype == 'dna' and default_moltype != 'dna':
elif moltype == 'dna' and default_moltype and default_moltype != 'dna':
raise ValueError(f"Incompatible sketch type ({default_moltype}) and parameter override ({moltype}) in '{params_str}'")
elif moltype is None:
if default_moltype is None:
raise ValueError(f"No default moltype and none specified in param string")
moltype = default_moltype

self.params_list.append((moltype, params))
else:
if default_moltype is None:
raise ValueError(f"No default moltype and none specified in param string")
# no params str? default to a single sig, using default_moltype.
self.params_list.append((default_moltype, {}))

def get_compute_params(self):
def get_compute_params(self, *, split_ksizes=False):
for moltype, params_d in self.params_list:
# get defaults for this moltype from self.defaults:
default_params = self.defaults[moltype]
Expand All @@ -134,26 +137,33 @@ def get_compute_params(self):
if not ksizes:
ksizes = def_ksizes

if self.mult_ksize_by_3:
# 'command sketch' adjusts k-mer sizes by 3 if non-DNA sketch.
if self.mult_ksize_by_3 and not def_dna:
ksizes = [ k*3 for k in ksizes ]

params_obj = ComputeParameters(ksizes,
params_d.get('seed', def_seed),
def_protein,
def_dayhoff,
def_hp,
def_dna,
params_d.get('num', def_num),
params_d.get('track_abundance',
def_abund),
params_d.get('scaled', def_scaled))

yield params_obj

def __call__(self):
make_param = lambda ksizes: ComputeParameters(ksizes,
params_d.get('seed', def_seed),
def_protein,
def_dayhoff,
def_hp,
def_dna,
params_d.get('num', def_num),
params_d.get('track_abundance',
def_abund),
params_d.get('scaled', def_scaled))

if split_ksizes:
for ksize in ksizes:
params_obj = make_param([ksize])
yield params_obj
else:
params_obj = make_param(ksizes)
yield params_obj

def __call__(self, *, split_ksizes=False):
"Produce a new set of signatures built to match the param strings."
sigs = []
for params in self.get_compute_params():
for params in self.get_compute_params(split_ksizes=split_ksizes):
sig = SourmashSignature.from_params(params)
sigs.append(sig)

Expand Down Expand Up @@ -214,8 +224,7 @@ def dna(args):

try:
signatures_factory = _signatures_for_sketch_factory(args.param_string,
'dna',
mult_ksize_by_3=False)
'dna')
except ValueError as e:
error(f"Error creating signatures: {str(e)}")
sys.exit(-1)
Expand Down Expand Up @@ -244,8 +253,7 @@ def protein(args):

try:
signatures_factory = _signatures_for_sketch_factory(args.param_string,
moltype,
mult_ksize_by_3=True)
moltype)
except ValueError as e:
error(f"Error creating signatures: {str(e)}")
sys.exit(-1)
Expand Down Expand Up @@ -274,8 +282,7 @@ def translate(args):

try:
signatures_factory = _signatures_for_sketch_factory(args.param_string,
moltype,
mult_ksize_by_3=True)
moltype)
except ValueError as e:
error(f"Error creating signatures: {str(e)}")
sys.exit(-1)
Expand Down
2 changes: 1 addition & 1 deletion src/sourmash/index/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ def select(self, ksize=None, moltype=None, scaled=None, num=None,

def select_signature(ss, *, ksize=None, moltype=None, scaled=0, num=0,
containment=False, abund=None, picklist=None):
"Check that the given signature matches the specificed requirements."
"Check that the given signature matches the specified requirements."
# ksize match?
if ksize and ksize != ss.minhash.ksize:
return False
Expand Down
Loading