From f7987a51d122fa4ec4f940cb054af6ac623d3149 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Wed, 11 Dec 2019 18:42:20 +0000 Subject: [PATCH 01/10] [MRG] Better molecule type checks (#782) * trigger bug 781 * strict is_protein check * change test value --- sourmash/_minhash.pyx | 20 +++++++------------- sourmash/signature.py | 5 +++++ sourmash/sourmash_args.py | 10 +++++----- tests/test-data/protein_781.sig | 1 + tests/test_bugs.py | 16 ++++++++++++++++ tests/test_sourmash_compute.py | 2 +- 6 files changed, 35 insertions(+), 19 deletions(-) create mode 100644 tests/test-data/protein_781.sig create mode 100644 tests/test_bugs.py diff --git a/sourmash/_minhash.pyx b/sourmash/_minhash.pyx index 153833a17b..66d6a357ee 100644 --- a/sourmash/_minhash.pyx +++ b/sourmash/_minhash.pyx @@ -502,17 +502,11 @@ cdef class MinHash(object): def is_molecule_type(self, molecule): if molecule.upper() == 'DNA' and not self.is_protein: return True - if self.is_protein: - if self.dayhoff: - if molecule == 'dayhoff': - return True - else: - if molecule == 'protein': - return True - if self.hp: - if molecule == 'hp': - return True - else: - if molecule == 'protein': - return True + elif self.is_protein and molecule == 'protein' and not any((self.dayhoff, self.hp)): + return True + elif self.dayhoff and molecule == 'dayhoff': + return True + elif self.hp and molecule == 'hp': + return True + return False diff --git a/sourmash/signature.py b/sourmash/signature.py index e724848fe0..fafcfd2ece 100644 --- a/sourmash/signature.py +++ b/sourmash/signature.py @@ -228,6 +228,11 @@ def load_signatures(data, ksize=None, select_moltype=None, if not ksize or ksize == sig.minhash.ksize: if not select_moltype or \ sig.minhash.is_molecule_type(select_moltype): + if select_moltype == 'protein': + if any(sig.minhash.is_molecule_type(t) for t in ('dayhoff', 'hp')): + # dayhoff and hp are also protein MHs. only yield + # sig if it is exactly one of (protein, hp, dayhoff) + continue yield sig except Exception as e: if not quiet: diff --git a/sourmash/sourmash_args.py b/sourmash/sourmash_args.py index 51c50ad39a..5fdb92ea93 100644 --- a/sourmash/sourmash_args.py +++ b/sourmash/sourmash_args.py @@ -116,12 +116,12 @@ def add_ksize_arg(parser, default): def get_moltype(sig, require=False): if sig.minhash.is_molecule_type('DNA'): moltype = 'DNA' - elif sig.minhash.is_molecule_type('protein'): - moltype = 'protein' elif sig.minhash.is_molecule_type('dayhoff'): moltype = 'dayhoff' elif sig.minhash.is_molecule_type('hp'): moltype = 'hp' + elif sig.minhash.is_molecule_type('protein'): + moltype = 'protein' else: raise ValueError('unknown molecule type for sig {}'.format(sig.name())) @@ -136,14 +136,14 @@ def calculate_moltype(args, default=None): args.dna = False moltype = default - if args.protein: - moltype = 'protein' - elif args.dna: + if args.dna: moltype = 'DNA' elif args.dayhoff: moltype = 'dayhoff' elif args.hp: moltype = 'hp' + elif args.protein: + moltype = 'protein' return moltype diff --git a/tests/test-data/protein_781.sig b/tests/test-data/protein_781.sig new file mode 100644 index 0000000000..199b618cc2 --- /dev/null +++ b/tests/test-data/protein_781.sig @@ -0,0 +1 @@ +[{"class":"sourmash_signature","email":"","filename":"smash-testing/gingivalis/protein/GCA_000007585.1_ASM758v1_protein.faa.gz","hash_function":"0.murmur64","license":"CC0","signatures":[{"ksize":7,"max_hash":9223372036854776,"md5sum":"8f14e45fceea167a5a36dedd4bea2543","mins":[],"molecule":"protein","num":0,"seed":42},{"ksize":7,"max_hash":9223372036854776,"md5sum":"8f14e45fceea167a5a36dedd4bea2543","mins":[],"molecule":"dayhoff","num":0,"seed":42},{"ksize":7,"max_hash":9223372036854776,"md5sum":"8f14e45fceea167a5a36dedd4bea2543","mins":[],"molecule":"hp","num":0,"seed":42},{"ksize":11,"max_hash":9223372036854776,"md5sum":"eddce6d9ac96be04ed806aadf53665d1","mins":[935333424279453,2538868037911012,7421288836408751,7960286979232677,8654912015719540],"molecule":"protein","num":0,"seed":42},{"ksize":11,"max_hash":9223372036854776,"md5sum":"6512bd43d9caa6e02c990b0a82652dca","mins":[],"molecule":"dayhoff","num":0,"seed":42},{"ksize":11,"max_hash":9223372036854776,"md5sum":"6512bd43d9caa6e02c990b0a82652dca","mins":[],"molecule":"hp","num":0,"seed":42},{"ksize":17,"max_hash":9223372036854776,"md5sum":"4e7f2da275e0d3cd3488a7429f23c0e7","mins":[80545341491115,95893197637407,115032912017982,167876181889216,186411802953277,222466650340651,224967183012543,258505356847424,309875701424311,355213055946019,386676749766432,412910340994277,462537020181146,494965971291721,559141879168033,561484232050194,644615490355418,658172460965864,720535257532265,738352407205548,745810431880043,759181896175271,850588167607652,887659205846183,894164001336358,905078595321889,970165616477383,1017856994095351,1028412790624984,1050795138079130,1139932302760059,1186388344819559,1192818915503467,1198212766664640,1216892715424691,1219438185071604,1237070334261946,1387123735448234,1434001421971507,1485329804804021,1506460396271450,1557881761202533,1621319275541247,1652996636263092,1759458369106525,1775445012461498,1778569708504615,1790264270235488,1814476471419496,1836560547212168,1860873491028851,1896569881953525,1897420327685705,1923345118834304,1936637912433824,1937255081379236,1947138500658091,1975618333674647,2144882621679500,2149108682537798,2161921385516085,2246818310352884,2290340663993180,2304543670495485,2364205861776990,2412514993466726,2452585910741483,2509528096806449,2539884144415183,2541107353815135,2572053948720929,2622479875739911,2647527190109029,2713096700045735,2761919595787895,2867643251943479,2925231867832341,3025758098634438,3033763278178401,3058704182312610,3115261468695424,3166876962072314,3233211247123021,3235567354197305,3243245537683617,3259305016866742,3337410579699057,3418776874767941,3423000771650184,3507216917735704,3595029584959389,3634370995551791,3768235701772236,3775046865943208,3817698542949254,3828452511096857,3833691931514236,4038649211585583,4067919805126295,4097280795487840,4221222492196035,4343951716797518,4367658693942699,4450087885545040,4509903538197388,4513107976808259,4527368785345773,4562209560900537,4639168970839038,4668382471614239,4669352267366042,4679521416287799,4686839507440048,4709562456621538,4724594069568581,4736749713837036,4738083948924482,4759958931942220,4789425451192135,4800497165006063,4820147941509539,4850297769952409,4850600607797904,4868447772682418,4905552855041575,4971930424454368,4985072012605862,4995739741362592,5001422416309153,5009161631707149,5030854929253622,5035217786452507,5065729136452008,5077023046760465,5082191099848633,5176553408615803,5187575110109033,5315950415531956,5380346145125625,5432557230189792,5443962862203433,5504634862131301,5523415209316411,5539462263412859,5578575768148920,5594872593367517,5602170740228452,5627969550666066,5645214919396423,5715244891476417,5716776196844718,5756975294595510,5791158195561112,5804298783161609,5859298762196680,5941703205411035,5984207756172447,6006605546729755,6040624656956813,6079431920736680,6120850189740657,6143463234120488,6403824852243549,6434823327466671,6452924095770729,6486607607852558,6493745783442873,6609514638731456,6621397331436691,6639512072137206,6651392661523258,6663467668433959,6746635771795261,6774460282204914,6798792706747163,6818258524917823,6853148606664897,6862857851511771,6863419022655845,6884221467246173,6960794347010868,6971091824735669,6996177331699958,7020119488788226,7041050337195108,7093433891425227,7105809197035780,7123502524719655,7152363307939419,7180066834856704,7230192109807876,7268204104636229,7369432586890070,7369545607316536,7452432790013634,7475586378544421,7518091663474830,7555974865996135,7567184934833446,7598164152364795,7691459572863761,7709831060098954,7733841383672718,7823392534007031,7863676561152009,7870327771274750,7886730508543846,7920363556752832,7939090045786917,7956077051018542,8006111745484961,8014778875958321,8022584532939757,8066519196340294,8068684551622667,8139161380118085,8156857187140640,8161593522327026,8243833392562765,8267006323898637,8308586921720912,8354821843549256,8375847106291424,8417181145027020,8463616731079599,8649752374463547,8676699653973820,8678492387990270,8746431528629889,8764389844740277,8806802453026237,8846826235070247,8937418166974287,8969593290271317,8982504059107137,9012010336229099,9027180495854695,9045704090610147,9104390213566336,9118607030191614,9198853303101481],"molecule":"protein","num":0,"seed":42},{"ksize":17,"max_hash":9223372036854776,"md5sum":"61b7ea9355dd355ea5d6670c919a45cf","mins":[1816309557216154,3456445761862780,7650346105396480],"molecule":"dayhoff","num":0,"seed":42},{"ksize":17,"max_hash":9223372036854776,"md5sum":"70efdf2ec9b086079795c442636b55fb","mins":[],"molecule":"hp","num":0,"seed":42}],"version":0.4}] diff --git a/tests/test_bugs.py b/tests/test_bugs.py new file mode 100644 index 0000000000..4635d2d7c9 --- /dev/null +++ b/tests/test_bugs.py @@ -0,0 +1,16 @@ +from __future__ import print_function, unicode_literals +from . import sourmash_tst_utils as utils + +def test_bug_781(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('protein_781.sig') + + status, out, err = utils.runscript('sourmash', + ['compare', + '--protein', '--no-dna', '--no-dayhoff', + '--no-hp', '-k', '11', '-o', 'testing', + testdata1], + in_directory=location) + print(out) + print(err) + assert status == 0 diff --git a/tests/test_sourmash_compute.py b/tests/test_sourmash_compute.py index 56d948903b..ff389848e6 100644 --- a/tests/test_sourmash_compute.py +++ b/tests/test_sourmash_compute.py @@ -493,7 +493,7 @@ def test_do_sourmash_compute_multik_with_dayhoff_hp_dna_protein(): assert sum(x.minhash.is_molecule_type('dayhoff') for x in siglist) == 2 assert sum(x.minhash.is_molecule_type('hp') for x in siglist) == 2 # 2 = dayhoff, 2 = hp = 4 protein - assert sum(x.minhash.is_molecule_type('protein') for x in siglist) == 4 + assert sum(x.minhash.is_molecule_type('protein') for x in siglist) == 2 def test_do_sourmash_compute_multik_with_nothing(): From 03e5269b63853eac6a2fd36a5950c1eb91b30a57 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 14 Dec 2019 12:59:26 -0800 Subject: [PATCH 02/10] fix some bugs in rankinfo (#797) --- sourmash/lca/command_rankinfo.py | 27 ++++++++++++------ tests/test_lca.py | 48 ++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 9 deletions(-) diff --git a/sourmash/lca/command_rankinfo.py b/sourmash/lca/command_rankinfo.py index d897527d0f..e9faa9b5ba 100644 --- a/sourmash/lca/command_rankinfo.py +++ b/sourmash/lca/command_rankinfo.py @@ -6,12 +6,12 @@ import sys from collections import defaultdict -from ..logging import error, debug, set_quiet +from ..logging import error, debug, set_quiet, notify from . import lca_utils from ..sourmash_args import SourmashArgumentParser -def make_lca_counts(dblist): +def make_lca_counts(dblist, min_num=0): """ Collect counts of all the LCAs in the list of databases. @@ -22,10 +22,14 @@ def make_lca_counts(dblist): assignments = defaultdict(set) for lca_db in dblist: for hashval, idx_list in lca_db.hashval_to_idx.items(): + if min_num and len(idx_list) < min_num: + continue + for idx in idx_list: - lid = lca_db.idx_to_lid[idx] - lineage = lca_db.lid_to_lineage[lid] - assignments[hashval].add(lineage) + lid = lca_db.idx_to_lid.get(idx) + if lid is not None: + lineage = lca_db.lid_to_lineage[lid] + assignments[hashval].add(lineage) # now convert to trees -> do LCA & counts counts = defaultdict(int) @@ -55,6 +59,8 @@ def rankinfo_main(args): help='suppress non-error output') p.add_argument('-d', '--debug', action='store_true', help='output debugging output') + p.add_argument('--minimum-num', type=int, default=0, + help='Minimum number of different lineages a k-mer must be in to be counted') args = p.parse_args(args) if not args.db: @@ -70,7 +76,7 @@ def rankinfo_main(args): dblist, ksize, scaled = lca_utils.load_databases(args.db, args.scaled) # count all the LCAs across these databases - counts = make_lca_counts(dblist) + counts = make_lca_counts(dblist, args.minimum_num) # collect counts across all ranks counts_by_rank = defaultdict(int) @@ -81,9 +87,12 @@ def rankinfo_main(args): # output! total = float(sum(counts_by_rank.values())) - for rank in lca_utils.taxlist(): - count = counts_by_rank.get(rank, 0) - print('{}: {} ({:.1f}%)'.format(rank, count, count / total * 100.)) + if total == 0: + notify("(no hashvals with lineages found)") + else: + for rank in lca_utils.taxlist(): + count = counts_by_rank.get(rank, 0) + print('{}: {} ({:.1f}%)'.format(rank, count, count / total * 100.)) if __name__ == '__main__': diff --git a/tests/test_lca.py b/tests/test_lca.py index fb63d63ade..84febe5928 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -795,6 +795,54 @@ def test_rankinfo_on_single(): assert not lines +def test_rankinfo_no_tax(): + with utils.TempDirectory() as location: + taxcsv = utils.get_test_data('lca/delmont-1.csv') + input_sig = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') + lca_db = os.path.join(location, 'delmont-1.lca.json') + + cmd = ['lca', 'index', taxcsv, lca_db, input_sig] + status, out, err = utils.runscript('sourmash', cmd) + + print(cmd) + print(out) + print(err) + + assert os.path.exists(lca_db) + + assert "** assuming column 'MAGs' is identifiers in spreadsheet" in err + assert "** assuming column 'Domain' is superkingdom in spreadsheet" in err + assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in err + + cmd = ['lca', 'rankinfo', lca_db] + status, out, err = utils.runscript('sourmash', cmd) + + +def test_rankinfo_with_min(): + with utils.TempDirectory() as location: + db1 = utils.get_test_data('lca/dir1.lca.json') + db2 = utils.get_test_data('lca/dir2.lca.json') + + cmd = ['lca', 'rankinfo', db1, db2, '--minimum-num', '1'] + status, out, err = utils.runscript('sourmash', cmd) + + print(cmd) + print(out) + print(err) + + lines = out.splitlines() + lines.remove('superkingdom: 0 (0.0%)') + lines.remove('phylum: 464 (12.8%)') + lines.remove('class: 533 (14.7%)') + lines.remove('order: 1050 (29.0%)') + lines.remove('family: 695 (19.2%)') + lines.remove('genus: 681 (18.8%)') + lines.remove('species: 200 (5.5%)') + lines.remove('strain: 0 (0.0%)') + + assert not lines + + def test_compare_csv(): with utils.TempDirectory() as location: a = utils.get_test_data('lca/classify-by-both.csv') From ec8a00cf9105f716bb97d2e950403c26230ed889 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Mon, 16 Dec 2019 01:24:35 +0000 Subject: [PATCH 03/10] Create an Index abstract base class (#556) * start adding code for LinearIndex.search * an initial test of LinearIndex.search * implement save & load for LinearIndex * add test for LinearIndex.load * implemented & tested LinearIndex.gather * implement LinearIndex in load_databases and search functions * implemented LinearIndex for gather, too * implemented search in LCA db * implemented gather on LCA DBs * implemented gather on SBT * implemented search on SBTs * removed conditionals in search & gather in favor of Index interface * rely on 'Index.gather' returning actual matches * remove duplicate SearchResult, clean up & rationalize SearchResult and GatherResult * display full order of sigs in failed tests * add signatures() iterator to Index objects * move search, gather functions into base Index class * clean up signature loading * add signatures() method to both LCA and SBT indices --- setup.py | 3 +- sourmash/commands.py | 25 ++-- sourmash/index.py | 127 +++++++++++++++++++ sourmash/lca/lca_utils.py | 86 ++++++++++--- sourmash/sbt.py | 98 ++++++++++++-- sourmash/sbtmh.py | 19 ++- sourmash/search.py | 261 ++++++++++++++------------------------ sourmash/sourmash_args.py | 20 +-- tests/test_index.py | 217 +++++++++++++++++++++++++++++++ tests/test_lca.py | 78 ++++++++++++ tests/test_sbt.py | 67 ++++++---- 11 files changed, 750 insertions(+), 251 deletions(-) create mode 100644 sourmash/index.py create mode 100644 tests/test_index.py diff --git a/setup.py b/setup.py index c811ff8791..6b43497c4a 100644 --- a/setup.py +++ b/setup.py @@ -64,7 +64,8 @@ language="c++", extra_compile_args=EXTRA_COMPILE_ARGS, extra_link_args=EXTRA_LINK_ARGS)], - "install_requires": ["screed>=0.9", "ijson>=2.5.1", "khmer>=2.1", 'numpy', 'matplotlib', 'scipy'], + "install_requires": ["screed>=0.9", "ijson>=2.5.1", "khmer>=2.1", 'numpy', + 'matplotlib', 'scipy', "deprecation>=2.0.6"], "setup_requires": ['Cython>=0.25.2', "setuptools>=38.6.0", 'setuptools_scm', 'setuptools_scm_git_archive'], "use_scm_version": {"write_to": "sourmash/version.py"}, diff --git a/sourmash/commands.py b/sourmash/commands.py index 3d6dfd12a9..ac81f816b5 100644 --- a/sourmash/commands.py +++ b/sourmash/commands.py @@ -450,8 +450,7 @@ def index(args): ss.minhash = ss.minhash.downsample_scaled(args.scaled) scaleds.add(ss.minhash.scaled) - leaf = SigLeaf(ss.md5sum(), ss) - tree.add_node(leaf) + tree.insert(ss) n += 1 if not ss: @@ -545,6 +544,10 @@ def search(args): not args.containment, args.traverse_directory) + # forcibly ignore abundances if query has no abundances + if not query.minhash.track_abundance: + args.ignore_abundance = True + if not len(databases): error('Nothing found to search!') sys.exit(-1) @@ -570,7 +573,7 @@ def search(args): print_results("---------- -----") for sr in results[:n_matches]: pct = '{:.1f}%'.format(sr.similarity*100) - name = sr.match_sig._display_name(60) + name = sr.match._display_name(60) print_results('{:>6} {}', pct, name) if args.best_only: @@ -583,14 +586,14 @@ def search(args): w.writeheader() for sr in results: d = dict(sr._asdict()) - del d['match_sig'] + del d['match'] w.writerow(d) # save matching signatures upon request if args.save_matches: outname = args.save_matches.name notify('saving all matched signatures to "{}"', outname) - sig.save_signatures([ sr.match_sig for sr in results ], + sig.save_signatures([ sr.match for sr in results ], args.save_matches) @@ -758,7 +761,7 @@ def gather(args): pct_query = '{:.1f}%'.format(result.f_unique_weighted*100) pct_genome = '{:.1f}%'.format(result.f_match*100) average_abund ='{:.1f}'.format(result.average_abund) - name = result.leaf._display_name(40) + name = result.match._display_name(40) if query.minhash.track_abundance and not args.ignore_abundance: print_results('{:9} {:>7} {:>7} {:>9} {}', @@ -786,13 +789,13 @@ def gather(args): w.writeheader() for result in found: d = dict(result._asdict()) - del d['leaf'] # actual signature not in CSV. + del d['match'] # actual signature not in CSV. w.writerow(d) if found and args.save_matches: outname = args.save_matches.name notify('saving all matches to "{}"', outname) - sig.save_signatures([ r.leaf for r in found ], args.save_matches) + sig.save_signatures([ r.match for r in found ], args.save_matches) if args.output_unassigned: if not len(query.minhash): @@ -906,7 +909,7 @@ def multigather(args): pct_query = '{:.1f}%'.format(result.f_unique_weighted*100) pct_genome = '{:.1f}%'.format(result.f_match*100) average_abund ='{:.1f}'.format(result.average_abund) - name = result.leaf._display_name(40) + name = result.match._display_name(40) if query.minhash.track_abundance and not args.ignore_abundance: print_results('{:9} {:>7} {:>7} {:>9} {}', @@ -941,14 +944,14 @@ def multigather(args): w.writeheader() for result in found: d = dict(result._asdict()) - del d['leaf'] # actual signature not in CSV. + del d['match'] # actual signature not in CSV. w.writerow(d) output_matches = output_base + '.matches.sig' with open(output_matches, 'wt') as fp: outname = output_matches notify('saving all matches to "{}"', outname) - sig.save_signatures([ r.leaf for r in found ], fp) + sig.save_signatures([ r.match for r in found ], fp) output_unassigned = output_base + '.unassigned.sig' with open(output_unassigned, 'wt') as fp: diff --git a/sourmash/index.py b/sourmash/index.py new file mode 100644 index 0000000000..df1f58e89b --- /dev/null +++ b/sourmash/index.py @@ -0,0 +1,127 @@ +"An Abstract Base Class for collections of signatures." + +from abc import ABCMeta, abstractmethod +from collections import namedtuple + +# compatible with Python 2 *and* 3: +ABC = ABCMeta("ABC", (object,), {"__slots__": ()}) + + +class Index(ABC): + @abstractmethod + def signatures(self): + "Return an iterator over all signatures in the Index object." + + @abstractmethod + def insert(self, signature): + """ """ + + @abstractmethod + def save(self, path, storage=None, sparseness=0.0, structure_only=False): + """ """ + + @classmethod + @abstractmethod + def load(cls, location, leaf_loader=None, storage=None, print_version_warning=True): + """ """ + + def find(self, search_fn, *args, **kwargs): + """Use search_fn to find matching signatures in the index. + + search_fn(other_sig, *args) should return a boolean that indicates + whether other_sig is a match. + + Returns a list. + """ + + matches = [] + + for node in self.signatures(): + if search_fn(node, *args): + matches.append(node) + return matches + + def search(self, query, *args, **kwargs): + """Return set of matches with similarity above 'threshold'. + + Results will be sorted by similarity, highest to lowest. + + Optional arguments accepted by all Index subclasses: + * do_containment: default False. If True, use Jaccard containment. + * best_only: default False. If True, allow optimizations that + may. May discard matches better than threshold, but first match + is guaranteed to be best. + * ignore_abundance: default False. If True, and query signature + and database support k-mer abundances, ignore those abundances. + + Note, the "best only" hint is ignored by LinearIndex. + """ + + # check arguments + if 'threshold' not in kwargs: + raise TypeError("'search' requires 'threshold'") + threshold = kwargs['threshold'] + + do_containment = kwargs.get('do_containment', False) + ignore_abundance = kwargs.get('ignore_abundance', False) + + # configure search - containment? ignore abundance? + if do_containment: + query_match = lambda x: query.contained_by(x, downsample=True) + else: + query_match = lambda x: query.similarity( + x, downsample=True, ignore_abundance=ignore_abundance) + + # do the actual search: + matches = [] + + for ss in self.signatures(): + similarity = query_match(ss) + if similarity >= threshold: + matches.append((similarity, ss, self.filename)) + + # sort! + matches.sort(key=lambda x: -x[0]) + return matches + + def gather(self, query, *args, **kwargs): + "Return the match with the best Jaccard containment in the Index." + results = [] + for ss in self.signatures(): + cont = query.minhash.containment_ignore_maxhash(ss.minhash) + if cont: + results.append((cont, ss, self.filename)) + + results.sort(reverse=True, key=lambda x: (x[0], x[1].name())) + + return results + + +class LinearIndex(Index): + def __init__(self, _signatures=None, filename=None): + self._signatures = [] + if _signatures: + self._signatures = list(_signatures) + self.filename = filename + + def signatures(self): + return iter(self._signatures) + + def __len__(self): + return len(self._signatures) + + def insert(self, node): + self._signatures.append(node) + + def save(self, path): + from .signature import save_signatures + with open(path, 'wt') as fp: + save_signatures(self.signatures(), fp) + + @classmethod + def load(cls, location): + from .signature import load_signatures + si = load_signatures(location) + + lidx = LinearIndex(si, filename=location) + return lidx diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py index 06b54ffd1b..a72f0f71ee 100644 --- a/sourmash/lca/lca_utils.py +++ b/sourmash/lca/lca_utils.py @@ -19,6 +19,7 @@ from .._minhash import get_max_hash_for_scaled from ..logging import notify, error, debug +from ..index import Index # type to store an element in a taxonomic lineage LineagePair = namedtuple('LineagePair', ['rank', 'name']) @@ -138,7 +139,7 @@ def find_lca(tree): return tuple(lineage), len(node) -class LCA_Database(object): +class LCA_Database(Index): """ Wrapper class for taxonomic database. @@ -163,6 +164,12 @@ def __init__(self): def __repr__(self): return "LCA_Database('{}')".format(self.filename) + def signatures(self): + from .. import SourmashSignature + self._create_signatures() + for v in self._signatures.values(): + yield SourmashSignature(v) + def load(self, db_name): "Load from a JSON file." xopen = open @@ -261,10 +268,48 @@ def save(self, db_name): json.dump(save_d, fp) + def search(self, query, *args, **kwargs): + # check arguments + if 'threshold' not in kwargs: + raise TypeError("'search' requires 'threshold'") + threshold = kwargs['threshold'] + do_containment = kwargs.get('do_containment', False) + ignore_abundance = kwargs.get('ignore_abundance', True) + if not ignore_abundance: + raise TypeError("'search' on LCA databases does not use abundance") + + results = [] + for x in self.find_signatures(query.minhash, threshold, do_containment): + (score, match, filename) = x + results.append((score, match, filename)) + + results.sort(key=lambda x: -x[0]) + return results + + def gather(self, query, *args, **kwargs): + results = [] + for x in self.find_signatures(query.minhash, 0.0, + containment=True, ignore_scaled=True): + (score, match, filename) = x + if score: + results.append((score, match, filename)) + + return results + + def insert(self, node): + raise NotImplementedError + + def find(self, search_fn, *args, **kwargs): + raise NotImplementedError + def downsample_scaled(self, scaled): """ Downsample to the provided scaled value, i.e. eliminate all hashes that don't fall in the required range. + + NOTE: we probably need to invalidate some of the dynamically + calculated members of this object, like _signatures, when we do this. + But we aren't going to right now. """ if scaled == self.scaled: return @@ -294,17 +339,13 @@ def get_lineage_assignments(self, hashval): return x - def find(self, minhash, threshold, containment=False, ignore_scaled=False): - """ - Do a Jaccard similarity or containment search. - """ - # make sure we're looking at the same scaled value as database - if self.scaled > minhash.scaled: - minhash = minhash.downsample_scaled(self.scaled) - elif self.scaled < minhash.scaled and not ignore_scaled: - raise ValueError("lca db scaled is {} vs query {}; must downsample".format(self.scaled, minhash.scaled)) + def _create_signatures(self): + "Create a _signatures member dictionary that contains {idx: minhash}." + from .. import MinHash + + if not hasattr(self, '_signatures'): + minhash = MinHash(n=0, ksize=self.ksize, scaled=self.scaled) - if not hasattr(self, 'signatures'): debug('creating signatures for LCA DB...') sigd = defaultdict(minhash.copy_and_clear) @@ -312,9 +353,23 @@ def find(self, minhash, threshold, containment=False, ignore_scaled=False): for vv in v: sigd[vv].add_hash(k) - self.signatures = sigd + self._signatures = sigd + + debug('=> {} signatures!', len(self._signatures)) + + def find_signatures(self, minhash, threshold, containment=False, + ignore_scaled=False): + """ + Do a Jaccard similarity or containment search. + """ + # make sure we're looking at the same scaled value as database + if self.scaled > minhash.scaled: + minhash = minhash.downsample_scaled(self.scaled) + elif self.scaled < minhash.scaled and not ignore_scaled: + # note that containment can be calculated w/o matching scaled. + raise ValueError("lca db scaled is {} vs query {}; must downsample".format(self.scaled, minhash.scaled)) - debug('=> {} signatures!', len(self.signatures)) + self._create_signatures() # build idx_to_ident from ident_to_idx if not hasattr(self, 'idx_to_ident'): @@ -340,7 +395,7 @@ def find(self, minhash, threshold, containment=False, ignore_scaled=False): name = self.ident_to_name[ident] debug('looking at {} ({})', ident, name) - match_mh = self.signatures[idx] + match_mh = self._signatures[idx] match_size = len(match_mh) debug('count: {}; query_mins: {}; match size: {}', @@ -354,11 +409,10 @@ def find(self, minhash, threshold, containment=False, ignore_scaled=False): debug('score: {} (containment? {})', score, containment) if score >= threshold: - # reconstruct signature... ugh. from .. import SourmashSignature match_sig = SourmashSignature(match_mh, name=name) - yield score, match_sig, match_sig.md5sum(), self.filename, name + yield score, match_sig, self.filename def load_single_database(filename, verbose=False): diff --git a/sourmash/sbt.py b/sourmash/sbt.py index 2a48874a39..ddc2f617b9 100644 --- a/sourmash/sbt.py +++ b/sourmash/sbt.py @@ -10,7 +10,7 @@ graph1 = factory() # ... add stuff to graph1 ... leaf1 = Leaf("a", graph1) - root.add_node(leaf1) + root.insert(leaf1) For example, :: @@ -26,7 +26,7 @@ graph = factory() graph.consume_fasta(filename) leaf = Leaf(filename, graph) - root.add_node(leaf) + root.insert(leaf) then define a search function, :: @@ -57,6 +57,7 @@ def search_transcript(node, seq, threshold): import sys from tempfile import NamedTemporaryFile +from deprecation import deprecated import khmer try: @@ -66,7 +67,7 @@ def search_transcript(node, seq, threshold): from .sbt_storage import FSStorage, TarStorage, IPFSStorage, RedisStorage from .logging import error, notify, debug - +from .index import Index STORAGES = { 'TarStorage': TarStorage, @@ -102,7 +103,7 @@ def init_args(self): return (self.ksize, self.starting_size, self.n_tables) -class SBT(object): +class SBT(Index): """A Sequence Bloom Tree implementation allowing generic internal nodes and leaves. The default node and leaf format is a Bloom Filter (like the original implementation), @@ -133,6 +134,10 @@ def __init__(self, factory, d=2, storage=None): self.next_node = 0 self.storage = storage + def signatures(self): + for k in self.leaves(): + yield k.data + def new_node_pos(self, node): if not self._nodes: self.next_node = 1 @@ -160,13 +165,20 @@ def new_node_pos(self, node): return self.next_node - def add_node(self, leaf): - pos = self.new_node_pos(leaf) + def insert(self, signature): + "Add a new SourmashSignature in to the SBT." + from .sbtmh import SigLeaf + + leaf = SigLeaf(signature.name(), signature) + self.add_node(leaf) + + def add_node(self, node): + pos = self.new_node_pos(node) if pos == 0: # empty tree; initialize w/node. n = Node(self.factory, name="internal." + str(pos)) self._nodes[0] = n - pos = self.new_node_pos(leaf) + pos = self.new_node_pos(node) # Cases: # 1) parent is a Leaf (already covered) @@ -186,26 +198,26 @@ def add_node(self, leaf): c1, c2 = self.children(p.pos)[:2] self._leaves[c1.pos] = p.node - self._leaves[c2.pos] = leaf + self._leaves[c2.pos] = node del self._leaves[p.pos] - for child in (p.node, leaf): + for child in (p.node, node): child.update(n) elif isinstance(p.node, Node): - self._leaves[pos] = leaf - leaf.update(p.node) + self._leaves[pos] = node + node.update(p.node) elif p.node is None: n = Node(self.factory, name="internal." + str(p.pos)) self._nodes[p.pos] = n c1 = self.children(p.pos)[0] - self._leaves[c1.pos] = leaf - leaf.update(n) + self._leaves[c1.pos] = node + node.update(n) # update all parents! p = self.parent(p.pos) while p: self._rebuild_node(p.pos) - leaf.update(self._nodes[p.pos]) + node.update(self._nodes[p.pos]) p = self.parent(p.pos) def find(self, search_fn, *args, **kwargs): @@ -251,6 +263,64 @@ def find(self, search_fn, *args, **kwargs): queue.extend(c.pos for c in self.children(node_p)) return matches + def search(self, query, *args, **kwargs): + from .sbtmh import search_minhashes, search_minhashes_containment + from .sbtmh import SearchMinHashesFindBest + from .signature import SourmashSignature + + threshold = kwargs['threshold'] + ignore_abundance = kwargs['ignore_abundance'] + do_containment = kwargs['do_containment'] + best_only = kwargs['best_only'] + + search_fn = search_minhashes + query_match = lambda x: query.similarity( + x, downsample=True, ignore_abundance=ignore_abundance) + if do_containment: + search_fn = search_minhashes_containment + query_match = lambda x: query.contained_by(x, downsample=True) + + if best_only: # this needs to be reset for each SBT + search_fn = SearchMinHashesFindBest().search + + # figure out scaled value of tree, downsample query if needed. + leaf = next(iter(self.leaves())) + tree_mh = leaf.data.minhash + + tree_query = query + if tree_mh.scaled and query.minhash.scaled and \ + tree_mh.scaled > query.minhash.scaled: + resampled_query_mh = tree_query.minhash + resampled_query_mh = resampled_query_mh.downsample_scaled(tree_mh.scaled) + tree_query = SourmashSignature(resampled_query_mh) + + # now, search! + results = [] + for leaf in self.find(search_fn, tree_query, threshold): + similarity = query_match(leaf.data) + + # tree search should always/only return matches above threshold + assert similarity >= threshold + + results.append((similarity, leaf.data, None)) + + return results + + + def gather(self, query, *args, **kwargs): + from .sbtmh import GatherMinHashes + # use a tree search function that keeps track of its best match. + search_fn = GatherMinHashes().search + + results = [] + for leaf in self.find(search_fn, query, 0.0): + leaf_e = leaf.data.minhash + similarity = query.minhash.containment_ignore_maxhash(leaf_e) + if similarity > 0.0: + results.append((similarity, leaf.data, None)) + + return results + def _rebuild_node(self, pos=0): """Recursively rebuilds an internal node (if it is not present). diff --git a/sourmash/sbtmh.py b/sourmash/sbtmh.py index 066b2a952d..44067d8965 100644 --- a/sourmash/sbtmh.py +++ b/sourmash/sbtmh.py @@ -203,9 +203,9 @@ def search_minhashes_containment(node, sig, threshold, return 0 -class GatherMinHashesFindBestIgnoreMaxHash(object): - def __init__(self, initial_best_match=0.0): - self.best_match = initial_best_match +class GatherMinHashes(object): + def __init__(self): + self.best_match = 0 def search(self, node, query, threshold, results=None): score = 0 @@ -235,12 +235,11 @@ def search(self, node, query, threshold, results=None): if results is not None: results[node.name] = score - if score >= threshold: - # have we done better than this? if no, truncate searches below. - if score >= self.best_match: - # update best if it's a leaf node... - if isinstance(node, SigLeaf): - self.best_match = score - return 1 + # have we done better than this? if no, truncate searches below. + if score >= self.best_match: + # update best if it's a leaf node... + if isinstance(node, SigLeaf): + self.best_match = score + return 1 return 0 diff --git a/sourmash/search.py b/sourmash/search.py index 2106405812..7694bdfdfc 100644 --- a/sourmash/search.py +++ b/sourmash/search.py @@ -4,14 +4,12 @@ from .logging import notify, error from .signature import SourmashSignature -from .sbtmh import search_minhashes, search_minhashes_containment -from .sbtmh import SearchMinHashesFindBest, GatherMinHashesFindBestIgnoreMaxHash from ._minhash import get_max_hash_for_scaled -# generic SearchResult across individual signatures + SBTs. +# generic SearchResult. SearchResult = namedtuple('SearchResult', - 'similarity, match_sig, md5, filename, name') + 'similarity, match, md5, filename, name') def format_bp(bp): @@ -30,184 +28,113 @@ def format_bp(bp): def search_databases(query, databases, threshold, do_containment, best_only, ignore_abundance): - # set up the search & score function(s) - similarity vs containment - search_fn = search_minhashes - query_match = lambda x: query.similarity( - x, downsample=True, ignore_abundance=ignore_abundance) - if do_containment: - search_fn = search_minhashes_containment - query_match = lambda x: query.contained_by(x, downsample=True) - results = [] found_md5 = set() for (obj, filename, filetype) in databases: - if filetype == 'SBT': - if best_only: # this needs to be reset for each SBT - search_fn = SearchMinHashesFindBest().search - - tree = obj - - # figure out scaled value of tree, downsample query if needed. - leaf = next(iter(tree.leaves())) - tree_mh = leaf.data.minhash - - tree_query = query - if tree_mh.scaled and query.minhash.scaled and \ - tree_mh.scaled > query.minhash.scaled: - resampled_query_mh = tree_query.minhash - resampled_query_mh = resampled_query_mh.downsample_scaled(tree_mh.scaled) - tree_query = SourmashSignature(resampled_query_mh) - - # now, search! - for leaf in tree.find(search_fn, tree_query, threshold): - similarity = query_match(leaf.data) - - # tree search should always/only return matches above threshold - assert similarity >= threshold - - if leaf.data.md5sum() not in found_md5: - sr = SearchResult(similarity=similarity, - match_sig=leaf.data, - md5=leaf.data.md5sum(), - filename=filename, - name=leaf.data.name()) - found_md5.add(sr.md5) - results.append(sr) - - elif filetype == 'LCA': - lca_db = obj - for x in lca_db.find(query.minhash, threshold, do_containment): - (score, match_sig, md5, filename, name) = x - if md5 not in found_md5: - sr = SearchResult(similarity=score, - match_sig=match_sig, - md5=md5, - filename=filename, - name=name) - found_md5.add(sr.md5) - results.append(sr) - - else: # list of signatures - for ss in obj: - similarity = query_match(ss) - if similarity >= threshold and \ - ss.md5sum() not in found_md5: - sr = SearchResult(similarity=similarity, - match_sig=ss, - md5=ss.md5sum(), - filename=filename, - name=ss.name()) - found_md5.add(sr.md5) - results.append(sr) - + search_iter = obj.search(query, threshold=threshold, + do_containment=do_containment, + ignore_abundance=ignore_abundance, + best_only=best_only) + for (similarity, match, filename) in search_iter: + md5 = match.md5sum() + if md5 not in found_md5: + results.append((similarity, match, filename)) + found_md5.add(md5) # sort results on similarity (reverse) - results.sort(key=lambda x: -x.similarity) + results.sort(key=lambda x: -x[0]) - return results + x = [] + for (similarity, match, filename) in results: + x.append(SearchResult(similarity=similarity, + match=match, + md5=match.md5sum(), + filename=filename, + name=match.name())) + return x +### +### gather code +### -# define a function to build new query object -def build_new_query(to_remove, old_query, scaled=None): - e = old_query.minhash - e.remove_many(to_remove) - if scaled: - e = e.downsample_scaled(scaled) - return SourmashSignature(e) +GatherResult = namedtuple('GatherResult', + 'intersect_bp, f_orig_query, f_match, f_unique_to_query, f_unique_weighted, average_abund, median_abund, std_abund, filename, name, md5, match') -GatherResult = namedtuple('GatherResult', - 'intersect_bp, f_orig_query, f_match, f_unique_to_query, f_unique_weighted, average_abund, median_abund, std_abund, filename, name, md5, leaf') +# build a new query object, subtracting found mins and downsampling +def _subtract_and_downsample(to_remove, old_query, scaled=None): + mh = old_query.minhash + mh = mh.downsample_scaled(scaled) + mh.remove_many(to_remove) + + return SourmashSignature(mh) + + +def _find_best(dblist, query): + """ + Search for the best containment, return precisely one match. + """ + + best_cont = 0.0 + best_match = None + best_filename = None + + # search across all databases + for (obj, filename, filetype) in dblist: + for cont, match, fname in obj.gather(query): + assert cont + + # note, break ties based on name, to ensure consistent order. + if (cont == best_cont and match.name() < best_match.name()) or \ + cont > best_cont: + # update best match. + best_cont = cont + best_match = match + + # some objects may not have associated filename (e.g. SBTs) + best_filename = fname or filename + + if not best_match: + return None, None, None + + return best_cont, best_match, best_filename def gather_databases(query, databases, threshold_bp, ignore_abundance): - orig_query = query - orig_mins = orig_query.minhash.get_hashes() + """ + Iteratively find the best containment of `query` in all the `databases`, + until we find fewer than `threshold_bp` (estimated) bp in common. + """ + # track original query information for later usage. + track_abundance = query.minhash.track_abundance and not ignore_abundance + orig_mh = query.minhash + orig_mins = orig_mh.get_hashes() orig_abunds = { k: 1 for k in orig_mins } # do we pay attention to abundances? - if orig_query.minhash.track_abundance and not ignore_abundance: + if track_abundance: import numpy as np - orig_abunds = orig_query.minhash.get_mins(with_abundance=True) - - # store the scaled value for the query - orig_scaled = orig_query.minhash.scaled - - # define a function to do a 'best' search and get only top match. - def find_best(dblist, query, remainder): - - # precompute best containment from all of the remainders - best_ctn_sofar = 0.0 - for x in remainder: - ctn = query.minhash.containment_ignore_maxhash(x.minhash) - if ctn > best_ctn_sofar: - best_ctn_sofar = ctn - - results = [] - for (obj, filename, filetype) in dblist: - # search a tree - if filetype == 'SBT': - tree = obj - search_fn = GatherMinHashesFindBestIgnoreMaxHash(best_ctn_sofar).search - - for leaf in tree.find(search_fn, query, best_ctn_sofar): - leaf_e = leaf.data.minhash - similarity = query.minhash.containment_ignore_maxhash(leaf_e) - if similarity > 0.0: - results.append((similarity, leaf.data, filename)) - # or an LCA database - elif filetype == 'LCA': - lca_db = obj - for x in lca_db.find(query.minhash, 0.0, - containment=True, ignore_scaled=True): - (score, match_sig, md5, filename, name) = x - if score > 0.0: - results.append((score, match_sig, filename)) - - # search a signature - else: - for ss in obj: - similarity = query.minhash.containment_ignore_maxhash(ss.minhash) - if similarity > 0.0: - results.append((similarity, ss, filename)) - - if not results: - return None, None, None - - # take the best result - results.sort(key=lambda x: (-x[0], x[1].name())) # reverse sort on similarity, and then on name - best_similarity, best_leaf, filename = results[0] - - for x in results[1:]: - remainder.add(x[1]) - - return best_similarity, best_leaf, filename - - - # construct a new query that doesn't have the max_hash attribute set. - query = build_new_query([], orig_query) - - cmp_scaled = 0 - remainder = set() + orig_abunds = orig_mh.get_mins(with_abundance=True) + + cmp_scaled = query.minhash.scaled # initialize with resolution of query while 1: - best_similarity, best_leaf, filename = find_best(databases, query, remainder) - if not best_leaf: # no matches at all! + best_cont, best_match, filename = _find_best(databases, query) + if not best_match: # no matches at all! break # subtract found hashes from search hashes, construct new search query_mins = set(query.minhash.get_hashes()) - found_mins = best_leaf.minhash.get_hashes() - - # figure out what the resolution of the banding on the subject is - if not best_leaf.minhash.max_hash: - error('Best hash match in sbt_gather has no max_hash') - error('Please prepare database of sequences with --scaled') - sys.exit(-1) + found_mins = best_match.minhash.get_hashes() - match_scaled = best_leaf.minhash.scaled + # Is the best match computed with scaled? Die if not. + match_scaled = best_match.minhash.scaled + if not match_scaled: + error('Best match in gather is not scaled.') + error('Please prepare gather databases with --scaled') + raise Exception # pick the highest scaled / lowest resolution - cmp_scaled = max(cmp_scaled, match_scaled, orig_scaled) + cmp_scaled = max(cmp_scaled, match_scaled) # eliminate mins under this new resolution. # (CTB note: this means that if a high scaled/low res signature is @@ -234,7 +161,7 @@ def find_best(dblist, query, remainder): f_orig_query = len(intersect_orig_mins) / float(len(orig_mins)) # calculate fractions wrt second denominator - metagenome size - orig_mh = orig_query.minhash.downsample_scaled(cmp_scaled) + orig_mh = orig_mh.downsample_scaled(cmp_scaled) query_n_mins = len(orig_mh) f_unique_to_query = len(intersect_mins) / float(query_n_mins) @@ -242,9 +169,10 @@ def find_best(dblist, query, remainder): f_unique_weighted = sum((orig_abunds[k] for k in intersect_mins)) \ / sum_abunds - intersect_abunds = list(sorted(orig_abunds[k] for k in intersect_mins)) + # calculate stats on abundances, if desired. average_abund, median_abund, std_abund = 0, 0, 0 - if orig_query.minhash.track_abundance and not ignore_abundance: + if track_abundance: + intersect_abunds = list((orig_abunds[k] for k in intersect_mins)) average_abund = np.mean(intersect_abunds) median_abund = np.median(intersect_abunds) std_abund = np.std(intersect_abunds) @@ -259,14 +187,15 @@ def find_best(dblist, query, remainder): median_abund=median_abund, std_abund=std_abund, filename=filename, - md5=best_leaf.md5sum(), - name=best_leaf.name(), - leaf=best_leaf) + md5=best_match.md5sum(), + name=best_match.name(), + match=best_match) - # construct a new query, minus the previous one. - query = build_new_query(found_mins, orig_query, cmp_scaled) - query_mins -= set(found_mins) + # construct a new query, subtracting hashes found in previous one. + query = _subtract_and_downsample(found_mins, query, cmp_scaled) + # compute weighted_missed: + query_mins -= set(found_mins) weighted_missed = sum((orig_abunds[k] for k in query_mins)) \ / sum_abunds diff --git a/sourmash/sourmash_args.py b/sourmash/sourmash_args.py index 5fdb92ea93..542559a9a5 100644 --- a/sourmash/sourmash_args.py +++ b/sourmash/sourmash_args.py @@ -7,6 +7,7 @@ from . import signature from .logging import notify, error +from .index import LinearIndex from . import signature as sig from .sbt import SBT from .sbtmh import SigLeaf @@ -315,12 +316,12 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, traverse=False): ksize=query_ksize, select_moltype=query_moltype) siglist = filter_compatible_signatures(query, siglist, 1) - siglist = list(siglist) - databases.append((siglist, sbt_or_sigfile, False)) - notify('loaded {} signatures from {}', len(siglist), + linear = LinearIndex(siglist, filename=sigfile) + databases.append((linear, sbt_or_sigfile, False)) + notify('loaded {} signatures from {}', len(linear), sigfile, end='\r') - n_signatures += len(siglist) - except Exception: # ignore errors with traverse + n_signatures += len(linear) + except Exception: # ignore errors with traverse pass # done! jump to beginning of main 'for' loop @@ -351,7 +352,6 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, traverse=False): assert query_ksize == lca_db.ksize query_scaled = query.minhash.scaled - assert query_scaled and query_scaled <= lca_db.scaled notify('loaded LCA {}', sbt_or_sigfile, end='\r') n_databases += 1 @@ -373,12 +373,12 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, traverse=False): raise ValueError siglist = filter_compatible_signatures(query, siglist, False) - siglist = list(siglist) + linear = LinearIndex(siglist, filename=sbt_or_sigfile) + databases.append((linear, sbt_or_sigfile, 'signature')) - databases.append((siglist, sbt_or_sigfile, 'signature')) - notify('loaded {} signatures from {}', len(siglist), + notify('loaded {} signatures from {}', len(linear), sbt_or_sigfile, end='\r') - n_signatures += len(siglist) + n_signatures += len(linear) except (EnvironmentError, ValueError): error("\nCannot open file '{}'", sbt_or_sigfile) sys.exit(-1) diff --git a/tests/test_index.py b/tests/test_index.py new file mode 100644 index 0000000000..cfcc5c976b --- /dev/null +++ b/tests/test_index.py @@ -0,0 +1,217 @@ +from __future__ import print_function, unicode_literals + +import os +import sourmash +from sourmash.index import LinearIndex +from sourmash_lib.sbt import SBT, GraphFactory, Leaf +from . import sourmash_tst_utils as utils + + +def test_simple_index(n_children): + factory = GraphFactory(5, 100, 3) + root = SBT(factory, d=n_children) + + leaf1 = Leaf("a", factory()) + leaf1.data.count("AAAAA") + leaf1.data.count("AAAAT") + leaf1.data.count("AAAAC") + + leaf2 = Leaf("b", factory()) + leaf2.data.count("AAAAA") + leaf2.data.count("AAAAT") + leaf2.data.count("AAAAG") + + leaf3 = Leaf("c", factory()) + leaf3.data.count("AAAAA") + leaf3.data.count("AAAAT") + leaf3.data.count("CAAAA") + + leaf4 = Leaf("d", factory()) + leaf4.data.count("AAAAA") + leaf4.data.count("CAAAA") + leaf4.data.count("GAAAA") + + leaf5 = Leaf("e", factory()) + leaf5.data.count("AAAAA") + leaf5.data.count("AAAAT") + leaf5.data.count("GAAAA") + + root.add_node(leaf1) + root.add_node(leaf2) + root.add_node(leaf3) + root.add_node(leaf4) + root.add_node(leaf5) + + def search_kmer(obj, seq): + return obj.data.get(seq) + + kmers = ["AAAAA", "AAAAT", "AAAAG", "CAAAA", "GAAAA"] + + linear = LinearIndex() + linear.insert(leaf1) + linear.insert(leaf2) + linear.insert(leaf3) + linear.insert(leaf4) + linear.insert(leaf5) + + for kmer in kmers: + assert set(root.find(search_kmer, kmer)) == set(linear.find(search_kmer, kmer)) + + print("-----") + print([x.metadata for x in root.find(search_kmer, "AAAAA")]) + print([x.metadata for x in root.find(search_kmer, "AAAAT")]) + print([x.metadata for x in root.find(search_kmer, "AAAAG")]) + print([x.metadata for x in root.find(search_kmer, "CAAAA")]) + print([x.metadata for x in root.find(search_kmer, "GAAAA")]) + + +def test_linear_index_search(): + sig2 = utils.get_test_data('2.fa.sig') + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + ss2 = sourmash.load_one_signature(sig2, ksize=31) + ss47 = sourmash.load_one_signature(sig47) + ss63 = sourmash.load_one_signature(sig63) + + lidx = LinearIndex() + lidx.insert(ss2) + lidx.insert(ss47) + lidx.insert(ss63) + + # now, search for sig2 + sr = lidx.search(ss2, threshold=1.0) + print([s[1].name() for s in sr]) + assert len(sr) == 1 + assert sr[0][1] == ss2 + + # search for sig47 with lower threshold; search order not guaranteed. + sr = lidx.search(ss47, threshold=0.1) + print([s[1].name() for s in sr]) + assert len(sr) == 2 + sr.sort(key=lambda x: -x[0]) + assert sr[0][1] == ss47 + assert sr[1][1] == ss63 + + # search for sig63 with lower threshold; search order not guaranteed. + sr = lidx.search(ss63, threshold=0.1) + print([s[1].name() for s in sr]) + assert len(sr) == 2 + sr.sort(key=lambda x: -x[0]) + assert sr[0][1] == ss63 + assert sr[1][1] == ss47 + + # search for sig63 with high threshold => 1 match + sr = lidx.search(ss63, threshold=0.8) + print([s[1].name for s in sr]) + assert len(sr) == 1 + sr.sort(key=lambda x: -x[0]) + assert sr[0][1] == ss63 + + +def test_linear_index_gather(): + sig2 = utils.get_test_data('2.fa.sig') + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + ss2 = sourmash.load_one_signature(sig2, ksize=31) + ss47 = sourmash.load_one_signature(sig47) + ss63 = sourmash.load_one_signature(sig63) + + lidx = LinearIndex() + lidx.insert(ss2) + lidx.insert(ss47) + lidx.insert(ss63) + + matches = lidx.gather(ss2) + assert len(matches) == 1 + assert matches[0][0] == 1.0 + assert matches[0][1] == ss2 + + matches = lidx.gather(ss47) + assert len(matches) == 2 + assert matches[0][0] == 1.0 + assert matches[0][1] == ss47 + assert round(matches[1][0], 2) == 0.49 + assert matches[1][1] == ss63 + + +def test_linear_index_save(): + sig2 = utils.get_test_data('2.fa.sig') + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + ss2 = sourmash.load_one_signature(sig2, ksize=31) + ss47 = sourmash.load_one_signature(sig47) + ss63 = sourmash.load_one_signature(sig63) + + linear = LinearIndex() + linear.insert(ss2) + linear.insert(ss47) + linear.insert(ss63) + + with utils.TempDirectory() as location: + filename = os.path.join(location, 'foo') + linear.save(filename) + + from sourmash import load_signatures + si = set(load_signatures(filename)) + + x = {ss2, ss47, ss63} + + print(len(si)) + print(len(x)) + + print(si) + print(x) + + assert si == x, si + + +def test_linear_index_load(): + sig2 = utils.get_test_data('2.fa.sig') + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + ss2 = sourmash.load_one_signature(sig2, ksize=31) + ss47 = sourmash.load_one_signature(sig47) + ss63 = sourmash.load_one_signature(sig63) + + with utils.TempDirectory() as location: + from sourmash import save_signatures + + filename = os.path.join(location, 'foo') + with open(filename, 'wt') as fp: + sourmash.save_signatures([ss2, ss47, ss63], fp) + + linear = LinearIndex.load(filename) + + x = {ss2, ss47, ss63} + assert set(linear.signatures()) == x, linear.signatures + assert linear.filename == filename + + +def test_linear_index_save_load(): + sig2 = utils.get_test_data('2.fa.sig') + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + ss2 = sourmash.load_one_signature(sig2, ksize=31) + ss47 = sourmash.load_one_signature(sig47) + ss63 = sourmash.load_one_signature(sig63) + + linear = LinearIndex() + linear.insert(ss2) + linear.insert(ss47) + linear.insert(ss63) + + with utils.TempDirectory() as location: + filename = os.path.join(location, 'foo') + linear.save(filename) + linear2 = LinearIndex.load(filename) + + # now, search for sig2 + sr = linear2.search(ss2, threshold=1.0) + print([s[1].name() for s in sr]) + assert len(sr) == 1 + assert sr[0][1] == ss2 diff --git a/tests/test_lca.py b/tests/test_lca.py index 84febe5928..5e0b17bce7 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -133,6 +133,84 @@ def test_db_repr(): assert repr(db) == "LCA_Database('{}')".format(filename) +def test_lca_index_signatures_method(): + # test 'signatures' method from base class Index + filename = utils.get_test_data('lca/47+63.lca.json') + db, ksize, scaled = lca_utils.load_single_database(filename) + + siglist = list(db.signatures()) + assert len(siglist) == 2 + + +def test_lca_index_insert_method(): + # test 'signatures' method from base class Index + filename = utils.get_test_data('lca/47+63.lca.json') + db, ksize, scaled = lca_utils.load_single_database(filename) + + sig = next(iter(db.signatures())) + + with pytest.raises(NotImplementedError) as e: + db.insert(sig) + + +def test_lca_index_find_method(): + # test 'signatures' method from base class Index + filename = utils.get_test_data('lca/47+63.lca.json') + db, ksize, scaled = lca_utils.load_single_database(filename) + + sig = next(iter(db.signatures())) + + with pytest.raises(NotImplementedError) as e: + db.find(None) + + +def test_search_db_scaled_gt_sig_scaled(): + dbfile = utils.get_test_data('lca/47+63.lca.json') + db, ksize, scaled = lca_utils.load_single_database(dbfile) + sig = sourmash.load_one_signature(utils.get_test_data('47.fa.sig')) + + results = db.search(sig, threshold=.01, ignore_abundance=True) + match_sig = results[0][1] + + sig.minhash = sig.minhash.downsample_scaled(10000) + assert sig.minhash == match_sig.minhash + + +def test_search_db_scaled_lt_sig_scaled(): + dbfile = utils.get_test_data('lca/47+63.lca.json') + db, ksize, scaled = lca_utils.load_single_database(dbfile) + sig = sourmash.load_one_signature(utils.get_test_data('47.fa.sig')) + sig.minhash = sig.minhash.downsample_scaled(100000) + + with pytest.raises(ValueError) as e: + results = db.search(sig, threshold=.01, ignore_abundance=True) + + +def test_gather_db_scaled_gt_sig_scaled(): + dbfile = utils.get_test_data('lca/47+63.lca.json') + db, ksize, scaled = lca_utils.load_single_database(dbfile) + sig = sourmash.load_one_signature(utils.get_test_data('47.fa.sig')) + + results = db.gather(sig, threshold=.01, ignore_abundance=True) + match_sig = results[0][1] + + sig.minhash = sig.minhash.downsample_scaled(10000) + assert sig.minhash == match_sig.minhash + + +def test_gather_db_scaled_lt_sig_scaled(): + dbfile = utils.get_test_data('lca/47+63.lca.json') + db, ksize, scaled = lca_utils.load_single_database(dbfile) + sig = sourmash.load_one_signature(utils.get_test_data('47.fa.sig')) + sig.minhash = sig.minhash.downsample_scaled(100000) + + results = db.gather(sig, threshold=.01, ignore_abundance=True) + match_sig = results[0][1] + + match_sig.minhash = match_sig.minhash.downsample_scaled(100000) + assert sig.minhash == match_sig.minhash + + ## command line tests diff --git a/tests/test_sbt.py b/tests/test_sbt.py index 69824caab9..3f2dfd51c5 100644 --- a/tests/test_sbt.py +++ b/tests/test_sbt.py @@ -4,12 +4,12 @@ import pytest -from sourmash import signature +from sourmash import load_one_signature from sourmash.sbt import SBT, GraphFactory, Leaf, Node from sourmash.sbtmh import (SigLeaf, search_minhashes, - search_minhashes_containment) + search_minhashes_containment) from sourmash.sbt_storage import (FSStorage, TarStorage, - RedisStorage, IPFSStorage) + RedisStorage, IPFSStorage) from . import sourmash_tst_utils as utils @@ -138,7 +138,7 @@ def test_tree_v1_load(): leaf_loader=SigLeaf.load) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) - to_search = next(signature.load_signatures(testdata1)) + to_search = load_one_signature(testdata1) results_v1 = {str(s) for s in tree_v1.find(search_minhashes_containment, to_search, 0.1)} @@ -157,7 +157,7 @@ def test_tree_v2_load(): leaf_loader=SigLeaf.load) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) - to_search = next(signature.load_signatures(testdata1)) + to_search = load_one_signature(testdata1) results_v2 = {str(s) for s in tree_v2.find(search_minhashes_containment, to_search, 0.1)} @@ -176,7 +176,7 @@ def test_tree_v3_load(): leaf_loader=SigLeaf.load) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) - to_search = next(signature.load_signatures(testdata1)) + to_search = load_one_signature(testdata1) results_v2 = {str(s) for s in tree_v2.find(search_minhashes_containment, to_search, 0.1)} @@ -195,7 +195,7 @@ def test_tree_v5_load(): leaf_loader=SigLeaf.load) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) - to_search = next(signature.load_signatures(testdata1)) + to_search = load_one_signature(testdata1) results_v2 = {str(s) for s in tree_v2.find(search_minhashes_containment, to_search, 0.1)} @@ -211,7 +211,7 @@ def test_tree_save_load(n_children): tree = SBT(factory, d=n_children) for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = load_one_signature(utils.get_test_data(f)) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf @@ -241,7 +241,7 @@ def test_tree_save_load_v5(n_children): tree = SBT(factory, d=n_children) for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = load_one_signature(utils.get_test_data(f)) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf @@ -272,7 +272,7 @@ def test_search_minhashes(): n_leaves = 0 for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = load_one_signature(utils.get_test_data(f)) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) @@ -295,7 +295,7 @@ def test_binary_nary_tree(): n_leaves = 0 for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = load_one_signature(utils.get_test_data(f)) leaf = SigLeaf(os.path.basename(f), sig) for tree in trees.values(): tree.add_node(leaf) @@ -323,7 +323,7 @@ def test_sbt_combine(n_children): n_leaves = 0 for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = load_one_signature(utils.get_test_data(f)) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) if n_leaves < 4: @@ -341,8 +341,7 @@ def test_sbt_combine(n_children): assert len(t_leaves) == len(t1_leaves) assert t1_leaves == t_leaves - to_search = next(signature.load_signatures( - utils.get_test_data(utils.SIG_FILES[0]))) + to_search = load_one_signature(utils.get_test_data(utils.SIG_FILES[0])) t1_result = {str(s) for s in tree_1.find(search_minhashes, to_search, 0.1)} tree_result = {str(s) for s in tree.find(search_minhashes, @@ -360,7 +359,7 @@ def test_sbt_combine(n_children): if not next_empty: next_empty = n + 1 - tree_1.add_node(leaf) + tree_1.add_node(SigLeaf(to_search.name(), to_search)) assert tree_1.next_node == next_empty @@ -370,7 +369,8 @@ def test_sbt_fsstorage(): tree = SBT(factory) for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = load_one_signature(utils.get_test_data(f)) + leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf @@ -403,7 +403,8 @@ def test_sbt_tarstorage(): tree = SBT(factory) for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = load_one_signature(utils.get_test_data(f)) + leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf @@ -439,7 +440,8 @@ def test_sbt_ipfsstorage(): tree = SBT(factory) for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = load_one_signature(utils.get_test_data(f)) + leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf @@ -477,7 +479,8 @@ def test_sbt_redisstorage(): tree = SBT(factory) for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = load_one_signature(utils.get_test_data(f)) + leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf @@ -516,7 +519,7 @@ def test_tree_repair(): leaf_loader=SigLeaf.load) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) - to_search = next(signature.load_signatures(testdata1)) + to_search = load_one_signature(testdata1) results_repair = {str(s) for s in tree_repair.find(search_minhashes, to_search, 0.1)} @@ -527,12 +530,12 @@ def test_tree_repair(): assert len(results_repair) == 2 -def test_tree_repair_add_node(): +def test_tree_repair_insert(): tree_repair = SBT.load(utils.get_test_data('leaves.sbt.json'), leaf_loader=SigLeaf.load) for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = load_one_signature(utils.get_test_data(f)) leaf = SigLeaf(os.path.basename(f), sig) tree_repair.add_node(leaf) @@ -552,7 +555,7 @@ def test_save_sparseness(n_children): tree = SBT(factory, d=n_children) for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = load_one_signature(utils.get_test_data(f)) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf @@ -586,3 +589,21 @@ def test_save_sparseness(n_children): # Leaf nodes can't have children if isinstance(node, Leaf): assert all(c.node is None for c in tree_loaded.children(pos)) + + +def test_sbt_as_index_signatures(): + # test 'signatures' method from Index base class. + factory = GraphFactory(31, 1e5, 4) + tree = SBT(factory, d=2) + + sig47 = load_one_signature(utils.get_test_data('47.fa.sig')) + sig63 = load_one_signature(utils.get_test_data('63.fa.sig')) + + tree.insert(sig47) + tree.insert(sig63) + + xx = list(tree.signatures()) + assert len(xx) == 2 + + assert sig47 in xx + assert sig63 in xx From 93b9b90692c5beb371f80bc6d125b3976567e112 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Mon, 16 Dec 2019 01:40:48 +0000 Subject: [PATCH 04/10] Indexing in Rust (#773) * working on keeping Index trait close to Index abc add coverage CI job initial index cmd impl add batch_insert method use needletail for fastx parsing fix finch feature lint clippy fix benchmarks comment out code that depends on unreleased crates... another ci check, can we publish it\? add windows, macos, beta and nightly tests tests move ocf to another repo needletail 0.3.2 update for finch 0.3.0 fix warnings initial search test oracle tests for search install sourmash for oracle tests keep nodegraph working on wasm32 use only hash_functions internally niffler instead of ocf test all features for coverage compatibility with sourmash-py SBT note about md5sum and filenames move tests into specific SBTs remove unwrap add note to nodegraph parsing update typed-builder fix nodegraph behavior to match khmer ignore roundtrip_sbt while figuring out heisenbug --- .github/actions-rs/grcov.yml | 7 + .github/workflows/rust.yml | 231 +++++++++++++++++++++ .gitignore | 10 +- .travis.yml | 20 -- Cargo.toml | 79 ++++---- Makefile | 5 +- benches/index.rs | 85 +++----- include/sourmash.h | 51 +++++ ocf/Cargo.toml | 18 -- ocf/src/lib.rs | 275 ------------------------- requirements.txt | 19 +- setup.py | 8 +- sourmash/sbtmh.py | 3 +- src/bin/smrs.rs | 157 ++++++++++----- src/bin/sourmash.yml | 19 ++ src/cmd.rs | 118 ++++++----- src/errors.rs | 13 +- src/ffi/minhash.rs | 96 +++++++-- src/ffi/mod.rs | 5 + src/ffi/nodegraph.rs | 181 +++++++++++++++++ src/ffi/signature.rs | 18 +- src/{ => ffi}/utils.rs | 4 +- src/from.rs | 55 +++-- src/index/bigsi.rs | 72 +++---- src/index/linear.rs | 95 +++++---- src/index/mod.rs | 129 +++++++++--- src/index/sbt/mhbt.rs | 295 +++++++++++++++++++++++++-- src/index/sbt/mhmt.rs | 122 +++++++++-- src/index/sbt/mod.rs | 381 ++++++++++++----------------------- src/index/sbt/ukhs.rs | 24 +-- src/index/storage.rs | 4 +- src/lib.rs | 3 - src/signature.rs | 62 +++--- src/sketch/minhash.rs | 182 ++++++++++------- src/sketch/mod.rs | 3 +- src/sketch/nodegraph.rs | 93 +++++++-- src/sketch/ukhs.rs | 202 +++++++++++-------- src/wasm.rs | 21 +- tests/minhash.rs | 18 +- tests/smrs_cmd.rs | 139 +++++++++++++ tox.ini | 18 +- 41 files changed, 2118 insertions(+), 1222 deletions(-) create mode 100644 .github/actions-rs/grcov.yml create mode 100644 .github/workflows/rust.yml delete mode 100644 ocf/Cargo.toml delete mode 100644 ocf/src/lib.rs create mode 100644 src/ffi/nodegraph.rs rename src/{ => ffi}/utils.rs (97%) create mode 100644 tests/smrs_cmd.rs diff --git a/.github/actions-rs/grcov.yml b/.github/actions-rs/grcov.yml new file mode 100644 index 0000000000..d8822ccdb7 --- /dev/null +++ b/.github/actions-rs/grcov.yml @@ -0,0 +1,7 @@ +branch: true +ignore-not-existing: true +llvm: true +filter: covered +output-type: lcov +output-file: ./lcov.info +prefix-dir: /home/user/build/ diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml new file mode 100644 index 0000000000..2972ac0b8a --- /dev/null +++ b/.github/workflows/rust.yml @@ -0,0 +1,231 @@ +name: Rust checks + +on: + push: + branches: [master] + pull_request: + branches: [master] + +jobs: + check: + name: Check + runs-on: ubuntu-latest + steps: + - name: Checkout sources + uses: actions/checkout@v1 + + - name: Install stable toolchain + uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + + - name: Run cargo check + uses: actions-rs/cargo@v1 + with: + command: check + + test: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + build: [beta, stable, windows, macos] + include: + - build: macos + os: macos-latest + rust: stable + - build: windows + os: windows-latest + rust: stable + - build: beta + os: ubuntu-latest + rust: beta + - build: stable + os: ubuntu-latest + rust: stable + steps: + - uses: actions/checkout@v1 + + - uses: actions-rs/toolchain@v1 + with: + toolchain: ${{ matrix.rust }} + override: true + + - name: Set up Python 3.8 + if: matrix.os != 'windows-latest' + uses: actions/setup-python@v1 + with: + python-version: "3.8" + + - name: Install dependencies + if: matrix.os != 'windows-latest' + run: | + python -m pip install --upgrade pip + python -m pip install -e . + + - name: Run tests + uses: actions-rs/cargo@v1 + with: + command: test + args: --no-fail-fast + + test_all_features: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v1 + + - uses: actions-rs/toolchain@v1 + with: + toolchain: stable + override: true + + - name: Set up Python 3.8 + uses: actions/setup-python@v1 + with: + python-version: "3.8" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install -e . + + - name: Run tests + uses: actions-rs/cargo@v1 + with: + command: test + args: --no-fail-fast --all --all-features + + coverage: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v1 + + - uses: actions-rs/toolchain@v1 + with: + toolchain: nightly + override: true + + - name: Set up Python 3.8 + uses: actions/setup-python@v1 + with: + python-version: "3.8" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install -e . + + - name: Run tests + uses: actions-rs/cargo@v1 + with: + command: test + args: --no-fail-fast --all --all-features + env: + 'CARGO_INCREMENTAL': '0' + 'RUSTFLAGS': '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Zno-landing-pads' + + - name: Collect coverage and generate report with grcov + uses: actions-rs/grcov@v0.1.4 + id: coverage + + - name: Upload coverage to codecov + uses: codecov/codecov-action@v1.0.3 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: ${{ steps.coverage.outputs.report }} + + lints: + name: Lints + runs-on: ubuntu-latest + steps: + - name: Checkout sources + uses: actions/checkout@v1 + + - name: Install stable toolchain + uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + components: rustfmt, clippy + + - name: Run cargo fmt + uses: actions-rs/cargo@v1 + with: + command: fmt + args: --all -- --check + + - name: Run cargo clippy + uses: actions-rs/cargo@v1 + with: + command: clippy + args: -- -D warnings + + wasm-pack: + name: Check if wasm-pack builds a valid package + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@master + - uses: actions-rs/toolchain@v1 + with: + toolchain: stable + target: wasm32-unknown-unknown + - uses: actions-rs/cargo@v1 + with: + command: install + args: --force wasm-pack + - name: run wasm-pack + run: wasm-pack build + + wasm32-wasi: + name: Run tests under wasm32-wasi + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@master + - name: Install wasm32-wasi target + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + target: wasm32-wasi + - name: Install wasmtime + run: "curl https://wasmtime.dev/install.sh -sSf | bash" + - name: Add wasmtime to PATH + run: echo "::add-path::$HOME/.wasmtime/bin" + - name: Install cargo-wasi command + uses: actions-rs/cargo@v1 + with: + command: install + args: --force cargo-wasi + - name: Build code with cargo-wasi + uses: actions-rs/cargo@v1 + with: + command: wasi + args: build + - name: Run tests under wasm32-wasi + uses: actions-rs/cargo@v1 + continue-on-error: true ## TODO: remove this when tests work... + with: + command: wasi + args: test + + publish: + name: Publish (dry-run) + runs-on: ubuntu-latest + steps: + - name: Checkout sources + uses: actions/checkout@v1 + + - name: Install stable toolchain + uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + + - name: Make sure we can publish the crate + uses: actions-rs/cargo@v1 + with: + command: publish + args: --dry-run diff --git a/.gitignore b/.gitignore index db06c2c4ae..0aa0e7a6f1 100644 --- a/.gitignore +++ b/.gitignore @@ -24,4 +24,12 @@ sourmash/_minhash.cpp .pytest_cache .python-version sourmash/version.py -*.DS_Store \ No newline at end of file +*.DS_Store +.tox +sourmash/_lowlevel*.py +.env +Pipfile +Pipfile.lock +ocf/target/ +target/ +Cargo.lock diff --git a/.travis.yml b/.travis.yml index a971a870f1..937b4b6c0b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -48,26 +48,6 @@ jobs: python: 3.6 - <<: *test python: 3.5 - - <<: *test - name: wasm-pack - language: rust - rust: stable - before_script: skip - install: skip - script: - - rustup target add wasm32-unknown-unknown - - cargo install --force wasm-pack - - wasm-pack build - - <<: *test - name: wasi target - language: rust - rust: stable - before_script: skip - install: skip - script: - - rustup target add wasm32-wasi - - cargo install --force cargo-wasi - - cargo wasi build - &wheel stage: build wheel and send to github releases diff --git a/Cargo.toml b/Cargo.toml index 7c41ba6a2c..f188cc1ad4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,9 @@ keywords = ["minhash", "bioinformatics"] categories = ["science", "algorithms", "data-structures"] license = "BSD-3-Clause" edition = "2018" +default-run = "smrs" +autoexamples = false +autobins = false [lib] name = "sourmash" @@ -18,7 +21,7 @@ bench = false lto=true [features] -from-finch = ["finch", "needletail"] +from-finch = ["finch"] [workspace] @@ -26,53 +29,59 @@ from-finch = ["finch", "needletail"] #cbindgen = "~0.6.7" [dependencies] -byteorder = "^1.2" -cfg-if = "0.1" -clap = { version = "~2.32", features = ["yaml"] } -env_logger = "0.6.0" +byteorder = "1.3.2" +cfg-if = "0.1.10" +clap = { version = "2.33.0", features = ["yaml"] } +env_logger = "0.7.1" exitfailure = "0.5.1" -failure = "0.1.3" -failure_derive = "0.1.3" -finch = { version = "~0.1.6", optional = true } -fixedbitset = "^0.1.9" -human-panic = "1.0.1" -lazy_static = "1.0.0" +failure = "0.1.6" +failure_derive = "0.1.6" +finch = { version = "0.3.0", optional = true } +fixedbitset = "0.2.0" +lazy_static = "1.4.0" lazy-init = "0.3.0" -log = "0.4.0" -md5 = "0.6.0" -murmurhash3 = "~0.0.5" -needletail = { version = "~0.2.1", optional = true } -serde = "1.0" -serde_derive = "~1.0.58" -serde_json = "1.0.2" -ukhs = { git = "https://github.com/luizirber/ukhs", branch = "feature/alternative_backends", features = ["boomphf_mphf"], default-features = false} -bio = { git = "https://github.com/luizirber/rust-bio", branch = "feature/fastx_reader" } +log = "0.4.8" +md5 = "0.7.0" +murmurhash3 = "0.0.5" +serde = "1.0.103" +serde_derive = "1.0.103" +serde_json = "1.0.44" +#ukhs = { git = "https://github.com/luizirber/ukhs", branch = "feature/alternative_backends", features = ["boomphf_mphf"], default-features = false} primal = "0.2.3" -pdatastructs = { git = "https://github.com/luizirber/pdatastructs.rs", branch = "succinct_wasm" } -itertools = "0.8.0" -typed-builder = "0.3.0" -csv = "1.0.7" +#pdatastructs = { git = "https://github.com/luizirber/pdatastructs.rs", branch = "succinct_wasm" } +itertools = "0.8.2" +typed-builder = "0.4.0" +csv = "1.1.1" tempfile = "3.1.0" +[dependencies.needletail] +version = "0.3.2" +default-features = false +#features = ["compression"] + [target.'cfg(all(target_arch = "wasm32", target_vendor="unknown"))'.dependencies.wasm-bindgen] -version = "^0.2" +version = "0.2.55" features = ["serde-serialize"] -[target.'cfg(not(all(target_arch = "wasm32", target_vendor="unknown")))'.dependencies.ocf] -version = "0.1" -path = "ocf" +[target.'cfg(not(all(target_arch = "wasm32", target_vendor="unknown")))'.dependencies.niffler] +version = "1.0" default-features = false -[target.'cfg(not(target_arch = "wasm32"))'.dependencies.mqf] -version = "1.0.0" +[target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies] +proptest = "0.9.4" [dev-dependencies] -proptest = "^0.8" -criterion = "^0.2" -rand = "^0.5" -tempfile = "3" -assert_matches = "1.2" +criterion = "0.3.0" +rand = "0.7.2" +tempfile = "3.1.0" +assert_matches = "1.3.0" +assert_cmd = "0.12.0" +predicates = "1.0.2" [[bench]] name = "index" harness = false + +[[bin]] +name = "smrs" +path = "src/bin/smrs.rs" diff --git a/Makefile b/Makefile index 482f497302..8bb87d29dd 100644 --- a/Makefile +++ b/Makefile @@ -7,6 +7,7 @@ all: clean: $(PYTHON) setup.py clean --all + rm -f sourmash/*.so cd doc && make clean install: all @@ -22,7 +23,7 @@ test: all doc: .PHONY cd doc && make html -include/sourmash.h: src/lib.rs src/ffi/minhash.rs src/ffi/signature.rs src/errors.rs +include/sourmash.h: src/lib.rs src/ffi/minhash.rs src/ffi/signature.rs src/ffi/nodegraph.rs src/errors.rs rustup override set nightly RUST_BACKTRACE=1 cbindgen --clean -c cbindgen.toml -o $@ rustup override set stable @@ -33,7 +34,7 @@ coverage: all $(PYTHON) -m pytest --cov=. --cov-report term-missing benchmark: - asv continuous master $(git rev-parse HEAD) + asv continuous master `git rev-parse HEAD` check: cargo build diff --git a/benches/index.rs b/benches/index.rs index 2ea53a5c72..e2f48a724f 100644 --- a/benches/index.rs +++ b/benches/index.rs @@ -6,9 +6,8 @@ use std::path::PathBuf; use criterion::{Bencher, Criterion, Fun}; use sourmash::index::bigsi::BIGSI; use sourmash::index::linear::LinearIndex; -use sourmash::index::storage::ReadData; +use sourmash::index::Index; use sourmash::index::MHBT; -use sourmash::index::{Dataset, Index}; use sourmash::signature::Signature; fn find_small_bench(c: &mut Criterion) { @@ -17,39 +16,30 @@ fn find_small_bench(c: &mut Criterion) { let sbt = MHBT::from_path(filename).expect("Loading error"); - let leaf: Dataset = (*sbt.datasets().first().unwrap()).clone(); + let leaf: Signature = (*sbt.signatures().first().unwrap()).clone(); let mut linear = LinearIndex::builder().storage(sbt.storage()).build(); - for l in &sbt.datasets() { - linear.insert(l); + for l in sbt.signatures() { + linear.insert(l).unwrap(); } let mut bigsi = BIGSI::new(10000, 10); - for l in &sbt.datasets() { - let data = l.data().unwrap(); - bigsi.insert(data); + for l in sbt.signatures() { + bigsi.insert(l).unwrap(); } - let sbt_find = Fun::new( - "sbt_search", - move |b: &mut Bencher, leaf: &Dataset| b.iter(|| sbt.search(leaf, 0.1, false)), - ); - - let linear_find = Fun::new( - "linear_search", - move |b: &mut Bencher, leaf: &Dataset| { - b.iter(|| linear.search(leaf, 0.1, false)) - }, - ); - - let bigsi_find = Fun::new( - "bigsi_search", - move |b: &mut Bencher, leaf: &Dataset| { - let data = leaf.data().unwrap(); - b.iter(|| bigsi.search(data, 0.1, false)) - }, - ); + let sbt_find = Fun::new("sbt_search", move |b: &mut Bencher, leaf: &Signature| { + b.iter(|| sbt.search(leaf, 0.1, false)) + }); + + let linear_find = Fun::new("linear_search", move |b: &mut Bencher, leaf: &Signature| { + b.iter(|| linear.search(leaf, 0.1, false)) + }); + + let bigsi_find = Fun::new("bigsi_search", move |b: &mut Bencher, leaf: &Signature| { + b.iter(|| bigsi.search(leaf, 0.1, false)) + }); let functions = vec![sbt_find, linear_find, bigsi_find]; c.bench_functions("search_small", functions, leaf); @@ -61,38 +51,29 @@ fn find_subset_bench(c: &mut Criterion) { let sbt = MHBT::from_path(filename).expect("Loading error"); - let leaf: Dataset = (*sbt.datasets().first().unwrap()).clone(); + let leaf: Signature = (*sbt.signatures().first().unwrap()).clone(); let mut linear = LinearIndex::builder().storage(sbt.storage()).build(); - for l in &sbt.datasets() { - linear.insert(l); + for l in sbt.signatures() { + linear.insert(l).unwrap(); } let mut bigsi = BIGSI::new(10000, 10); - for l in &sbt.datasets() { - let data = l.data().unwrap(); - bigsi.insert(data); + for l in sbt.signatures() { + bigsi.insert(l).unwrap(); } - let sbt_find = Fun::new( - "sbt_search", - move |b: &mut Bencher, leaf: &Dataset| b.iter(|| sbt.search(leaf, 0.1, false)), - ); - - let linear_find = Fun::new( - "linear_search", - move |b: &mut Bencher, leaf: &Dataset| { - b.iter(|| linear.search(leaf, 0.1, false)) - }, - ); - - let bigsi_find = Fun::new( - "bigsi_search", - move |b: &mut Bencher, leaf: &Dataset| { - let data = leaf.data().unwrap(); - b.iter(|| bigsi.search(data, 0.1, false)) - }, - ); + let sbt_find = Fun::new("sbt_search", move |b: &mut Bencher, leaf: &Signature| { + b.iter(|| sbt.search(leaf, 0.1, false)) + }); + + let linear_find = Fun::new("linear_search", move |b: &mut Bencher, leaf: &Signature| { + b.iter(|| linear.search(leaf, 0.1, false)) + }); + + let bigsi_find = Fun::new("bigsi_search", move |b: &mut Bencher, leaf: &Signature| { + b.iter(|| bigsi.search(leaf, 0.1, false)) + }); let functions = vec![sbt_find, linear_find, bigsi_find]; c.bench_functions("search_subset", functions, leaf); diff --git a/include/sourmash.h b/include/sourmash.h index ed01e1cb3c..607b37e385 100644 --- a/include/sourmash.h +++ b/include/sourmash.h @@ -8,6 +8,14 @@ #include #include +enum HashFunctions { + HASH_FUNCTIONS_MURMUR64_DNA = 1, + HASH_FUNCTIONS_MURMUR64_PROTEIN = 2, + HASH_FUNCTIONS_MURMUR64_DAYHOFF = 3, + HASH_FUNCTIONS_MURMUR64_HP = 4, +}; +typedef uint32_t HashFunctions; + enum SourmashErrorCode { SOURMASH_ERROR_CODE_NO_ERROR = 0, SOURMASH_ERROR_CODE_PANIC = 1, @@ -32,6 +40,8 @@ typedef uint32_t SourmashErrorCode; typedef struct KmerMinHash KmerMinHash; +typedef struct Nodegraph Nodegraph; + typedef struct Signature Signature; /** @@ -57,6 +67,8 @@ void kmerminhash_add_word(KmerMinHash *ptr, const char *word); double kmerminhash_compare(KmerMinHash *ptr, const KmerMinHash *other); +double kmerminhash_containment_ignore_maxhash(KmerMinHash *ptr, const KmerMinHash *other); + uint64_t kmerminhash_count_common(KmerMinHash *ptr, const KmerMinHash *other); bool kmerminhash_dayhoff(KmerMinHash *ptr); @@ -79,6 +91,12 @@ const uint64_t *kmerminhash_get_mins(KmerMinHash *ptr); uintptr_t kmerminhash_get_mins_size(KmerMinHash *ptr); +HashFunctions kmerminhash_hash_function(KmerMinHash *ptr); + +void kmerminhash_hash_function_set(KmerMinHash *ptr, HashFunctions hash_function); + +bool kmerminhash_hp(KmerMinHash *ptr); + uint64_t kmerminhash_intersection(KmerMinHash *ptr, const KmerMinHash *other); bool kmerminhash_is_protein(KmerMinHash *ptr); @@ -95,6 +113,7 @@ KmerMinHash *kmerminhash_new(uint32_t n, uint32_t k, bool prot, bool dayhoff, + bool hp, uint64_t seed, uint64_t mx, bool track_abundance); @@ -109,6 +128,36 @@ uint64_t kmerminhash_seed(KmerMinHash *ptr); bool kmerminhash_track_abundance(KmerMinHash *ptr); +bool nodegraph_count(Nodegraph *ptr, uint64_t h); + +double nodegraph_expected_collisions(Nodegraph *ptr); + +void nodegraph_free(Nodegraph *ptr); + +Nodegraph *nodegraph_from_buffer(const char *ptr, uintptr_t insize); + +Nodegraph *nodegraph_from_path(const char *filename); + +uintptr_t nodegraph_get(Nodegraph *ptr, uint64_t h); + +uintptr_t nodegraph_ksize(Nodegraph *ptr); + +uintptr_t nodegraph_matches(Nodegraph *ptr, KmerMinHash *mh_ptr); + +Nodegraph *nodegraph_new(void); + +uintptr_t nodegraph_noccupied(Nodegraph *ptr); + +uintptr_t nodegraph_ntables(Nodegraph *ptr); + +void nodegraph_save(Nodegraph *ptr, const char *filename); + +uintptr_t nodegraph_tablesize(Nodegraph *ptr); + +void nodegraph_update(Nodegraph *ptr, Nodegraph *optr); + +Nodegraph *nodegraph_with_tables(uintptr_t ksize, uintptr_t starting_size, uintptr_t n_tables); + bool signature_eq(Signature *ptr, Signature *other); KmerMinHash *signature_first_mh(Signature *ptr); @@ -152,6 +201,8 @@ SourmashStr signatures_save_buffer(Signature **ptr, uintptr_t size); char sourmash_aa_to_dayhoff(char aa); +char sourmash_aa_to_hp(char aa); + /** * Clears the last error. */ diff --git a/ocf/Cargo.toml b/ocf/Cargo.toml deleted file mode 100644 index 1752b25b6d..0000000000 --- a/ocf/Cargo.toml +++ /dev/null @@ -1,18 +0,0 @@ -[package] -name = "ocf" -version = "0.1.0" -authors = ["Luiz Irber "] -edition = "2018" - -[features] -default = ["bzip2", "xz2"] -bz2 = ["bzip2"] -lzma = ["xz2"] - -[dependencies] -bzip2 = { version = "0.3.3", optional = true } -cfg-if = "0.1" -failure = "0.1.3" -flate2 = "1.0" -enum_primitive = "0.1.1" -xz2 = { version = "0.1", optional = true } diff --git a/ocf/src/lib.rs b/ocf/src/lib.rs deleted file mode 100644 index f9662244fc..0000000000 --- a/ocf/src/lib.rs +++ /dev/null @@ -1,275 +0,0 @@ -/* -Copyright (c) 2018 Pierre Marijon - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -Originally from https://github.com/natir/yacrd/blob/3fc6ef8b5b51256f0c4bc45b8056167acf34fa58/src/file.rs -Changes: - - make bzip2 and lzma support optional -*/ - -/* crates use */ -use cfg_if::cfg_if; -use enum_primitive::{ - enum_from_primitive, enum_from_primitive_impl, enum_from_primitive_impl_ty, FromPrimitive, -}; -use failure::{Error, Fail}; -use flate2; - -/* standard use */ -use std::fs::File; -use std::io; -use std::io::{BufReader, BufWriter}; - -enum_from_primitive! { - #[repr(u64)] - #[derive(Debug, PartialEq)] - pub enum CompressionFormat { - Gzip = 0x1F8B, - Bzip = 0x425A, - Lzma = 0xFD377A585A, - No, - } -} - -#[derive(Debug, Fail)] -pub enum OCFError { - #[fail(display = "Feature disabled, enabled it during compilation")] - FeatureDisabled, -} - -pub fn get_input(input_name: &str) -> Result<(Box, CompressionFormat), Error> { - // choose std::io::stdin or open file - if input_name == "-" { - Ok((Box::new(get_readable(input_name)), CompressionFormat::No)) - } else { - get_readable_file(input_name) - } -} - -pub fn get_readable_file( - input_name: &str, -) -> Result<(Box, CompressionFormat), Error> { - let raw_input = get_readable(input_name); - - // check compression - let compression = get_compression(raw_input); - - // return readable and compression status - match compression { - CompressionFormat::Gzip => Ok(( - Box::new(flate2::read::GzDecoder::new(get_readable(input_name))), - CompressionFormat::Gzip, - )), - CompressionFormat::Bzip => new_bz2_decoder(get_readable(input_name)), - CompressionFormat::Lzma => new_lzma_decoder(get_readable(input_name)), - CompressionFormat::No => Ok((Box::new(get_readable(input_name)), CompressionFormat::No)), - } -} - -pub fn get_readable(input_name: &str) -> Box { - match input_name { - "-" => Box::new(BufReader::new(io::stdin())), - _ => Box::new(BufReader::new( - File::open(input_name) - .unwrap_or_else(|_| panic!("Can't open input file {}", input_name)), - )), - } -} - -fn get_compression(mut in_stream: Box) -> CompressionFormat { - let mut buf = vec![0u8; 5]; - - in_stream - .read_exact(&mut buf) - .expect("Error durring reading first bit of file"); - - let mut five_bit_val: u64 = 0; - for (i, item) in buf.iter().enumerate().take(5) { - five_bit_val |= (u64::from(*item)) << (8 * (4 - i)); - } - if CompressionFormat::from_u64(five_bit_val) == Some(CompressionFormat::Lzma) { - return CompressionFormat::Lzma; - } - - let mut two_bit_val: u64 = 0; - for (i, item) in buf.iter().enumerate().take(2) { - two_bit_val |= (u64::from(*item)) << (8 * (1 - i)); - } - - match CompressionFormat::from_u64(two_bit_val) { - e @ Some(CompressionFormat::Gzip) | e @ Some(CompressionFormat::Bzip) => e.unwrap(), - _ => CompressionFormat::No, - } -} - -cfg_if! { - if #[cfg(feature = "bz2")] { - use bzip2; - - fn new_bz2_encoder(out: Box) -> Result, Error> { - Ok(Box::new(bzip2::write::BzEncoder::new( - out, - bzip2::Compression::Best, - ))) - } - - fn new_bz2_decoder( - inp: Box, - ) -> Result<(Box, CompressionFormat), Error> { - use bzip2; - Ok(( - Box::new(bzip2::read::BzDecoder::new(inp)), - CompressionFormat::Bzip, - )) - } - } else { - fn new_bz2_encoder(_: Box) -> Result, Error> { - Err(OCFError::FeatureDisabled.into()) - } - - fn new_bz2_decoder(_: Box) -> Result<(Box, CompressionFormat), Error> { - Err(OCFError::FeatureDisabled.into()) - } - } -} - -cfg_if! { - if #[cfg(feature = "lzma")] { - use xz2; - - fn new_lzma_encoder(out: Box) -> Result, Error> { - Ok(Box::new(xz2::write::XzEncoder::new(out, 9))) - } - - fn new_lzma_decoder( - inp: Box, - ) -> Result<(Box, CompressionFormat), Error> { - use xz2; - Ok(( - Box::new(xz2::read::XzDecoder::new(inp)), - CompressionFormat::Lzma, - )) - } - } else { - fn new_lzma_encoder(_: Box) -> Result, Error> { - Err(OCFError::FeatureDisabled.into()) - } - - fn new_lzma_decoder(_: Box) -> Result<(Box, CompressionFormat), Error> { - Err(OCFError::FeatureDisabled.into()) - } - } -} - -pub fn get_output( - output_name: &str, - format: CompressionFormat, -) -> Result, Error> { - match format { - CompressionFormat::Gzip => Ok(Box::new(flate2::write::GzEncoder::new( - get_writable(output_name), - flate2::Compression::best(), - ))), - CompressionFormat::Bzip => new_bz2_encoder(get_writable(output_name)), - CompressionFormat::Lzma => new_lzma_encoder(get_writable(output_name)), - CompressionFormat::No => Ok(Box::new(get_writable(output_name))), - } -} - -pub fn choose_compression( - input_compression: CompressionFormat, - compression_set: bool, - compression_value: &str, -) -> CompressionFormat { - if !compression_set { - return input_compression; - } - - match compression_value { - "gzip" => CompressionFormat::Gzip, - "bzip2" => CompressionFormat::Bzip, - "lzma" => CompressionFormat::Lzma, - _ => CompressionFormat::No, - } -} - -fn get_writable(output_name: &str) -> Box { - match output_name { - "-" => Box::new(BufWriter::new(io::stdout())), - _ => Box::new(BufWriter::new( - File::create(output_name) - .unwrap_or_else(|_| panic!("Can't open output file {}", output_name)), - )), - } -} - -#[cfg(test)] -mod test { - - use super::*; - - const GZIP_FILE: &'static [u8] = &[0o037, 0o213, 0o0, 0o0, 0o0]; - const BZIP_FILE: &'static [u8] = &[0o102, 0o132, 0o0, 0o0, 0o0]; - const LZMA_FILE: &'static [u8] = &[0o375, 0o067, 0o172, 0o130, 0o132]; - - #[test] - fn compression_from_file() { - assert_eq!( - get_compression(Box::new(GZIP_FILE)), - CompressionFormat::Gzip - ); - assert_eq!( - get_compression(Box::new(BZIP_FILE)), - CompressionFormat::Bzip - ); - assert_eq!( - get_compression(Box::new(LZMA_FILE)), - CompressionFormat::Lzma - ); - } - - #[test] - fn compression_from_input_or_cli() { - assert_eq!( - choose_compression(CompressionFormat::Gzip, false, "_"), - CompressionFormat::Gzip - ); - assert_eq!( - choose_compression(CompressionFormat::Bzip, false, "_"), - CompressionFormat::Bzip - ); - assert_eq!( - choose_compression(CompressionFormat::Lzma, false, "_"), - CompressionFormat::Lzma - ); - assert_eq!( - choose_compression(CompressionFormat::No, true, "gzip"), - CompressionFormat::Gzip - ); - assert_eq!( - choose_compression(CompressionFormat::No, true, "bzip2"), - CompressionFormat::Bzip - ); - assert_eq!( - choose_compression(CompressionFormat::No, true, "lzma"), - CompressionFormat::Lzma - ); - } -} diff --git a/requirements.txt b/requirements.txt index 7def5908b1..5be6ef7bce 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,4 @@ -pytest -screed -numpy -matplotlib -scipy -Cython -khmer>=2.1,<3 -sphinx -alabaster -recommonmark -sphinxcontrib-napoleon -setuptools_scm -setuptools_scm_git_archive -nbsphinx -bam2fasta +-e .[test] +-e .[doc] +-e .[10x] +-e .[storage] diff --git a/setup.py b/setup.py index 6b43497c4a..1f1720821d 100644 --- a/setup.py +++ b/setup.py @@ -70,10 +70,12 @@ 'setuptools_scm', 'setuptools_scm_git_archive'], "use_scm_version": {"write_to": "sourmash/version.py"}, "extras_require": { - 'test' : ['pytest', 'pytest-cov', 'recommonmark'], + 'test' : ['pytest', 'pytest-cov'], 'demo' : ['jupyter', 'jupyter_client', 'ipython'], - 'doc' : ['sphinx'], - '10x': ['bam2fasta==1.0.1'] + 'doc' : ['sphinx', 'recommonmark', 'alabaster', + "sphinxcontrib-napoleon", "nbsphinx"], + '10x': ['bam2fasta==1.0.1'], + 'storage': ["ipfshttpclient", "redis"] }, "include_package_data": True, "package_data": { diff --git a/sourmash/sbtmh.py b/sourmash/sbtmh.py index 44067d8965..2289e0cb94 100644 --- a/sourmash/sbtmh.py +++ b/sourmash/sbtmh.py @@ -57,8 +57,7 @@ def update(self, parent): for v in self.data.minhash.get_mins(): parent.data.count(v) min_n_below = parent.metadata.get('min_n_below', sys.maxsize) - min_n_below = min(len(self.data.minhash.get_mins()), - min_n_below) + min_n_below = min(len(self.data.minhash), min_n_below) if min_n_below == 0: min_n_below = 1 diff --git a/src/bin/smrs.rs b/src/bin/smrs.rs index d0ea0b0847..b58f2c44ee 100644 --- a/src/bin/smrs.rs +++ b/src/bin/smrs.rs @@ -6,25 +6,82 @@ use std::rc::Rc; use clap::{load_yaml, App}; use exitfailure::ExitFailure; use failure::Error; -//use human_panic::setup_panic; -use lazy_init::Lazy; use log::{info, LevelFilter}; -use ocf::{get_output, CompressionFormat}; +use niffler::{get_output, CompressionFormat}; use serde::ser::SerializeStruct; use serde::{Serialize, Serializer}; -use sourmash::cmd::{ - count_unique, draff_compare, draff_index, draff_search, draff_signature, prepare, -}; +/* FIXME bring back after succint-rs changes +use sourmash::cmd::{count_unique, draff_compare, draff_search, draff_signature, prepare}; +*/ +use sourmash::cmd::prepare; + use sourmash::index::linear::LinearIndex; use sourmash::index::sbt::scaffold; use sourmash::index::search::{ search_minhashes, search_minhashes_containment, search_minhashes_find_best, }; -use sourmash::index::{Comparable, Dataset, Index, MHBT}; +use sourmash::index::storage::{FSStorage, Storage}; +use sourmash::index::{Comparable, Index, MHBT}; use sourmash::signature::{Signature, SigsTrait}; use sourmash::sketch::Sketch; +pub fn index( + sig_files: Vec<&str>, + storage: Rc, + outfile: &str, +) -> Result { + let mut index = MHBT::builder().storage(Rc::clone(&storage)).build(); + + for filename in sig_files { + // TODO: check for stdin? can also use get_input()? + + let mut sig = Signature::from_path(filename)?; + + if sig.len() > 1 { + unimplemented!(); + }; + + index.insert(sig.pop().unwrap())?; + } + + // TODO: implement to_writer and use this? + //let mut output = get_output(outfile, CompressionFormat::No)?; + //index.to_writer(&mut output)? + + index.save_file(outfile, Some(storage))?; + + Ok(Indices::MHBT(index)) + + /* + let mut lindex = LinearIndex::::builder() + .storage(Rc::clone(&storage)) + .build(); + + for filename in sig_files { + // TODO: check for stdin? can also use get_input()? + + let mut sig = Signature::from_path(filename)?; + + if sig.len() > 1 { + unimplemented!(); + }; + + lindex.insert(sig.pop().unwrap())?; + } + + let mut index: MHBT = lindex.into(); + + // TODO: implement to_writer and use this? + //let mut output = get_output(outfile, CompressionFormat::No)?; + //index.to_writer(&mut output)? + + index.save_file(outfile, Some(storage))?; + + Ok(Indices::MHBT(index)) + */ +} + struct Query { data: T, } @@ -53,28 +110,19 @@ impl Query { } fn name(&self) -> String { - self.data.name().clone() + self.data.name() } } -impl From> for Dataset { - fn from(other: Query) -> Dataset { - let data = Lazy::new(); - data.get_or_create(|| other.data); - - Dataset::builder() - .data(Rc::new(data)) - .filename("") - .name("") - .metadata("") - .storage(None) - .build() +impl From> for Signature { + fn from(other: Query) -> Signature { + other.data } } fn load_query_signature( query: &str, - ksize: usize, + ksize: Option, moltype: Option<&str>, scaled: Option, ) -> Result, Error> { @@ -93,13 +141,13 @@ struct Database { path: String, } -enum Indices { +pub enum Indices { MHBT(MHBT), - LinearIndex(LinearIndex>), + LinearIndex(LinearIndex), } -impl Index for Database { - type Item = Dataset; +impl Index<'_> for Database { + type Item = Signature; fn find( &self, @@ -116,7 +164,7 @@ impl Index for Database { } } - fn insert(&mut self, node: &Self::Item) -> Result<(), Error> { + fn insert(&mut self, node: Self::Item) -> Result<(), Error> { match &mut self.data { Indices::MHBT(data) => data.insert(node), Indices::LinearIndex(data) => data.insert(node), @@ -134,10 +182,17 @@ impl Index for Database { unimplemented!(); } - fn datasets(&self) -> Vec { + fn signatures(&self) -> Vec { + match &self.data { + Indices::MHBT(data) => data.signatures(), + Indices::LinearIndex(data) => data.signatures(), + } + } + + fn signature_refs(&self) -> Vec<&Self::Item> { match &self.data { - Indices::MHBT(data) => data.datasets(), - Indices::LinearIndex(data) => data.datasets(), + Indices::MHBT(data) => data.signature_refs(), + Indices::LinearIndex(data) => data.signature_refs(), } } } @@ -170,7 +225,7 @@ fn load_sbts_and_sigs( info!("loaded SBT {}", path); n_databases += 1; continue; - } else if let Ok(data) = LinearIndex::>::from_path(path) { + } else if let Ok(data) = LinearIndex::::from_path(path) { // TODO: check compatible dbs.push(Database { data: Indices::LinearIndex(data), @@ -251,7 +306,7 @@ fn search_databases( if similarity >= threshold { results.push(Results { similarity, - match_sig: dataset.clone().into(), + match_sig: dataset.clone(), db: db.path.clone(), }) } @@ -263,7 +318,7 @@ fn search_databases( } fn main() -> Result<(), ExitFailure> { - //setup_panic!(); + //better_panic::install(); env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init(); @@ -271,6 +326,7 @@ fn main() -> Result<(), ExitFailure> { let m = App::from_yaml(yml).get_matches(); match m.subcommand_name() { + /* FIXME bring back after succint-rs changes Some("draff") => { let cmd = m.subcommand_matches("draff").unwrap(); let inputs = cmd @@ -300,6 +356,14 @@ fn main() -> Result<(), ExitFailure> { draff_compare(inputs)?; } + Some("count_unique") => { + let cmd = m.subcommand_matches("count_unique").unwrap(); + + let index: &str = cmd.value_of("index").unwrap(); + + count_unique(index)?; + } + */ Some("prepare") => { let cmd = m.subcommand_matches("prepare").unwrap(); let index: &str = cmd.value_of("index").unwrap(); @@ -311,29 +375,29 @@ fn main() -> Result<(), ExitFailure> { let inputs = cmd .values_of("inputs") .map(|vals| vals.collect::>()) - .unwrap(); + .expect("Missing inputs"); - let output: &str = cmd.value_of("output").unwrap(); + let output: &str = cmd.value_of("output").expect("Missing output"); + let (output, base) = if output.ends_with(".sbt.json") { + (output.to_owned(), output.trim_end_matches(".sbt.json")) + } else { + (output.to_owned() + ".sbt.json", output) + }; + + let storage: Rc = Rc::new(FSStorage::new(".", &format!(".sbt.{}", base))); - draff_index(inputs, output)?; + index(inputs, storage, &output)?; } Some("scaffold") => { let cmd = m.subcommand_matches("scaffold").unwrap(); let sbt_file = cmd.value_of("current_sbt").unwrap(); let sbt = MHBT::from_path(sbt_file)?; - let mut new_sbt: MHBT = scaffold(sbt.datasets(), sbt.storage()); + let mut new_sbt: MHBT = scaffold(sbt.leaves(), sbt.storage()); new_sbt.save_file("test", None)?; - assert_eq!(new_sbt.datasets().len(), sbt.datasets().len()); - } - Some("count_unique") => { - let cmd = m.subcommand_matches("count_unique").unwrap(); - - let index: &str = cmd.value_of("index").unwrap(); - - count_unique(index)?; + assert_eq!(new_sbt.leaves().len(), sbt.leaves().len()); } Some("search") => { let cmd = m.subcommand_matches("search").unwrap(); @@ -345,10 +409,9 @@ fn main() -> Result<(), ExitFailure> { let query = load_query_signature( cmd.value_of("query").unwrap(), if cmd.is_present("ksize") { - cmd.value_of("ksize").unwrap().parse().unwrap() + Some(cmd.value_of("ksize").unwrap().parse().unwrap()) } else { - // TODO default k - unimplemented!() + None }, Some("dna"), // TODO: select moltype, if cmd.is_present("scaled") { diff --git a/src/bin/sourmash.yml b/src/bin/sourmash.yml index 765358e2a1..ecc072e228 100644 --- a/src/bin/sourmash.yml +++ b/src/bin/sourmash.yml @@ -108,6 +108,25 @@ subcommands: args: - index: help: SBT index + - index: + about: create an index + settings: + - ArgRequiredElseHelp + args: + - ksize: + help: "k-mer size for which to build the SBT." + short: k + long: "ksize" + takes_value: true + required: false + - output: + help: alternative output file + short: o + takes_value: true + required: false + - inputs: + help: signatures + multiple: true # groups: # - protein: diff --git a/src/cmd.rs b/src/cmd.rs index 79538ff82d..c16ea49c3c 100644 --- a/src/cmd.rs +++ b/src/cmd.rs @@ -1,26 +1,29 @@ +use failure::Error; + +use crate::index::MHBT; + +/* FIXME: bring back after boomphf changes use std::path::{Path, PathBuf}; use std::rc::Rc; -use bio::io::fastx; -use failure::Error; use log::info; -use ocf::{get_input, get_output, CompressionFormat}; -use pdatastructs::hyperloglog::HyperLogLog; +use needletail::parse_sequence_path; +use crate::index::{Comparable, Index, MHBT}; use crate::index::linear::LinearIndex; use crate::index::storage::{FSStorage, Storage}; -use crate::index::{Comparable, Dataset, Index, UKHSTree, MHBT}; use crate::signature::{Signature, SigsTrait}; -use crate::sketch::ukhs::{FlatUKHS, UKHSTrait, UniqueUKHS}; use crate::sketch::Sketch; +use crate::index::{UKHSTree}; +use crate::sketch::ukhs::{FlatUKHS, UKHSTrait, UniqueUKHS}; pub fn draff_index(sig_files: Vec<&str>, outfile: &str) -> Result<(), Error> { let storage: Rc = Rc::new( - FSStorage::new(".".into(), ".draff".into()), // TODO: use outfile + FSStorage::new(".", ".draff"), // TODO: use outfile ); //let mut index = UKHSTree::builder().storage(Rc::clone(&storage)).build(); - let mut index = LinearIndex::>::builder() + let mut index = LinearIndex::::builder() .storage(Rc::clone(&storage)) .build(); @@ -40,9 +43,7 @@ pub fn draff_index(sig_files: Vec<&str>, outfile: &str) -> Result<(), Error> { .signatures(vec![Sketch::UKHS(ukhs_sig)]) .build(); - let dataset = sig.into(); - - index.insert(&dataset)?; + index.insert(sig)?; } // TODO: implement to_writer and use this? @@ -85,26 +86,13 @@ pub fn draff_search(index: &str, query: &str) -> Result<(), Error> { .signatures(vec![Sketch::UKHS(ukhs_sig)]) .build(); - let dataset = sig.into(); - - for found in index.search(&dataset, 0.9, false)? { - println!("{:.2}: {:?}", dataset.similarity(found), found); + for found in index.search(&sig, 0.9, false)? { + println!("{:.2}: {:?}", sig.similarity(found), found); } Ok(()) } -pub fn prepare(index_path: &str) -> Result<(), Error> { - let mut index = MHBT::from_path(index_path)?; - - // TODO equivalent to fill_internal in python - //unimplemented!(); - - index.save_file(index_path, None)?; - - Ok(()) -} - pub fn draff_signature(files: Vec<&str>, k: usize, w: usize) -> Result<(), Error> { for filename in files { // TODO: check for stdin? @@ -113,62 +101,68 @@ pub fn draff_signature(files: Vec<&str>, k: usize, w: usize) -> Result<(), Error info!("Build signature for {} with W={}, K={}...", filename, w, k); - let (input, _) = get_input(filename)?; - let reader = fastx::Reader::new(input); - - for record in reader.records() { - let record = record?; - - // if there is anything other than ACGT in sequence, - // it is replaced with A. - // This matches khmer and screed behavior - // - // NOTE: sourmash is different! It uses the force flag to drop - // k-mers that are not ACGT - let seq: Vec = record - .seq() - .iter() - .map(|&x| match x as char { - 'A' | 'C' | 'G' | 'T' => x, - 'a' | 'c' | 'g' | 't' => x.to_ascii_uppercase(), - _ => 'A' as u8, - }) - .collect(); - - ukhs.add_sequence(&seq, false)?; - } + parse_sequence_path( + filename, + |_| {}, + |record| { + // if there is anything other than ACGT in sequence, + // it is replaced with A. + // This matches khmer and screed behavior + // + // NOTE: sourmash is different! It uses the force flag to drop + // k-mers that are not ACGT + let seq: Vec = record + .seq + .iter() + .map(|&x| match x as char { + 'A' | 'C' | 'G' | 'T' => x, + 'a' | 'c' | 'g' | 't' => x.to_ascii_uppercase(), + _ => b'A', + }) + .collect(); + + ukhs.add_sequence(&seq, false) + .expect("Error adding sequence"); + }, + )?; let mut outfile = PathBuf::from(filename); outfile.set_extension("sig"); + /* let mut output = get_output(outfile.to_str().unwrap(), CompressionFormat::No)?; let flat: FlatUKHS = ukhs.into(); flat.to_writer(&mut output)? + */ } info!("Done."); Ok(()) } +*/ +/* FIXME bring back after succint-rs changes pub fn count_unique(index_path: &str) -> Result<(), Error> { let index = MHBT::from_path(index_path)?; info!("Loaded index: {}", index_path); - let mut hll = HyperLogLog::new(16); + let mut hll = pdatastructs::hyperloglog::HyperLogLog::new(16); let mut total_hashes = 0u64; - for (n, dataset) in index.datasets().iter().enumerate() { + for (n, sig) in index.signatures().iter().enumerate() { if n % 1000 == 0 { - info!("Processed {} datasets", n); + info!("Processed {} signatures", n); info!("Unique hashes in {}: {}", index_path, hll.count()); info!("Total hashes in {}: {}", index_path, total_hashes); }; - for hash in dataset.mins() { - hll.add(&hash); - total_hashes += 1; + if let Sketch::MinHash(mh) = &sig.signatures[0] { + for hash in mh.mins() { + hll.add(&hash); + total_hashes += 1; + } } } @@ -177,3 +171,15 @@ pub fn count_unique(index_path: &str) -> Result<(), Error> { Ok(()) } +*/ + +pub fn prepare(index_path: &str) -> Result<(), Error> { + let mut index = MHBT::from_path(index_path)?; + + // TODO equivalent to fill_internal in python + //unimplemented!(); + + index.save_file(index_path, None)?; + + Ok(()) +} diff --git a/src/errors.rs b/src/errors.rs index 23c95de049..a539deb895 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -1,4 +1,4 @@ -use failure::{Error, Fail}; +use failure::Fail; #[derive(Debug, Fail)] pub enum SourmashError { @@ -21,8 +21,8 @@ pub enum SourmashError { #[fail(display = "different signatures cannot be compared")] MismatchSignatureType, - #[fail(display = "Can only set track_abundance=True if the MinHash is empty")] - NonEmptyMinHash, + #[fail(display = "Can only set {} if the MinHash is empty", message)] + NonEmptyMinHash { message: String }, #[fail(display = "invalid DNA character in input k-mer: {}", message)] InvalidDNA { message: String }, @@ -64,10 +64,11 @@ pub enum SourmashErrorCode { SerdeError = 100_004, } +#[cfg(not(all(target_arch = "wasm32", target_vendor = "unknown")))] impl SourmashErrorCode { - pub fn from_error(error: &Error) -> SourmashErrorCode { + pub fn from_error(error: &failure::Error) -> SourmashErrorCode { for cause in error.iter_chain() { - use crate::utils::Panic; + use crate::ffi::utils::Panic; if cause.downcast_ref::().is_some() { return SourmashErrorCode::Panic; } @@ -82,7 +83,7 @@ impl SourmashErrorCode { SourmashError::MismatchSignatureType => { SourmashErrorCode::MismatchSignatureType } - SourmashError::NonEmptyMinHash => SourmashErrorCode::NonEmptyMinHash, + SourmashError::NonEmptyMinHash { .. } => SourmashErrorCode::NonEmptyMinHash, SourmashError::InvalidDNA { .. } => SourmashErrorCode::InvalidDNA, SourmashError::InvalidProt { .. } => SourmashErrorCode::InvalidProt, SourmashError::InvalidCodonLength { .. } => { diff --git a/src/ffi/minhash.rs b/src/ffi/minhash.rs index 68aa5264db..2f84fab499 100644 --- a/src/ffi/minhash.rs +++ b/src/ffi/minhash.rs @@ -1,12 +1,13 @@ use std::ffi::CStr; -use std::mem; use std::os::raw::c_char; use std::ptr; use std::slice; use crate::errors::SourmashError; use crate::signature::SigsTrait; -use crate::sketch::minhash::{aa_to_dayhoff, translate_codon, KmerMinHash}; +use crate::sketch::minhash::{ + aa_to_dayhoff, aa_to_hp, translate_codon, HashFunctions, KmerMinHash, +}; #[no_mangle] pub unsafe extern "C" fn kmerminhash_new( @@ -14,19 +15,31 @@ pub unsafe extern "C" fn kmerminhash_new( k: u32, prot: bool, dayhoff: bool, + hp: bool, seed: u64, mx: u64, track_abundance: bool, ) -> *mut KmerMinHash { - mem::transmute(Box::new(KmerMinHash::new( + // TODO: at most one of (prot, dayhoff, hp) should be true + + let hash_function = if dayhoff { + HashFunctions::murmur64_dayhoff + } else if hp { + HashFunctions::murmur64_hp + } else if prot { + HashFunctions::murmur64_protein + } else { + HashFunctions::murmur64_DNA + }; + + Box::into_raw(Box::new(KmerMinHash::new( n, k, - prot, - dayhoff, + hash_function, seed, mx, track_abundance, - ))) + ))) as _ } #[no_mangle] @@ -97,8 +110,13 @@ pub unsafe extern "C" fn sourmash_aa_to_dayhoff(aa: c_char) -> c_char { } #[no_mangle] -pub extern "C" fn kmerminhash_remove_hash(ptr: *mut KmerMinHash, h: u64) { - let mh = unsafe { +pub unsafe extern "C" fn sourmash_aa_to_hp(aa: c_char) -> c_char { + aa_to_hp(aa as u8) as c_char +} + +#[no_mangle] +pub unsafe extern "C" fn kmerminhash_remove_hash(ptr: *mut KmerMinHash, h: u64) { + let mh = { assert!(!ptr.is_null()); &mut *ptr }; @@ -107,17 +125,17 @@ pub extern "C" fn kmerminhash_remove_hash(ptr: *mut KmerMinHash, h: u64) { } #[no_mangle] -pub extern "C" fn kmerminhash_remove_many( +pub unsafe extern "C" fn kmerminhash_remove_many( ptr: *mut KmerMinHash, hashes_ptr: *const u64, insize: usize, ) { - let mh = unsafe { + let mh = { assert!(!ptr.is_null()); &mut *ptr }; - let hashes = unsafe { + let hashes = { assert!(!hashes_ptr.is_null()); slice::from_raw_parts(hashes_ptr as *mut u64, insize) }; @@ -236,6 +254,15 @@ pub unsafe extern "C" fn kmerminhash_dayhoff(ptr: *mut KmerMinHash) -> bool { mh.dayhoff() } +#[no_mangle] +pub unsafe extern "C" fn kmerminhash_hp(ptr: *mut KmerMinHash) -> bool { + let mh = { + assert!(!ptr.is_null()); + &mut *ptr + }; + mh.hp() +} + #[no_mangle] pub unsafe extern "C" fn kmerminhash_seed(ptr: *mut KmerMinHash) -> u64 { let mh = { @@ -270,8 +297,8 @@ unsafe fn kmerminhash_enable_abundance(ptr: *mut KmerMinHash) -> Result<()> { &mut *ptr }; - if mh.mins.len() != 0 { - return Err(SourmashError::NonEmptyMinHash.into()); + if mh.mins.is_empty() { + return Err(SourmashError::NonEmptyMinHash { message: "track_abundance=True".into()}.into()); } mh.abunds = Some(vec![]); @@ -306,6 +333,31 @@ pub unsafe extern "C" fn kmerminhash_max_hash(ptr: *mut KmerMinHash) -> u64 { mh.max_hash() } +#[no_mangle] +pub unsafe extern "C" fn kmerminhash_hash_function(ptr: *mut KmerMinHash) -> HashFunctions { + let mh = { + assert!(!ptr.is_null()); + &mut *ptr + }; + mh.hash_function() +} + +ffi_fn! { +unsafe fn kmerminhash_hash_function_set(ptr: *mut KmerMinHash, hash_function: HashFunctions) -> Result<()> { + let mh = { + assert!(!ptr.is_null()); + &mut *ptr + }; + + if !mh.mins.is_empty() { + return Err(SourmashError::NonEmptyMinHash { message: "hash_function".into()}.into()); + } + + mh.hash_function = hash_function; + Ok(()) +} +} + ffi_fn! { unsafe fn kmerminhash_merge(ptr: *mut KmerMinHash, other: *const KmerMinHash) -> Result<()> { let mh = { @@ -366,13 +418,29 @@ unsafe fn kmerminhash_intersection(ptr: *mut KmerMinHash, other: *const KmerMinH &*other }; - if let Ok((_, size)) = mh.intersection(other_mh) { + if let Ok((_, size)) = mh.intersection_size(other_mh) { return Ok(size); } Ok(0) } } +ffi_fn! { +unsafe fn kmerminhash_containment_ignore_maxhash(ptr: *mut KmerMinHash, other: *const KmerMinHash) + -> Result { + let mh = { + assert!(!ptr.is_null()); + &mut *ptr + }; + let other_mh = { + assert!(!other.is_null()); + &*other + }; + + mh.containment_ignore_maxhash(&other_mh) +} +} + ffi_fn! { unsafe fn kmerminhash_compare(ptr: *mut KmerMinHash, other: *const KmerMinHash) -> Result { diff --git a/src/ffi/mod.rs b/src/ffi/mod.rs index 9c1d5a982e..ec84d16e5a 100644 --- a/src/ffi/mod.rs +++ b/src/ffi/mod.rs @@ -1,8 +1,13 @@ //! # Foreign Function Interface for calling sourmash from a C API //! //! Primary client for now is the Python version, using CFFI and milksnake. +#![allow(clippy::missing_safety_doc)] + +#[macro_use] +pub mod utils; pub mod minhash; +pub mod nodegraph; pub mod signature; use std::ffi::CStr; diff --git a/src/ffi/nodegraph.rs b/src/ffi/nodegraph.rs new file mode 100644 index 0000000000..734ab26286 --- /dev/null +++ b/src/ffi/nodegraph.rs @@ -0,0 +1,181 @@ +use std::ffi::CStr; +use std::os::raw::c_char; +use std::slice; + +use niffler::get_input; + +use crate::sketch::minhash::KmerMinHash; +use crate::sketch::nodegraph::Nodegraph; + +#[no_mangle] +pub unsafe extern "C" fn nodegraph_new() -> *mut Nodegraph { + Box::into_raw(Box::new(Nodegraph::default())) as _ +} + +#[no_mangle] +pub unsafe extern "C" fn nodegraph_free(ptr: *mut Nodegraph) { + if ptr.is_null() { + return; + } + Box::from_raw(ptr); +} + +#[no_mangle] +pub unsafe extern "C" fn nodegraph_with_tables( + ksize: usize, + starting_size: usize, + n_tables: usize, +) -> *mut Nodegraph { + Box::into_raw(Box::new(Nodegraph::with_tables( + starting_size, + n_tables, + ksize, + ))) as _ +} + +#[no_mangle] +pub unsafe extern "C" fn nodegraph_count(ptr: *mut Nodegraph, h: u64) -> bool { + let ng = { + assert!(!ptr.is_null()); + &mut *ptr + }; + + ng.count(h) +} + +#[no_mangle] +pub unsafe extern "C" fn nodegraph_get(ptr: *mut Nodegraph, h: u64) -> usize { + let ng = { + assert!(!ptr.is_null()); + &mut *ptr + }; + + ng.get(h) +} + +#[no_mangle] +pub unsafe extern "C" fn nodegraph_expected_collisions(ptr: *mut Nodegraph) -> f64 { + let ng = { + assert!(!ptr.is_null()); + &mut *ptr + }; + + ng.expected_collisions() +} + +#[no_mangle] +pub unsafe extern "C" fn nodegraph_ksize(ptr: *mut Nodegraph) -> usize { + let ng = { + assert!(!ptr.is_null()); + &mut *ptr + }; + + ng.ksize() +} + +#[no_mangle] +pub unsafe extern "C" fn nodegraph_tablesize(ptr: *mut Nodegraph) -> usize { + let ng = { + assert!(!ptr.is_null()); + &mut *ptr + }; + + ng.tablesize() +} + +#[no_mangle] +pub unsafe extern "C" fn nodegraph_ntables(ptr: *mut Nodegraph) -> usize { + let ng = { + assert!(!ptr.is_null()); + &mut *ptr + }; + + ng.ntables() +} + +#[no_mangle] +pub unsafe extern "C" fn nodegraph_noccupied(ptr: *mut Nodegraph) -> usize { + let ng = { + assert!(!ptr.is_null()); + &mut *ptr + }; + + ng.noccupied() +} + +#[no_mangle] +pub unsafe extern "C" fn nodegraph_matches(ptr: *mut Nodegraph, mh_ptr: *mut KmerMinHash) -> usize { + let ng = { + assert!(!ptr.is_null()); + &mut *ptr + }; + + let mh = { + assert!(!ptr.is_null()); + &mut *mh_ptr + }; + + ng.matches(mh) +} + +#[no_mangle] +pub unsafe extern "C" fn nodegraph_update(ptr: *mut Nodegraph, optr: *mut Nodegraph) { + let ng = { + assert!(!ptr.is_null()); + &mut *ptr + }; + + let ong = { + assert!(!optr.is_null()); + &mut *optr + }; + + ng.update(ong); +} + +ffi_fn! { +unsafe fn nodegraph_from_path(filename: *const c_char) -> Result<*mut Nodegraph> { + let c_str = { + assert!(!filename.is_null()); + + CStr::from_ptr(filename) + }; + + let (mut input, _) = get_input(c_str.to_str()?)?; + let ng = Nodegraph::from_reader(&mut input)?; + + Ok(Box::into_raw(Box::new(ng))) +} +} + +ffi_fn! { +unsafe fn nodegraph_from_buffer(ptr: *const c_char, insize: usize) -> Result<*mut Nodegraph> { + let buf = { + assert!(!ptr.is_null()); + slice::from_raw_parts(ptr as *mut u8, insize) + }; + + let ng = Nodegraph::from_reader(&mut &buf[..])?; + + Ok(Box::into_raw(Box::new(ng))) +} +} + +ffi_fn! { +unsafe fn nodegraph_save(ptr: *mut Nodegraph, filename: *const c_char) -> Result<()> { + let ng = { + assert!(!ptr.is_null()); + &mut *ptr + }; + + let c_str = { + assert!(!filename.is_null()); + + CStr::from_ptr(filename) + }; + + ng.save(c_str.to_str()?)?; + + Ok(()) +} +} diff --git a/src/ffi/signature.rs b/src/ffi/signature.rs index 093ff95d0c..56e147ebe3 100644 --- a/src/ffi/signature.rs +++ b/src/ffi/signature.rs @@ -3,13 +3,13 @@ use std::io; use std::os::raw::c_char; use std::slice; -use ocf::get_input; +use niffler::get_input; use serde_json; +use crate::ffi::utils::SourmashStr; use crate::signature::Signature; use crate::sketch::minhash::KmerMinHash; use crate::sketch::Sketch; -use crate::utils::SourmashStr; // Signature methods @@ -243,8 +243,13 @@ unsafe fn signatures_load_path(ptr: *const c_char, // TODO: implement ignore_md5sum + let k = match ksize { + 0 => None, + x => Some(x) + }; + let (mut input, _) = get_input(buf.to_str()?)?; - let filtered_sigs = Signature::load_signatures(&mut input, ksize, moltype, None)?; + let filtered_sigs = Signature::load_signatures(&mut input, k, moltype, None)?; let ptr_sigs: Vec<*mut Signature> = filtered_sigs.into_iter().map(|x| { Box::into_raw(Box::new(x)) as *mut Signature @@ -277,10 +282,15 @@ unsafe fn signatures_load_buffer(ptr: *const c_char, } }; + let k = match ksize { + 0 => None, + x => Some(x) + }; + // TODO: implement ignore_md5sum let mut reader = io::BufReader::new(buf); - let filtered_sigs = Signature::load_signatures(&mut reader, ksize, moltype, None)?; + let filtered_sigs = Signature::load_signatures(&mut reader, k, moltype, None)?; let ptr_sigs: Vec<*mut Signature> = filtered_sigs.into_iter().map(|x| { Box::into_raw(Box::new(x)) as *mut Signature diff --git a/src/utils.rs b/src/ffi/utils.rs similarity index 97% rename from src/utils.rs rename to src/ffi/utils.rs index 999dd21e4c..68c883c77e 100644 --- a/src/utils.rs +++ b/src/ffi/utils.rs @@ -25,7 +25,7 @@ macro_rules! ffi_fn ( $(#[$attr])* pub unsafe extern "C" fn $name($($aname: $aty,)*) -> $rv { - $crate::utils::landingpad(|| $body) + $crate::ffi::utils::landingpad(|| $body) } ); @@ -39,7 +39,7 @@ macro_rules! ffi_fn ( pub unsafe extern "C" fn $name($($aname: $aty,)*) { // this silences panics and stuff - $crate::utils::landingpad(|| { $body; Ok(0 as ::std::os::raw::c_int) }); + $crate::ffi::utils::landingpad(|| { $body; Ok(0 as ::std::os::raw::c_int) }); } } ); diff --git a/src/from.rs b/src/from.rs index e0846a11d3..67821927a3 100644 --- a/src/from.rs +++ b/src/from.rs @@ -1,15 +1,22 @@ -use finch::minhashes::MinHashKmers; +use finch::sketch_schemes::mash::MashSketcher; +use finch::sketch_schemes::SketchScheme; -use crate::signatures::minhash::KmerMinHash; +use crate::sketch::minhash::{HashFunctions, KmerMinHash}; -impl From for KmerMinHash { - fn from(other: MinHashKmers) -> KmerMinHash { - let values = other.into_vec(); +/* + TODO: + - also convert scaled sketches + - sourmash Signature equivalent is the finch Sketch, write conversions for that too +*/ + +impl From for KmerMinHash { + fn from(other: MashSketcher) -> KmerMinHash { + let values = other.to_vec(); let mut new_mh = KmerMinHash::new( values.len() as u32, values.get(0).unwrap().kmer.len() as u32, - false, + HashFunctions::murmur64_DNA, 42, 0, true, @@ -20,7 +27,9 @@ impl From for KmerMinHash { .map(|x| (x.hash as u64, x.count as u64)) .collect(); - new_mh.add_many_with_abund(&hash_with_abunds); + new_mh + .add_many_with_abund(&hash_with_abunds) + .expect("Error adding hashes with abund"); new_mh } @@ -32,27 +41,30 @@ mod test { use std::collections::HashSet; use std::iter::FromIterator; - use crate::signatures::minhash::KmerMinHash; + use crate::signature::SigsTrait; + use crate::sketch::minhash::{HashFunctions, KmerMinHash}; - use finch::minhashes::MinHashKmers; - use needletail::kmer::canonical; + use finch::sketch_schemes::mash::MashSketcher; + use needletail::kmer::CanonicalKmers; + use needletail::Sequence; use super::*; #[test] fn finch_behavior() { - let mut a = KmerMinHash::new(20, 10, false, 42, 0, true); - let mut b = MinHashKmers::new(20, 42); + let mut a = KmerMinHash::new(20, 10, HashFunctions::murmur64_DNA, 42, 0, true); + let mut b = MashSketcher::new(20, 10, 42); let seq = b"TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA"; + let rc = seq.reverse_complement(); - a.add_sequence(seq, false); + a.add_sequence(seq, false).unwrap(); - for kmer in seq.windows(10) { - b.push(&canonical(kmer), 0); + for (_, kmer, _) in CanonicalKmers::new(seq, &rc, 10) { + b.push(&kmer, 0); } - let b_hashes = b.into_vec(); + let b_hashes = b.to_vec(); let s1: HashSet<_> = HashSet::from_iter(a.mins.iter().map(|x| *x)); let s2: HashSet<_> = HashSet::from_iter(b_hashes.iter().map(|x| x.hash as u64)); @@ -76,15 +88,16 @@ mod test { #[test] fn from_finch() { - let mut a = KmerMinHash::new(20, 10, false, 42, 0, true); - let mut b = MinHashKmers::new(20, 42); + let mut a = KmerMinHash::new(20, 10, HashFunctions::murmur64_DNA, 42, 0, true); + let mut b = MashSketcher::new(20, 10, 42); let seq = b"TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA"; + let rc = seq.reverse_complement(); - a.add_sequence(seq, false); + a.add_sequence(seq, false).unwrap(); - for kmer in seq.windows(10) { - b.push(&canonical(kmer), 0); + for (_, kmer, _) in CanonicalKmers::new(seq, &rc, 10) { + b.push(&kmer, 0); } let c = KmerMinHash::from(b); diff --git a/src/index/bigsi.rs b/src/index/bigsi.rs index 07f67304d4..6fac529a05 100644 --- a/src/index/bigsi.rs +++ b/src/index/bigsi.rs @@ -5,7 +5,7 @@ use failure::{Error, Fail}; use fixedbitset::FixedBitSet; use typed_builder::TypedBuilder; -use crate::index::{Comparable, Index}; +use crate::index::Index; use crate::signature::{Signature, SigsTrait}; use crate::sketch::nodegraph::Nodegraph; use crate::sketch::Sketch; @@ -79,22 +79,9 @@ impl BIGSI { } } -impl Index for BIGSI { +impl<'a> Index<'a> for BIGSI { type Item = Signature; - - fn find( - &self, - _search_fn: F, - _sig: &Self::Item, - _threshold: f64, - ) -> Result, Error> - where - F: Fn(&dyn Comparable, &Self::Item, f64) -> bool, - { - // TODO: is there a better way than making this a runtime check? - //Err(BIGSIError::MethodDisabled.into()) - unimplemented!(); - } + //type SignatureIterator = std::slice::Iter<'a, Self::Item>; fn search( &self, @@ -109,12 +96,10 @@ impl Index for BIGSI { let mut counter: HashMap = HashMap::with_capacity(hashes.size()); for hash in &hashes.mins { - self.query(*hash) - .map(|dataset_idx| { - let idx = counter.entry(dataset_idx).or_insert(0); - *idx += 1; - }) - .count(); + self.query(*hash).for_each(|dataset_idx| { + let idx = counter.entry(dataset_idx).or_insert(0); + *idx += 1; + }); } for (idx, count) in counter { @@ -140,8 +125,8 @@ impl Index for BIGSI { } } - fn insert(&mut self, node: &Self::Item) -> Result<(), Error> { - self.add(node.clone()); + fn insert(&mut self, node: Self::Item) -> Result<(), Error> { + self.add(node); Ok(()) } @@ -153,9 +138,19 @@ impl Index for BIGSI { unimplemented!() } - fn datasets(&self) -> Vec { + fn signatures(&self) -> Vec { + unimplemented!() + } + + fn signature_refs(&self) -> Vec<&Self::Item> { unimplemented!() } + + /* + fn iter_signatures(&'a self) -> Self::SignatureIterator { + self.datasets.iter() + } + */ } #[cfg(test)] @@ -163,14 +158,10 @@ mod test { use std::fs::File; use std::io::BufReader; use std::path::PathBuf; - use std::rc::Rc; - - use lazy_init::Lazy; use super::BIGSI; - use crate::index::storage::ReadData; - use crate::index::Dataset; + use crate::index::SigStore; use crate::index::{Index, MHBT}; use crate::signature::Signature; @@ -182,29 +173,20 @@ mod test { let sbt = MHBT::from_path(filename).expect("Loading error"); let mut bigsi = BIGSI::new(10000, 10); - let datasets = sbt.datasets(); + let datasets = sbt.signatures(); let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); filename.push("tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); let mut reader = BufReader::new(File::open(filename).unwrap()); - let sigs = Signature::load_signatures(&mut reader, 31, Some("DNA".into()), None).unwrap(); + let sigs = + Signature::load_signatures(&mut reader, Some(31), Some("DNA".into()), None).unwrap(); let sig_data = sigs[0].clone(); - let data = Lazy::new(); - data.get_or_create(|| sig_data); - - let leaf = Dataset::builder() - .data(Rc::new(data)) - .filename("") - .name("") - .metadata("") - .storage(None) - .build(); + let leaf: SigStore<_> = sig_data.into(); - for l in &datasets { - let data = l.data().unwrap(); - bigsi.insert(data).expect("insertion error!"); + for l in datasets { + bigsi.insert(l).expect("insertion error!"); } let results_sbt = sbt.search(&leaf, 0.5, false).unwrap(); diff --git a/src/index/linear.rs b/src/index/linear.rs index 6def3bd82b..76ce5b6918 100644 --- a/src/index/linear.rs +++ b/src/index/linear.rs @@ -11,15 +11,18 @@ use serde_derive::{Deserialize, Serialize}; use typed_builder::TypedBuilder; use crate::index::storage::{FSStorage, ReadData, Storage, StorageInfo, ToWriter}; -use crate::index::{Comparable, Dataset, DatasetInfo, Index}; +use crate::index::{Comparable, DatasetInfo, Index, SigStore}; #[derive(TypedBuilder)] -pub struct LinearIndex { +pub struct LinearIndex +where + L: Sync, +{ #[builder(default)] storage: Option>, #[builder(default)] - pub(crate) datasets: Vec, + pub(crate) datasets: Vec>, } #[derive(Serialize, Deserialize)] @@ -29,36 +32,16 @@ struct LinearInfo { leaves: Vec, } -impl Index for LinearIndex +impl<'a, L> Index<'a> for LinearIndex where - L: Clone + Comparable, + L: Sync + Clone + Comparable + 'a, + SigStore: From, { type Item = L; + //type SignatureIterator = std::slice::Iter<'a, Self::Item>; - fn find( - &self, - search_fn: F, - sig: &Self::Item, - threshold: f64, - ) -> Result, Error> - where - F: Fn(&dyn Comparable, &Self::Item, f64) -> bool, - { - Ok(self - .datasets - .iter() - .flat_map(|node| { - if search_fn(node, sig, threshold) { - Some(node) - } else { - None - } - }) - .collect()) - } - - fn insert(&mut self, node: &L) -> Result<(), Error> { - self.datasets.push(node.clone()); + fn insert(&mut self, node: L) -> Result<(), Error> { + self.datasets.push(node.into()); Ok(()) } @@ -77,15 +60,31 @@ where unimplemented!() } - fn datasets(&self) -> Vec { - self.datasets.to_vec() + fn signatures(&self) -> Vec { + self.datasets + .iter() + .map(|x| x.data.get().unwrap().clone()) + .collect() } + + fn signature_refs(&self) -> Vec<&Self::Item> { + self.datasets + .iter() + .map(|x| x.data.get().unwrap()) + .collect() + } + + /* + fn iter_signatures(&'a self) -> Self::SignatureIterator { + self.datasets.iter() + } + */ } -impl LinearIndex> +impl LinearIndex where L: std::marker::Sync + ToWriter, - Dataset: ReadData, + SigStore: ReadData, { pub fn save_file>( &mut self, @@ -127,12 +126,12 @@ where mem::replace(&mut l.storage, Some(Rc::clone(&storage))); let filename = (*l).save(&l.filename).unwrap(); - let new_node = DatasetInfo { - filename: filename, + + DatasetInfo { + filename, name: l.name.clone(), metadata: l.metadata.clone(), - }; - new_node + } }) .collect(), }; @@ -143,7 +142,7 @@ where Ok(()) } - pub fn from_path>(path: P) -> Result>, Error> { + pub fn from_path>(path: P) -> Result, Error> { let file = File::open(&path)?; let mut reader = BufReader::new(file); @@ -153,12 +152,11 @@ where basepath.push(path); basepath.canonicalize()?; - let linear = - LinearIndex::>::from_reader(&mut reader, &basepath.parent().unwrap())?; + let linear = LinearIndex::::from_reader(&mut reader, &basepath.parent().unwrap())?; Ok(linear) } - pub fn from_reader(rdr: &mut R, path: P) -> Result>, Error> + pub fn from_reader(rdr: &mut R, path: P) -> Result, Error> where R: Read, P: AsRef, @@ -177,15 +175,12 @@ where datasets: linear .leaves .into_iter() - .map(|l| { - let new_node = Dataset { - filename: l.filename, - name: l.name, - metadata: l.metadata, - storage: Some(Rc::clone(&storage)), - data: Rc::new(Lazy::new()), - }; - new_node + .map(|l| SigStore { + filename: l.filename, + name: l.name, + metadata: l.metadata, + storage: Some(Rc::clone(&storage)), + data: Rc::new(Lazy::new()), }) .collect(), }) diff --git a/src/index/mod.rs b/src/index/mod.rs index adb07267ef..9f5ecdbe85 100644 --- a/src/index/mod.rs +++ b/src/index/mod.rs @@ -11,10 +11,10 @@ pub mod storage; pub mod search; +use std::ops::Deref; use std::path::Path; use std::rc::Rc; -use cfg_if::cfg_if; use failure::Error; use lazy_init::Lazy; use serde_derive::{Deserialize, Serialize}; @@ -25,21 +25,28 @@ use crate::index::search::{search_minhashes, search_minhashes_containment}; use crate::index::storage::{ReadData, ReadDataError, Storage}; use crate::signature::Signature; use crate::sketch::nodegraph::Nodegraph; -use crate::sketch::ukhs::{FlatUKHS, UKHSTrait}; use crate::sketch::Sketch; -pub type MHBT = SBT, Dataset>; -pub type UKHSTree = SBT, Dataset>; +/* FIXME: bring back after boomphf changes +use crate::sketch::ukhs::{FlatUKHS, UKHSTrait}; +pub type UKHSTree = SBT, Signature>; +*/ + +pub type MHBT = SBT, Signature>; +/* FIXME: bring back after MQF works on macOS and Windows +use cfg_if::cfg_if; cfg_if! { if #[cfg(not(target_arch = "wasm32"))] { use mqf::MQF; - pub type MHMT = SBT, Dataset>; + pub type MHMT = SBT, Signature>; } } +*/ -pub trait Index { - type Item; +pub trait Index<'a> { + type Item: Comparable; + //type SignatureIterator: Iterator; fn find( &self, @@ -48,7 +55,20 @@ pub trait Index { threshold: f64, ) -> Result, Error> where - F: Fn(&dyn Comparable, &Self::Item, f64) -> bool; + F: Fn(&dyn Comparable, &Self::Item, f64) -> bool, + { + Ok(self + .signature_refs() + .into_iter() + .flat_map(|node| { + if search_fn(&node, sig, threshold) { + Some(node) + } else { + None + } + }) + .collect()) + } fn search( &self, @@ -65,13 +85,27 @@ pub trait Index { //fn gather(&self, sig: &Self::Item, threshold: f64) -> Result, Error>; - fn insert(&mut self, node: &Self::Item) -> Result<(), Error>; + fn insert(&mut self, node: Self::Item) -> Result<(), Error>; + + fn batch_insert(&mut self, nodes: Vec) -> Result<(), Error> { + for node in nodes { + self.insert(node)?; + } + + Ok(()) + } fn save>(&self, path: P) -> Result<(), Error>; fn load>(path: P) -> Result<(), Error>; - fn datasets(&self) -> Vec; + fn signatures(&self) -> Vec; + + fn signature_refs(&self) -> Vec<&Self::Item>; + + /* + fn iter_signatures(&self) -> Self::SignatureIterator; + */ } // TODO: split into two traits, Similarity and Containment? @@ -101,7 +135,7 @@ pub struct DatasetInfo { } #[derive(TypedBuilder, Default, Clone)] -pub struct Dataset +pub struct SigStore where T: std::marker::Sync, { @@ -114,7 +148,7 @@ where pub(crate) data: Rc>, } -impl Dataset +impl SigStore where T: std::marker::Sync + Default, { @@ -123,20 +157,20 @@ where } } -impl std::fmt::Debug for Dataset +impl std::fmt::Debug for SigStore where T: std::marker::Sync, { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, - "Dataset [filename: {}, name: {}, metadata: {}]", + "SigStore [filename: {}, name: {}, metadata: {}]", self.filename, self.name, self.metadata ) } } -impl ReadData for Dataset { +impl ReadData for SigStore { fn data(&self) -> Result<&Signature, Error> { if let Some(sig) = self.data.get() { Ok(sig) @@ -160,8 +194,8 @@ impl ReadData for Dataset { } } -impl Dataset { - pub fn count_common(&self, other: &Dataset) -> u64 { +impl SigStore { + pub fn count_common(&self, other: &SigStore) -> u64 { let ng: &Signature = self.data().unwrap(); let ong: &Signature = other.data().unwrap(); @@ -188,21 +222,29 @@ impl Dataset { } } -impl From> for Signature { - fn from(other: Dataset) -> Signature { +impl From> for Signature { + fn from(other: SigStore) -> Signature { other.data.get().unwrap().to_owned() } } -impl From for Dataset { - fn from(other: Signature) -> Dataset { +impl Deref for SigStore { + type Target = Signature; + + fn deref(&self) -> &Signature { + self.data.get().unwrap() + } +} + +impl From for SigStore { + fn from(other: Signature) -> SigStore { let name = other.name(); let filename = other.filename(); let data = Lazy::new(); data.get_or_create(|| other); - Dataset::builder() + SigStore::builder() .name(name) .filename(filename) .data(data) @@ -212,8 +254,8 @@ impl From for Dataset { } } -impl Comparable> for Dataset { - fn similarity(&self, other: &Dataset) -> f64 { +impl Comparable> for SigStore { + fn similarity(&self, other: &SigStore) -> f64 { let ng: &Signature = self.data().unwrap(); let ong: &Signature = other.data().unwrap(); @@ -225,16 +267,18 @@ impl Comparable> for Dataset { } } + /* FIXME: bring back after boomphf changes if let Sketch::UKHS(mh) = &ng.signatures[0] { if let Sketch::UKHS(omh) = &ong.signatures[0] { return 1. - mh.distance(&omh); } } + */ unimplemented!() } - fn containment(&self, other: &Dataset) -> f64 { + fn containment(&self, other: &SigStore) -> f64 { let ng: &Signature = self.data().unwrap(); let ong: &Signature = other.data().unwrap(); @@ -250,3 +294,38 @@ impl Comparable> for Dataset { unimplemented!() } } + +impl Comparable for Signature { + fn similarity(&self, other: &Signature) -> f64 { + // TODO: select the right signatures... + // TODO: better matching here, what if it is not a mh? + if let Sketch::MinHash(mh) = &self.signatures[0] { + if let Sketch::MinHash(omh) = &other.signatures[0] { + return mh.compare(&omh).unwrap(); + } + } + + /* FIXME: bring back after boomphf changes + if let Sketch::UKHS(mh) = &self.signatures[0] { + if let Sketch::UKHS(omh) = &other.signatures[0] { + return 1. - mh.distance(&omh); + } + } + */ + + unimplemented!() + } + + fn containment(&self, other: &Signature) -> f64 { + // TODO: select the right signatures... + // TODO: better matching here, what if it is not a mh? + if let Sketch::MinHash(mh) = &self.signatures[0] { + if let Sketch::MinHash(omh) = &other.signatures[0] { + let common = mh.count_common(&omh).unwrap(); + let size = mh.mins.len(); + return common as f64 / size as f64; + } + } + unimplemented!() + } +} diff --git a/src/index/sbt/mhbt.rs b/src/index/sbt/mhbt.rs index 46f710e521..b26975f6de 100644 --- a/src/index/sbt/mhbt.rs +++ b/src/index/sbt/mhbt.rs @@ -1,10 +1,13 @@ +use std::collections::HashMap; use std::io::Write; +use std::rc::Rc; use failure::Error; +use lazy_init::Lazy; -use crate::index::sbt::{FromFactory, Node, Update, SBT}; +use crate::index::sbt::{Factory, FromFactory, Node, Update, SBT}; use crate::index::storage::{ReadData, ReadDataError, ToWriter}; -use crate::index::{Comparable, Dataset}; +use crate::index::Comparable; use crate::signature::{Signature, SigsTrait}; use crate::sketch::nodegraph::Nodegraph; use crate::sketch::Sketch; @@ -18,9 +21,24 @@ impl ToWriter for Nodegraph { } } -impl FromFactory> for SBT, L> { - fn factory(&self, _name: &str) -> Result, Error> { - unimplemented!() +impl FromFactory> for SBT, L> { + fn factory(&self, name: &str) -> Result, Error> { + match self.factory { + Factory::GraphFactory { args: (k, t, n) } => { + let n = Nodegraph::with_tables(t as usize, n as usize, k as usize); + + let data = Lazy::new(); + data.get_or_create(|| n); + + Ok(Node::builder() + .filename(name) + .name(name) + .metadata(HashMap::default()) + .storage(self.storage()) + .data(Rc::new(data)) + .build()) + } + } } } @@ -30,9 +48,35 @@ impl Update> for Node { } } -impl Update> for Dataset { - fn update(&self, _other: &mut Node) -> Result<(), Error> { - unimplemented!(); +impl Update> for Signature { + fn update(&self, parent: &mut Node) -> Result<(), Error> { + // TODO: avoid copy here + let mut parent_data = parent.data()?.clone(); + + if let Sketch::MinHash(sig) = &self.signatures[0] { + sig.mins.iter().for_each(|h| { + parent_data.count(*h); + }); + + let min_n_below = parent + .metadata + .entry("min_n_below".into()) + .or_insert(u64::max_value()); + + *min_n_below = u64::min(sig.size() as u64, *min_n_below); + if *min_n_below == 0 { + *min_n_below = 1 + } + } else { + //TODO what if it is not a minhash? + unimplemented!() + } + + let data = Lazy::new(); + data.get_or_create(|| parent_data); + parent.data = Rc::new(data); + + Ok(()) } } @@ -50,13 +94,12 @@ impl Comparable> for Node { } } -impl Comparable> for Node { - fn similarity(&self, other: &Dataset) -> f64 { +impl Comparable for Node { + fn similarity(&self, other: &Signature) -> f64 { let ng: &Nodegraph = self.data().unwrap(); - let oth: &Signature = other.data().unwrap(); // TODO: select the right signatures... - if let Sketch::MinHash(sig) = &oth.signatures[0] { + if let Sketch::MinHash(sig) = &other.signatures[0] { if sig.size() == 0 { return 0.0; } @@ -74,12 +117,11 @@ impl Comparable> for Node { } } - fn containment(&self, other: &Dataset) -> f64 { + fn containment(&self, other: &Signature) -> f64 { let ng: &Nodegraph = self.data().unwrap(); - let oth: &Signature = other.data().unwrap(); // TODO: select the right signatures... - if let Sketch::MinHash(sig) = &oth.signatures[0] { + if let Sketch::MinHash(sig) = &other.signatures[0] { if sig.size() == 0 { return 0.0; } @@ -108,3 +150,226 @@ impl ReadData for Node { } } } + +#[cfg(test)] +mod test { + use std::fs::File; + use std::io::{BufReader, Seek, SeekFrom}; + use std::path::PathBuf; + use std::rc::Rc; + use tempfile; + + use assert_matches::assert_matches; + use lazy_init::Lazy; + + use super::Factory; + + use crate::index::linear::LinearIndex; + use crate::index::sbt::scaffold; + use crate::index::search::{search_minhashes, search_minhashes_containment}; + use crate::index::storage::ReadData; + use crate::index::{Index, SigStore, MHBT}; + use crate::signature::Signature; + + #[test] + fn save_sbt() { + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("tests/test-data/v5.sbt.json"); + + let mut sbt = MHBT::from_path(filename).expect("Loading error"); + + let mut tmpfile = tempfile::NamedTempFile::new().unwrap(); + sbt.save_file(tmpfile.path(), None).unwrap(); + + tmpfile.seek(SeekFrom::Start(0)).unwrap(); + } + + #[test] + fn load_sbt() { + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("tests/test-data/v5.sbt.json"); + + let sbt = MHBT::from_path(filename).expect("Loading error"); + + assert_eq!(sbt.d, 2); + //assert_eq!(sbt.storage.backend, "FSStorage"); + //assert_eq!(sbt.storage.args["path"], ".sbt.v5"); + //assert_matches!(&sbt.storage, ::FSStorage(args) => { + // assert_eq!(args, &[1, 100000, 4]); + //}); + assert_matches!(&sbt.factory, Factory::GraphFactory { args } => { + assert_eq!(args, &(1, 100000.0, 4)); + }); + + println!("sbt leaves {:?} {:?}", sbt.leaves.len(), sbt.leaves); + + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); + + let mut reader = BufReader::new(File::open(filename).unwrap()); + let sigs = + Signature::load_signatures(&mut reader, Some(31), Some("DNA".into()), None).unwrap(); + let sig_data = sigs[0].clone(); + + let data = Lazy::new(); + data.get_or_create(|| sig_data); + + let leaf = SigStore::builder() + .data(Rc::new(data)) + .filename("") + .name("") + .metadata("") + .storage(None) + .build(); + + let results = sbt.find(search_minhashes, &leaf, 0.5).unwrap(); + assert_eq!(results.len(), 1); + println!("results: {:?}", results); + println!("leaf: {:?}", leaf); + + let results = sbt.find(search_minhashes, &leaf, 0.1).unwrap(); + assert_eq!(results.len(), 2); + println!("results: {:?}", results); + println!("leaf: {:?}", leaf); + + let mut linear = LinearIndex::builder().storage(sbt.storage()).build(); + for l in &sbt.leaves { + linear.insert(l.1.data().unwrap().clone()).unwrap(); + } + + println!( + "linear leaves {:?} {:?}", + linear.datasets.len(), + linear.datasets + ); + + let results = linear.find(search_minhashes, &leaf, 0.5).unwrap(); + assert_eq!(results.len(), 1); + println!("results: {:?}", results); + println!("leaf: {:?}", leaf); + + let results = linear.find(search_minhashes, &leaf, 0.1).unwrap(); + assert_eq!(results.len(), 2); + println!("results: {:?}", results); + println!("leaf: {:?}", leaf); + + let results = linear + .find(search_minhashes_containment, &leaf, 0.5) + .unwrap(); + assert_eq!(results.len(), 2); + println!("results: {:?}", results); + println!("leaf: {:?}", leaf); + + let results = linear + .find(search_minhashes_containment, &leaf, 0.1) + .unwrap(); + assert_eq!(results.len(), 4); + println!("results: {:?}", results); + println!("leaf: {:?}", leaf); + } + + #[test] + #[ignore] + fn roundtrip_sbt() -> Result<(), Box> { + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("tests/test-data/v5.sbt.json"); + + let sbt = MHBT::from_path(filename)?; + + assert_eq!(sbt.d, 2); + //assert_eq!(sbt.storage.backend, "FSStorage"); + //assert_eq!(sbt.storage.args["path"], ".sbt.v5"); + //assert_matches!(&sbt.storage, ::FSStorage(args) => { + // assert_eq!(args, &[1, 100000, 4]); + //}); + assert_matches!(&sbt.factory, Factory::GraphFactory { args } => { + assert_eq!(args, &(1, 100000.0, 4)); + }); + + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); + + let mut reader = BufReader::new(File::open(filename)?); + let sigs = Signature::load_signatures(&mut reader, Some(31), Some("DNA".into()), None)?; + let sig_data = sigs[0].clone(); + + let leaf: SigStore<_> = sig_data.into(); + + let results = sbt.find(search_minhashes, &leaf, 0.5)?; + assert_eq!(results.len(), 1); + //println!("results: {:?}", results); + //println!("leaf: {:?}", leaf); + + let results = sbt.find(search_minhashes, &leaf, 0.1)?; + assert_eq!(results.len(), 2); + //println!("results: {:?}", results); + //println!("leaf: {:?}", leaf); + + println!("sbt internal {:?} {:?}", sbt.nodes.len(), sbt.nodes); + println!("sbt leaves {:?} {:?}", sbt.leaves.len(), sbt.leaves); + + let mut new_sbt: MHBT = MHBT::builder().storage(None).build(); + let datasets = sbt.signatures(); + for l in datasets { + new_sbt.insert(l)?; + } + + for (i, node) in &sbt.nodes { + assert_eq!(node.data().unwrap(), new_sbt.nodes[i].data().unwrap()); + } + + assert_eq!(new_sbt.signature_refs().len(), 7); + println!("new_sbt internal {:?} {:?}", sbt.nodes.len(), sbt.nodes); + println!("new_sbt leaves {:?} {:?}", sbt.leaves.len(), sbt.leaves); + + let results = new_sbt.find(search_minhashes, &leaf, 0.5)?; + //println!("results: {:?}", results); + //println!("leaf: {:?}", leaf); + assert_eq!(results.len(), 1); + + let results = new_sbt.find(search_minhashes, &leaf, 0.1)?; + println!("results: {:?}", results); + println!("leaf: {:?}", leaf); + assert_eq!(results.len(), 2); + + let results = new_sbt.find(search_minhashes_containment, &leaf, 0.5)?; + println!("results: {:?}", results); + println!("leaf: {:?}", leaf); + assert_eq!(results.len(), 2); + + let results = new_sbt.find(search_minhashes_containment, &leaf, 0.1)?; + println!("results: {:?}", results); + println!("leaf: {:?}", leaf); + assert_eq!(results.len(), 4); + + Ok(()) + } + + #[test] + fn scaffold_sbt() { + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("tests/test-data/v5.sbt.json"); + + let sbt = MHBT::from_path(filename).expect("Loading error"); + + let new_sbt: MHBT = scaffold(sbt.leaves(), sbt.storage()); + + assert_eq!(new_sbt.signatures().len(), 7); + } + + #[test] + fn load_v4() { + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("tests/test-data/v4.sbt.json"); + + let _sbt = MHBT::from_path(filename).expect("Loading error"); + } + + #[test] + fn load_v5() { + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("tests/test-data/v5.sbt.json"); + + let _sbt = MHBT::from_path(filename).expect("Loading error"); + } +} diff --git a/src/index/sbt/mhmt.rs b/src/index/sbt/mhmt.rs index a53834750d..c2baa7496a 100644 --- a/src/index/sbt/mhmt.rs +++ b/src/index/sbt/mhmt.rs @@ -5,7 +5,7 @@ use mqf::MQF; use crate::index::sbt::{FromFactory, Node, Update, SBT}; use crate::index::storage::{ReadData, ReadDataError, ToWriter}; -use crate::index::{Comparable, Dataset}; +use crate::index::Comparable; use crate::signature::{Signature, SigsTrait}; use crate::sketch::Sketch; @@ -34,7 +34,7 @@ impl ReadData for Node { // TODO: using tempfile for now, but ideally want to avoid that let mut tmpfile = tempfile::NamedTempFile::new().unwrap(); - tmpfile.write_all(&mut &raw[..]).unwrap(); + tmpfile.write_all(&raw[..]).unwrap(); MQF::deserialize(tmpfile.path()).unwrap() })) @@ -46,7 +46,7 @@ impl ReadData for Node { } } -impl FromFactory> for SBT, L> { +impl FromFactory> for SBT, L> { fn factory(&self, _name: &str) -> Result, Error> { unimplemented!() } @@ -58,7 +58,7 @@ impl Update> for Node { } } -impl Update> for Dataset { +impl Update> for Signature { fn update(&self, _other: &mut Node) -> Result<(), Error> { unimplemented!(); } @@ -66,27 +66,26 @@ impl Update> for Dataset { impl Comparable> for Node { fn similarity(&self, other: &Node) -> f64 { - let ng: &MQF = self.data().unwrap(); - let ong: &MQF = other.data().unwrap(); + let _ng: &MQF = self.data().unwrap(); + let _ong: &MQF = other.data().unwrap(); unimplemented!(); //ng.similarity(&ong) } fn containment(&self, other: &Node) -> f64 { - let ng: &MQF = self.data().unwrap(); - let ong: &MQF = other.data().unwrap(); + let _ng: &MQF = self.data().unwrap(); + let _ong: &MQF = other.data().unwrap(); unimplemented!(); //ng.containment(&ong) } } -impl Comparable> for Node { - fn similarity(&self, other: &Dataset) -> f64 { +impl Comparable for Node { + fn similarity(&self, other: &Signature) -> f64 { let ng: &MQF = self.data().unwrap(); - let oth: &Signature = other.data().unwrap(); // TODO: select the right signatures... - if let Sketch::MinHash(sig) = &oth.signatures[0] { + if let Sketch::MinHash(sig) = &other.signatures[0] { if sig.size() == 0 { return 0.0; } @@ -109,12 +108,11 @@ impl Comparable> for Node { } } - fn containment(&self, other: &Dataset) -> f64 { + fn containment(&self, other: &Signature) -> f64 { let ng: &MQF = self.data().unwrap(); - let oth: &Signature = other.data().unwrap(); // TODO: select the right signatures... - if let Sketch::MinHash(sig) = &oth.signatures[0] { + if let Sketch::MinHash(sig) = &other.signatures[0] { if sig.size() == 0 { return 0.0; } @@ -133,3 +131,97 @@ impl Comparable> for Node { } } } + +/* FIXME: bring back after MQF works on macOS and Windows +#[cfg(test)] +mod test { + use std::fs::File; + use std::io::{BufReader, Seek, SeekFrom}; + use std::path::PathBuf; + use std::rc::Rc; + use tempfile; + + use assert_matches::assert_matches; + use lazy_init::Lazy; + + use super::{scaffold, Factory}; + + use crate::index::linear::LinearIndex; + use crate::index::search::{search_minhashes, search_minhashes_containment}; + use crate::index::storage::ReadData; + use crate::index::{Index, SigStore, MHBT}; + use crate::signature::Signature; + + #[cfg(not(target_arch = "wasm32"))] + #[test] + fn load_mhmt() { + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("tests/test-data/v5_mhmt.sbt.json"); + + let mut sbt = crate::index::MHMT::from_path(filename).expect("Loading error"); + + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); + + let mut reader = BufReader::new(File::open(filename).unwrap()); + let sigs = Signature::load_signatures(&mut reader, 31, Some("DNA".into()), None).unwrap(); + let sig_data = sigs[0].clone(); + + let data = Lazy::new(); + data.get_or_create(|| sig_data); + + let leaf = SigStore::builder() + .data(Rc::new(data)) + .filename("") + .name("") + .metadata("") + .storage(None) + .build(); + + let results = sbt.find(search_minhashes, &leaf, 0.5).unwrap(); + //assert_eq!(results.len(), 1); + println!("results: {:?}", results); + println!("leaf: {:?}", leaf); + + let results = sbt.find(search_minhashes, &leaf, 0.1).unwrap(); + assert_eq!(results.len(), 2); + println!("results: {:?}", results); + println!("leaf: {:?}", leaf); + + let mut linear = LinearIndex::builder().storage(sbt.storage()).build(); + for l in &sbt.leaves { + linear.insert(l.1.data().unwrap().clone()).unwrap(); + } + + println!( + "linear leaves {:?} {:?}", + linear.datasets.len(), + linear.datasets + ); + + let results = linear.find(search_minhashes, &leaf, 0.5).unwrap(); + assert_eq!(results.len(), 1); + println!("results: {:?}", results); + println!("leaf: {:?}", leaf); + + let results = linear.find(search_minhashes, &leaf, 0.1).unwrap(); + assert_eq!(results.len(), 2); + println!("results: {:?}", results); + println!("leaf: {:?}", leaf); + + let results = linear + .find(search_minhashes_containment, &leaf, 0.5) + .unwrap(); + assert_eq!(results.len(), 2); + println!("results: {:?}", results); + println!("leaf: {:?}", leaf); + + let results = linear + .find(search_minhashes_containment, &leaf, 0.1) + .unwrap(); + assert_eq!(results.len(), 4); + println!("results: {:?}", results); + println!("leaf: {:?}", leaf); + } + */ +} diff --git a/src/index/sbt/mod.rs b/src/index/sbt/mod.rs index 139b6913c1..8262262a98 100644 --- a/src/index/sbt/mod.rs +++ b/src/index/sbt/mod.rs @@ -1,8 +1,13 @@ pub mod mhbt; + +/* FIXME: bring back after boomphf changes pub mod ukhs; +*/ +/* FIXME: bring back after MQF works on macOS and Windows #[cfg(not(target_arch = "wasm32"))] pub mod mhmt; +*/ use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; @@ -17,11 +22,12 @@ use std::rc::Rc; use failure::Error; use lazy_init::Lazy; +use log::info; use serde_derive::{Deserialize, Serialize}; use typed_builder::TypedBuilder; use crate::index::storage::{FSStorage, ReadData, Storage, StorageInfo, ToWriter}; -use crate::index::{Comparable, Dataset, DatasetInfo, Index}; +use crate::index::{Comparable, DatasetInfo, Index, SigStore}; use crate::signature::Signature; pub trait Update { @@ -33,7 +39,10 @@ pub trait FromFactory { } #[derive(TypedBuilder)] -pub struct SBT { +pub struct SBT +where + L: Sync, +{ #[builder(default = 2)] d: u32, @@ -47,7 +56,7 @@ pub struct SBT { nodes: HashMap, #[builder(default_code = "HashMap::default()")] - leaves: HashMap, + leaves: HashMap>, } const fn parent(pos: u64, d: u64) -> u64 { @@ -60,7 +69,7 @@ const fn child(parent: u64, pos: u64, d: u64) -> u64 { impl SBT where - L: std::clone::Clone + Default, + L: std::clone::Clone + Default + Sync, N: Default, { #[inline(always)] @@ -86,15 +95,32 @@ where self.storage.clone() } + /* + fn fill_up(&mut self) -> Result<(), Error> { + let mut visited = HashSet::new(); + let mut queue: Vec<_> = self.leaves.keys().collect(); + + while !queue.is_empty() { + let pos = queue.pop().unwrap(); + + if !visited.contains(&pos) { + visited.insert(pos); + } + } + + Ok(()) + } + */ + // combine } -impl SBT, Dataset> +impl SBT, T> where - T: std::marker::Sync + ToWriter, + T: std::marker::Sync + ToWriter + Clone, U: std::marker::Sync + ToWriter, Node: ReadData, - Dataset: ReadData, + SigStore: ReadData, { fn parse_v4(rdr: &mut R) -> Result where @@ -112,7 +138,7 @@ where Ok(SBTInfo::V5(sinfo)) } - pub fn from_reader(rdr: &mut R, path: P) -> Result, Dataset>, Error> + pub fn from_reader(rdr: &mut R, path: P) -> Result, T>, Error> where R: Read, P: AsRef, @@ -165,7 +191,7 @@ where .leaves .into_iter() .map(|(n, l)| { - let new_node = Dataset { + let new_node = SigStore { filename: l.filename, name: l.name, metadata: l.metadata, @@ -192,7 +218,7 @@ where }; Some((*n, new_node)) } - NodeInfoV4::Dataset(_) => None, + NodeInfoV4::Leaf(_) => None, }) .collect(); @@ -201,8 +227,8 @@ where .into_iter() .filter_map(|(n, x)| match x { NodeInfoV4::Node(_) => None, - NodeInfoV4::Dataset(l) => { - let new_node = Dataset { + NodeInfoV4::Leaf(l) => { + let new_node = SigStore { filename: l.filename, name: l.name, metadata: l.metadata, @@ -227,7 +253,7 @@ where }) } - pub fn from_path>(path: P) -> Result, Dataset>, Error> { + pub fn from_path>(path: P) -> Result, T>, Error> { let file = File::open(&path)?; let mut reader = BufReader::new(file); @@ -238,8 +264,7 @@ where // TODO: canonicalize doesn't work on wasm32-wasi //basepath.canonicalize()?; - let sbt = - SBT::, Dataset>::from_reader(&mut reader, &basepath.parent().unwrap())?; + let sbt = SBT::, T>::from_reader(&mut reader, &basepath.parent().unwrap())?; Ok(sbt) } @@ -286,7 +311,7 @@ where let filename = (*l).save(&l.filename).unwrap(); let new_node = NodeInfo { - filename: filename, + filename, name: l.name.clone(), metadata: l.metadata.clone(), }; @@ -303,9 +328,10 @@ where // set storage to new one mem::replace(&mut l.storage, Some(Rc::clone(&storage))); + // TODO: this should be l.md5sum(), not l.filename let filename = (*l).save(&l.filename).unwrap(); let new_node = DatasetInfo { - filename: filename, + filename, name: l.name.clone(), metadata: l.metadata.clone(), }; @@ -319,13 +345,18 @@ where Ok(()) } + + pub fn leaves(&self) -> Vec> { + self.leaves.values().cloned().collect() + } } -impl Index for SBT +impl<'a, N, L> Index<'a> for SBT where N: Comparable + Comparable + Update + Debug + Default, - L: Comparable + Update + Clone + Debug + Default, + L: Comparable + Update + Clone + Debug + Default + Sync, SBT: FromFactory, + SigStore: From + ReadData, { type Item = L; @@ -339,6 +370,7 @@ where while !queue.is_empty() { let pos = queue.pop().unwrap(); + if !visited.contains(&pos) { visited.insert(pos); @@ -349,8 +381,9 @@ where } } } else if let Some(leaf) = self.leaves.get(&pos) { - if search_fn(leaf, sig, threshold) { - matches.push(leaf); + let data = leaf.data().expect("Error reading data"); + if search_fn(data, sig, threshold) { + matches.push(data); } } } @@ -359,11 +392,11 @@ where Ok(matches) } - fn insert(&mut self, dataset: &L) -> Result<(), Error> { + fn insert(&mut self, dataset: L) -> Result<(), Error> { if self.leaves.is_empty() { // in this case the tree is empty, // just add the dataset to the first available leaf - self.leaves.entry(0).or_insert(dataset.clone()); + self.leaves.entry(0).or_insert_with(|| dataset.into()); return Ok(()); } @@ -373,6 +406,7 @@ where // TODO: find position by similarity search let pos = self.leaves.keys().max().unwrap() + 1; let parent_pos = self.parent(pos).unwrap(); + let final_pos; if let Entry::Occupied(pnode) = self.leaves.entry(parent_pos) { // Case 1: parent is a Leaf @@ -384,7 +418,7 @@ where // for each children update the parent node // TODO: write the update method - leaf.update(&mut new_node)?; + leaf.data.get().unwrap().update(&mut new_node)?; dataset.update(&mut new_node)?; // node and parent are children of new internal node @@ -393,7 +427,8 @@ where let c2_pos = c_pos.next().unwrap(); self.leaves.entry(c1_pos).or_insert(leaf); - self.leaves.entry(c2_pos).or_insert(dataset.clone()); + self.leaves.entry(c2_pos).or_insert_with(|| dataset.into()); + final_pos = c2_pos; // add the new internal node to self.nodes[parent_pos) // TODO check if it is really empty? @@ -409,26 +444,31 @@ where // (if there isn't an empty spot, it was already covered by case 1) Entry::Occupied(mut pnode) => { dataset.update(&mut pnode.get_mut())?; - self.leaves.entry(pos).or_insert(dataset.clone()); + self.leaves.entry(pos).or_insert_with(|| dataset.into()); + final_pos = pos; } // Case 3: parent is None/empty // this can happen with d != 2, need to create parent node Entry::Vacant(pnode) => { - self.leaves.entry(c_pos).or_insert(dataset.clone()); dataset.update(&mut new_node)?; + self.leaves.entry(c_pos).or_insert_with(|| dataset.into()); + final_pos = c_pos; pnode.insert(new_node); } } } + let entry = &self.leaves[&final_pos]; + let data = entry.data.get().unwrap(); + let mut parent_pos = parent_pos; while let Some(ppos) = self.parent(parent_pos) { if let Entry::Occupied(mut pnode) = self.nodes.entry(parent_pos) { //TODO: use children for this node to update, instead of dragging // dataset up to the root? It would be more generic, but this // works for minhash, draff signatures and nodegraphs... - dataset.update(&mut pnode.get_mut())?; + data.update(&mut pnode.get_mut())?; } parent_pos = ppos; } @@ -436,17 +476,37 @@ where Ok(()) } + /* + fn batch_insert(&mut self, nodes: Vec) -> Result<(), Error> { + self = scaffold(nodes, self.storage()); + Ok(()) + } + */ + fn save>(&self, _path: P) -> Result<(), Error> { - unimplemented!() + unimplemented!(); } fn load>(_path: P) -> Result<(), Error> { unimplemented!() } - fn datasets(&self) -> Vec { - self.leaves.values().cloned().collect() + fn signatures(&self) -> Vec { + self.leaves + .values() + .map(|x| x.data().unwrap().clone()) + .collect() + } + + fn signature_refs(&self) -> Vec<&Self::Item> { + self.leaves.values().map(|x| x.data().unwrap()).collect() } + + /* + fn iter_signatures(&'a self) -> Self::SignatureIterator { + self.leaves.values() + } + */ } /* @@ -497,7 +557,17 @@ where } } -impl Dataset +impl PartialEq for Node +where + T: Sync + PartialEq, + Node: ReadData, +{ + fn eq(&self, other: &Node) -> bool { + self.data().unwrap() == other.data().unwrap() + } +} + +impl SigStore where T: Sync + ToWriter, { @@ -544,7 +614,7 @@ struct NodeInfo { #[serde(untagged)] enum NodeInfoV4 { Node(NodeInfo), - Dataset(DatasetInfo), + Leaf(DatasetInfo), } #[derive(Serialize, Deserialize)] @@ -603,7 +673,7 @@ type HashIntersection = HashSet>; enum BinaryTree { Empty, Internal(Box>), - Dataset(Box>>), + Leaf(Box>>), } struct TreeNode { @@ -613,20 +683,20 @@ struct TreeNode { } pub fn scaffold( - mut datasets: Vec>, + mut datasets: Vec>, storage: Option>, -) -> SBT, Dataset> +) -> SBT, Signature> where N: std::marker::Sync + std::clone::Clone + std::default::Default, { - let mut leaves: HashMap> = HashMap::with_capacity(datasets.len()); + let mut leaves: HashMap> = HashMap::with_capacity(datasets.len()); let mut next_round = Vec::new(); // generate two bottom levels: // - datasets // - first level of internal nodes - eprintln!("Start processing leaves"); + info!("Start processing leaves"); while !datasets.is_empty() { let next_leaf = datasets.pop().unwrap(); @@ -655,7 +725,7 @@ where .cloned() .collect(); - let simleaf_tree = BinaryTree::Dataset(Box::new(TreeNode { + let simleaf_tree = BinaryTree::Leaf(Box::new(TreeNode { element: similar_leaf, left: BinaryTree::Empty, right: BinaryTree::Empty, @@ -663,7 +733,7 @@ where (simleaf_tree, in_common) }; - let leaf_tree = BinaryTree::Dataset(Box::new(TreeNode { + let leaf_tree = BinaryTree::Leaf(Box::new(TreeNode { element: next_leaf, left: BinaryTree::Empty, right: BinaryTree::Empty, @@ -678,15 +748,15 @@ where next_round.push(tree); if next_round.len() % 100 == 0 { - eprintln!("Processed {} leaves", next_round.len() * 2); + info!("Processed {} leaves", next_round.len() * 2); } } - eprintln!("Finished processing leaves"); + info!("Finished processing leaves"); // while we don't get to the root, generate intermediary levels while next_round.len() != 1 { next_round = BinaryTree::process_internal_level(next_round); - eprintln!("Finished processing round {}", next_round.len()); + info!("Finished processing round {}", next_round.len()); } // Convert from binary tree to nodes/leaves @@ -700,7 +770,7 @@ where visited.insert(pos); match cnode { - BinaryTree::Dataset(leaf) => { + BinaryTree::Leaf(leaf) => { leaves.insert(pos, leaf.element); } BinaryTree::Internal(mut node) => { @@ -761,7 +831,7 @@ impl BinaryTree { BinaryTree::Empty => { std::mem::replace(&mut el1.element, HashIntersection::default()) } - _ => panic!("Should not see a Dataset at this level"), + _ => panic!("Should not see a Leaf at this level"), } } else { HashIntersection::default() @@ -784,217 +854,14 @@ impl BinaryTree { } } -#[cfg(test)] -mod test { - use std::fs::File; - use std::io::{BufReader, Seek, SeekFrom}; - use std::path::PathBuf; - use std::rc::Rc; - use tempfile; - - use assert_matches::assert_matches; - use lazy_init::Lazy; - - use super::{scaffold, Factory}; - - use crate::index::linear::LinearIndex; - use crate::index::search::{search_minhashes, search_minhashes_containment}; - use crate::index::{Dataset, Index, MHBT}; - use crate::signature::Signature; - - #[test] - fn save_sbt() { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("tests/test-data/v5.sbt.json"); - - let mut sbt = MHBT::from_path(filename).expect("Loading error"); - - let mut tmpfile = tempfile::NamedTempFile::new().unwrap(); - sbt.save_file(tmpfile.path(), None).unwrap(); - - tmpfile.seek(SeekFrom::Start(0)).unwrap(); - } - - #[cfg(not(target_arch = "wasm32"))] - #[test] - fn load_mhmt() { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("tests/test-data/v5_mhmt.sbt.json"); - - let mut sbt = crate::index::MHMT::from_path(filename).expect("Loading error"); - - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); - - let mut reader = BufReader::new(File::open(filename).unwrap()); - let sigs = Signature::load_signatures(&mut reader, 31, Some("DNA".into()), None).unwrap(); - let sig_data = sigs[0].clone(); - - let data = Lazy::new(); - data.get_or_create(|| sig_data); - - let leaf = Dataset::builder() - .data(Rc::new(data)) - .filename("") - .name("") - .metadata("") - .storage(None) - .build(); - - let results = sbt.find(search_minhashes, &leaf, 0.5).unwrap(); - //assert_eq!(results.len(), 1); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = sbt.find(search_minhashes, &leaf, 0.1).unwrap(); - assert_eq!(results.len(), 2); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let mut linear = LinearIndex::builder().storage(sbt.storage()).build(); - for l in &sbt.leaves { - linear.insert(l.1).unwrap(); - } - - println!( - "linear leaves {:?} {:?}", - linear.datasets.len(), - linear.datasets - ); - - let results = linear.find(search_minhashes, &leaf, 0.5).unwrap(); - assert_eq!(results.len(), 1); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = linear.find(search_minhashes, &leaf, 0.1).unwrap(); - assert_eq!(results.len(), 2); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = linear - .find(search_minhashes_containment, &leaf, 0.5) - .unwrap(); - assert_eq!(results.len(), 2); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = linear - .find(search_minhashes_containment, &leaf, 0.1) - .unwrap(); - assert_eq!(results.len(), 4); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - } - - #[test] - fn load_sbt() { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("tests/test-data/v5.sbt.json"); - - let sbt = MHBT::from_path(filename).expect("Loading error"); - - assert_eq!(sbt.d, 2); - //assert_eq!(sbt.storage.backend, "FSStorage"); - //assert_eq!(sbt.storage.args["path"], ".sbt.v5"); - //assert_matches!(&sbt.storage, ::FSStorage(args) => { - // assert_eq!(args, &[1, 100000, 4]); - //}); - assert_matches!(&sbt.factory, Factory::GraphFactory { args } => { - assert_eq!(args, &(1, 100000.0, 4)); - }); - - println!("sbt leaves {:?} {:?}", sbt.leaves.len(), sbt.leaves); - - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); - - let mut reader = BufReader::new(File::open(filename).unwrap()); - let sigs = Signature::load_signatures(&mut reader, 31, Some("DNA".into()), None).unwrap(); - let sig_data = sigs[0].clone(); - - let data = Lazy::new(); - data.get_or_create(|| sig_data); - - let leaf = Dataset::builder() - .data(Rc::new(data)) - .filename("") - .name("") - .metadata("") - .storage(None) - .build(); - - let results = sbt.find(search_minhashes, &leaf, 0.5).unwrap(); - assert_eq!(results.len(), 1); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = sbt.find(search_minhashes, &leaf, 0.1).unwrap(); - assert_eq!(results.len(), 2); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let mut linear = LinearIndex::builder().storage(sbt.storage()).build(); - for l in &sbt.leaves { - linear.insert(l.1).unwrap(); - } - - println!( - "linear leaves {:?} {:?}", - linear.datasets.len(), - linear.datasets - ); - - let results = linear.find(search_minhashes, &leaf, 0.5).unwrap(); - assert_eq!(results.len(), 1); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = linear.find(search_minhashes, &leaf, 0.1).unwrap(); - assert_eq!(results.len(), 2); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = linear - .find(search_minhashes_containment, &leaf, 0.5) - .unwrap(); - assert_eq!(results.len(), 2); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = linear - .find(search_minhashes_containment, &leaf, 0.1) - .unwrap(); - assert_eq!(results.len(), 4); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - } - - #[test] - fn scaffold_sbt() { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("tests/test-data/v5.sbt.json"); - - let sbt = MHBT::from_path(filename).expect("Loading error"); - - let new_sbt: MHBT = scaffold(sbt.datasets(), sbt.storage()); - - assert_eq!(new_sbt.datasets().len(), 7); - } - - #[test] - fn load_v4() { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("tests/test-data/v4.sbt.json"); - - let _sbt = MHBT::from_path(filename).expect("Loading error"); - } - - #[test] - fn load_v5() { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("tests/test-data/v5.sbt.json"); - - let _sbt = MHBT::from_path(filename).expect("Loading error"); +/* +impl From> for SBT, Signature> +where + U: Sync + Default + Clone, +{ + fn from(other: LinearIndex) -> Self { + let storage = other.storage(); + scaffold(other.datasets, storage) } } +*/ diff --git a/src/index/sbt/ukhs.rs b/src/index/sbt/ukhs.rs index 2fac9c7291..30d3573cc4 100644 --- a/src/index/sbt/ukhs.rs +++ b/src/index/sbt/ukhs.rs @@ -7,12 +7,12 @@ use lazy_init::Lazy; use crate::index::sbt::{FromFactory, Node, Update, SBT}; use crate::index::storage::{ReadData, ReadDataError}; -use crate::index::{Comparable, Dataset}; +use crate::index::Comparable; use crate::signature::Signature; use crate::sketch::ukhs::{FlatUKHS, UKHSTrait}; use crate::sketch::Sketch; -impl FromFactory> for SBT, L> { +impl FromFactory> for SBT, L> { fn factory(&self, name: &str) -> Result, Error> { let data = Lazy::new(); // TODO: don't hardcode this! @@ -34,15 +34,13 @@ impl Update> for Node { } } -impl Update> for Dataset { +impl Update> for Signature { fn update(&self, other: &mut Node) -> Result<(), Error> { - let data = &self.data()?; - - let sigs = if data.signatures.len() > 1 { + let sigs = if self.signatures.len() > 1 { // TODO: select the right signatures... unimplemented!() } else { - &data.signatures[0] + &self.signatures[0] }; if let Sketch::UKHS(sig) = sigs { @@ -73,14 +71,12 @@ impl Comparable> for Node { } } -impl Comparable> for Node { - fn similarity(&self, other: &Dataset) -> f64 { - let odata = other.data().unwrap(); - - if odata.signatures.len() > 1 { +impl Comparable for Node { + fn similarity(&self, other: &Signature) -> f64 { + if other.signatures.len() > 1 { // TODO: select the right signatures... unimplemented!() - } else if let Sketch::UKHS(o_sig) = &odata.signatures[0] { + } else if let Sketch::UKHS(o_sig) = &other.signatures[0] { // This is doing a variation of Weighted Jaccard. // The internal nodes are built with max(l_i, r_i) for each // left and right children, so if we do a WJ similarity directly @@ -108,7 +104,7 @@ impl Comparable> for Node { } } - fn containment(&self, _other: &Dataset) -> f64 { + fn containment(&self, _other: &Signature) -> f64 { unimplemented!(); } } diff --git a/src/index/storage.rs b/src/index/storage.rs index 184a6805ca..b1e83aaa07 100644 --- a/src/index/storage.rs +++ b/src/index/storage.rs @@ -44,7 +44,7 @@ impl From<&StorageArgs> for FSStorage { fullpath.push(path); FSStorage { - fullpath: fullpath, + fullpath, subdir: path.clone(), } } @@ -105,7 +105,7 @@ impl Storage for FSStorage { let file = File::create(&fpath)?; let mut buf_writer = BufWriter::new(file); - buf_writer.write(content)?; + buf_writer.write_all(content)?; Ok(path.into()) } diff --git a/src/lib.rs b/src/lib.rs index 2410941b4c..6add4804eb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -18,9 +18,6 @@ pub mod errors; -#[macro_use] -pub mod utils; - pub mod index; pub mod signature; diff --git a/src/signature.rs b/src/signature.rs index dd83d8045d..07823a43c3 100644 --- a/src/signature.rs +++ b/src/signature.rs @@ -82,6 +82,7 @@ pub struct Signature { #[builder(default)] pub filename: Option, + #[serde(skip_serializing_if = "Option::is_none")] pub name: Option, #[serde(default = "default_license")] @@ -154,7 +155,7 @@ impl Signature { pub fn load_signatures( buf: &mut R, - ksize: usize, + ksize: Option, moltype: Option<&str>, _scaled: Option, ) -> Result, Error> @@ -181,33 +182,41 @@ impl Signature { .filter(|sig| { match sig { Sketch::MinHash(mh) => { - if ksize == 0 || ksize == mh.ksize() as usize { - match moltype { - Some(x) => { - if (x.to_lowercase() == "dna" && !mh.is_protein()) - || (x.to_lowercase() == "protein" && mh.is_protein()) - { - return true; - } + if let Some(k) = ksize { + if k != mh.ksize() as usize { + return false; + } + }; + + match moltype { + Some(x) => { + if (x.to_lowercase() == "dna" && !mh.is_protein()) + || (x.to_lowercase() == "protein" && mh.is_protein()) + { + return true; } - None => return true, // TODO: match previous behavior - }; + } + None => return true, // TODO: match previous behavior }; } Sketch::UKHS(hs) => { - if ksize == 0 || ksize == hs.ksize() as usize { - match moltype { - Some(x) => { - if x.to_lowercase() == "dna" { - return true; - } else { - // TODO: draff only supports dna for now - unimplemented!() - } + if let Some(k) = ksize { + if k != hs.ksize() as usize { + return false; + } + }; + + match moltype { + Some(x) => { + if x.to_lowercase() == "dna" { + return true; + } else { + // TODO: draff only supports dna for now + unimplemented!() } - None => unimplemented!(), - }; - } + } + None => unimplemented!(), + }; } }; false @@ -231,7 +240,7 @@ impl ToWriter for Signature { where W: io::Write, { - match serde_json::to_writer(writer, &self) { + match serde_json::to_writer(writer, &vec![&self]) { Ok(_) => Ok(()), Err(_) => Err(SourmashError::SerdeError.into()), } @@ -267,6 +276,8 @@ impl PartialEq for Signature { if let Sketch::MinHash(other_mh) = &other.signatures[0] { return metadata && (mh == other_mh); } + } else { + unimplemented!() } metadata } @@ -286,7 +297,8 @@ mod test { filename.push("tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); let mut reader = BufReader::new(File::open(filename).unwrap()); - let sigs = Signature::load_signatures(&mut reader, 31, Some("DNA".into()), None).unwrap(); + let sigs = + Signature::load_signatures(&mut reader, Some(31), Some("DNA".into()), None).unwrap(); let _sig_data = sigs[0].clone(); // TODO: check sig_data } diff --git a/src/sketch/minhash.rs b/src/sketch/minhash.rs index e9e1f3163f..427ccb977d 100644 --- a/src/sketch/minhash.rs +++ b/src/sketch/minhash.rs @@ -17,13 +17,22 @@ use crate::signature::SigsTrait; #[cfg(all(target_arch = "wasm32", target_vendor = "unknown"))] use wasm_bindgen::prelude::*; +#[allow(non_camel_case_types)] +#[derive(Debug, Clone, Copy, PartialEq)] +#[repr(u32)] +pub enum HashFunctions { + murmur64_DNA = 1, + murmur64_protein = 2, + murmur64_dayhoff = 3, + murmur64_hp = 4, +} + #[cfg_attr(all(target_arch = "wasm32", target_vendor = "unknown"), wasm_bindgen)] #[derive(Debug, Clone, PartialEq)] pub struct KmerMinHash { num: u32, ksize: u32, - is_protein: bool, - dayhoff: bool, + pub(crate) hash_function: HashFunctions, seed: u64, max_hash: u64, pub(crate) mins: Vec, @@ -35,8 +44,7 @@ impl Default for KmerMinHash { KmerMinHash { num: 1000, ksize: 21, - is_protein: false, - dayhoff: false, + hash_function: HashFunctions::murmur64_DNA, seed: 42, max_hash: 0, mins: Vec::with_capacity(1000), @@ -69,10 +77,12 @@ impl Serialize for KmerMinHash { partial.serialize_field( "molecule", - match &self.is_protein { + match &self.is_protein() { true => { - if self.dayhoff { + if self.dayhoff() { "dayhoff" + } else if self.hp() { + "hp" } else { "protein" } @@ -105,7 +115,12 @@ impl<'de> Deserialize<'de> for KmerMinHash { let tmpsig = TempSig::deserialize(deserializer)?; let num = if tmpsig.max_hash != 0 { 0 } else { tmpsig.num }; - let molecule = tmpsig.molecule.to_lowercase(); + let hash_function = match tmpsig.molecule.to_lowercase().as_ref() { + "protein" => HashFunctions::murmur64_protein, + "dayhoff" => HashFunctions::murmur64_dayhoff, + "dna" => HashFunctions::murmur64_DNA, + _ => unimplemented!(), // TODO: throw error here + }; Ok(KmerMinHash { num, @@ -114,13 +129,7 @@ impl<'de> Deserialize<'de> for KmerMinHash { max_hash: tmpsig.max_hash, mins: tmpsig.mins, abunds: tmpsig.abundances, - is_protein: match molecule.as_ref() { - "protein" => true, - "dayhoff" => true, - "dna" => false, - _ => unimplemented!(), - }, - dayhoff: molecule == "dayhoff", + hash_function, }) } } @@ -129,8 +138,7 @@ impl KmerMinHash { pub fn new( num: u32, ksize: u32, - is_protein: bool, - dayhoff: bool, + hash_function: HashFunctions, seed: u64, max_hash: u64, track_abundance: bool, @@ -153,8 +161,7 @@ impl KmerMinHash { KmerMinHash { num, ksize, - is_protein, - dayhoff, + hash_function, seed, max_hash, mins, @@ -167,7 +174,7 @@ impl KmerMinHash { } pub fn is_protein(&self) -> bool { - self.is_protein + self.hash_function == HashFunctions::murmur64_protein } pub fn seed(&self) -> u64 { @@ -183,8 +190,7 @@ impl KmerMinHash { md5_ctx.consume(self.ksize().to_string()); self.mins .iter() - .map(|x| md5_ctx.consume(x.to_string())) - .count(); + .for_each(|x| md5_ctx.consume(x.to_string())); format!("{:x}", md5_ctx.compute()) } @@ -393,10 +399,7 @@ impl KmerMinHash { pub fn count_common(&self, other: &KmerMinHash) -> Result { self.check_compatible(other)?; - let iter = Intersection { - left: self.mins.iter().peekable(), - right: other.mins.iter().peekable(), - }; + let iter = Intersection::new(self.mins.iter(), other.mins.iter()); Ok(iter.count() as u64) } @@ -407,8 +410,7 @@ impl KmerMinHash { let mut combined_mh = KmerMinHash::new( self.num, self.ksize, - self.is_protein, - self.dayhoff, + self.hash_function, self.seed, self.max_hash, self.abunds.is_some(), @@ -417,18 +419,12 @@ impl KmerMinHash { combined_mh.merge(&self)?; combined_mh.merge(&other)?; - let it1 = Intersection { - left: self.mins.iter().peekable(), - right: other.mins.iter().peekable(), - }; + let it1 = Intersection::new(self.mins.iter(), other.mins.iter()); // TODO: there is probably a way to avoid this Vec here, // and pass the it1 as left in it2. let i1: Vec = it1.cloned().collect(); - let it2 = Intersection { - left: i1.iter().peekable(), - right: combined_mh.mins.iter().peekable(), - }; + let it2 = Intersection::new(i1.iter(), combined_mh.mins.iter()); let common: Vec = it2.cloned().collect(); Ok((common, combined_mh.mins.len() as u64)) @@ -440,8 +436,7 @@ impl KmerMinHash { let mut combined_mh = KmerMinHash::new( self.num, self.ksize, - self.is_protein, - self.dayhoff, + self.hash_function, self.seed, self.max_hash, self.abunds.is_some(), @@ -450,18 +445,12 @@ impl KmerMinHash { combined_mh.merge(&self)?; combined_mh.merge(&other)?; - let it1 = Intersection { - left: self.mins.iter().peekable(), - right: other.mins.iter().peekable(), - }; + let it1 = Intersection::new(self.mins.iter(), other.mins.iter()); // TODO: there is probably a way to avoid this Vec here, // and pass the it1 as left in it2. let i1: Vec = it1.cloned().collect(); - let it2 = Intersection { - left: i1.iter().peekable(), - right: combined_mh.mins.iter().peekable(), - }; + let it2 = Intersection::new(i1.iter(), combined_mh.mins.iter()); Ok((it2.count() as u64, combined_mh.mins.len() as u64)) } @@ -469,14 +458,28 @@ impl KmerMinHash { pub fn compare(&self, other: &KmerMinHash) -> Result { self.check_compatible(other)?; if let Ok((common, size)) = self.intersection_size(other) { - return Ok(common as f64 / u64::max(1, size) as f64); + Ok(common as f64 / u64::max(1, size) as f64) } else { - return Ok(0.0); + Ok(0.0) } } + pub fn containment_ignore_maxhash(&self, other: &KmerMinHash) -> Result { + let it = Intersection::new(self.mins.iter(), other.mins.iter()); + + Ok(it.count() as f64 / self.size() as f64) + } + pub fn dayhoff(&self) -> bool { - self.dayhoff + self.hash_function == HashFunctions::murmur64_dayhoff + } + + pub fn hp(&self) -> bool { + self.hash_function == HashFunctions::murmur64_hp + } + + pub fn hash_function(&self) -> HashFunctions { + self.hash_function } pub fn mins(&self) -> Vec { @@ -501,10 +504,8 @@ impl SigsTrait for KmerMinHash { if self.ksize != other.ksize { return Err(SourmashError::MismatchKSizes.into()); } - if self.is_protein != other.is_protein { - return Err(SourmashError::MismatchDNAProt.into()); - } - if self.dayhoff != other.dayhoff { + if self.hash_function != other.hash_function { + // TODO: fix this error return Err(SourmashError::MismatchDNAProt.into()); } if self.max_hash != other.max_hash { @@ -522,7 +523,7 @@ impl SigsTrait for KmerMinHash { .map(|&x| (x as char).to_ascii_uppercase() as u8) .collect(); if sequence.len() >= (self.ksize as usize) { - if !self.is_protein { + if !self.is_protein() { // dna for kmer in sequence.windows(self.ksize as usize) { if _checkdna(kmer) { @@ -551,20 +552,17 @@ impl SigsTrait for KmerMinHash { .skip(i) .take(sequence.len() - i) .collect(); - let aa = to_aa(&substr, self.dayhoff)?; + let aa = to_aa(&substr, self.dayhoff(), self.hp())?; - aa.windows(aa_ksize as usize) - .map(|n| self.add_word(n)) - .count(); + aa.windows(aa_ksize as usize).for_each(|n| self.add_word(n)); let rc_substr: Vec = rc.iter().cloned().skip(i).take(rc.len() - i).collect(); - let aa_rc = to_aa(&rc_substr, self.dayhoff)?; + let aa_rc = to_aa(&rc_substr, self.dayhoff(), self.hp())?; aa_rc .windows(aa_ksize as usize) - .map(|n| self.add_word(n)) - .count(); + .for_each(|n| self.add_word(n)); } } } @@ -573,8 +571,17 @@ impl SigsTrait for KmerMinHash { } struct Intersection> { - left: Peekable, - right: Peekable, + iter: Peekable, + other: Peekable, +} + +impl> Intersection { + pub fn new(left: I, right: I) -> Self { + Intersection { + iter: left.peekable(), + other: right.peekable(), + } + } } impl> Iterator for Intersection { @@ -582,21 +589,21 @@ impl> Iterator for Intersection { fn next(&mut self) -> Option { loop { - let res = match (self.left.peek(), self.right.peek()) { + let res = match (self.iter.peek(), self.other.peek()) { (Some(ref left_key), Some(ref right_key)) => left_key.cmp(right_key), _ => return None, }; match res { Ordering::Less => { - self.left.next(); + self.iter.next(); } Ordering::Greater => { - self.right.next(); + self.other.next(); } Ordering::Equal => { - self.right.next(); - return self.left.next(); + self.other.next(); + return self.iter.next(); } } } @@ -685,7 +692,7 @@ lazy_static! { // G ("GGT", b'G'), ("GGC", b'G'), ("GGA", b'G'), ("GGG", b'G'), ("GGN", b'G'), - ].into_iter().cloned().collect() + ].iter().cloned().collect() }; } @@ -730,7 +737,33 @@ lazy_static! { // e (b'F', b'f'), (b'W', b'f'), (b'Y', b'f'), - ].into_iter().cloned().collect() + ].iter().cloned().collect() + }; +} + +// HP Hydrophobic/hydrophilic mapping +// From: Phillips, R., Kondev, J., Theriot, J. (2008). +// Physical Biology of the Cell. New York: Garland Science, Taylor & Francis Group. ISBN: 978-0815341635 + +// +// | Amino acid | HP +// |---------------------------------------|---------| +// | A, F, G, I, L, M, P, V, W, Y | h | +// | N, C, S, T, D, E, R, H, K, Q | p | +lazy_static! { + static ref HPTABLE: HashMap = { + [ + // h + (b'A', b'h'), (b'F', b'h'), (b'G', b'h'), (b'I', b'h'), (b'L', b'h'), + (b'M', b'h'), (b'P', b'h'), (b'V', b'h'), (b'W', b'h'), (b'Y', b'h'), + + // p + (b'N', b'p'), (b'C', b'p'), (b'S', b'p'), (b'T', b'p'), (b'D', b'p'), + (b'E', b'p'), (b'R', b'p'), (b'H', b'p'), (b'K', b'p'), (b'Q', b'p'), + ] + .iter() + .cloned() + .collect() }; } @@ -770,8 +803,15 @@ pub(crate) fn aa_to_dayhoff(aa: u8) -> char { } } +pub(crate) fn aa_to_hp(aa: u8) -> char { + match HPTABLE.get(&aa) { + Some(letter) => *letter as char, + None => 'X', + } +} + #[inline] -fn to_aa(seq: &[u8], dayhoff: bool) -> Result, Error> { +fn to_aa(seq: &[u8], dayhoff: bool, hp: bool) -> Result, Error> { let mut converted: Vec = Vec::with_capacity(seq.len() / 3); for chunk in seq.chunks(3) { @@ -782,6 +822,8 @@ fn to_aa(seq: &[u8], dayhoff: bool) -> Result, Error> { let residue = translate_codon(chunk)?; if dayhoff { converted.push(aa_to_dayhoff(residue) as u8); + } else if hp { + converted.push(aa_to_hp(residue) as u8); } else { converted.push(residue); } diff --git a/src/sketch/mod.rs b/src/sketch/mod.rs index 9da8dca613..f02d0f7e1c 100644 --- a/src/sketch/mod.rs +++ b/src/sketch/mod.rs @@ -1,5 +1,6 @@ pub mod minhash; pub mod nodegraph; + pub mod ukhs; use serde_derive::{Deserialize, Serialize}; @@ -11,5 +12,5 @@ use crate::sketch::ukhs::FlatUKHS; #[serde(untagged)] pub enum Sketch { MinHash(KmerMinHash), - UKHS(FlatUKHS), + UKHS(FlatUKHS), // FIXME } diff --git a/src/sketch/nodegraph.rs b/src/sketch/nodegraph.rs index 9d8b52b452..4ae4725c8e 100644 --- a/src/sketch/nodegraph.rs +++ b/src/sketch/nodegraph.rs @@ -7,9 +7,10 @@ use failure::Error; use fixedbitset::FixedBitSet; use primal; +use crate::sketch::minhash::KmerMinHash; use crate::HashIntoType; -#[derive(Debug, Default, Clone, PartialEq)] +#[derive(Debug, Default, Clone)] pub struct Nodegraph { pub(crate) bs: Vec, ksize: usize, @@ -17,6 +18,15 @@ pub struct Nodegraph { unique_kmers: usize, } +// TODO: only checking for the bitset for now, +// since unique_kmers is not saved in a khmer nodegraph +// and occupied_bins also has issues... +impl PartialEq for Nodegraph { + fn eq(&self, other: &Nodegraph) -> bool { + self.bs == other.bs + } +} + impl Nodegraph { pub fn new(tablesizes: &[usize], ksize: usize) -> Nodegraph { let mut bs = Vec::with_capacity(tablesizes.len()); @@ -33,13 +43,24 @@ impl Nodegraph { } pub fn with_tables(tablesize: usize, n_tables: usize, ksize: usize) -> Nodegraph { - // TODO: cache the Sieve somewhere for repeated calls? - let tablesizes: Vec = primal::Primes::all() - .filter(|p| *p >= tablesize) - .take(n_tables) - .collect(); + let mut tablesizes = Vec::with_capacity(n_tables); + + let mut i = (tablesize - 1) as u64; + if i % 2 == 0 { + i += 1 + } + + while tablesizes.len() != n_tables { + if primal::is_prime(i) { + tablesizes.push(i as usize); + } + if i == 1 { + break; + } + i -= 2; + } - Nodegraph::new(&tablesizes, ksize) + Nodegraph::new(tablesizes.as_slice(), ksize) } pub fn count(&mut self, hash: HashIntoType) -> bool { @@ -85,23 +106,51 @@ impl Nodegraph { let mut new_bins = 0; for (bs, bs_other) in self.bs.iter_mut().zip(&other.bs) { - bs_other - .ones() - .map(|x| { - if !bs.put(x) { - new_bins += 1; - } - }) - .count(); + bs_other.ones().for_each(|x| { + if !bs.put(x) { + new_bins += 1; + } + }); } // TODO: occupied bins seems to be broken in khmer? I don't get the same // values... - //self.occupied_bins += new_bins; + self.occupied_bins += new_bins; + } + + pub fn expected_collisions(&self) -> f64 { + let min_size = self.bs.iter().map(|x| x.len()).min().unwrap(); + let n_ht = self.bs.len(); + let occupancy = self.occupied_bins; + + let fp_one = occupancy / min_size; + f64::powf(fp_one as f64, n_ht as f64) + } + + pub fn tablesize(&self) -> usize { + self.bs.iter().map(|x| x.len()).sum() + } + + pub fn noccupied(&self) -> usize { + self.occupied_bins + } + + pub fn matches(&self, mh: &KmerMinHash) -> usize { + mh.mins.iter().filter(|x| self.get(**x) == 1).count() + } + + pub fn ntables(&self) -> usize { + self.bs.len() + } + + pub fn ksize(&self) -> usize { + self.ksize } // save pub fn save>(&self, path: P) -> Result<(), Error> { - self.save_to_writer(&mut File::open(path)?)?; + // TODO: if it ends with gz, open a compressed file + // might use get_output here? + self.save_to_writer(&mut File::create(path)?)?; Ok(()) } @@ -143,6 +192,9 @@ impl Nodegraph { where R: io::Read, { + // TODO: see https://github.com/brainstorm/bio-index-formats for an + // example of using nom to parse binary data. + // Use it here instead of byteorder let signature = rdr.read_u32::()?; assert_eq!(signature, 0x4f58_4c49); @@ -238,11 +290,14 @@ impl Nodegraph { #[cfg(test)] mod test { use super::*; + use cfg_if::cfg_if; use std::io::{BufReader, BufWriter}; use std::path::PathBuf; + cfg_if! { + if #[cfg(not(target_arch = "wasm32"))] { use proptest::num::u64; - use proptest::{proptest, proptest_helper}; + use proptest::{proptest}; proptest! { #[test] @@ -252,6 +307,8 @@ mod test { assert_eq!(ng.get(hash), 1); } } + } + } #[test] fn count_and_get_nodegraph() { diff --git a/src/sketch/ukhs.rs b/src/sketch/ukhs.rs index 47d2b4e543..03114631e0 100644 --- a/src/sketch/ukhs.rs +++ b/src/sketch/ukhs.rs @@ -1,3 +1,41 @@ +use failure::Error; +use serde_derive::{Deserialize, Serialize}; + +use crate::signature::SigsTrait; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FlatUKHS {} + +impl FlatUKHS { + pub fn md5sum(&self) -> String { + unimplemented!() + } +} + +impl SigsTrait for FlatUKHS { + fn size(&self) -> usize { + unimplemented!() + } + + fn to_vec(&self) -> Vec { + unimplemented!() + } + + fn ksize(&self) -> usize { + unimplemented!() + } + + fn check_compatible(&self, _other: &Self) -> Result<(), Error> { + unimplemented!() + } + + fn add_sequence(&mut self, _seq: &[u8], _force: bool) -> Result<(), Error> { + unimplemented!() + } +} + +/* FIXME bring back after succint-rs changes + use std::f64::consts::PI; use std::fs::File; use std::hash::BuildHasherDefault; @@ -5,18 +43,13 @@ use std::io::{BufReader, BufWriter, Read, Write}; use std::mem; use std::path::Path; -use failure::Error; use itertools::Itertools; use pdatastructs::hyperloglog::HyperLogLog; -use serde::de::{Deserialize, Deserializer}; -use serde::ser::{Serialize, SerializeStruct, Serializer}; -use serde_derive::Deserialize; use ukhs; use crate::errors::SourmashError; use crate::index::sbt::NoHashHasher; use crate::index::storage::ToWriter; -use crate::signature::SigsTrait; use crate::sketch::nodegraph::Nodegraph; #[derive(Clone)] @@ -38,8 +71,7 @@ impl UKHS { md5_ctx.consume(self.ukhs.k().to_string()); self.buckets .iter() - .map(|x| md5_ctx.consume(x.to_string())) - .count(); + .for_each(|x| md5_ctx.consume(x.to_string())); format!("{:x}", md5_ctx.compute()) } } @@ -179,24 +211,22 @@ impl UKHSTrait for UKHS { // this is the cosine distance as defined by scipy //1. - d - /* // This is the weighted Jaccard distance // TODO: don't iterate twice... - let mins: u64 = self - .buckets - .iter() - .zip(other.buckets.iter()) - .map(|(a, b)| u64::min(*a, *b)) - .sum(); - let maxs: u64 = self - .buckets - .iter() - .zip(other.buckets.iter()) - .map(|(a, b)| u64::max(*a, *b)) - .sum(); - - 1. - (mins as f64 / maxs as f64) - */ + //let mins: u64 = self + // .buckets + // .iter() + // .zip(other.buckets.iter()) + // .map(|(a, b)| u64::min(*a, *b)) + // .sum(); + //let maxs: u64 = self + // .buckets + // .iter() + // .zip(other.buckets.iter()) + // .map(|(a, b)| u64::max(*a, *b)) + // .sum(); + // + //1. - (mins as f64 / maxs as f64) } fn to_writer(&self, writer: &mut W) -> Result<(), Error> @@ -232,13 +262,13 @@ impl SigsTrait for UKHS { // TODO: is seq.len() > W? let it: Vec<(u64, u64)> = self.ukhs.hash_iter_sequence(seq)?.collect(); - /* This one update every unikmer bucket with w_hash - it.into_iter() - .map(|(_, k_hash)| { - self.buckets[self.ukhs.query_bucket(k_hash).unwrap()] += 1; - }) - .count(); - */ + // This one update every unikmer bucket with w_hash + //it.into_iter() + // .map(|(_, k_hash)| { + // self.buckets[self.ukhs.query_bucket(k_hash).unwrap()] += 1; + // }) + // .count(); + // // Only update the bucket for the minimum unikmer found for (_, group) in &it.into_iter().group_by(|(w, _)| *w) { @@ -313,13 +343,12 @@ impl SigsTrait for UKHS { fn add_sequence(&mut self, seq: &[u8], _force: bool) -> Result<(), Error> { let it: Vec<(u64, u64)> = self.ukhs.hash_iter_sequence(seq)?.collect(); - /* This one update every unikmer bucket with w_hash - it.into_iter() - .map(|(w_hash, k_hash)| { - self.buckets[self.ukhs.query_bucket(k_hash).unwrap()].count(w_hash); - }) - .count(); - */ + // This one update every unikmer bucket with w_hash + //it.into_iter() + // .map(|(w_hash, k_hash)| { + // self.buckets[self.ukhs.query_bucket(k_hash).unwrap()].count(w_hash); + // }) + // .count(); // Only update the bucket for the minimum unikmer found for (w_hash, group) in &it.into_iter().group_by(|(w, _)| *w) { @@ -412,13 +441,12 @@ impl SigsTrait for UKHS { fn add_sequence(&mut self, seq: &[u8], _force: bool) -> Result<(), Error> { let it: Vec<(u64, u64)> = self.ukhs.hash_iter_sequence(seq)?.collect(); - /* This one update every unikmer bucket with w_hash - it.into_iter() - .map(|(w_hash, k_hash)| { - self.buckets[self.ukhs.query_bucket(k_hash).unwrap()].add(&w_hash); - }) - .count(); - */ + // This one update every unikmer bucket with w_hash + //it.into_iter() + // .map(|(w_hash, k_hash)| { + // self.buckets[self.ukhs.query_bucket(k_hash).unwrap()].add(&w_hash); + // }) + // .count(); // Only update the bucket for the minimum unikmer found for (w_hash, group) in &it.into_iter().group_by(|(w, _)| *w) { @@ -516,40 +544,37 @@ where // Removed this for now, because calling .into() in these doesn't // transfer all the important information... -/* -impl From for Dataset { - fn from(other: FlatUKHS) -> Dataset { - let data = Lazy::new(); - data.get_or_create(|| other.into()); - - Dataset::builder() - .data(Rc::new(data)) - .filename("") - .name("") - .metadata("") - .storage(None) - .build() - } -} - -impl From for Signature { - fn from(other: FlatUKHS) -> Signature { - Signature::builder() - .hash_function("nthash") // TODO: spec! - .class("draff_signature") // TODO: spec! - .name(Some("draff_file".into())) // TODO: spec! - .signatures(vec![Sketch::UKHS(other)]) - .build() - } -} -*/ +//impl From for Dataset { +// fn from(other: FlatUKHS) -> Dataset { +// let data = Lazy::new(); +// data.get_or_create(|| other.into()); +// +// Dataset::builder() +// .data(Rc::new(data)) +// .filename("") +// .name("") +// .metadata("") +// .storage(None) +// .build() +// } +//} +// +//impl From for Signature { +// fn from(other: FlatUKHS) -> Signature { +// Signature::builder() +// .hash_function("nthash") // TODO: spec! +// .class("draff_signature") // TODO: spec! +// .name(Some("draff_file".into())) // TODO: spec! +// .signatures(vec![Sketch::UKHS(other)]) +// .build() +// } +//} #[cfg(test)] mod test { use std::path::PathBuf; - use bio::io::fasta::Reader; - use ocf::get_input; + use needletail::parse_sequence_path; use super::{FlatUKHS, MemberUKHS, UKHSTrait}; use crate::signature::SigsTrait; @@ -561,13 +586,14 @@ mod test { let mut ukhs = MemberUKHS::new(9, 21).unwrap(); - let (input, _) = get_input(filename.to_str().unwrap()).unwrap(); - let reader = Reader::new(input); - - for record in reader.records() { - let record = record.unwrap(); - ukhs.add_sequence(record.seq(), false).unwrap(); - } + parse_sequence_path( + filename, + |_| {}, + |record| { + ukhs.add_sequence(&record.seq, false).unwrap(); + }, + ) + .expect("error parsing"); // TODO: find test case... //assert_eq!(ukhs.to_vec(), [1, 2, 3]); @@ -580,13 +606,14 @@ mod test { let mut ukhs = FlatUKHS::new(9, 21).unwrap(); - let (input, _) = get_input(filename.to_str().unwrap()).unwrap(); - let reader = Reader::new(input); - - for record in reader.records() { - let record = record.unwrap(); - ukhs.add_sequence(record.seq(), false).unwrap(); - } + parse_sequence_path( + filename, + |_| {}, + |record| { + ukhs.add_sequence(&record.seq, false).unwrap(); + }, + ) + .expect("error parsing"); let mut buffer = Vec::new(); ukhs.to_writer(&mut buffer).unwrap(); @@ -602,3 +629,4 @@ mod test { } } } +*/ diff --git a/src/wasm.rs b/src/wasm.rs index 7d7000dbf7..93318bad50 100644 --- a/src/wasm.rs +++ b/src/wasm.rs @@ -3,7 +3,7 @@ use wasm_bindgen::prelude::*; use serde_json; use crate::signature::SigsTrait; -use crate::sketch::minhash::KmerMinHash; +use crate::sketch::minhash::{HashFunctions, KmerMinHash}; #[wasm_bindgen] impl KmerMinHash { @@ -13,6 +13,7 @@ impl KmerMinHash { ksize: u32, is_protein: bool, dayhoff: bool, + hp: bool, seed: u32, scaled: u32, track_abundance: bool, @@ -25,11 +26,22 @@ impl KmerMinHash { u64::max_value() / scaled as u64 }; + // TODO: at most one of (prot, dayhoff, hp) should be true + + let hash_function = if dayhoff { + HashFunctions::murmur64_dayhoff + } else if hp { + HashFunctions::murmur64_hp + } else if is_protein { + HashFunctions::murmur64_protein + } else { + HashFunctions::murmur64_DNA + }; + KmerMinHash::new( num, ksize, - is_protein, - dayhoff, + hash_function, seed as u64, max_hash, track_abundance, @@ -38,7 +50,8 @@ impl KmerMinHash { #[wasm_bindgen] pub fn add_sequence_js(&mut self, buf: &str) { - self.add_sequence(buf.as_bytes(), true); + self.add_sequence(buf.as_bytes(), true) + .expect("Error adding sequence"); } #[wasm_bindgen] diff --git a/tests/minhash.rs b/tests/minhash.rs index 86a4b5909d..af0853e69f 100644 --- a/tests/minhash.rs +++ b/tests/minhash.rs @@ -1,9 +1,9 @@ use sourmash::signature::SigsTrait; -use sourmash::sketch::minhash::KmerMinHash; +use sourmash::sketch::minhash::{HashFunctions, KmerMinHash}; #[test] fn throws_error() { - let mut mh = KmerMinHash::new(1, 4, false, false, 42, 0, false); + let mut mh = KmerMinHash::new(1, 4, HashFunctions::murmur64_DNA, 42, 0, false); match mh.add_sequence(b"ATGR", false) { Ok(_) => assert!(false, "R is not a valid DNA character"), @@ -13,8 +13,8 @@ fn throws_error() { #[test] fn merge() { - let mut a = KmerMinHash::new(20, 10, false, false, 42, 0, false); - let mut b = KmerMinHash::new(20, 10, false, false, 42, 0, false); + let mut a = KmerMinHash::new(20, 10, HashFunctions::murmur64_DNA, 42, 0, false); + let mut b = KmerMinHash::new(20, 10, HashFunctions::murmur64_DNA, 42, 0, false); a.add_sequence(b"TGCCGCCCAGCA", false).unwrap(); b.add_sequence(b"TGCCGCCCAGCA", false).unwrap(); @@ -40,8 +40,8 @@ fn merge() { #[test] fn compare() { - let mut a = KmerMinHash::new(20, 10, false, false, 42, 0, false); - let mut b = KmerMinHash::new(20, 10, false, false, 42, 0, false); + let mut a = KmerMinHash::new(20, 10, HashFunctions::murmur64_DNA, 42, 0, false); + let mut b = KmerMinHash::new(20, 10, HashFunctions::murmur64_DNA, 42, 0, false); a.add_sequence(b"TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA", false) .unwrap(); @@ -67,12 +67,12 @@ fn compare() { #[test] fn dayhoff() { - let mut a = KmerMinHash::new(10, 6, true, true, 42, 0, false); - let mut b = KmerMinHash::new(10, 6, true, false, 42, 0, false); + let mut a = KmerMinHash::new(10, 6, HashFunctions::murmur64_dayhoff, 42, 0, false); + let mut b = KmerMinHash::new(10, 6, HashFunctions::murmur64_protein, 42, 0, false); a.add_sequence(b"ACTGAC", false).unwrap(); b.add_sequence(b"ACTGAC", false).unwrap(); - assert_eq!(a.size(), 2); + assert_eq!(a.size(), 1); assert_eq!(b.size(), 2); } diff --git a/tests/smrs_cmd.rs b/tests/smrs_cmd.rs new file mode 100644 index 0000000000..c203099158 --- /dev/null +++ b/tests/smrs_cmd.rs @@ -0,0 +1,139 @@ +use std::fs; +use std::process::Command; + +use assert_cmd::prelude::*; +use predicates::prelude::*; +use predicates::str::contains; +use tempfile::TempDir; + +#[test] +fn search() -> Result<(), Box> { + let mut cmd = Command::cargo_bin("smrs")?; + + cmd.arg("search") + .arg("tests/test-data/demo/SRR2060939_1.sig") + .arg("tests/test-data/v5.sbt.json") + .assert() + .success() + .stdout(contains("SRR2060939_1.fastq.gz")) + .stdout(contains("SRR2060939_2.fastq.gz")) + .stdout(contains("SRR2255622_1.fastq.gz")); + + Ok(()) +} + +#[test] +#[ignore] +fn search_only_leaves() -> Result<(), Box> { + let mut cmd = Command::cargo_bin("smrs")?; + + cmd.arg("search") + .arg("tests/test-data/demo/SRR2060939_1.sig") + .arg("tests/test-data/leaves.sbt.json") + .assert() + .success() + .stdout(contains("SRR2060939_1.fastq.gz")) + .stdout(contains("SRR2060939_2.fastq.gz")) + .stdout(contains("SRR2255622_1.fastq.gz")); + + Ok(()) +} + +#[test] +#[ignore] +#[cfg(unix)] +fn compute_index_and_search() -> Result<(), Box> { + let tmp_dir = TempDir::new()?; + fs::copy("tests/test-data/short.fa", tmp_dir.path().join("short.fa"))?; + fs::copy( + "tests/test-data/short2.fa", + tmp_dir.path().join("short2.fa"), + )?; + + assert!(tmp_dir.path().join("short.fa").exists()); + assert!(tmp_dir.path().join("short2.fa").exists()); + + let mut cmd = Command::new("sourmash"); + cmd.arg("compute") + .args(&["short.fa", "short2.fa"]) + .current_dir(&tmp_dir) + .assert() + .success(); + + assert!(tmp_dir.path().join("short.fa.sig").exists()); + assert!(tmp_dir.path().join("short2.fa.sig").exists()); + + let mut cmd = Command::new("sourmash"); + //let mut cmd = Command::cargo_bin("smrs")?; + cmd.arg("index") + .args(&["-k", "31"]) + //.args(&["-o", "zzz.sbt.json"]) + .arg("zzz.sbt.json") + .args(&["short.fa.sig", "short2.fa.sig"]) + .current_dir(&tmp_dir) + .assert() + .success(); + + assert!(tmp_dir.path().join("zzz.sbt.json").exists()); + + let cmds = vec![Command::new("sourmash"), Command::cargo_bin("smrs")?]; + + for mut cmd in cmds { + cmd.arg("search") + .args(&["-k", "31"]) + .arg("short.fa.sig") + .arg("zzz.sbt.json") + .current_dir(&tmp_dir) + .assert() + .success() + .stdout(contains("short.fa")) + .stdout(contains("short2.fa")); + } + + Ok(()) +} + +#[test] +#[cfg(unix)] +fn index_and_search() -> Result<(), Box> { + let tmp_dir = TempDir::new()?; + fs::copy( + "tests/test-data/demo/SRR2060939_1.sig", + tmp_dir.path().join("1.sig"), + )?; + fs::copy( + "tests/test-data/demo/SRR2060939_2.sig", + tmp_dir.path().join("2.sig"), + )?; + + assert!(tmp_dir.path().join("1.sig").exists()); + assert!(tmp_dir.path().join("2.sig").exists()); + + let mut cmd = Command::cargo_bin("smrs")?; + cmd.arg("index") + .args(&["-k", "31"]) + .args(&["-o", "zzz.sbt.json"]) + .args(&["1.sig", "2.sig"]) + .current_dir(&tmp_dir) + .assert() + .success(); + + assert!(tmp_dir.path().join("zzz.sbt.json").exists()); + + let cmds = vec![Command::new("sourmash"), Command::cargo_bin("smrs")?]; + + for mut cmd in cmds { + cmd.arg("search") + .args(&["-k", "31"]) + .arg("1.sig") + .arg("zzz.sbt.json") + .current_dir(&tmp_dir) + .assert() + .success() + .stdout(contains("2 matches:")) + .stdout(contains("SRR2060939_1.fastq.gz")) + .stdout(contains("SRR2060939_2.fastq.gz")); + } + + Ok(()) +} diff --git a/tox.ini b/tox.ini index cca92bcccd..a54618c780 100644 --- a/tox.ini +++ b/tox.ini @@ -4,14 +4,14 @@ envlist=py27,py35,py36,py37 [testenv] passenv = CI TRAVIS TRAVIS_* whitelist_externals= - make + make +extras = + test + doc + 10x + storage deps= - codecov - ipfshttpclient - redis - bam2fasta + codecov commands= - pip install -r requirements.txt - pip install -e .[test] - make coverage - codecov --gcov-glob third-party + make coverage + codecov --gcov-glob third-party From 3febb3fa69d5adffcc2cf51ca221753f57d580d2 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Tue, 17 Dec 2019 05:09:40 +0000 Subject: [PATCH 05/10] Replacing C++ with Rust (#424) - Update build system and CI - Add Rust install instructions to docs - Remove dependency on Cython (replaced with cffi and milksnake) - Move _minhash.pyx to _minhash.py, and remove Cython bits - Add convenience functions and classes to work with Rust layer - Remove third-party/ directory --- .coveragerc | 1 - .gitignore | 10 +- .travis.yml | 35 +- MANIFEST.in | 10 +- Makefile | 12 +- README.md | 19 +- doc/developer.md | 15 +- netlify.toml | 12 + setup.py | 82 ++-- sourmash/__init__.py | 16 +- sourmash/_compat.py | 24 ++ sourmash/_minhash.pxd | 70 --- sourmash/_minhash.py | 527 ++++++++++++++++++++++ sourmash/_minhash.pyx | 512 ---------------------- sourmash/exceptions.py | 44 ++ sourmash/kmer_min_hash.hh | 648 ---------------------------- sourmash/sbtmh.py | 5 +- sourmash/sig/__main__.py | 5 +- sourmash/sourmash_args.py | 5 +- sourmash/utils.py | 78 ++++ src/bin/smrs.rs | 8 + src/ffi/minhash.rs | 2 +- src/ffi/signature.rs | 17 +- src/index/bigsi.rs | 10 +- src/index/sbt/mhbt.rs | 19 +- src/signature.rs | 19 +- src/sketch/minhash.rs | 29 +- tests/minhash.rs | 2 +- tests/smrs_cmd.rs | 1 - tests/test__minhash.py | 97 ++++- tests/test_rustobj.py | 18 + tests/test_sourmash.py | 2 +- third-party/.gitignore | 6 - third-party/smhasher/MurmurHash3.cc | 340 --------------- third-party/smhasher/MurmurHash3.h | 37 -- 35 files changed, 982 insertions(+), 1755 deletions(-) create mode 100644 netlify.toml create mode 100644 sourmash/_compat.py delete mode 100644 sourmash/_minhash.pxd create mode 100644 sourmash/_minhash.py delete mode 100644 sourmash/_minhash.pyx create mode 100644 sourmash/exceptions.py delete mode 100644 sourmash/kmer_min_hash.hh create mode 100644 sourmash/utils.py create mode 100644 tests/test_rustobj.py delete mode 100644 third-party/.gitignore delete mode 100644 third-party/smhasher/MurmurHash3.cc delete mode 100644 third-party/smhasher/MurmurHash3.h diff --git a/.coveragerc b/.coveragerc index d284e1014c..09b80d8275 100644 --- a/.coveragerc +++ b/.coveragerc @@ -6,6 +6,5 @@ omit = doc/conf.py setup.py tests/* - third-party/smhasher/MurmurHash3.cc .tox/* benchmarks/* diff --git a/.gitignore b/.gitignore index 0aa0e7a6f1..0b797c7bf5 100644 --- a/.gitignore +++ b/.gitignore @@ -13,14 +13,9 @@ dist build sourmash.egg-info .ipynb_checkpoints -_minhash.so .cache *.so .coverage -sourmash_lib/_minhash.cpp -sourmash/_minhash.cpp -.asv/ -.eggs/ .pytest_cache .python-version sourmash/version.py @@ -30,6 +25,9 @@ sourmash/_lowlevel*.py .env Pipfile Pipfile.lock -ocf/target/ target/ Cargo.lock +.eggs +.asv +pkg/ +wasm-pack.log diff --git a/.travis.yml b/.travis.yml index 937b4b6c0b..7483db938e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,16 +7,18 @@ cache: - "$HOME/.cache/pip" - "$HOME/.cargo" - "target" - - ".tox" branches: only: - master - "/^v.*$/" -script: tox +script: tox -vv -install: pip install tox-travis +install: + - source .travis/install_cargo.sh + +before_script: pip install tox-travis jobs: allow_failures: @@ -28,12 +30,14 @@ jobs: - &test stage: test - python: 2.7 + python: 3.6 - <<: *test os: osx + osx_image: xcode10.1 + python: 3.7 language: generic env: - - TOXENV=py36 + - TOXENV=py37 - <<: *test python: 3.7 name: integration (ipfs/redis) @@ -45,7 +49,7 @@ jobs: - redis-server - docker - <<: *test - python: 3.6 + python: 2.7 - <<: *test python: 3.5 @@ -55,12 +59,16 @@ jobs: services: - docker env: - - PIP=pip + - CIBW_BUILD='cp37-*' - CIBW_SKIP='*-manylinux_i686' - install: skip + - CIBW_BEFORE_BUILD='source .travis/install_cargo.sh' + - CIBW_ENVIRONMENT='PATH="$HOME/.cargo/bin:$PATH"' + - CIBW_ENVIRONMENT_MACOS='MACOSX_DEPLOYMENT_TARGET=10.11' + before_script: skip script: - - sudo $PIP install cibuildwheel==1.0.0 - - cibuildwheel --output-dir wheelhouse + - python -m pip install -U pip setuptools + - python -m pip install cibuildwheel==1.1.0 + - python -m cibuildwheel --output-dir wheelhouse deploy: provider: releases api_key: @@ -73,12 +81,7 @@ jobs: - <<: *wheel os: osx osx_image: xcode10.1 - language: generic - before_script: - - sudo $PIP install -U pip setuptools - env: - - PIP=pip2 - - CIBW_ENVIRONMENT_MACOS='MACOSX_DEPLOYMENT_TARGET=10.11' + language: shell stages: - check diff --git a/MANIFEST.in b/MANIFEST.in index 69df9b2280..1d206dc896 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,10 +1,16 @@ include LICENSE Makefile Dockerfile LICENSE Makefile README.md requirements.txt include index.ipynb +include sourmash VERSION recursive-include sourmash_lib * recursive-include sourmash * -recursive-include third-party *.cc *.h -exclude tests/* +recursive-include src *.rs +recursive-include benches *.rs +include Cargo.toml +include include/sourmash.h +prune .eggs +global-exclude *.rlib global-exclude *.orig global-exclude *.pyc global-exclude *.so prune tests/test-data/ +global-exclude *.git/ diff --git a/Makefile b/Makefile index 8bb87d29dd..559ad81cbc 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,13 @@ PYTHON ?= python -all: - $(PYTHON) setup.py build_ext -i +all: build .PHONY: +build: + $(PYTHON) setup.py build_ext -i + cargo build + clean: $(PYTHON) setup.py clean --all rm -f sourmash/*.so @@ -19,6 +22,7 @@ dist: FORCE test: all pip install -e '.[test]' $(PYTHON) -m pytest + cargo test doc: .PHONY cd doc && make html @@ -29,12 +33,12 @@ include/sourmash.h: src/lib.rs src/ffi/minhash.rs src/ffi/signature.rs src/ffi/n rustup override set stable coverage: all - $(PYTHON) setup.py clean --all - SOURMASH_COVERAGE=1 $(PYTHON) setup.py build_ext -i + $(PYTHON) setup.py build_ext -i $(PYTHON) -m pytest --cov=. --cov-report term-missing benchmark: asv continuous master `git rev-parse HEAD` + cargo bench check: cargo build diff --git a/README.md b/README.md index e2d6619cb3..b1297dade1 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,19 @@ + + # sourmash [![Documentation](https://readthedocs.org/projects/sourmash/badge/?version=latest)](http://sourmash.readthedocs.io/en/latest/) [![Build Status](https://travis-ci.com/dib-lab/sourmash.svg?branch=master)](https://travis-ci.com/dib-lab/sourmash) +PyPI [![codecov](https://codecov.io/gh/dib-lab/sourmash/branch/master/graph/badge.svg)](https://codecov.io/gh/dib-lab/sourmash) [![DOI](http://joss.theoj.org/papers/10.21105/joss.00027/status.svg)](http://joss.theoj.org/papers/10.21105/joss.00027) +License: 3-Clause BSD + +🦀 +[![](http://meritbadge.herokuapp.com/sourmash)](https://crates.io/crates/sourmash) +[![Rust API Documentation on docs.rs](https://docs.rs/sourmash/badge.svg)](https://docs.rs/sourmash) + +--- Compute MinHash signatures for nucleotide (DNA/RNA) and protein sequences. @@ -13,7 +23,7 @@ Usage: sourmash compare *.sig -o distances sourmash plot distances -Sourmash 1.0 is [published on JOSS](https://doi.org/10.21105/joss.00027); please cite that paper if you use sourmash (`doi: 10.21105/joss.00027`):. +sourmash 1.0 is [published on JOSS](https://doi.org/10.21105/joss.00027); please cite that paper if you use sourmash (`doi: 10.21105/joss.00027`):. ---- @@ -48,9 +58,10 @@ A quickstart tutorial [is available](https://sourmash.readthedocs.io/en/latest/t ### Requirements sourmash runs under both Python 2.7.x and Python 3.5+. The base -requirements are screed and ijson, together with a C++ development -environment and the CPython development headers and libraries (for the -C++ extension). +requirements are screed and ijson, together with a Rust environment (for the +extension code). We suggest using `rustup` to install the Rust environment: + + curl https://sh.rustup.rs -sSf | sh The comparison code (`sourmash compare`) uses numpy, and the plotting code uses matplotlib and scipy, but most of the code is usable without diff --git a/doc/developer.md b/doc/developer.md index cd4ab13847..79f2411e9f 100644 --- a/doc/developer.md +++ b/doc/developer.md @@ -7,7 +7,13 @@ You can get the latest development master branch with: ``` git clone https://github.com/dib-lab/sourmash.git ``` -To install all of the necessary dependencies, do: +sourmash runs under both Python 2.7.x and Python 3.5+. The base +requirements are screed and ijson, together with a Rust environment (for the +extension code). We suggest using `rustup` to install the Rust environment: + + curl https://sh.rustup.rs -sSf | sh + +To install all of the necessary Python dependencies, do: ``` pip install -r requirements.txt ``` @@ -25,13 +31,6 @@ pip install -e . We use [Travis][0] for continuous integration. -Code coverage calculation is enabled (on Linux only) by running -`make coverage`. This recompiles the C++ extension without -optimization and with coverage configured. See `setup.py` for -more information on this; the environment variable -`SOURMASH_COVERAGE` controls whether the C++ extension is -compiled with code coverage analysis enabled. - Code coverage can be viewed interactively at [codecov.io][1]. [0]:https://travis-ci.org/dib-lab/sourmash diff --git a/netlify.toml b/netlify.toml new file mode 100644 index 0000000000..994c94d665 --- /dev/null +++ b/netlify.toml @@ -0,0 +1,12 @@ +# Configuration for pull request documentation previews via Netlify + +[build] +publish = "_build/html" +base = "doc" +command = ''' + cd .. && \ + curl https://sh.rustup.rs -sSf | sh -s -- -y && \ + source $HOME/.cargo/env && \ + pip install -e .[doc] && \ + make doc +''' diff --git a/setup.py b/setup.py index 1f1720821d..2850339eda 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,32 @@ from __future__ import print_function -import sys -from setuptools import setup, find_packages -from setuptools import Extension import os +from setuptools import setup, find_packages +import sys + + +DEBUG_BUILD = os.environ.get("SOURMASH_DEBUG") == "1" + + +def build_native(spec): + cmd = ["cargo", "build", "--lib"] + + target = "debug" + if not DEBUG_BUILD: + cmd.append("--release") + target = "release" + + build = spec.add_external_build(cmd=cmd, path=".") + + rtld_flags = ["NOW"] + if sys.platform == "darwin": + rtld_flags.append("NODELETE") + spec.add_cffi_module( + module_path="sourmash._lowlevel", + dylib=lambda: build.find_dylib("sourmash", in_path="target/%s" % target), + header_filename=lambda: build.find_header("sourmash.h", in_path="include"), + rtld_flags=rtld_flags, + ) -EXTRA_COMPILE_ARGS = ['-std=c++11', '-pedantic'] -EXTRA_LINK_ARGS=[] CLASSIFIERS = [ "Environment :: Console", @@ -15,7 +36,7 @@ "Natural Language :: English", "Operating System :: POSIX :: Linux", "Operating System :: MacOS :: MacOS X", - "Programming Language :: C++", + "Programming Language :: Rust", "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", @@ -24,24 +45,10 @@ CLASSIFIERS.append("Development Status :: 5 - Production/Stable") -if sys.platform == 'darwin': # Mac OS X? - # force 64bit only builds - EXTRA_COMPILE_ARGS.extend(['-arch', 'x86_64', '-mmacosx-version-min=10.7', - '-stdlib=libc++']) - -else: # ...likely Linux - if os.environ.get('SOURMASH_COVERAGE'): - print('Turning on coverage analysis.') - EXTRA_COMPILE_ARGS.extend(['-g', '--coverage', '-lgcov']) - EXTRA_LINK_ARGS.extend(['--coverage', '-lgcov']) - else: - EXTRA_COMPILE_ARGS.append('-O3') - -with open('README.md', 'r') as readme: +with open("README.md", "r") as readme: LONG_DESCRIPTION = readme.read() -SETUP_METADATA = \ - { +SETUP_METADATA = { "name": "sourmash", "description": "tools for comparing DNA sequences with MinHash sketches", "long_description": LONG_DESCRIPTION, @@ -55,20 +62,18 @@ 'sourmash = sourmash.__main__:main' ] }, - "ext_modules": [Extension("sourmash._minhash", - sources=["sourmash/_minhash.pyx", - "third-party/smhasher/MurmurHash3.cc"], - depends=["sourmash/kmer_min_hash.hh"], - include_dirs=["./sourmash", - "./third-party/smhasher/"], - language="c++", - extra_compile_args=EXTRA_COMPILE_ARGS, - extra_link_args=EXTRA_LINK_ARGS)], "install_requires": ["screed>=0.9", "ijson>=2.5.1", "khmer>=2.1", 'numpy', + "cffi", 'matplotlib', 'scipy', "deprecation>=2.0.6"], - "setup_requires": ['Cython>=0.25.2', "setuptools>=38.6.0", - 'setuptools_scm', 'setuptools_scm_git_archive'], + "setup_requires": [ + "setuptools>=38.6.0", + "milksnake", + "setuptools_scm", + "setuptools_scm_git_archive", + ], "use_scm_version": {"write_to": "sourmash/version.py"}, + "zip_safe": False, + "platforms": "any", "extras_require": { 'test' : ['pytest', 'pytest-cov'], 'demo' : ['jupyter', 'jupyter_client', 'ipython'], @@ -76,13 +81,10 @@ "sphinxcontrib-napoleon", "nbsphinx"], '10x': ['bam2fasta==1.0.1'], 'storage': ["ipfshttpclient", "redis"] - }, - "include_package_data": True, - "package_data": { - "sourmash": ['*.pxd'] }, - "classifiers": CLASSIFIERS - } + "include_package_data": True, + "classifiers": CLASSIFIERS, + "milksnake_tasks": [build_native], +} setup(**SETUP_METADATA) - diff --git a/sourmash/__init__.py b/sourmash/__init__.py index 2893023d2a..319ccd1e75 100644 --- a/sourmash/__init__.py +++ b/sourmash/__init__.py @@ -7,12 +7,21 @@ import math import os -from ._minhash import (MinHash, get_minhash_default_seed, get_minhash_max_hash) +from ._lowlevel import ffi, lib + +ffi.init_once(lib.sourmash_init, "init") + +from ._minhash import MinHash, get_minhash_default_seed, get_minhash_max_hash + DEFAULT_SEED = get_minhash_default_seed() MAX_HASH = get_minhash_max_hash() -from .signature import (load_signatures, load_one_signature, SourmashSignature, - save_signatures) +from .signature import ( + load_signatures, + load_one_signature, + SourmashSignature, + save_signatures, +) from .sbtmh import load_sbt_index, search_sbt_index, create_sbt_index from . import lca from . import sbt @@ -21,6 +30,7 @@ from . import signature from pkg_resources import get_distribution, DistributionNotFound + try: VERSION = get_distribution(__name__).version except DistributionNotFound: # pragma: no cover diff --git a/sourmash/_compat.py b/sourmash/_compat.py new file mode 100644 index 0000000000..86b4e97f98 --- /dev/null +++ b/sourmash/_compat.py @@ -0,0 +1,24 @@ +import sys + + +PY2 = sys.version_info[0] == 2 + +if PY2: + text_type = unicode + int_types = (int, long) + string_types = (str, unicode) + range_type = xrange + itervalues = lambda x: x.itervalues() + NUL = '\x00' + def implements_to_string(cls): + cls.__unicode__ = cls.__str__ + cls.__str__ = lambda x: x.__unicode__().encode('utf-8') + return cls +else: + text_type = str + int_types = (int,) + string_types = (str,) + range_type = range + itervalues = lambda x: x.values() + NUL = 0 + implements_to_string = lambda x: x diff --git a/sourmash/_minhash.pxd b/sourmash/_minhash.pxd deleted file mode 100644 index a5aa1f4e04..0000000000 --- a/sourmash/_minhash.pxd +++ /dev/null @@ -1,70 +0,0 @@ -# -*- coding: UTF-8 -*- -# cython: language_level=3, c_string_type=str, c_string_encoding=ascii - -from __future__ import unicode_literals - -from libcpp cimport bool -from libcpp.map cimport map -from libcpp.memory cimport unique_ptr -from libcpp.set cimport set as cppset -from libcpp.string cimport string -from libc.stdint cimport uint32_t, uint64_t -from libcpp.vector cimport vector - - -cdef extern from "kmer_min_hash.hh": - ctypedef uint64_t HashIntoType; - ctypedef vector[HashIntoType] CMinHashType; - - - cdef uint64_t _hash_murmur(const string, uint32_t seed) - cdef uint64_t _hash_murmur(const char *, unsigned int, uint32_t) - - cdef cppclass KmerMinHash: - const uint32_t seed; - const unsigned int num; - const unsigned int ksize; - const bool is_protein; - const bool dayhoff; - const bool hp; - const HashIntoType max_hash; - CMinHashType mins; - - KmerMinHash(unsigned int, unsigned int, bool, bool, bool, uint32_t, HashIntoType) - void add_hash(HashIntoType) except +ValueError - void remove_hash(HashIntoType) except +ValueError - void add_word(const string& word) except +ValueError - void add_word(const char * word) except +ValueError - void add_sequence(const string&, bool) except +ValueError - void merge(const KmerMinHash&) except +ValueError - string aa_to_dayhoff(string aa) except +ValueError - string aa_to_hp(string aa) except +ValueError - string translate_codon(string codon) except +ValueError - unsigned int count_common(const KmerMinHash&) except +ValueError - unsigned long size() - - - cdef cppclass KmerMinAbundance(KmerMinHash): - CMinHashType abunds; - - KmerMinAbundance(unsigned int, unsigned int, bool, bool, bool, uint32_t, HashIntoType) - void add_hash(HashIntoType) except +ValueError - void remove_hash(HashIntoType) except +ValueError - void add_word(string word) except +ValueError - void add_word(const char * word) except +ValueError - void add_sequence(const string&, bool) except +ValueError - void merge(const KmerMinAbundance&) except +ValueError - void merge(const KmerMinHash&) except +ValueError - string aa_to_dayhoff(string aa) except +ValueError - string aa_to_hp(string aa) except +ValueError - string translate_codon(string codon) except +ValueError - unsigned int count_common(const KmerMinAbundance&) except +ValueError - unsigned long size() - - -cdef class MinHash(object): - cdef unique_ptr[KmerMinHash] _this - cdef bool _track_abundance - - cpdef get_mins(self, bool with_abundance=*) - cpdef set_abundances(self, dict) diff --git a/sourmash/_minhash.py b/sourmash/_minhash.py new file mode 100644 index 0000000000..2557fd218b --- /dev/null +++ b/sourmash/_minhash.py @@ -0,0 +1,527 @@ +# -*- coding: UTF-8 -*- +from __future__ import unicode_literals, division + +import math +import copy + +from ._compat import string_types, range_type +from ._lowlevel import ffi, lib +from .utils import RustObject, rustcall, decode_str +from .exceptions import SourmashError + +# default MurmurHash seed +MINHASH_DEFAULT_SEED = 42 + + +def get_minhash_default_seed(): + return MINHASH_DEFAULT_SEED + + +# we use the 64-bit hash space of MurmurHash only +# this is 2 ** 64 - 1 in hexadecimal +MINHASH_MAX_HASH = 0xFFFFFFFFFFFFFFFF + + +def get_minhash_max_hash(): + return MINHASH_MAX_HASH + + +def get_max_hash_for_scaled(scaled): + if scaled == 0: + return 0 + elif scaled == 1: + return get_minhash_max_hash() + + return int(round(get_minhash_max_hash() / scaled, 0)) + + +def get_scaled_for_max_hash(max_hash): + if max_hash == 0: + return 0 + return int(round(get_minhash_max_hash() / max_hash, 0)) + + +def to_bytes(s): + # Allow for strings, bytes or int + # Single item of byte string = int + + if isinstance(s, bytes): + return s + + if not isinstance(s, string_types + (bytes, int)): + raise TypeError("Requires a string-like sequence") + + if isinstance(s, string_types): + s = s.encode("utf-8") + elif isinstance(s, int): + s = bytes([s]) + + return s + + +def hash_murmur(kmer, seed=MINHASH_DEFAULT_SEED): + "hash_murmur(string, [,seed])\n\n" + "Compute a hash for a string, optionally using a seed (an integer). " + "The current default seed is returned by hash_seed()." + + return lib.hash_murmur(to_bytes(kmer), seed) + + +def dotproduct(a, b, normalize=True): + """ + Compute the dot product of two dictionaries {k: v} where v is + abundance. + """ + + if normalize: + norm_a = math.sqrt(sum([x * x for x in a.values()])) + norm_b = math.sqrt(sum([x * x for x in b.values()])) + + if norm_a == 0.0 or norm_b == 0.0: + return 0.0 + else: + norm_a = 1.0 + norm_b = 1.0 + + prod = 0.0 + for k, abundance in a.items(): + prod += (float(abundance) / norm_a) * (b.get(k, 0) / norm_b) + + return prod + + +class MinHash(RustObject): + def __init__( + self, + n, + ksize, + is_protein=False, + dayhoff=False, + hp=False, + track_abundance=False, + seed=MINHASH_DEFAULT_SEED, + max_hash=0, + mins=None, + scaled=0, + ): + if max_hash and scaled: + raise ValueError("cannot set both max_hash and scaled") + elif scaled: + max_hash = get_max_hash_for_scaled(scaled) + + if max_hash and n: + raise ValueError("cannot set both n and max_hash") + + if not n and not (max_hash or scaled): + raise ValueError("cannot omit both n and scaled") + + if dayhoff or hp: + is_protein = False + + self._objptr = lib.kmerminhash_new( + n, ksize, is_protein, dayhoff, hp, seed, int(max_hash), track_abundance + ) + self.__dealloc_func__ = lib.kmerminhash_free + + if mins: + if track_abundance: + self.set_abundances(mins) + else: + self.add_many(mins) + + def __copy__(self): + a = MinHash( + self.num, + self.ksize, + is_protein=self.is_protein, + dayhoff=self.dayhoff, + hp=self.hp, + track_abundance=self.track_abundance, + seed=self.seed, + max_hash=self.max_hash, + ) + a.merge(self) + return a + + def __getstate__(self): # enable pickling + return ( + self.num, + self.ksize, + self.is_protein, + self.dayhoff, + self.hp, + self.get_mins(with_abundance=self.track_abundance), + None, + self.track_abundance, + self.max_hash, + self.seed, + ) + + def __setstate__(self, tup): + (n, ksize, is_protein, dayhoff, hp, mins, _, track_abundance, max_hash, seed) = tup + + self.__del__() + self._objptr = lib.kmerminhash_new( + n, ksize, is_protein, dayhoff, hp, seed, max_hash, track_abundance + ) + if track_abundance: + self.set_abundances(mins) + else: + self.add_many(mins) + + def __reduce__(self): + return ( + MinHash, + ( + self.num, + self.ksize, + self.is_protein, + self.dayhoff, + self.hp, + self.track_abundance, + self.seed, + self.max_hash, + self.get_mins(with_abundance=self.track_abundance), + 0, + ), + ) + + def __eq__(self, other): + return self.__getstate__() == other.__getstate__() + + def copy_and_clear(self): + a = MinHash( + self.num, + self.ksize, + self.is_protein, + self.dayhoff, + self.hp, + self.track_abundance, + self.seed, + self.max_hash, + ) + return a + + def add_sequence(self, sequence, force=False): + self._methodcall(lib.kmerminhash_add_sequence, to_bytes(sequence), force) + + def add(self, kmer): + "Add kmer into sketch." + self.add_sequence(kmer) + + def add_many(self, hashes): + "Add many hashes in at once." + if isinstance(hashes, MinHash): + self._methodcall(lib.kmerminhash_add_from, hashes._objptr) + else: + for hash in hashes: + self._methodcall(lib.kmerminhash_add_hash, hash) + + def remove_many(self, hashes): + "Add many hashes in at once." + self._methodcall(lib.kmerminhash_remove_many, list(hashes), len(hashes)) + + def update(self, other): + "Update this estimator from all the hashes from the other." + self.add_many(other) + + def __len__(self): + return self._methodcall(lib.kmerminhash_get_mins_size) + + def get_mins(self, with_abundance=False): + size = self._methodcall(lib.kmerminhash_get_mins_size) + mins_ptr = self._methodcall(lib.kmerminhash_get_mins) + + if with_abundance and self.track_abundance: + abunds_ptr = self._methodcall(lib.kmerminhash_get_abunds) + return dict(zip(ffi.unpack(mins_ptr, size), ffi.unpack(abunds_ptr, size))) + else: + return ffi.unpack(mins_ptr, size) + + def get_hashes(self): + return self.get_mins() + + def subtract_mins(self, other): + a = set(self.get_mins()) + b = set(other.get_mins()) + return a - b + + @property + def seed(self): + return self._methodcall(lib.kmerminhash_seed) + + @property + def num(self): + return self._methodcall(lib.kmerminhash_num) + + @property + def scaled(self): + if self.max_hash: + return get_scaled_for_max_hash(self.max_hash) + return 0 + + @property + def is_dna(self): + return not (self.is_protein or self.dayhoff or self.hp) + + @property + def is_protein(self): + return self._methodcall(lib.kmerminhash_is_protein) + + @property + def dayhoff(self): + return self._methodcall(lib.kmerminhash_dayhoff) + + @property + def hp(self): + return self._methodcall(lib.kmerminhash_hp) + + @property + def ksize(self): + return self._methodcall(lib.kmerminhash_ksize) + + @property + def max_hash(self): + return self._methodcall(lib.kmerminhash_max_hash) + + @property + def track_abundance(self): + return self._methodcall(lib.kmerminhash_track_abundance) + + @track_abundance.setter + def track_abundance(self, b): + if self.track_abundance == b: + return + + if b is False: + self._methodcall(lib.kmerminhash_disable_abundance) + elif len(self) > 0: + raise RuntimeError("Can only set track_abundance=True if the MinHash is empty") + else: + self._methodcall(lib.kmerminhash_enable_abundance) + + def add_hash(self, h): + return self._methodcall(lib.kmerminhash_add_hash, h) + + def translate_codon(self, codon): + try: + return rustcall(lib.sourmash_translate_codon, + to_bytes(codon)).decode('utf-8') + except SourmashError as e: + raise ValueError(e.message) + + def count_common(self, other): + if not isinstance(other, MinHash): + raise TypeError("Must be a MinHash!") + return self._methodcall(lib.kmerminhash_count_common, other._get_objptr()) + + def downsample_n(self, new_num): + if self.num and self.num < new_num: + raise ValueError("new sample n is higher than current sample n") + + a = MinHash( + new_num, self.ksize, self.is_protein, self.dayhoff, self.hp, self.track_abundance, self.seed, 0 + ) + if self.track_abundance: + a.set_abundances(self.get_mins(with_abundance=True)) + else: + a.add_many(self) + + return a + + def downsample_max_hash(self, *others): + max_hashes = [x.max_hash for x in others] + new_max_hash = min(self.max_hash, *max_hashes) + new_scaled = get_scaled_for_max_hash(new_max_hash) + + return self.downsample_scaled(new_scaled) + + def downsample_scaled(self, new_num): + if self.num: + raise ValueError("num != 0 - cannot downsample a standard MinHash") + + max_hash = self.max_hash + if max_hash is None: + raise ValueError("no max_hash available - cannot downsample") + + old_scaled = get_scaled_for_max_hash(self.max_hash) + if old_scaled > new_num: + raise ValueError( + "new scaled {} is lower than current sample scaled {}".format( + new_num, old_scaled + ) + ) + + new_max_hash = get_max_hash_for_scaled(new_num) + + a = MinHash( + 0, + self.ksize, + self.is_protein, + self.dayhoff, + self.hp, + self.track_abundance, + self.seed, + new_max_hash, + ) + if self.track_abundance: + a.set_abundances(self.get_mins(with_abundance=True)) + else: + a.add_many(self) + + return a + + def intersection(self, other, in_common=False): + if not isinstance(other, MinHash): + raise TypeError("Must be a MinHash!") + + if self.num != other.num: + err = "must have same num: {} != {}".format(self.num, other.num) + raise TypeError(err) + + if in_common: + # TODO: copy from buffer to Python land instead, + # this way involves more moving data around. + combined_mh = self.copy_and_clear() + combined_mh.merge(self) + combined_mh.merge(other) + + size = len(combined_mh) + common = set(self.get_mins()) + common.intersection_update(other.get_mins()) + common.intersection_update(combined_mh.get_mins()) + else: + size = self._methodcall(lib.kmerminhash_intersection, other._get_objptr()) + common = set() + + return common, max(size, 1) + + def compare(self, other): + if self.num != other.num: + err = "must have same num: {} != {}".format(self.num, other.num) + raise TypeError(err) + return self._methodcall(lib.kmerminhash_compare, other._get_objptr()) + + def jaccard(self, other): + return self.compare(other) + + def similarity(self, other, ignore_abundance=False): + """Calculate similarity of two sketches. + + If the sketches are not abundance weighted, or ignore_abundance=True, + compute Jaccard similarity. + + If the sketches are abundance weighted, calculate a distance metric + based on the cosine similarity. + + Note, because the term frequencies (tf-idf weights) cannot be negative, + the angle will never be < 0deg or > 90deg. + + See https://en.wikipedia.org/wiki/Cosine_similarity + """ + + # if either signature is flat, calculate Jaccard only. + if not (self.track_abundance and other.track_abundance) or ignore_abundance: + return self.jaccard(other) + else: + # can we merge? if not, raise exception. + aa = copy.copy(self) + aa.merge(other) + + a = self.get_mins(with_abundance=True) + b = other.get_mins(with_abundance=True) + + prod = dotproduct(a, b) + prod = min(1.0, prod) + + distance = 2 * math.acos(prod) / math.pi + return 1.0 - distance + + def contained_by(self, other): + """\ + Calculate how much of self is contained by other. + """ + if not len(self): + return 0.0 + + return self.count_common(other) / len(self) + + def containment_ignore_maxhash(self, other): + if len(self) == 0: + return 0.0 + + if not isinstance(other, MinHash): + raise TypeError("Must be a MinHash!") + + return self._methodcall(lib.kmerminhash_containment_ignore_maxhash, other._get_objptr()) + + def __iadd__(self, other): + if not isinstance(other, MinHash): + raise TypeError("Must be a MinHash!") + self._methodcall(lib.kmerminhash_merge, other._get_objptr()) + return self + + merge = __iadd__ + + def set_abundances(self, values): + if self.track_abundance: + added = 0 + + for k, v in sorted(values.items()): + if not self.max_hash or k <= self.max_hash: + self._methodcall(lib.kmerminhash_mins_push, k) + self._methodcall(lib.kmerminhash_abunds_push, v) + added += 1 + if self.num > 0 and added >= self.num: + break + else: + raise RuntimeError( + "Use track_abundance=True when constructing " + "the MinHash to use set_abundances." + ) + + def add_protein(self, sequence): + ksize = self.ksize // 3 + if len(sequence) < ksize: + return + + aa_kmers = (sequence[i:i + ksize] for i in range(0, len(sequence) - ksize + 1)) + if self.is_protein: + for aa_kmer in aa_kmers: + self._methodcall( + lib.kmerminhash_add_word, to_bytes(aa_kmer) + ) + elif self.dayhoff: + for aa_kmer in aa_kmers: + dayhoff_kmer = '' + for aa in aa_kmer: + data = rustcall(lib.sourmash_aa_to_dayhoff, to_bytes(aa)) + dayhoff_letter = data.decode('utf-8') + dayhoff_kmer += dayhoff_letter + self._methodcall( + lib.kmerminhash_add_word, to_bytes(dayhoff_kmer) + ) + elif self.hp: + for aa_kmer in aa_kmers: + hp_kmer = '' + for aa in aa_kmer: + data = rustcall(lib.sourmash_aa_to_hp, to_bytes(aa)) + hp_letter = data.decode('utf-8') + hp_kmer += hp_letter + self._methodcall( + lib.kmerminhash_add_word, to_bytes(hp_kmer) + ) + else: + raise ValueError("Invalid protein type") + + def is_molecule_type(self, molecule): + if self.is_protein and molecule == 'protein': + return True + elif self.dayhoff and molecule == 'dayhoff': + return True + elif self.hp and molecule == 'hp': + return True + elif molecule.upper() == "DNA" and self.is_dna: + return True + + return False diff --git a/sourmash/_minhash.pyx b/sourmash/_minhash.pyx deleted file mode 100644 index 66d6a357ee..0000000000 --- a/sourmash/_minhash.pyx +++ /dev/null @@ -1,512 +0,0 @@ -# -*- coding: UTF-8 -*- -# cython: language_level=3, c_string_type=str, c_string_encoding=ascii - -from __future__ import unicode_literals - -from cython.operator cimport dereference as deref, address - -from libcpp cimport bool -from libc.stdint cimport uint32_t - -from ._minhash cimport KmerMinHash, KmerMinAbundance, _hash_murmur -import math -import copy - - -# default MurmurHash seed -cdef uint32_t MINHASH_DEFAULT_SEED = 42 - - -def get_minhash_default_seed(): - return MINHASH_DEFAULT_SEED - - -# we use the 64-bit hash space of MurmurHash only -cdef uint64_t MINHASH_MAX_HASH = 2**64 - 1 - - -def get_minhash_max_hash(): - return MINHASH_MAX_HASH - - -def get_max_hash_for_scaled(scaled): - if scaled == 0: - return 0 - elif scaled == 1: - return get_minhash_max_hash() - - return int(round(get_minhash_max_hash() / scaled, 0)) - - -def get_scaled_for_max_hash(max_hash): - if max_hash == 0: - return 0 - return int(round(get_minhash_max_hash() / max_hash, 0)) - - -cdef bytes to_bytes(s): - # Allow for strings, bytes or int - # Single item of byte string = int - if not isinstance(s, (basestring, bytes, int)): - raise TypeError("Requires a string-like sequence") - - if isinstance(s, unicode): - s = s.encode('utf-8') - if isinstance(s, int): - s = bytes([s]) - return s - - -def hash_murmur(kmer, uint32_t seed=MINHASH_DEFAULT_SEED): - "hash_murmur(string, [,seed])\n\n" - "Compute a hash for a string, optionally using a seed (an integer). " - "The current default seed is returned by hash_seed()." - - return _hash_murmur(to_bytes(kmer), seed) - - -def dotproduct(a, b, normalize=True): - """ - Compute the dot product of two dictionaries {k: v} where v is - abundance. - """ - - if normalize: - norm_a = math.sqrt(sum([ x*x for x in a.values() ])) - norm_b = math.sqrt(sum([ x*x for x in b.values() ])) - - if norm_a == 0.0 or norm_b == 0.0: - return 0.0 - else: - norm_a = 1.0 - norm_b = 1.0 - - prod = 0. - for k, abundance in a.items(): - prod += (float(abundance) / norm_a) * (b.get(k, 0) / norm_b) - - return prod - - -cdef class MinHash(object): - - def __init__(self, unsigned int n, unsigned int ksize, - bool is_protein=False, - bool dayhoff=False, - bool hp=False, - bool track_abundance=False, - uint32_t seed=MINHASH_DEFAULT_SEED, - HashIntoType max_hash=0, - mins=None, HashIntoType scaled=0): - self._track_abundance = track_abundance - - if max_hash and scaled: - raise ValueError('cannot set both max_hash and scaled') - elif scaled: - max_hash = get_max_hash_for_scaled(scaled) - - if max_hash and n: - raise ValueError('cannot set both n and max_hash') - - if not n and not (max_hash or scaled): - raise ValueError("cannot omit both n and scaled") - - cdef KmerMinHash *mh = NULL - if track_abundance: - mh = new KmerMinAbundance(n, ksize, is_protein, dayhoff, hp, seed, max_hash) - else: - mh = new KmerMinHash(n, ksize, is_protein, dayhoff, hp, seed, max_hash) - - self._this.reset(mh) - - if mins: - if track_abundance: - self.set_abundances(mins) - else: - self.add_many(mins) - - - def __copy__(self): - a = MinHash(deref(self._this).num, deref(self._this).ksize, - deref(self._this).is_protein, deref(self._this).dayhoff, - deref(self._this).hp, - self.track_abundance, - deref(self._this).seed, deref(self._this).max_hash) - a.merge(self) - return a - - def __getstate__(self): # enable pickling - with_abundance = False - if self.track_abundance: - with_abundance = True - - return (deref(self._this).num, - deref(self._this).ksize, - deref(self._this).is_protein, - deref(self._this).dayhoff, - deref(self._this).hp, - self.get_mins(with_abundance=with_abundance), - None, self.track_abundance, deref(self._this).max_hash, - deref(self._this).seed) - - def __setstate__(self, tup): - (n, ksize, is_protein, dayhoff, hp, mins, _, track_abundance, max_hash, seed) =\ - tup - - self._track_abundance = track_abundance - - cdef KmerMinHash *mh = NULL - if track_abundance: - mh = new KmerMinAbundance(n, ksize, is_protein, dayhoff, hp, seed, max_hash) - self._this.reset(mh) - self.set_abundances(mins) - else: - mh = new KmerMinHash(n, ksize, is_protein, dayhoff, hp, seed, max_hash) - self._this.reset(mh) - self.add_many(mins) - - def __reduce__(self): - return (MinHash, - (deref(self._this).num, - deref(self._this).ksize, - deref(self._this).is_protein, - deref(self._this).dayhoff, - deref(self._this).hp, - self.track_abundance, - deref(self._this).seed, - deref(self._this).max_hash, - self.get_mins(with_abundance=self.track_abundance), - 0)) - - def __richcmp__(self, other, op): - if op == 2: - return self.__getstate__() == other.__getstate__() - raise Exception("undefined comparison") - - def copy_and_clear(self): - a = MinHash(deref(self._this).num, deref(self._this).ksize, - deref(self._this).is_protein, deref(self._this).dayhoff, - deref(self._this).hp, self.track_abundance, - deref(self._this).seed, deref(self._this).max_hash) - return a - - def add_sequence(self, sequence, bool force=False): - deref(self._this).add_sequence(to_bytes(sequence), force) - - def add(self, kmer): - "Add kmer into sketch." - self.add_sequence(kmer) - - def add_many(self, hashes): - "Add many hashes in at once." - for hash in hashes: - self.add_hash(hash) - - def remove_many(self, hashes): - "Remove many hashes at once." - for hash in hashes: - deref(self._this).remove_hash(hash) - - def update(self, other): - "Update this estimator from all the hashes from the other." - self.add_many(other.get_mins()) - - def __len__(self): - return deref(self._this).mins.size() - - cpdef get_mins(self, bool with_abundance=False): - cdef KmerMinAbundance *mh = address(deref(self._this)) - if with_abundance and self.track_abundance: - return dict(zip(mh.mins, mh.abunds)) - else: - return deref(self._this).mins - - def get_hashes(self): - return self.get_mins() - - def subtract_mins(self, other): - a = set(self.get_mins()) - b = set(other.get_mins()) - return a - b - - @property - def seed(self): - return deref(self._this).seed - - @property - def num(self): - return deref(self._this).num - - @property - def scaled(self): - if self.max_hash: - return get_scaled_for_max_hash(self.max_hash) - return 0 - - @property - def is_protein(self): - return deref(self._this).is_protein - - @property - def dayhoff(self): - return deref(self._this).dayhoff - - @property - def hp(self): - return deref(self._this).hp - - @property - def ksize(self): - return deref(self._this).ksize - - @property - def max_hash(self): - return deref(self._this).max_hash - - @property - def track_abundance(self): - return self._track_abundance - - @track_abundance.setter - def track_abundance(self, v): - cdef KmerMinHash *mh = NULL - - if v == self._track_abundance: - return - - if v is True and len(self) != 0: - raise RuntimeError("Can only set track_abundance=True if the MinHash is empty") - - if v: - mh = new KmerMinAbundance(self.num, self.ksize, self.is_protein, - self.dayhoff, self.hp, self.seed, self.max_hash) - self._this.reset(mh) - - # At this point, if we are changing from track_abundance=True to False, - # keep the underlying Abundance MH (to avoid copying data to a new one). - - self._track_abundance = v - - def add_hash(self, uint64_t h): - deref(self._this).add_hash(h) - - def translate_codon(self, codon): - return deref(self._this).translate_codon(to_bytes(codon)) - - def count_common(self, MinHash other): - return deref(self._this).count_common(deref(other._this)) - - def downsample_n(self, new_num): - if self.num and self.num < new_num: - raise ValueError('new sample n is higher than current sample n') - - a = MinHash(new_num, deref(self._this).ksize, - deref(self._this).is_protein, deref(self._this).dayhoff, - deref(self._this).hp, - self.track_abundance, - deref(self._this).seed, 0) - if self.track_abundance: - a.set_abundances(self.get_mins(with_abundance=True)) - else: - a.add_many(self.get_mins()) - - return a - - def downsample_max_hash(self, *others): - max_hashes = [ x.max_hash for x in others ] - new_max_hash = min(self.max_hash, *max_hashes) - new_scaled = get_scaled_for_max_hash(new_max_hash) - - return self.downsample_scaled(new_scaled) - - def downsample_scaled(self, new_num): - if self.num: - raise ValueError('num != 0 - cannot downsample a standard MinHash') - - max_hash = self.max_hash - if max_hash is None: - raise ValueError('no max_hash available - cannot downsample') - - old_scaled = get_scaled_for_max_hash(self.max_hash) - if old_scaled > new_num: - raise ValueError('new scaled {} is lower than current sample scaled {}'.format(new_num, old_scaled)) - - new_max_hash = get_max_hash_for_scaled(new_num) - - a = MinHash(0, deref(self._this).ksize, - deref(self._this).is_protein, deref(self._this).dayhoff, - deref(self._this).hp, - self.track_abundance, - deref(self._this).seed, new_max_hash) - if self.track_abundance: - a.set_abundances(self.get_mins(with_abundance=True)) - else: - a.add_many(self.get_mins()) - - return a - - def intersection(self, MinHash other): - if self.num != other.num: - err = 'must have same num: {} != {}'.format(self.num, - other.num) - raise TypeError(err) - else: - num = self.num - - if self.track_abundance and other.track_abundance: - combined_mh = new KmerMinAbundance(num, - deref(self._this).ksize, - deref(self._this).is_protein, - deref(self._this).dayhoff, - deref(self._this).hp, - deref(self._this).seed, - deref(self._this).max_hash) - - else: - combined_mh = new KmerMinHash(num, - deref(self._this).ksize, - deref(self._this).is_protein, - deref(self._this).dayhoff, - deref(self._this).hp, - deref(self._this).seed, - deref(self._this).max_hash) - - combined_mh.merge(deref(self._this)) - combined_mh.merge(deref(other._this)) - - common = set(self.get_mins()) - common.intersection_update(other.get_mins()) - common.intersection_update(combined_mh.mins) - - size = max(combined_mh.size(), 1) - del combined_mh - - return common, size - - def compare(self, MinHash other): - common, size = self.intersection(other) - n = len(common) - return n / size - - def jaccard(self, MinHash other): - return self.compare(other) - - def similarity(self, other, ignore_abundance=False): - """\ - Calculate similarity of two sketches. - - If the sketches are not abundance weighted, or ignore_abundance=True, - compute Jaccard similarity. - - If the sketches are abundance weighted, calculate a distance metric - based on the cosine similarity. - - Note, because the term frequencies (tf-idf weights) cannot be negative, - the angle will never be < 0deg or > 90deg. - - See https://en.wikipedia.org/wiki/Cosine_similarity - """ - - # if either signature is flat, calculate Jaccard only. - if not (self.track_abundance and other.track_abundance) or \ - ignore_abundance: - return self.jaccard(other) - else: - # can we merge? if not, raise exception. - aa = copy.copy(self) - aa.merge(other) - - a = self.get_mins(with_abundance=True) - b = other.get_mins(with_abundance=True) - - prod = dotproduct(a, b) - prod = min(1.0, prod) - - distance = 2*math.acos(prod) / math.pi - return 1.0 - distance - - def contained_by(self, other): - """\ - Calculate how much of self is contained by other. - """ - if not len(self): - return 0.0 - return self.count_common(other) / len(self.get_mins()) - - def containment_ignore_maxhash(self, MinHash other): - a = set(self.get_mins()) - if not a: - return 0.0 - - b = set(other.get_mins()) - - overlap = a.intersection(b) - return float(len(overlap)) / float(len(a)) - - def __iadd__(self, MinHash other): - cdef KmerMinAbundance *mh = address(deref(self._this)) - cdef KmerMinAbundance *other_mh = address(deref(other._this)) - - if self.track_abundance and other.track_abundance: - deref(mh).merge(deref(other_mh)) - else: - deref(self._this).merge(deref(other._this)) - - return self - merge = __iadd__ - - cpdef set_abundances(self, dict values): - if self.track_abundance: - added = 0 - - for k, v in sorted(values.items()): - if not self.max_hash or k <= self.max_hash: - deref(self._this).mins.push_back(k) - (address(deref(self._this))).abunds.push_back(v) - added += 1 - if self.num > 0 and added >= self.num: - break - else: - raise RuntimeError("Use track_abundance=True when constructing " - "the MinHash to use set_abundances.") - - def add_protein(self, sequence): - cdef uint32_t ksize = deref(self._this).ksize // 3 - if len(sequence) < ksize: - return - - if not deref(self._this).is_protein: - raise ValueError("cannot add amino acid sequence to DNA MinHash!") - - aa_kmers = (sequence[i:i + ksize] for i in range(0, len(sequence) - ksize + 1)) - if not self.dayhoff and not self.hp: - for aa_kmer in aa_kmers: - deref(self._this).add_word(to_bytes(aa_kmer)) - elif self.dayhoff: - for aa_kmer in aa_kmers: - dayhoff_kmer = '' - for aa in aa_kmer: - dayhoff_letter = deref(self._this).aa_to_dayhoff(to_bytes(aa)) - dayhoff_kmer += dayhoff_letter - # dayhoff_kmer = ''.join( for aa in aa_kmer) - deref(self._this).add_word(to_bytes(dayhoff_kmer)) - else: - for aa_kmer in aa_kmers: - hp_kmer = '' - for aa in aa_kmer: - hp_letter = deref(self._this).aa_to_hp(to_bytes(aa)) - hp_kmer += hp_letter - # hp_kmer = ''.join( for aa in aa_kmer) - deref(self._this).add_word(to_bytes(hp_kmer)) - - def is_molecule_type(self, molecule): - if molecule.upper() == 'DNA' and not self.is_protein: - return True - elif self.is_protein and molecule == 'protein' and not any((self.dayhoff, self.hp)): - return True - elif self.dayhoff and molecule == 'dayhoff': - return True - elif self.hp and molecule == 'hp': - return True - - return False diff --git a/sourmash/exceptions.py b/sourmash/exceptions.py new file mode 100644 index 0000000000..8254a21762 --- /dev/null +++ b/sourmash/exceptions.py @@ -0,0 +1,44 @@ +from ._compat import implements_to_string +from ._lowlevel import lib + + +__all__ = ['SourmashError'] +exceptions_by_code = {} + + +@implements_to_string +class SourmashError(Exception): + code = None + + def __init__(self, msg): + Exception.__init__(self) + self.message = msg + self.rust_info = None + + def __str__(self): + rv = self.message + if self.rust_info is not None: + return u'%s\n\n%s' % (rv, self.rust_info) + return rv + + +def _make_exceptions(): + for attr in dir(lib): + if not attr.startswith('SOURMASH_ERROR_CODE_'): + continue + + class Exc(SourmashError): + pass + + code = getattr(lib, attr) + if code < 100 or code > 10000: + Exc.__name__ = attr[20:].title().replace('_', '') + Exc.code = getattr(lib, attr) + globals()[Exc.__name__] = Exc + Exc.code = code + exceptions_by_code[code] = Exc + __all__.append(Exc.__name__) + else: + exceptions_by_code[code] = ValueError + +_make_exceptions() diff --git a/sourmash/kmer_min_hash.hh b/sourmash/kmer_min_hash.hh deleted file mode 100644 index 47decc8105..0000000000 --- a/sourmash/kmer_min_hash.hh +++ /dev/null @@ -1,648 +0,0 @@ -#ifndef KMER_MIN_HASH_HH -#define KMER_MIN_HASH_HH - -#include -#include -#include -#include -#include -#include -#include - -#include "../third-party/smhasher/MurmurHash3.h" - -#define tbl \ - " "\ - /*ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz */\ - " TVGH FCD M KN YSAABW R TVGH FCD M KN YSAABW R" - -inline uint64_t _hash_murmur(const std::string& kmer, - const uint32_t seed) { - uint64_t out[2]; - out[0] = 0; out[1] = 0; - MurmurHash3_x64_128((void *)kmer.c_str(), kmer.size(), seed, &out); - return out[0]; -} - -/** - * @Synopsis Unsafe hash overload. Takes a const char * - * and assumes it has length ksize. - * - * @Param kmer The k-mer. - * @Param ksize Length of the k-mer. - * @Param seed Hashing seed. - * - * @Returns The hash value. - */ -inline uint64_t _hash_murmur(const char * kmer, - unsigned int ksize, - const uint32_t seed) { - uint64_t out[2]; - out[0] = 0; out[1] = 0; - MurmurHash3_x64_128((void *)kmer, ksize, seed, &out); - return out[0]; -} - -typedef uint64_t HashIntoType; - -typedef std::vector CMinHashType; - -class minhash_exception : public std::exception -{ -public: - explicit minhash_exception(const std::string& msg = "Generic minhash exception") - : _msg(msg) { } - - virtual ~minhash_exception() throw() { } - virtual const char* what() const throw () - { - return _msg.c_str(); - } - -protected: - const std::string _msg; -}; - -// Looks like a iterator but all it does is counts push_backs -struct Counter { - struct value_type { - template value_type(const T &) {} - }; - void push_back(const value_type &) { ++count; } - size_t count = 0; -}; - - -class KmerMinHash -{ -public: - const unsigned int num; - const unsigned int ksize; - const bool is_protein; - const bool dayhoff; - const bool hp; - const uint32_t seed; - const HashIntoType max_hash; - CMinHashType mins; - - KmerMinHash(unsigned int n, unsigned int k, bool prot, bool dyhoff, bool hp, uint32_t s, - HashIntoType mx) - : num(n), ksize(k), is_protein(prot), dayhoff(dyhoff), hp(hp), seed(s), max_hash(mx) { - if (n > 0) { - mins.reserve(num + 1); - } - // only reserve a finite amount of space for unbounded MinHashes - else { - mins.reserve(1000); - } - }; - - void check_compatible(const KmerMinHash& other) { - if (ksize != other.ksize) { - throw minhash_exception("different ksizes cannot be compared"); - } - if (is_protein != other.is_protein) { - throw minhash_exception("DNA/prot minhashes cannot be compared"); - } - if (dayhoff != other.dayhoff) { - throw minhash_exception("DNA/prot minhashes cannot be compared"); - } - if (hp != other.hp) { - throw minhash_exception("DNA/prot minhashes cannot be compared"); - } - if (max_hash != other.max_hash) { - throw minhash_exception("mismatch in max_hash; comparison fail"); - } - if (seed != other.seed) { - throw minhash_exception("mismatch in seed; comparison fail"); - } - } - - virtual void add_hash(const HashIntoType h) { - if ((max_hash and h <= max_hash) or not max_hash) { - if (mins.size() == 0) { - mins.push_back(h); - return; - } - else if (h <= max_hash or mins.back() > h or mins.size() < num) { - auto pos = std::lower_bound(std::begin(mins), std::end(mins), h); - - // must still be growing, we know the list won't get too long - if (pos == mins.cend()) { - mins.push_back(h); - } - // inserting somewhere in the middle, if this value isn't already - // in mins store it and shrink list if needed - else if (*pos != h) { - mins.insert(pos, h); - if (num and mins.size() > num) { - mins.pop_back(); - } - } - } - } - } - - virtual void remove_hash(const HashIntoType h) { - auto pos = std::lower_bound(std::begin(mins), std::end(mins), h); - if (pos != mins.cend() and *pos == h) { - mins.erase(pos); - } - } - - void add_word(const std::string& word) { - const HashIntoType hash = _hash_murmur(word, seed); - add_hash(hash); - } - - /** - * @Synopsis Unsafe overload: calls _hash_murmur assuming - * word is length ksize. - * - * @Param word k-mer to add. - */ - void add_word(const char * word, unsigned int size) { - const HashIntoType hash = _hash_murmur(word, size, seed); - add_hash(hash); - } - - void _invalid_kmer(const std::string& kmer) { - std::string msg = "invalid DNA character in input k-mer: "; - msg += kmer; - throw minhash_exception(msg); - } - - void add_sequence(std::string& seq, bool force=false) { - - if (seq.length() < ksize) { - return; - } - - std::transform(seq.begin(), seq.end(), seq.begin(), ::toupper); - - if (!is_protein) { - auto rc = _revcomp(seq); - for (unsigned int i = 0; i < seq.length() - ksize + 1; i++) { - auto fw_kmer = seq.c_str() + i; - auto rc_kmer = rc.c_str() + rc.length() - ksize - i; - - if (! _checkdna(fw_kmer, ksize)) { - if (force) { - continue; - } else { - _invalid_kmer(fw_kmer); - } - } - - if (std::lexicographical_compare(fw_kmer, - fw_kmer + ksize, - rc_kmer, - rc_kmer + ksize)) { - add_word(fw_kmer, ksize); - } else { - add_word(rc_kmer, ksize); - } - } - } else { // protein - std::string rc = _revcomp(seq); - for (unsigned int i = 0; i < 3; i++) { - std::string aa = _dna_to_aa(seq.substr(i, seq.length() - i)); - unsigned int aa_ksize = int(ksize / 3); - std::string kmer; - - for (unsigned int j = 0; j < aa.length() - aa_ksize + 1; j++) { - kmer = aa.substr(j, aa_ksize); - add_word(kmer); - } - - aa = _dna_to_aa(rc.substr(i, rc.length() - i)); - aa_ksize = int(ksize / 3); - - for (unsigned int j = 0; j < aa.length() - aa_ksize + 1; j++) { - kmer = aa.substr(j, aa_ksize); - add_word(kmer); - } - } - } - } - - std::string translate_codon(std::string& codon) { - std::string residue; - - if (codon.length() >= 2 && codon.length() <= 3){ - // If codon is length 2, pad with an N for ambiguous codon amino acids - if (codon.length() == 2) { - codon += "N"; - } - auto translated = _codon_table.find(codon); - - if (translated != _codon_table.end()) { - // "second" is the element mapped to by the codon - // Because .find returns an iterator - residue = translated -> second; - } else { - // Otherwise, assign the "X" or "unknown" amino acid - residue = "X"; - } - } else if (codon.length() == 1){ - // Then we only have one nucleotides and the amino acid is unknown - residue = "X"; - } else { - std::string msg = "Codon is invalid length: "; - msg += codon; - throw minhash_exception(msg); - } - return residue; - } - - std::string _dna_to_aa(const std::string& dna) { - std::string aa; - std::string codon; - std::string residue; - unsigned int dna_size = (dna.size() / 3) * 3; // floor it - for (unsigned int j = 0; j < dna_size; j += 3) { - - codon = dna.substr(j, 3); - - residue = translate_codon(codon); - - // Use dayhoff encoding of amino acids - if (dayhoff) { - std::string new_letter = aa_to_dayhoff(residue); - aa += new_letter; - // Use hp encoding of amino acids - } else if (hp) { - std::string new_letter = aa_to_hp(residue); - aa += new_letter; - } - else { - aa += residue; - } - - } - return aa; - } - - /** - * @Synopsis Check that a single char is a DNA base. - * - * @Param c The character. - * - * @Returns True if valid, false otherwise. - */ - bool _checkdna(const char c) const { - switch(c) { - case 'A': - case 'C': - case 'G': - case 'T': - break; - default: - return false; - } - return true; - } - - /** - * @Synopsis Safe DNA sanity check for a sequence of - * arbitrary length. - * - * @Param seq The sequence. - * - * @Returns True if valid, false otherwise. - */ - bool _checkdna(const std::string& seq) const { - - for (size_t i=0; i < seq.length(); ++i) { - if (!_checkdna(seq[i])) { - return false; - } - } - return true; - } - - /** - * @Synopsis Unsafe k-mer DNA sanity check: doesn't check length. - * - * @Param kmer k-mer to check. - * - * @Returns true if sane; false otherwise. - */ - bool _checkdna(const char * kmer, unsigned int length) const { - - for (size_t i=0; i < length; ++i) { - if (!_checkdna(*(kmer + i))) { - return false; - } - } - return true; - } - - std::string _revcomp(const std::string& kmer) const { - std::string out = kmer; - - auto from = out.begin(); - auto to = out.end(); - - char c; - for (to--; from <= to; from++, to--) { - c = tbl[(int)*from]; - *from = tbl[(int)*to]; - *to = c; - } - - return out; - } - - std::string aa_to_dayhoff(const std::string& aa) const { - // Convert an amino acid letter to dayhoff encoding - std::string new_letter; - - auto dayhoff_encoded = _dayhoff_table.find(aa); - if (dayhoff_encoded != _dayhoff_table.end()) { - // "second" is the element mapped to by the codon - // Because .find returns an iterator - new_letter = dayhoff_encoded -> second; - } else { - // Otherwise, assign the "X" or "unknown" amino acid - new_letter = "X"; - } - return new_letter; - } - - std::string aa_to_hp(const std::string& aa) const { - // Convert an amino acid letter to hp encoding - std::string new_letter; - - auto hp_encoded = _hp_table.find(aa); - if (hp_encoded != _hp_table.end()) { - // "second" is the element mapped to by the codon - // Because .find returns an iterator - new_letter = hp_encoded -> second; - } else { - // Otherwise, assign the "X" or "unknown" amino acid - new_letter = "X"; - } - return new_letter; - } - - virtual void merge(const KmerMinHash& other) { - check_compatible(other); - - CMinHashType merged; - merged.reserve(other.mins.size() + mins.size()); - std::set_union(other.mins.begin(), other.mins.end(), - mins.begin(), mins.end(), - std::back_inserter(merged)); - if (merged.size() < num or !num) { - mins = merged; - } - else { - mins = CMinHashType(std::begin(merged), std::begin(merged) + num); - } - } - - virtual unsigned int count_common(const KmerMinHash& other) { - check_compatible(other); - - Counter counter; - std::set_intersection(mins.begin(), mins.end(), - other.mins.begin(), other.mins.end(), - std::back_inserter(counter)); - return counter.count; - } - - virtual size_t size() { - return mins.size(); - } - - virtual ~KmerMinHash() throw() { } - -private: - std::map _codon_table = { - {"TTT", "F"}, {"TTC", "F"}, - {"TTA", "L"}, {"TTG", "L"}, - - {"TCT", "S"}, {"TCC", "S"}, {"TCA", "S"}, {"TCG", "S"}, {"TCN", "S"}, - - {"TAT", "Y"}, {"TAC", "Y"}, - {"TAA", "*"}, {"TAG", "*"}, - - {"TGT", "C"}, {"TGC", "C"}, - {"TGA", "*"}, - {"TGG", "W"}, - - {"CTT", "L"}, {"CTC", "L"}, {"CTA", "L"}, {"CTG", "L"}, {"CTN", "L"}, - - {"CCT", "P"}, {"CCC", "P"}, {"CCA", "P"}, {"CCG", "P"}, {"CCN", "P"}, - - {"CAT", "H"}, {"CAC", "H"}, - {"CAA", "Q"}, {"CAG", "Q"}, - - {"CGT", "R"}, {"CGC", "R"}, {"CGA", "R"}, {"CGG", "R"}, {"CGN", "R"}, - - {"ATT", "I"}, {"ATC", "I"}, {"ATA", "I"}, - {"ATG", "M"}, - - {"ACT", "T"}, {"ACC", "T"}, {"ACA", "T"}, {"ACG", "T"}, {"ACN", "T"}, - - {"AAT", "N"}, {"AAC", "N"}, - {"AAA", "K"}, {"AAG", "K"}, - - {"AGT", "S"}, {"AGC", "S"}, - {"AGA", "R"}, {"AGG", "R"}, - - {"GTT", "V"}, {"GTC", "V"}, {"GTA", "V"}, {"GTG", "V"}, {"GTN", "V"}, - - {"GCT", "A"}, {"GCC", "A"}, {"GCA", "A"}, {"GCG", "A"}, {"GCN", "A"}, - - {"GAT", "D"}, {"GAC", "D"}, - {"GAA", "E"}, {"GAG", "E"}, - - {"GGT", "G"}, {"GGC", "G"}, {"GGA", "G"}, {"GGG", "G"}, {"GGN", "G"} - }; - - -// Dayhoff table from -// Peris, P., López, D., & Campos, M. (2008). -// IgTM: An algorithm to predict transmembrane domains and topology in -// proteins. BMC Bioinformatics, 9(1), 1029–11. -// http://doi.org/10.1186/1471-2105-9-367 -// -// Original source: -// Dayhoff M. O., Schwartz R. M., Orcutt B. C. (1978). -// A model of evolutionary change in proteins, -// in Atlas of Protein Sequence and Structure, -// ed Dayhoff M. O., editor. -// (Washington, DC: National Biomedical Research Foundation; ), 345–352. -// -// | Amino acid | Property | Dayhoff | -// |---------------|-----------------------|---------| -// | C | Sulfur polymerization | a | -// | A, G, P, S, T | Small | b | -// | D, E, N, Q | Acid and amide | c | -// | H, K, R | Basic | d | -// | I, L, M, V | Hydrophobic | e | -// | F, W, Y | Aromatic | f | - std::map _dayhoff_table = { - {"C", "a"}, - - {"A", "b"}, {"G", "b"}, {"P", "b"}, {"S", "b"}, {"T", "b"}, - - {"D", "c"}, {"E", "c"}, {"N", "c"}, {"Q", "c"}, - - {"H", "d"}, {"K", "d"}, {"R", "d"}, - - {"I", "e"}, {"L", "e"}, {"M", "e"}, {"V", "e"}, - - {"F", "f"}, {"W", "f"}, {"Y", "f"} - - }; - -// HP Hydrophobic/hydrophilic mapping -// From: Phillips, R., Kondev, J., Theriot, J. (2008). -// Physical Biology of the Cell. New York: Garland Science, Taylor & Francis Group. ISBN: 978-0815341635 - -// -// | Amino acid | HP -// |---------------------------------------|---------| -// | A, F, G, I, L, M, P, V, W, Y | h | -// | N, C, S, T, D, E, R, H, K, Q | p | - std::map _hp_table = { - {"A", "h"}, {"F", "h"}, {"G", "h"}, {"I", "h"}, - {"L", "h"}, {"M", "h"}, {"P", "h"}, {"V", "h"}, - {"W", "h"}, {"Y", "h"}, {"N", "p"}, {"C", "p"}, - {"S", "p"}, {"T", "p"}, {"D", "p"}, {"E", "p"}, - {"R", "p"}, {"H", "p"}, {"K", "p"}, {"Q", "p"} - }; - -}; - -class KmerMinAbundance: public KmerMinHash { - public: - CMinHashType abunds; - - KmerMinAbundance(unsigned int n, unsigned int k, bool prot, bool dayhoff, - bool hp, uint32_t seed, HashIntoType mx) : - KmerMinHash(n, k, prot, dayhoff, hp, seed, mx) { }; - - virtual void add_hash(HashIntoType h) { - if ((max_hash and h <= max_hash) or not max_hash) { - // empty? add it, if within range / no range specified. - if (mins.size() == 0) { - mins.push_back(h); - abunds.push_back(1); - return; - } else if (h <= max_hash or mins.back() > h or mins.size() < num) { - // "good" hash - within range, smaller than current entry, or - // still space. - auto pos = std::lower_bound(std::begin(mins), std::end(mins), h); - - // at end -- must still be growing, we know the list won't get too - // long - if (pos == mins.cend()) { - mins.push_back(h); - abunds.push_back(1); - } else if (*pos != h) { - // didn't find hash already in mins, so - // inserting somewhere in the middle; shrink list if needed. - - // calculate distance for use w/abunds *before* insert, as - // 'mins.insert' may invalidate 'pos'. - size_t dist = std::distance(begin(mins), pos); - mins.insert(pos, h); - abunds.insert(begin(abunds) + dist, 1); - - // now too big? if so, continue. - if (mins.size() > num and not max_hash) { - mins.pop_back(); - abunds.pop_back(); - } - } else { // *pos == h - hash value already there, increment count. - auto p = std::distance(begin(mins), pos); - abunds[p] += 1; - } - } - } - } - - virtual void remove_hash(const HashIntoType h) { - auto pos = std::lower_bound(std::begin(mins), std::end(mins), h); - if (pos != mins.cend() and *pos == h) { - mins.erase(pos); - size_t dist = std::distance(begin(mins), pos); - abunds.erase(begin(abunds) + dist); - } - } - - virtual void merge(const KmerMinAbundance& other) { - check_compatible(other); - - CMinHashType merged_mins; - CMinHashType merged_abunds; - size_t max_size = other.mins.size() + mins.size(); - - merged_mins.reserve(max_size); - merged_abunds.reserve(max_size); - - auto it1_m = mins.begin(); - auto it2_m = other.mins.begin(); - auto out_m = std::back_inserter(merged_mins); - - auto it1_a = abunds.begin(); - auto it2_a = other.abunds.begin(); - auto out_a = std::back_inserter(merged_abunds); - - for (; it1_m != mins.end(); ++out_m, ++out_a) { - if (it2_m == other.mins.end()) { - /* we reached the end of other.mins, - so just copy the remainder of mins to the output */ - std::copy(it1_m, mins.end(), out_m); - std::copy(it1_a, abunds.end(), out_a); - break; - } - if (*it2_m < *it1_m) { - /* other.mins is smaller than mins, - so copy it to output and advance other.mins iterators */ - *out_m = *it2_m; - *out_a = *it2_a; - ++it2_m; - ++it2_a; - } else if (*it2_m == *it1_m) { - /* same value in both mins, so sums the abundances - on the output and advances all iterators */ - *out_m = *it1_m; - *out_a = *it1_a + *it2_a; - ++it1_m; ++it1_a; - ++it2_m; ++it2_a; - } else { - /* mins is smaller than other.mins, - so copy it to output and advance the mins iterators */ - *out_m = *it1_m; - *out_a = *it1_a; - ++it1_m; - ++it1_a; - } - } - /* we reached the end of mins/abunds, - so just copy the remainder of other to the output - (other might already be at the end, in this case nothing happens) */ - std::copy(it2_m, other.mins.end(), out_m); - std::copy(it2_a, other.abunds.end(), out_a); - - if (merged_mins.size() < num || !num) { - mins = merged_mins; - abunds = merged_abunds; - } else { - mins = CMinHashType(std::begin(merged_mins), std::begin(merged_mins) + num); - abunds = CMinHashType(std::begin(merged_abunds), std::begin(merged_abunds) + num); - } - } - - virtual size_t size() { - return mins.size(); - } - -}; - -#endif // KMER_MIN_HASH_HH diff --git a/sourmash/sbtmh.py b/sourmash/sbtmh.py index 2289e0cb94..0f7b46ae9c 100644 --- a/sourmash/sbtmh.py +++ b/sourmash/sbtmh.py @@ -54,10 +54,11 @@ def save(self, path): return self.storage.save(path, buf.getvalue()) def update(self, parent): - for v in self.data.minhash.get_mins(): + mh = self.data.minhash + for v in mh.get_mins(): parent.data.count(v) min_n_below = parent.metadata.get('min_n_below', sys.maxsize) - min_n_below = min(len(self.data.minhash), min_n_below) + min_n_below = min(len(mh), min_n_below) if min_n_below == 0: min_n_below = 1 diff --git a/sourmash/sig/__main__.py b/sourmash/sig/__main__.py index b0d29a8efe..10c9765107 100644 --- a/sourmash/sig/__main__.py +++ b/sourmash/sig/__main__.py @@ -268,10 +268,13 @@ def merge(args): mh.track_abundance = False try: + sigobj_mh = sigobj.minhash if not args.flatten: _check_abundance_compatibility(first_sig, sigobj) + else: + sigobj_mh.track_abundance = False - mh.merge(sigobj.minhash) + mh.merge(sigobj_mh) except: error("ERROR when merging signature '{}' ({}) from file {}", sigobj.name(), sigobj.md5sum()[:8], sigfile) diff --git a/sourmash/sourmash_args.py b/sourmash/sourmash_args.py index 542559a9a5..e496173737 100644 --- a/sourmash/sourmash_args.py +++ b/sourmash/sourmash_args.py @@ -215,11 +215,12 @@ def __iter__(self): self.ksizes.add(query_ksize) self.moltypes.add(query_moltype) - yield filename, query, query_moltype, query_ksize - if len(self.ksizes) > 1 or len(self.moltypes) > 1: raise ValueError('multiple k-mer sizes/molecule types present') + for query in sl: + yield filename, query, query_moltype, query_ksize + def traverse_find_sigs(dirnames, yield_all_files=False): for dirname in dirnames: diff --git a/sourmash/utils.py b/sourmash/utils.py new file mode 100644 index 0000000000..4fb2835fd8 --- /dev/null +++ b/sourmash/utils.py @@ -0,0 +1,78 @@ +import weakref + +from ._lowlevel import ffi, lib +from .exceptions import exceptions_by_code, SourmashError + +attached_refs = weakref.WeakKeyDictionary() + + +class RustObject(object): + __dealloc_func__ = None + _objptr = None + _shared = False + + def __init__(self): + raise TypeError("Cannot instanciate %r objects" % self.__class__.__name__) + + @classmethod + def _from_objptr(cls, ptr, shared=False): + rv = object.__new__(cls) + rv._objptr = ptr + rv._shared = shared + return rv + + def _methodcall(self, func, *args): + return rustcall(func, self._get_objptr(), *args) + + def _get_objptr(self): + if not self._objptr: + raise RuntimeError("Object is closed") + return self._objptr + + def __del__(self): + if self._objptr is None or self._shared: + return + f = self.__class__.__dealloc_func__ + if f is not None: + rustcall(f, self._objptr) + self._objptr = None + + +def decode_str(s, free=False): + """Decodes a SourmashStr""" + try: + if s.len == 0: + return u"" + return ffi.unpack(s.data, s.len).decode("utf-8", "replace") + finally: + if free: + lib.sourmash_str_free(ffi.addressof(s)) + + +def encode_str(s): + """Encodes a SourmashStr""" + rv = ffi.new("SourmashStr *") + if isinstance(s, text_type): + s = s.encode("utf-8") + rv.data = ffi.from_buffer(s) + rv.len = len(s) + # we have to hold a weak reference here to ensure our string does not + # get collected before the string is used. + attached_refs[rv] = s + return rv + + +def rustcall(func, *args): + """Calls rust method and does some error handling.""" + lib.sourmash_err_clear() + rv = func(*args) + err = lib.sourmash_err_get_last_code() + if not err: + return rv + msg = lib.sourmash_err_get_last_message() + cls = exceptions_by_code.get(err, SourmashError) + exc = cls(decode_str(msg)) + backtrace = decode_str(lib.sourmash_err_get_backtrace()) + if backtrace: + exc.rust_info = backtrace + raise exc diff --git a/src/bin/smrs.rs b/src/bin/smrs.rs index b58f2c44ee..b3fee5264d 100644 --- a/src/bin/smrs.rs +++ b/src/bin/smrs.rs @@ -1,3 +1,4 @@ +use std::convert::TryInto; use std::fs::File; use std::io; use std::path::Path; @@ -24,6 +25,7 @@ use sourmash::index::search::{ use sourmash::index::storage::{FSStorage, Storage}; use sourmash::index::{Comparable, Index, MHBT}; use sourmash::signature::{Signature, SigsTrait}; +use sourmash::sketch::minhash::HashFunctions; use sourmash::sketch::Sketch; pub fn index( @@ -126,6 +128,12 @@ fn load_query_signature( moltype: Option<&str>, scaled: Option, ) -> Result, Error> { + let moltype: Option = if let Some(mol) = moltype { + Some(mol.try_into()?) + } else { + None + }; + let mut reader = io::BufReader::new(File::open(query)?); let sigs = Signature::load_signatures(&mut reader, ksize, moltype, scaled)?; diff --git a/src/ffi/minhash.rs b/src/ffi/minhash.rs index 2f84fab499..d9ec05397b 100644 --- a/src/ffi/minhash.rs +++ b/src/ffi/minhash.rs @@ -297,7 +297,7 @@ unsafe fn kmerminhash_enable_abundance(ptr: *mut KmerMinHash) -> Result<()> { &mut *ptr }; - if mh.mins.is_empty() { + if !mh.mins.is_empty() { return Err(SourmashError::NonEmptyMinHash { message: "track_abundance=True".into()}.into()); } diff --git a/src/ffi/signature.rs b/src/ffi/signature.rs index 56e147ebe3..2d6984347b 100644 --- a/src/ffi/signature.rs +++ b/src/ffi/signature.rs @@ -1,3 +1,4 @@ +use std::convert::TryInto; use std::ffi::CStr; use std::io; use std::os::raw::c_char; @@ -8,7 +9,7 @@ use serde_json; use crate::ffi::utils::SourmashStr; use crate::signature::Signature; -use crate::sketch::minhash::KmerMinHash; +use crate::sketch::minhash::{HashFunctions, KmerMinHash}; use crate::sketch::Sketch; // Signature methods @@ -233,12 +234,11 @@ unsafe fn signatures_load_path(ptr: *const c_char, CStr::from_ptr(ptr) }; - let moltype = { - if select_moltype.is_null() { + let moltype: Option = if select_moltype.is_null() { None } else { - Some(CStr::from_ptr(select_moltype).to_str()?) - } + let mol = CStr::from_ptr(select_moltype).to_str()?; + Some(mol.try_into()?) }; // TODO: implement ignore_md5sum @@ -274,12 +274,11 @@ unsafe fn signatures_load_buffer(ptr: *const c_char, slice::from_raw_parts(ptr as *mut u8, insize) }; - let moltype = { - if select_moltype.is_null() { + let moltype: Option = if select_moltype.is_null() { None } else { - Some(CStr::from_ptr(select_moltype).to_str()?) - } + let mol = CStr::from_ptr(select_moltype).to_str()?; + Some(mol.try_into()?) }; let k = match ksize { diff --git a/src/index/bigsi.rs b/src/index/bigsi.rs index 6fac529a05..141211f584 100644 --- a/src/index/bigsi.rs +++ b/src/index/bigsi.rs @@ -155,6 +155,7 @@ impl<'a> Index<'a> for BIGSI { #[cfg(test)] mod test { + use std::convert::TryInto; use std::fs::File; use std::io::BufReader; use std::path::PathBuf; @@ -179,8 +180,13 @@ mod test { filename.push("tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); let mut reader = BufReader::new(File::open(filename).unwrap()); - let sigs = - Signature::load_signatures(&mut reader, Some(31), Some("DNA".into()), None).unwrap(); + let sigs = Signature::load_signatures( + &mut reader, + Some(31), + Some("DNA".try_into().unwrap()), + None, + ) + .unwrap(); let sig_data = sigs[0].clone(); let leaf: SigStore<_> = sig_data.into(); diff --git a/src/index/sbt/mhbt.rs b/src/index/sbt/mhbt.rs index b26975f6de..565468a278 100644 --- a/src/index/sbt/mhbt.rs +++ b/src/index/sbt/mhbt.rs @@ -153,14 +153,15 @@ impl ReadData for Node { #[cfg(test)] mod test { + use std::convert::TryInto; use std::fs::File; use std::io::{BufReader, Seek, SeekFrom}; use std::path::PathBuf; use std::rc::Rc; - use tempfile; use assert_matches::assert_matches; use lazy_init::Lazy; + use tempfile; use super::Factory; @@ -207,8 +208,13 @@ mod test { filename.push("tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); let mut reader = BufReader::new(File::open(filename).unwrap()); - let sigs = - Signature::load_signatures(&mut reader, Some(31), Some("DNA".into()), None).unwrap(); + let sigs = Signature::load_signatures( + &mut reader, + Some(31), + Some("DNA".try_into().unwrap()), + None, + ) + .unwrap(); let sig_data = sigs[0].clone(); let data = Lazy::new(); @@ -290,7 +296,12 @@ mod test { filename.push("tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); let mut reader = BufReader::new(File::open(filename)?); - let sigs = Signature::load_signatures(&mut reader, Some(31), Some("DNA".into()), None)?; + let sigs = Signature::load_signatures( + &mut reader, + Some(31), + Some("DNA".try_into().unwrap()), + None, + )?; let sig_data = sigs[0].clone(); let leaf: SigStore<_> = sig_data.into(); diff --git a/src/signature.rs b/src/signature.rs index 07823a43c3..914c962f3e 100644 --- a/src/signature.rs +++ b/src/signature.rs @@ -14,6 +14,7 @@ use typed_builder::TypedBuilder; use crate::errors::SourmashError; use crate::index::storage::ToWriter; +use crate::sketch::minhash::HashFunctions; use crate::sketch::Sketch; pub trait SigsTrait { @@ -156,7 +157,7 @@ impl Signature { pub fn load_signatures( buf: &mut R, ksize: Option, - moltype: Option<&str>, + moltype: Option, _scaled: Option, ) -> Result, Error> where @@ -190,9 +191,7 @@ impl Signature { match moltype { Some(x) => { - if (x.to_lowercase() == "dna" && !mh.is_protein()) - || (x.to_lowercase() == "protein" && mh.is_protein()) - { + if mh.hash_function() == x { return true; } } @@ -208,7 +207,7 @@ impl Signature { match moltype { Some(x) => { - if x.to_lowercase() == "dna" { + if x == HashFunctions::murmur64_DNA { return true; } else { // TODO: draff only supports dna for now @@ -285,6 +284,7 @@ impl PartialEq for Signature { #[cfg(test)] mod test { + use std::convert::TryInto; use std::fs::File; use std::io::BufReader; use std::path::PathBuf; @@ -297,8 +297,13 @@ mod test { filename.push("tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); let mut reader = BufReader::new(File::open(filename).unwrap()); - let sigs = - Signature::load_signatures(&mut reader, Some(31), Some("DNA".into()), None).unwrap(); + let sigs = Signature::load_signatures( + &mut reader, + Some(31), + Some("DNA".try_into().unwrap()), + None, + ) + .unwrap(); let _sig_data = sigs[0].clone(); // TODO: check sig_data } diff --git a/src/sketch/minhash.rs b/src/sketch/minhash.rs index 427ccb977d..088d2a4fd3 100644 --- a/src/sketch/minhash.rs +++ b/src/sketch/minhash.rs @@ -1,14 +1,14 @@ -use serde::de::{Deserialize, Deserializer}; -use serde::ser::{Serialize, SerializeStruct, Serializer}; -use serde_derive::Deserialize; - use std::cmp::Ordering; use std::collections::HashMap; +use std::convert::TryFrom; use std::iter::{Iterator, Peekable}; use std::str; use failure::Error; use lazy_static::lazy_static; +use serde::de::{Deserialize, Deserializer}; +use serde::ser::{Serialize, SerializeStruct, Serializer}; +use serde_derive::Deserialize; use crate::_hash_murmur; use crate::errors::SourmashError; @@ -27,6 +27,20 @@ pub enum HashFunctions { murmur64_hp = 4, } +impl TryFrom<&str> for HashFunctions { + type Error = Error; + + fn try_from(moltype: &str) -> Result { + match moltype.to_lowercase().as_ref() { + "dna" => Ok(HashFunctions::murmur64_DNA), + "dayhoff" => Ok(HashFunctions::murmur64_dayhoff), + "hp" => Ok(HashFunctions::murmur64_hp), + "protein" => Ok(HashFunctions::murmur64_protein), + _ => unimplemented!(), + } + } +} + #[cfg_attr(all(target_arch = "wasm32", target_vendor = "unknown"), wasm_bindgen)] #[derive(Debug, Clone, PartialEq)] pub struct KmerMinHash { @@ -177,6 +191,10 @@ impl KmerMinHash { self.hash_function == HashFunctions::murmur64_protein } + fn is_dna(&self) -> bool { + self.hash_function == HashFunctions::murmur64_DNA + } + pub fn seed(&self) -> u64 { self.seed } @@ -523,8 +541,7 @@ impl SigsTrait for KmerMinHash { .map(|&x| (x as char).to_ascii_uppercase() as u8) .collect(); if sequence.len() >= (self.ksize as usize) { - if !self.is_protein() { - // dna + if self.is_dna() { for kmer in sequence.windows(self.ksize as usize) { if _checkdna(kmer) { let rc = revcomp(kmer); diff --git a/tests/minhash.rs b/tests/minhash.rs index af0853e69f..8b38d5b8ac 100644 --- a/tests/minhash.rs +++ b/tests/minhash.rs @@ -73,6 +73,6 @@ fn dayhoff() { a.add_sequence(b"ACTGAC", false).unwrap(); b.add_sequence(b"ACTGAC", false).unwrap(); - assert_eq!(a.size(), 1); + assert_eq!(a.size(), 2); assert_eq!(b.size(), 2); } diff --git a/tests/smrs_cmd.rs b/tests/smrs_cmd.rs index c203099158..54ff329286 100644 --- a/tests/smrs_cmd.rs +++ b/tests/smrs_cmd.rs @@ -2,7 +2,6 @@ use std::fs; use std::process::Command; use assert_cmd::prelude::*; -use predicates::prelude::*; use predicates::str::contains; use tempfile::TempDir; diff --git a/tests/test__minhash.py b/tests/test__minhash.py index e994fcfde2..1c18a57bed 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -42,12 +42,17 @@ import pytest import sourmash -from sourmash._minhash import (MinHash, hash_murmur, dotproduct, - get_scaled_for_max_hash, - get_max_hash_for_scaled) -from . import sourmash_tst_utils as utils +from sourmash._minhash import ( + MinHash, + hash_murmur, + dotproduct, + get_scaled_for_max_hash, + get_max_hash_for_scaled, +) from sourmash import signature +from . import sourmash_tst_utils as utils + # add: # * get default params from Python # * keyword args for minhash constructor @@ -69,6 +74,7 @@ def test_basic_dna(track_abundance): print(a, b) assert a == b assert len(b) == 1 + assert a[0] == b[0] == 12415348535738636339 def test_div_zero(track_abundance): @@ -95,12 +101,12 @@ def test_bytes_dna(track_abundance): mh = MinHash(1, 4, track_abundance=track_abundance) mh.add_sequence('ATGC') mh.add_sequence(b'ATGC') - mh.add_sequence(u'ATGC') + mh.add_sequence('ATGC') a = mh.get_mins() mh.add_sequence('GCAT') # this will not get added; hash > ATGC mh.add_sequence(b'GCAT') # this will not get added; hash > ATGC - mh.add_sequence(u'GCAT') # this will not get added; hash > ATGC + mh.add_sequence('GCAT') # this will not get added; hash > ATGC b = mh.get_mins() print(a, b) @@ -112,7 +118,7 @@ def test_bytes_protein_dayhoff(track_abundance, dayhoff): # verify that we can hash protein/aa sequences mh = MinHash(10, 6, True, dayhoff=dayhoff, hp=False, track_abundance=track_abundance) mh.add_protein('AGYYG') - mh.add_protein(u'AGYYG') + mh.add_protein('AGYYG') mh.add_protein(b'AGYYG') assert len(mh.get_mins()) == 4 @@ -254,6 +260,13 @@ def test_max_hash_conversion(): assert new_scaled == SCALED +def test_max_hash_and_scaled_zero(): + max_hash = get_max_hash_for_scaled(0) + new_scaled = get_scaled_for_max_hash(0) + assert max_hash == new_scaled + assert max_hash == 0 + + def test_max_hash_and_scaled_error(track_abundance): # test behavior when supplying both max_hash and scaled with pytest.raises(ValueError): @@ -369,6 +382,28 @@ def test_compare_1(track_abundance): assert b.compare(b) == 1.0 +def test_intersection_errors(track_abundance): + a = MinHash(20, 10, track_abundance=track_abundance) + b = MinHash(20, 10, track_abundance=track_abundance) + c = MinHash(30, 10, track_abundance=track_abundance) + + a.add_sequence("TGCCGCCCAGCA") + b.add_sequence("TGCCGCCCAGCA") + + common = set(a.get_mins()) + combined_size = 3 + + intersection, size = a.intersection(b, in_common=False) + assert intersection == set() + assert combined_size == size + + with pytest.raises(TypeError): + a.intersection(set()) + + with pytest.raises(TypeError): + a.intersection(c) + + def test_intersection_1(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) b = MinHash(20, 10, track_abundance=track_abundance) @@ -379,38 +414,38 @@ def test_intersection_1(track_abundance): common = set(a.get_mins()) combined_size = 3 - intersection, size = a.intersection(b) + intersection, size = a.intersection(b, in_common=True) assert intersection == common assert combined_size == size - intersection, size = b.intersection(b) + intersection, size = b.intersection(b, in_common=True) assert intersection == common assert combined_size == size - intersection, size = b.intersection(a) + intersection, size = b.intersection(a, in_common=True) assert intersection == common assert combined_size == size - intersection, size = a.intersection(a) + intersection, size = a.intersection(a, in_common=True) assert intersection == common assert combined_size == size # add same sequence again b.add_sequence('TGCCGCCCAGCA') - intersection, size = a.intersection(b) + intersection, size = a.intersection(b, in_common=True) assert intersection == common assert combined_size == size - intersection, size = b.intersection(b) + intersection, size = b.intersection(b, in_common=True) assert intersection == common assert combined_size == size - intersection, size = b.intersection(a) + intersection, size = b.intersection(a, in_common=True) assert intersection == common assert combined_size == size - intersection, size = a.intersection(a) + intersection, size = a.intersection(a, in_common=True) assert intersection == common assert combined_size == size @@ -420,18 +455,18 @@ def test_intersection_1(track_abundance): new_in_common = set(a.get_mins()).intersection(set(b.get_mins())) new_combined_size = 8 - intersection, size = a.intersection(b) + intersection, size = a.intersection(b, in_common=True) assert intersection == new_in_common assert size == new_combined_size - intersection, size = b.intersection(a) + intersection, size = b.intersection(a, in_common=True) assert intersection == new_in_common assert size == new_combined_size - intersection, size = a.intersection(a) + intersection, size = a.intersection(a, in_common=True) assert intersection == set(a.get_mins()) - intersection, size = b.intersection(b) + intersection, size = b.intersection(b, in_common=True) assert intersection == set(b.get_mins()) @@ -510,6 +545,20 @@ def test_mh_count_common_diff_ksize(track_abundance): a.count_common(b) +def test_mh_count_common_notmh(track_abundance): + a = MinHash(20, 5, track_abundance=track_abundance) + b = set() + + with pytest.raises(TypeError): + a.count_common(b) + + +def test_mh_downsample_n_error(track_abundance): + a = MinHash(20, 10, track_abundance=track_abundance) + with pytest.raises(ValueError): + a.downsample_n(30) + + def test_mh_asymmetric(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): @@ -531,6 +580,12 @@ def test_mh_asymmetric(track_abundance): assert b.compare(a) == 0.5 +def test_mh_merge_typeerror(track_abundance): + a = MinHash(20, 10, track_abundance=track_abundance) + with pytest.raises(TypeError): + a.merge(set()) + + def test_mh_merge(track_abundance): # test merging two identically configured minhashes a = MinHash(20, 10, track_abundance=track_abundance) @@ -596,7 +651,7 @@ def test_mh_merge_check_length(track_abundance): b.add_hash(i) c = a.merge(b) - assert(len(c.get_mins()) == 20) + assert len(c.get_mins()) == 20 def test_mh_merge_check_length2(track_abundance): @@ -612,7 +667,7 @@ def test_mh_merge_check_length2(track_abundance): b.add_hash(4) c = a.merge(b) - assert(len(c.get_mins()) == 3) + assert len(c.get_mins()) == 3 def test_mh_asymmetric_merge(track_abundance): diff --git a/tests/test_rustobj.py b/tests/test_rustobj.py new file mode 100644 index 0000000000..4be7a0e1ee --- /dev/null +++ b/tests/test_rustobj.py @@ -0,0 +1,18 @@ +import pytest + +from sourmash.utils import RustObject +from sourmash._minhash import to_bytes + + +def test_rustobj_init(): + with pytest.raises(TypeError): + RustObject() + + +def test_to_bytes(): + with pytest.raises(TypeError): + to_bytes([9882]) + + assert to_bytes(98) == bytes([98]) + assert to_bytes("abc") == b"abc" + assert to_bytes(b"abc") == b"abc" diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index e9c4a5ecc5..c9073d4e4e 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -67,7 +67,7 @@ def test_do_serial_compare(c): cmp_outfile = c.output('cmp') assert os.path.exists(cmp_outfile) - cmp_out = numpy.load(cmp_outfile.encode('utf-8')) + cmp_out = numpy.load(cmp_outfile) sigs = [] for fn in testsigs: diff --git a/third-party/.gitignore b/third-party/.gitignore deleted file mode 100644 index 2ced0cd228..0000000000 --- a/third-party/.gitignore +++ /dev/null @@ -1,6 +0,0 @@ -zlib/example -zlib/example64 -zlib/examplesh -zlib/minigzip -zlib/minigzip64 -zlib/minigzipsh diff --git a/third-party/smhasher/MurmurHash3.cc b/third-party/smhasher/MurmurHash3.cc deleted file mode 100644 index 758a230d5d..0000000000 --- a/third-party/smhasher/MurmurHash3.cc +++ /dev/null @@ -1,340 +0,0 @@ -//----------------------------------------------------------------------------- -// MurmurHash3 was written by Austin Appleby, and is placed in the public -// domain. The author hereby disclaims copyright to this source code. - -// Note - The x86 and x64 versions do _not_ produce the same results, as the -// algorithms are optimized for their respective platforms. You can still -// compile and run any of them on any platform, but your performance with the -// non-native version will be less than optimal. - -#include "MurmurHash3.h" - -//----------------------------------------------------------------------------- -// Platform-specific functions and macros - -// Microsoft Visual Studio - -#if defined(_MSC_VER) - -#define FORCE_INLINE __forceinline - -#include - -#define ROTL32(x,y) _rotl(x,y) -#define ROTL64(x,y) _rotl64(x,y) - -#define BIG_CONSTANT(x) (x) - -// Other compilers - -#else // defined(_MSC_VER) - -#if defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && GNUC_MINOR >= 4)) -/* gcc version >= 4.4 4.1 = RHEL 5, 4.4 = RHEL 6. Don't inline for RHEL 5 gcc which is 4.1*/ -#define FORCE_INLINE inline __attribute__((always_inline)) -#else -#define FORCE_INLINE -#endif - -inline uint32_t rotl32 ( uint32_t x, int8_t r ) -{ - return (x << r) | (x >> (32 - r)); -} - -inline uint64_t rotl64 ( uint64_t x, int8_t r ) -{ - return (x << r) | (x >> (64 - r)); -} - -#define ROTL32(x,y) rotl32(x,y) -#define ROTL64(x,y) rotl64(x,y) - -#define BIG_CONSTANT(x) (x##LLU) - -#endif // !defined(_MSC_VER) - -//----------------------------------------------------------------------------- -// Block read - if your platform needs to do endian-swapping or can only -// handle aligned reads, do the conversion here - -FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i ) -{ - return p[i]; -} - -FORCE_INLINE uint64_t getblock ( const uint64_t * p, int i ) -{ - return p[i]; -} - -//----------------------------------------------------------------------------- -// Finalization mix - force all bits of a hash block to avalanche - -FORCE_INLINE uint32_t fmix ( uint32_t h ) -{ - h ^= h >> 16; - h *= 0x85ebca6b; - h ^= h >> 13; - h *= 0xc2b2ae35; - h ^= h >> 16; - - return h; -} - -//---------- - -FORCE_INLINE uint64_t fmix ( uint64_t k ) -{ - k ^= k >> 33; - k *= BIG_CONSTANT(0xff51afd7ed558ccd); - k ^= k >> 33; - k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); - k ^= k >> 33; - - return k; -} - -//----------------------------------------------------------------------------- - -void MurmurHash3_x86_32 ( const void * key, int len, - uint32_t seed, void * out ) -{ - const uint8_t * data = (const uint8_t*)key; - const int nblocks = len / 4; - - uint32_t h1 = seed; - - const uint32_t c1 = 0xcc9e2d51; - const uint32_t c2 = 0x1b873593; - - //---------- - // body - - const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); - - for(int i = -nblocks; i; i++) - { - uint32_t k1 = getblock(blocks,i); - - k1 *= c1; - k1 = ROTL32(k1,15); - k1 *= c2; - - h1 ^= k1; - h1 = ROTL32(h1,13); - h1 = h1*5+0xe6546b64; - } - - //---------- - // tail - - const uint8_t * tail = (const uint8_t*)(data + nblocks*4); - - uint32_t k1 = 0; - - switch(len & 3) - { - case 3: k1 ^= tail[2] << 16; - case 2: k1 ^= tail[1] << 8; - case 1: k1 ^= tail[0]; - k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; - }; - - //---------- - // finalization - - h1 ^= len; - - h1 = fmix(h1); - - *(uint32_t*)out = h1; -} - -//----------------------------------------------------------------------------- - -void MurmurHash3_x86_128 ( const void * key, const int len, - uint32_t seed, void * out ) -{ - const uint8_t * data = (const uint8_t*)key; - const int nblocks = len / 16; - - uint32_t h1 = seed; - uint32_t h2 = seed; - uint32_t h3 = seed; - uint32_t h4 = seed; - - const uint32_t c1 = 0x239b961b; - const uint32_t c2 = 0xab0e9789; - const uint32_t c3 = 0x38b34ae5; - const uint32_t c4 = 0xa1e38b93; - - //---------- - // body - - const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); - - for(int i = -nblocks; i; i++) - { - uint32_t k1 = getblock(blocks,i*4+0); - uint32_t k2 = getblock(blocks,i*4+1); - uint32_t k3 = getblock(blocks,i*4+2); - uint32_t k4 = getblock(blocks,i*4+3); - - k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; - - h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; - - k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; - - h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; - - k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; - - h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; - - k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; - - h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; - } - - //---------- - // tail - - const uint8_t * tail = (const uint8_t*)(data + nblocks*16); - - uint32_t k1 = 0; - uint32_t k2 = 0; - uint32_t k3 = 0; - uint32_t k4 = 0; - - switch(len & 15) - { - case 15: k4 ^= tail[14] << 16; - case 14: k4 ^= tail[13] << 8; - case 13: k4 ^= tail[12] << 0; - k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; - - case 12: k3 ^= tail[11] << 24; - case 11: k3 ^= tail[10] << 16; - case 10: k3 ^= tail[ 9] << 8; - case 9: k3 ^= tail[ 8] << 0; - k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; - - case 8: k2 ^= tail[ 7] << 24; - case 7: k2 ^= tail[ 6] << 16; - case 6: k2 ^= tail[ 5] << 8; - case 5: k2 ^= tail[ 4] << 0; - k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; - - case 4: k1 ^= tail[ 3] << 24; - case 3: k1 ^= tail[ 2] << 16; - case 2: k1 ^= tail[ 1] << 8; - case 1: k1 ^= tail[ 0] << 0; - k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; - }; - - //---------- - // finalization - - h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; - - h1 += h2; h1 += h3; h1 += h4; - h2 += h1; h3 += h1; h4 += h1; - - h1 = fmix(h1); - h2 = fmix(h2); - h3 = fmix(h3); - h4 = fmix(h4); - - h1 += h2; h1 += h3; h1 += h4; - h2 += h1; h3 += h1; h4 += h1; - - ((uint32_t*)out)[0] = h1; - ((uint32_t*)out)[1] = h2; - ((uint32_t*)out)[2] = h3; - ((uint32_t*)out)[3] = h4; -} - -//----------------------------------------------------------------------------- - -void MurmurHash3_x64_128 ( const void * key, const int len, - const uint32_t seed, void * out ) -{ - const uint8_t * data = (const uint8_t*)key; - const int nblocks = len / 16; - - uint64_t h1 = seed; - uint64_t h2 = seed; - - const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); - const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); - - //---------- - // body - - const uint64_t * blocks = (const uint64_t *)(data); - - for(int i = 0; i < nblocks; i++) - { - uint64_t k1 = getblock(blocks,i*2+0); - uint64_t k2 = getblock(blocks,i*2+1); - - k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; - - h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; - - k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; - - h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; - } - - //---------- - // tail - - const uint8_t * tail = (const uint8_t*)(data + nblocks*16); - - uint64_t k1 = 0; - uint64_t k2 = 0; - - switch(len & 15) - { - case 15: k2 ^= uint64_t(tail[14]) << 48; - case 14: k2 ^= uint64_t(tail[13]) << 40; - case 13: k2 ^= uint64_t(tail[12]) << 32; - case 12: k2 ^= uint64_t(tail[11]) << 24; - case 11: k2 ^= uint64_t(tail[10]) << 16; - case 10: k2 ^= uint64_t(tail[ 9]) << 8; - case 9: k2 ^= uint64_t(tail[ 8]) << 0; - k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; - - case 8: k1 ^= uint64_t(tail[ 7]) << 56; - case 7: k1 ^= uint64_t(tail[ 6]) << 48; - case 6: k1 ^= uint64_t(tail[ 5]) << 40; - case 5: k1 ^= uint64_t(tail[ 4]) << 32; - case 4: k1 ^= uint64_t(tail[ 3]) << 24; - case 3: k1 ^= uint64_t(tail[ 2]) << 16; - case 2: k1 ^= uint64_t(tail[ 1]) << 8; - case 1: k1 ^= uint64_t(tail[ 0]) << 0; - k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; - }; - - //---------- - // finalization - - h1 ^= len; h2 ^= len; - - h1 += h2; - h2 += h1; - - h1 = fmix(h1); - h2 = fmix(h2); - - h1 += h2; - h2 += h1; - - ((uint64_t*)out)[0] = h1; - ((uint64_t*)out)[1] = h2; -} - -//----------------------------------------------------------------------------- - diff --git a/third-party/smhasher/MurmurHash3.h b/third-party/smhasher/MurmurHash3.h deleted file mode 100644 index 54e9d3f9e3..0000000000 --- a/third-party/smhasher/MurmurHash3.h +++ /dev/null @@ -1,37 +0,0 @@ -//----------------------------------------------------------------------------- -// MurmurHash3 was written by Austin Appleby, and is placed in the public -// domain. The author hereby disclaims copyright to this source code. - -#ifndef _MURMURHASH3_H_ -#define _MURMURHASH3_H_ - -//----------------------------------------------------------------------------- -// Platform-specific functions and macros - -// Microsoft Visual Studio - -#if defined(_MSC_VER) - -typedef unsigned char uint8_t; -typedef unsigned long uint32_t; -typedef unsigned __int64 uint64_t; - -// Other compilers - -#else // defined(_MSC_VER) - -#include - -#endif // !defined(_MSC_VER) - -//----------------------------------------------------------------------------- - -void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); - -void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); - -void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); - -//----------------------------------------------------------------------------- - -#endif // _MURMURHASH3_H_ From 7fc473696a271d530f9f682a7acc08940557055d Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Thu, 19 Dec 2019 20:25:22 +0000 Subject: [PATCH 06/10] update docs and add recommonmark as a test dependency (#805) --- doc/developer.md | 11 ++++++++--- setup.py | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/doc/developer.md b/doc/developer.md index 79f2411e9f..98086965d5 100644 --- a/doc/developer.md +++ b/doc/developer.md @@ -27,11 +27,16 @@ python -m virtualenv dev . dev/bin/activate pip install -e . ``` + +You can run tests by invoking `make test` or `python -m pytest` in the sourmash +directory. + ## Automated tests and code coverage calculation -We use [Travis][0] for continuous integration. +We use [Travis][0] and [GitHub Actions][2] for continuous integration. Code coverage can be viewed interactively at [codecov.io][1]. -[0]:https://travis-ci.org/dib-lab/sourmash -[1]:https://codecov.io/gh/dib-lab/sourmash/ +[0]: https://travis-ci.com/dib-lab/sourmash +[1]: https://codecov.io/gh/dib-lab/sourmash/ +[2]: https://github.com/dib-lab/sourmash/actions diff --git a/setup.py b/setup.py index 2850339eda..d6ce0e119b 100644 --- a/setup.py +++ b/setup.py @@ -75,7 +75,7 @@ def build_native(spec): "zip_safe": False, "platforms": "any", "extras_require": { - 'test' : ['pytest', 'pytest-cov'], + 'test' : ['pytest', 'pytest-cov', 'recommonmark'], 'demo' : ['jupyter', 'jupyter_client', 'ipython'], 'doc' : ['sphinx', 'recommonmark', 'alabaster', "sphinxcontrib-napoleon", "nbsphinx"], From e97a1ac24d6539ad78b7669fac8682d1c4723acf Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Thu, 19 Dec 2019 22:35:12 +0000 Subject: [PATCH 07/10] Set up MSRV, minimum supported rust version (#806) * set up MSRV, minimum supported rust version * remove default-run from Cargo.toml (so we can build on 1.34) --- .github/workflows/rust.yml | 26 ++++++++++++++++++++++++++ Cargo.toml | 1 - 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 2972ac0b8a..3115b47503 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -229,3 +229,29 @@ jobs: with: command: publish args: --dry-run + + minimum_rust_version: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v1 + + - uses: actions-rs/toolchain@v1 + with: + toolchain: 1.34.0 + override: true + + - name: Set up Python 3.8 + uses: actions/setup-python@v1 + with: + python-version: "3.8" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install -e . + + - name: Run tests + uses: actions-rs/cargo@v1 + with: + command: test + args: --no-fail-fast diff --git a/Cargo.toml b/Cargo.toml index f188cc1ad4..7b0032fcf6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,7 +8,6 @@ keywords = ["minhash", "bioinformatics"] categories = ["science", "algorithms", "data-structures"] license = "BSD-3-Clause" edition = "2018" -default-run = "smrs" autoexamples = false autobins = false From 10360ebc617fcde279c02a575360ae9ea4340a66 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Fri, 20 Dec 2019 02:16:43 +0000 Subject: [PATCH 08/10] Fix LCA search error if query has abundance (#804) * fix abundance search on LCAs --- sourmash/lca/lca_utils.py | 9 +++++---- tests/test-data/47.abunds.fa.sig | 1 + tests/test_bugs.py | 11 +++++++++++ 3 files changed, 17 insertions(+), 4 deletions(-) create mode 100644 tests/test-data/47.abunds.fa.sig diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py index a72f0f71ee..9a61401f20 100644 --- a/sourmash/lca/lca_utils.py +++ b/sourmash/lca/lca_utils.py @@ -274,12 +274,13 @@ def search(self, query, *args, **kwargs): raise TypeError("'search' requires 'threshold'") threshold = kwargs['threshold'] do_containment = kwargs.get('do_containment', False) - ignore_abundance = kwargs.get('ignore_abundance', True) - if not ignore_abundance: - raise TypeError("'search' on LCA databases does not use abundance") + ignore_abundance = kwargs.get('ignore_abundance', False) + mh = query.minhash + if ignore_abundance: + mh.track_abundance = False results = [] - for x in self.find_signatures(query.minhash, threshold, do_containment): + for x in self.find_signatures(mh, threshold, do_containment): (score, match, filename) = x results.append((score, match, filename)) diff --git a/tests/test-data/47.abunds.fa.sig b/tests/test-data/47.abunds.fa.sig new file mode 100644 index 0000000000..74a9f495cb --- /dev/null +++ b/tests/test-data/47.abunds.fa.sig @@ -0,0 +1 @@ +[{"class":"sourmash_signature","email":"","filename":"47.fa","hash_function":"0.murmur64","license":"CC0","name":"NC_009665.1 Shewanella baltica OS185, complete genome","signatures":[{"ksize":31,"max_hash":18446744073709552,"md5sum":"09a08691ce52952152f0e866a59f6261","mins":[2925290528259,7397951382043,9478766578752,26390034908046,31811219567311,36191627174349,39112643786682,46822418898135,47180432856748,60017138985701,60046869099761,65325381835497,73805228897455,74037001801154,75800414195236,81855770871884,83631867214312,86442965329695,89858161015356,90806331781332,95108107091043,97258972896665,109728134835863,111162670259148,113585458770972,116166720583475,121382935674939,125296899385152,141176320451685,141284968207060,141805235471354,147190179068733,149024066888166,153783847123278,157255282423883,160902593196961,162823771630571,166163367169365,174979625787948,175032069345452,182141449646872,187503667710897,191814288543916,192890223167288,195186364664284,196037984804395,197033160819668,203335269479450,204822233856042,209233297808434,210990374921109,214600505227173,216861451361880,217827490079709,224612774123844,227683744949779,228540468204721,228644441858825,228848037454334,235478348737722,240579984608212,245029062906088,248581735034297,251186192464160,258255664267571,258924003299576,265587486568360,269368370464968,274235329360733,287831833139065,293035680952788,294558365931778,295357672887769,303548064148961,303884611876696,306041902505698,307504482597750,309969810458414,316701230964482,316897730501733,318378982650332,318568418472400,318769251839299,319335385321196,324290895668437,335605928681508,339214912158009,341001360876621,349347535027435,351258437206186,360700437330047,367327558614874,373471575728001,374188784960382,380562917179168,384016070832594,386412107814027,389279696836396,393985777467936,395356088720884,396054053894786,399215565767837,399215750881719,411030707384650,414934253467214,423759820188444,430191392037330,431262729545883,437506450440821,438105428895659,438530381378884,439044119332850,444273467310604,449680755457024,450506164772110,457138551847407,457671098462976,461477875054528,473505790593386,481815788294090,487479264340595,489519873805078,494381455384554,495601542123242,500121418104925,502603922576313,506180131137999,506336140549160,516283812540815,518803929727716,536385923460112,536713413896697,537757852470225,538244971589768,540208451183188,540588787405694,542763181011925,549192277958979,550069279000761,553900351455263,554447489693319,559226934389812,561316274253850,569061433009767,578397933313938,578899728742280,582924953100697,583572058199369,589631402773527,595681341596523,615537076898013,626475972546369,632243908286808,639824119626438,645743921515803,648018730756195,654100189449365,668365295793413,670523964406925,671677182717796,671759739945458,676515376899555,677670347980377,684837528099741,687895771489510,693758846688308,694523064126211,697547171219962,698360853391060,698383699159430,699304671955329,703696716274708,706932232475763,708449170262947,726292867622433,726450649964317,727800693698567,728799639190186,734019394597526,735119835330596,737721455578775,738543439712395,741951415758063,748275069435017,750168693442959,763201112060730,763669867104092,763903450865190,767121298622699,767417571203746,770436202573059,771683466150501,772051111454828,772152509572841,787291725467630,798319271383660,802162977380527,806341566938246,813805466325024,815570804752811,816564335333987,817024725405204,817504754626588,821706687072387,826077010431743,828356750400476,831736232379626,843025850509368,843740928711723,845050451776051,852042280696332,857912135260852,871829709114624,873254290207218,875246525542985,889897273652095,894040289596463,897620767964532,904962988643425,905308801557271,906900833647951,909442865612931,912697620927191,913789208155712,916185332282483,917277762192278,917334002968300,919561883055202,920956096920505,929046426661708,930950142910172,933691189676382,934117578798841,936230738064974,938188383682602,941861412444067,942726201014166,945032973428091,947084478373286,948779805509636,951217347666850,955636489177710,961017555998937,961314440978493,964218423186297,968212926455014,968926587713112,969379511837489,972618046502811,974637708612999,980196796037373,980565419407507,983225283458250,987541215674501,992940514834332,996549857630112,998926194132937,1014496787753945,1017704359447639,1020480845863237,1024292399670426,1024634573363382,1028460419483054,1033874047074353,1035843403340873,1037163054983442,1039558325527817,1045088944681707,1045785088974313,1048574231977270,1051002783372661,1056506578664023,1057491059487351,1059437143082343,1059853068042602,1060760398971021,1061967838052170,1066520357980609,1069224019506529,1071759691375436,1072369963153950,1080440645655398,1083957482733017,1085596610204486,1086288713384900,1088705827145973,1089204340626863,1090298523330765,1090505634288396,1093123453947031,1093780160574614,1094807962005299,1096801323900100,1103535113750718,1105423537109674,1110277142974534,1113000955148039,1118646614510530,1119281509125641,1119614160374606,1120783033143617,1129279349995602,1130113935525204,1130881986044393,1132820492214112,1144614443668767,1147223276986948,1147680055727668,1158195764117399,1163159397520386,1163303408022562,1164535774717695,1174386415542665,1182168703505980,1186019430315229,1190006012882786,1191391064481088,1200797929442729,1203248128742846,1208460365112124,1225631809302250,1234453656762891,1234698668275227,1237451114108962,1241245219164313,1241546710850109,1241668290204495,1242418821754022,1243711623939695,1244290020173228,1244346278691061,1245003263018464,1250484435790357,1256754510605581,1264563878337445,1269060350975578,1270154727600023,1271923497273997,1273115659423672,1280559509676354,1282472909138162,1285880210646676,1287498565406779,1291218968991828,1292246474868788,1292487278268025,1296553378083571,1300214247397513,1311078551896352,1311449533649890,1313826623773576,1314579091305857,1322276316890973,1325524051301607,1332430917176015,1332675238905364,1334939013056183,1337600826833551,1339674524726757,1341661245836409,1341942310569850,1344850241954264,1348761416973437,1356904466129199,1369078449955986,1370039456672284,1370854092951821,1372351037556570,1376501003787476,1378471035008080,1383348406006914,1387085462947589,1387385057191781,1388584147493453,1390384276015810,1396964107951550,1401298565016323,1407230822931784,1410486644494794,1410786461048450,1414537954260326,1418743862991832,1420081602859846,1424366051167663,1431140791675340,1438960590550765,1443983103542619,1449677011803774,1457429906997387,1458641089226597,1458716224614631,1459144447544839,1461511802747479,1465076638017898,1465867789405739,1467827467674025,1468663744355213,1476939334625119,1480237325649862,1481088686107013,1483592564337201,1492314148312178,1498457281550692,1499617447616390,1503853002568292,1504262319315651,1505172916296130,1513668920373911,1519648405600154,1520931632741619,1521780633605083,1522237249746592,1522582599941917,1523518586763814,1528071377900249,1529728378502178,1531967467499308,1535306641925593,1535658178776979,1539307118095840,1546368847550532,1548019688923957,1552083355029650,1555637141656241,1555928090783844,1556284449775147,1558324681023092,1560969323307091,1569318833056381,1573222947937990,1584949879718000,1588978430427079,1591204462547614,1598261363578814,1600688746972553,1603093475242546,1605199952752847,1617237167349710,1618568234848372,1619286790649678,1621192910003941,1622309948672121,1628201100274523,1632271494883561,1633815225207084,1634399357702189,1637441524349088,1642637371934077,1643195637784435,1643598557356785,1645422696089427,1645673596073883,1645866259200502,1648716913052297,1652765950688817,1654033476941478,1658743399661231,1659819081077302,1662433005161059,1666913529898081,1679605700468270,1679921198649960,1680089532480362,1683724693448022,1689682641477370,1691706033392643,1696089597402537,1705950022227142,1714068118984789,1717459770518422,1718014979380734,1719690455811654,1734728075132632,1739172733710985,1745324659468599,1747303538361662,1749145577098552,1750021468273833,1750530525839386,1757623281396842,1758538630442116,1761579455667380,1762621869823670,1766019454242846,1766154871452422,1768074570558590,1769506068128510,1770988073934927,1778201561133905,1778315567513725,1780288814569870,1781805678833298,1786019351090790,1787025898307575,1793358709247570,1801404378718274,1802232213372715,1804215890133513,1808123394894591,1808805306365691,1810435102767883,1813163351446427,1818925318022107,1819091566970620,1821246620845572,1825289420275521,1829618500803507,1831822327838518,1832408978761242,1835694527640110,1837046808494825,1840320929072049,1842060817177608,1844561134226776,1845664541012305,1846495837486874,1846814283210937,1848195902901531,1849342199305473,1852955637970413,1853064829868822,1854709332537365,1856791461736081,1860030910962345,1862153320764207,1862209616890144,1867578456400407,1870278489144074,1871450013370760,1880743049410508,1880811582956504,1885064900552256,1888527800896759,1899912419788159,1909513665427200,1909893462067689,1910324702460153,1920155014152585,1923031184773399,1923724551213831,1928488418125995,1935392806238480,1936981590066389,1937494292258243,1941935226774825,1942786308149620,1943419695090025,1947698435893922,1947827395290642,1948000063884420,1951286173673455,1957196594968485,1961156417600790,1968345824207972,1973565525696890,1976759223622041,1977637922131648,1977923456470816,1979621033784766,1979691191211071,1981752378561978,1987210877457747,1993564537623510,1996672784729607,1999133750243675,2011354377485272,2017517839581062,2021272596821928,2024291985865500,2030916441428059,2030943399237635,2035149501864507,2047630125224977,2052394950437991,2059880114534091,2060560658024761,2064432037950349,2083858695302000,2088434760658037,2092860563281190,2095578868362462,2097280377232511,2099121913442760,2104210209064238,2107076373938295,2108459225069649,2111395821264557,2122246048824157,2123504523298871,2125171930737142,2127588293738580,2135890156111278,2147601242872786,2147628766136779,2148277682163663,2148629935713334,2148648462894137,2154050039033300,2154825108832254,2157543511093753,2159292319817060,2159391483345580,2163811550162994,2163911364872485,2164107595577716,2166610246026701,2169130162448361,2169401527323023,2182965638264818,2189902950844361,2191089458213993,2194321556975056,2197255584699767,2197550753498976,2197844428920029,2204899458948058,2207484772689862,2214585329667475,2216291576857764,2218560589085471,2222579004644118,2225440067596925,2231678593259696,2239881880935087,2246651203996116,2249382176770011,2251553784168898,2262343143065292,2269891656332884,2277357511613050,2282851679505524,2284008883123690,2294311150128150,2295851772366195,2296330477067902,2297300047218453,2299522719885844,2300003729256754,2305986746818130,2307128673346491,2309328595812376,2314348683023278,2325178911253636,2337363146012963,2338273922165178,2340650536569632,2341149645621931,2349169861378449,2350865952696907,2355373744763135,2359599974602456,2360988166250281,2364165589013103,2365100930739182,2365101583995089,2368070257601382,2379436665071024,2387981834215976,2391376217204289,2391657757985839,2398709852888712,2400026944838468,2402208725828096,2409550806440554,2410681029165949,2412591449989948,2413992919514685,2416809210551017,2420810333651625,2426508439798144,2431689886658063,2434636409776451,2434817960891416,2439360431069834,2442924938559564,2444743697540746,2447704465950372,2451185988965285,2459766139292236,2466530448132713,2466909570912171,2468169126671752,2469990435969385,2472082629869597,2484069800626695,2484707593134371,2486433068244510,2486783619425529,2489988128759413,2497895029394563,2498928723235105,2502877897637973,2507836460937176,2516300104537043,2529686136078992,2531603179656151,2535379300081535,2540748246632844,2542081767873586,2543849372306944,2545009932051689,2547409441873208,2556506799873846,2556532058925046,2570487229611126,2573496573602154,2573521798941261,2575271828359827,2583040424187016,2583468225494252,2583541506767529,2590282004204866,2594709561160407,2598736648640020,2600008000392449,2601526047213631,2604919231758350,2606628075888049,2607585442845824,2608738783833234,2611080508323464,2611611650962181,2618806127233677,2620375519634887,2623788431218018,2625753537877756,2626910805885551,2633023374568506,2641610347651893,2642503504311045,2651833968467605,2658287974480506,2661391357250546,2662023298318235,2665833107218149,2668521248016496,2671313026437821,2675525837460390,2678336928677512,2680497019271975,2693519224664396,2696615422431379,2697093257227995,2703119946699707,2707831053578465,2708218678481553,2715915477263655,2729224055534831,2731006551655845,2732055421730493,2734700729661142,2738859769218570,2742018183825055,2743081343023861,2743888467937942,2751785297738513,2757079557005164,2758720834995819,2758979243701204,2764250636697336,2766608515295278,2771029262532041,2771167327169082,2774929020087483,2787100655005297,2789215679189400,2794721456334777,2796704110243902,2822847943723684,2823690427545053,2824112773494385,2828383286324950,2832223063283424,2839310794637108,2839525055055156,2840131111024087,2845086895593857,2848124500620503,2850392763711528,2851615637093918,2852640851512226,2858440556030254,2859863407402383,2860695977896818,2868567544019176,2868722569049791,2875394822256464,2875768840498356,2876952463837377,2877071122530509,2881697295591721,2885101058817579,2888998206990875,2889740392149462,2892701240258741,2893195916828713,2893601424052339,2896177891093468,2903470979230250,2904033648694661,2905361464861211,2908444575023598,2911124480176230,2911599257101941,2921971427799899,2927511611173972,2932335002147816,2942563058582163,2946143480195981,2946208695315985,2947035710205399,2948810955001129,2950953756094034,2952664814352903,2956213777269798,2956539890231005,2958358510714643,2959335482526692,2959566715003402,2965352172439193,2965439340704221,2965855909473064,2966360689949309,2973978384800223,2975093256580654,2975414282596751,2977316941548719,2977663445217111,2983750706789338,2984126693118897,2985749744661602,2993577870679042,2995240248615334,2997216190337734,2997475303842149,2998782348202460,3002144816290295,3004601117971759,3005445693257298,3005543539398257,3006825969228148,3007008399287583,3007436553703536,3016063581505292,3024440815482041,3029401793589254,3035917551240430,3037029199949908,3037243067032125,3049610843618123,3052077662817141,3055458832740035,3061176921317878,3072032250423585,3074112617890076,3076040588704705,3079164924470365,3083176156972821,3086123334924126,3089319907683113,3098540992604022,3101693309309556,3103506635288743,3110882600220192,3112903104807973,3120960716235347,3125993645599853,3126462642335525,3127239755462313,3127515727740291,3132674664095953,3136116654223153,3138943638252170,3140104823595207,3153220014750330,3158074449437715,3160234728942373,3164386809673569,3173783571944417,3175607440806275,3180308189083804,3184825950572980,3190439500089370,3191603569657769,3192369183577062,3193298901760522,3193346797759720,3194861056078031,3196931220104868,3201112500492023,3202793460581380,3219171897424954,3219575037594274,3224535741992415,3225652520990690,3226382984631204,3230186294385431,3243947082373306,3244627180010006,3248922800662151,3253156487699363,3255370232763973,3257337304537355,3265267353090335,3272928147712512,3273345746404244,3275046616104436,3275876017733599,3276867668269439,3278668472321042,3278714300330291,3280780868469494,3280794856819360,3285404600033524,3289047034753180,3294550813104021,3298089165637310,3299027911208090,3306709449273253,3309878495036042,3312036573217165,3320827905894255,3323347352904912,3332103278994362,3333652145199727,3335087116356546,3336332604419491,3346073604734971,3346508186700073,3362515004859132,3365198599599379,3365586794581106,3366181769304978,3367829027870594,3369243531861603,3377357612999215,3378502887959344,3379556656256325,3380377839647911,3380683064402177,3381845747007120,3382728295376857,3383075891087465,3383302464154854,3383677243861212,3384640865212142,3390849838258698,3402608390039987,3404656276789459,3405149696809115,3405393044390619,3408568306290700,3409525642139599,3409848562939689,3415139398115166,3415475306791216,3417433407494643,3418693183078260,3419312829124670,3421667659970361,3425688845571110,3430483417325813,3431010040648861,3433786393292948,3433893111687059,3434270543355054,3439118720682675,3443405929340821,3443814552613298,3444188614792340,3444648483822568,3452335392026500,3454079754241547,3458841677994973,3459090134521778,3463857637926984,3466754010283942,3468005713457978,3468306935523998,3469621823753300,3484417521606056,3484887355924665,3485163641925480,3485345404315595,3485684351025169,3486271024140478,3486481363201290,3490218326835149,3493226173405941,3495557286227599,3501438911492802,3502652434217727,3509064061394091,3509072379429744,3523958917267613,3531208508664527,3532553615695946,3535903464263126,3539565874451621,3539901139312850,3540376200850317,3541349497756661,3544819360646120,3549947024705822,3556230614643794,3561873326260814,3567631654480233,3569439133907194,3569585416963919,3580260758329980,3584444323393668,3586259833614913,3598107344839577,3599693059706844,3600497750427469,3609452627397093,3611915615904413,3613882437854401,3624379228154857,3628580997551890,3632115466215379,3633516585258144,3636013168823101,3644503150656777,3645611623206895,3648590415100172,3650411848640853,3651208352182968,3659962641982274,3663160485730853,3665217012698891,3670531861702815,3678142299238288,3679412885698189,3679723249740163,3688280883691690,3688327457349314,3690688866820810,3705003329123112,3718621708258333,3722526932992524,3723634868396071,3726938239845979,3728715138302811,3734192362762123,3737946633507459,3740458701179796,3742193997053523,3746284516790765,3750533866251628,3752117756365521,3756936323992755,3757543406733882,3762486477132181,3762751701280063,3767814128506980,3773610790058654,3776595480654768,3776811730885528,3779767620249001,3786173858770873,3794100680281451,3794855359477272,3797500278748845,3797572839534654,3801836630743327,3806047581097738,3809879441266392,3810799390411918,3821170295134013,3821630362687996,3822296640796331,3823175201590864,3825757657900286,3827560739565438,3831533818899493,3835198273307888,3835558444026950,3837899280987896,3841522241190425,3842446209291097,3856495945466312,3856632581492180,3860116438298861,3862694398476978,3863189469668600,3869850399187705,3871935007496414,3872628172162502,3873324719285632,3878833882038024,3880243619746497,3880529063199350,3882311402640088,3887523678289264,3893564413662650,3899793396857493,3908511683767038,3911082399615065,3917275362273600,3921777467979712,3925362829074370,3928994435189027,3929160579967105,3929583967036139,3934142879673460,3941494939757571,3941525993199884,3942048398609850,3949319172964121,3949404714704001,3951447621965404,3968940236457600,3972341462705556,3973437909773411,3977362201597748,3980515012130917,3982398317594569,3983919240708090,3991304962417620,3996232480056804,4001338100305267,4003000776821491,4010634521845832,4011194441900352,4012917626427041,4018474899910568,4020678940249116,4020893102640326,4023604165179706,4026092267698298,4026688740814878,4034351439123543,4036574510586483,4037588403031850,4039922936524250,4039932863104502,4040392164753436,4041139896587433,4044662871102224,4049703973786608,4051543922389363,4052681338148215,4055364557376134,4062261174287869,4065415697051189,4071633959541762,4079083296648701,4086687744224011,4094432968332287,4097518435638924,4102324633120593,4103314051061542,4114248458913135,4122077681924969,4123781140489537,4132460402529320,4134892291048521,4135928574382122,4138840172908252,4139231196428117,4143693188392502,4150745968454974,4151459023776703,4154752706236746,4156105039099431,4157256439982237,4157669782790617,4166107478621219,4167383773226728,4167492383925201,4168181927338698,4175996866082730,4176623816804364,4183402718643845,4188349160298046,4195677986920473,4198198981311457,4199209071018538,4199346559716278,4199937026193161,4200142600556427,4201374728073667,4210305409366342,4211040452351221,4214771019264212,4225632034684502,4234174233250830,4245383804030219,4245913779845337,4253320341425011,4255227426589464,4257521149254292,4264631484544901,4266389306044662,4267533238822472,4275580175408244,4289359014707288,4295704004397925,4300982165488644,4301841948469269,4302934567016197,4305626904573311,4312242836327385,4316505852417381,4320671705521862,4320948899808113,4329077654347637,4330412836235513,4332389830901236,4332584479772575,4335310014667247,4337120565239620,4340454370272718,4345864924315697,4346218796838410,4352222894063447,4354953199641044,4356160430961353,4357880027267574,4367125746948875,4369068622168572,4371927952461526,4374872954793723,4377274169565988,4382694020890333,4383134213334340,4387047294147332,4388622108830575,4393722386432944,4394935128907327,4405185846773600,4409442890926800,4418995118878418,4419676464130546,4421125787216995,4425233123315500,4430853131113411,4433648774646017,4434408204046953,4435949176623047,4437545167361411,4438317402421127,4438817177523704,4443045313246981,4444824473102486,4445273578631201,4454575698901762,4458728897870062,4461351844352989,4462449521144694,4464228910638153,4467196354294999,4474000580782956,4474046785524256,4474574743148389,4481408122328948,4486507708773899,4495575342843287,4496415696378542,4501876131664304,4508459014863643,4517133177825796,4525521793503217,4528035909846301,4532427908015373,4535830530899372,4538230925800141,4539716842839588,4551316076842289,4552805172103424,4561703129830313,4562273139429756,4562889929649950,4572477816005275,4573883165195550,4577060828696911,4577457918883209,4577958025008691,4580199770736665,4582265498314074,4585370103466467,4595548152987374,4602434211109390,4607450449118254,4620004533537053,4621113398888425,4622580510893583,4630992980149087,4634141648884370,4640301477916105,4641399081470667,4641458089041250,4641794721319090,4641810465552112,4645238665720809,4645621983164383,4646458337623997,4647556566493222,4651299169613798,4663702386772812,4665559789434328,4669236331860436,4671025914890237,4671062938394354,4676205735481526,4681240613933899,4683821965014649,4689342516749982,4690119446223188,4694128337468791,4696124657031960,4696193088102148,4696638008353613,4698423835133356,4699216184918082,4704195358103927,4708213524509388,4721521787903217,4723274946162868,4729163773640834,4730341942998122,4750145321126258,4751422453008817,4755912323473330,4756833761182793,4758803188341003,4762437486337017,4762834014218571,4778822146835476,4784620939372924,4784854530655115,4785601845773156,4786683724318639,4788364915970531,4791650559342688,4792800308786051,4793495469956659,4794337453617434,4797047238512497,4800291638880957,4804645672015140,4806827593856676,4811729290308862,4813583810073804,4817878202402319,4819210711953623,4819297775674748,4819311564829320,4822240770685261,4823251614359045,4825955485244615,4828732317464211,4833444690765931,4836894122787451,4838221388703602,4848960069162027,4856643780511233,4859353321888294,4859467776000605,4861078197128753,4868100857196342,4869271395674487,4875972042816124,4879638248944748,4881531428270387,4894597037736842,4900275904853327,4902345078498684,4907673099841830,4912787048821119,4914549573455980,4930034114903088,4938885956719683,4938984906671371,4952413633841153,4957117546097581,4962024566226233,4964966617138828,4966376651170584,4973778510774167,4975540053830624,4978106676024424,4986110732910751,4995563329299788,4996212995257738,4999915977157470,5011891458604349,5012633125949878,5017724733800167,5018157783395788,5018781410893851,5019357482030347,5021445876086138,5030617336717801,5033368364296409,5038003571725954,5045163363224076,5047944681561823,5054098670441464,5057486321357458,5057790328506277,5059239413878415,5059653728314562,5075357793289723,5078143579563766,5082075970958360,5086177235816634,5090517135844571,5095238786157913,5095997925642684,5096759450835327,5097502836207144,5101214857653244,5102026127818781,5102867437873560,5108455108876502,5109339895416818,5112547786374962,5113517669186741,5113987619419017,5125513112408495,5127124574162351,5127331696245969,5128951178677788,5132612621833970,5143420500944709,5145276514713692,5146830226631178,5148881101936222,5149206177910233,5149437337079666,5149863467137139,5150315306295015,5150426384165948,5152031727525643,5154406455748760,5156461422732999,5157379504662047,5161207050871469,5161248283909416,5164496015188591,5169183503623442,5170033828188437,5172451986737288,5173899903805393,5174341244024506,5174660761082943,5178681082547978,5185496711050665,5199397406461572,5199519828192191,5216652140931560,5222047064350262,5222282340592980,5225960701910860,5230003976759540,5231985318055496,5236341345649495,5238635836185856,5239269458643567,5242274139089145,5247586236105385,5251993594243967,5256157883002967,5260023793294245,5262424254200249,5262515077905251,5265232429826960,5273272937856228,5277050637122870,5284015351506042,5291703312055669,5298034705538719,5302053963299700,5304711911200062,5313230992694743,5314446308863251,5316323217920338,5318153535798629,5319128074583642,5326372158895078,5327554389775897,5328891577748554,5333028765846132,5333952601578012,5334935043856488,5335020091722251,5340420023836909,5345704507186657,5349788779068053,5351104001242138,5358493250319346,5361787265106398,5364846600059577,5367133911213099,5367718245082904,5370226527358712,5372175647130534,5375274967884629,5379968233240165,5380736206240325,5381546956953785,5383368125410553,5385746119086850,5386810090653851,5388894770243354,5390748522455977,5394080989067132,5396868798166091,5397913443318485,5398598214938688,5400291293650351,5403110884625308,5406614253824595,5408167860716448,5408684428107176,5410632742040879,5412454472585029,5415609710834006,5419089032931253,5425776155703968,5426343929555151,5430164168254949,5433304691996431,5435448562589939,5440154526785082,5442550985028476,5443519792343783,5444690183081885,5450650932615338,5455725933689601,5458279931832152,5465378018390073,5465468293521107,5470523383371739,5470969344336539,5475821882378207,5477177690474931,5482250465546522,5484161477936466,5489284312856062,5489369658364069,5490569127868792,5491063328575234,5493319803023083,5493870604473986,5494363692102912,5495499148937672,5503103710464614,5503513389639664,5507615794296947,5508503680735494,5517653225284855,5521521718617004,5525271231853414,5525784499514800,5536775761553566,5543069752424250,5544364757593219,5545802799820933,5551558523406896,5554380825386759,5557919753549640,5559553479542169,5560825696398560,5570645491117548,5571611222021582,5575596617449768,5579977210132164,5592415623299140,5593426453232836,5595805533420086,5602700450924399,5603538322319793,5611751716804501,5613862108602356,5614909593083489,5620948070768841,5621064494334548,5624980176214575,5626090376879123,5630820724993122,5631339882742194,5637312677638017,5642756368786070,5646619733332345,5653701086020480,5656741558549497,5658909534123937,5663092617828839,5663224746690969,5663385121189069,5667246937392321,5680063203437155,5685769039380624,5697986357589265,5699319238678867,5710195316669727,5710364792755140,5712513634730123,5714049549411876,5717070125530008,5717707514404603,5718933939076632,5719318350374149,5722219703153811,5737673156168293,5740477961003848,5743439881123553,5746087812806105,5757334188672719,5759061891073762,5759605430401092,5760917932621246,5762846753375421,5768847482599071,5775786807466214,5778465954807704,5779727192598359,5782454578043069,5784207630429531,5787478509200132,5795310516601245,5799653338299710,5812168700056249,5813405909746709,5814659226632216,5815769481430106,5816254259189301,5817139504392687,5818962864882161,5820855454510934,5828565941868585,5832544070159164,5838532420889856,5842761102579491,5846508833190611,5848249740129972,5854937355756548,5856022939536906,5858993021903933,5870645665739004,5876393418760803,5876736261749158,5877001606837482,5879985416123329,5885444131631387,5891024028896298,5894685367317522,5898490390630341,5899953571983439,5903023130181011,5903484378223350,5903666264650934,5903694388453077,5905923307105350,5908696090042283,5913584686996270,5915859656997442,5915879684950182,5916931928833529,5919430186483428,5924117520799744,5924879546681533,5932662790863585,5935312555770245,5935661430493388,5937817448042430,5947101870703205,5950972051903959,5951041949791953,5951726048170658,5972218700939363,5973250287141740,5973677832178283,5977031849078959,5990178361033769,5990570628261118,6001206595929325,6002276243709531,6006751473613523,6006897308518556,6011214467146689,6016555602340361,6023708485790729,6027671084996298,6028501758004444,6029374529344504,6030811056861793,6032097611334280,6035955206980948,6036393255326821,6043316342842623,6045838883225433,6052608603431831,6053804686685524,6053958863313339,6055268730727559,6056807484550007,6058584581621317,6059626128614092,6070104051324693,6083108464119662,6085590822787717,6090956327042002,6094860440146813,6095038202403332,6096229123772452,6097073203191647,6103891026698555,6104688884599994,6107758160260988,6122246130680759,6124004526649147,6125907283269891,6128284886205816,6132924004245246,6137419131615699,6137499190958511,6139205116246240,6142105213155394,6142688368084569,6142913125046520,6144042650109511,6144095475272536,6148853279972165,6157729055897116,6158322788243107,6160835787414693,6164759784469239,6167018513092476,6167607991565253,6171707826744820,6174144459043731,6177490881631784,6177692002705307,6180599532722384,6180617553423208,6181251707047392,6182491554240294,6190932940940803,6192358086666249,6194877677308546,6198857031919734,6200081358996517,6202082397019416,6205024632916134,6207302580763824,6211423761058583,6215284878845774,6215440151333948,6216154454451807,6218233647565127,6218512470330013,6220604711061324,6227071256308643,6233498546819624,6236798857143890,6243407719027967,6244874563473516,6245142031174040,6246838555534920,6249074081912395,6249758679502616,6250557402527662,6251868356478317,6256321719332215,6261686455216337,6264775741996340,6270623030455246,6274969421000503,6280006843403268,6281149158892909,6296961749764111,6300447716613718,6304297838290460,6306042178881559,6307115079760207,6312245698368186,6322233760419356,6323173079753451,6324812402453371,6335023550154225,6353191312166053,6354692547077099,6357396963255601,6358389478584958,6359135369542064,6368768790664617,6371090203570186,6372329995436972,6373927709843793,6374693910197545,6379348598089064,6381541745622015,6385800661078082,6388937996347478,6390863793314128,6393248753656632,6407210526643342,6408730838629478,6417566655445436,6423382525818443,6425609470770570,6436083078959781,6436814879261326,6442916426075876,6445879733828998,6454320146259766,6454656458940756,6458237633544796,6459246249643680,6462957099049843,6466065562806689,6471689417982028,6482532068799143,6489722108699832,6490299073029283,6494216993344366,6499734594021057,6503186772282900,6504859169508928,6507830449842499,6510975929277435,6512429809706119,6516226128433302,6523391982496272,6534435207304569,6546595430555691,6546752598032205,6558599467268843,6559912537272461,6565025138801153,6568196183285198,6571097539012216,6578237361036784,6578503095620162,6585680325396501,6587654701564589,6595708386553174,6608001737567202,6616149821789620,6618251453481948,6618435915029628,6618676721046335,6621057137514784,6622760768363465,6623916460399609,6627687708278604,6627783782930560,6630093742634040,6633679848040650,6641201043777485,6643521507427133,6645310761621776,6648932781635955,6662203303202718,6668468822319197,6674364905825227,6678173806211118,6678617623247346,6682614209582006,6684865188797781,6688488067507556,6692574202860978,6699342662745446,6699601051457480,6699624106317412,6704989683409671,6712177752369021,6720191931854899,6721985110519865,6728831533279314,6729387689420763,6740158910360226,6740782983077394,6741215022206694,6742696713154438,6745753170262651,6747208404406959,6747763101536606,6749753901939374,6752595765235073,6754613815657166,6755291783706242,6759785636657525,6760588036724092,6771281046319400,6778747192742624,6785714707329795,6788002061659117,6788203480253865,6793843162054529,6794299705676629,6801783177540135,6803312883106461,6805314972544938,6818759912474922,6820988811787355,6823062398472026,6826362904176932,6827299283549200,6828195113056916,6828673134192974,6830256824394694,6832757900847449,6839800914177389,6844288668932623,6844336660738179,6845320561422543,6867967645790541,6871074917650344,6871762569059759,6876239502460282,6878773843563416,6885815333874594,6888484602467342,6898143968669196,6900004638975216,6905914351227176,6910834673790415,6911812457892129,6914055104334094,6919747802868531,6923486337221090,6925667026147338,6933453950416148,6938107292471175,6938223955509899,6941024220883212,6942033581849643,6943458456583346,6945905024717815,6958904824641253,6965104736708795,6966317419189633,6967131174911621,6967523404032078,6968626913917078,6969723087551023,6969879544962764,6973244897917836,6976680325243925,6979025352783650,6979370520679168,6980494356161697,6984882368689931,6991429378659097,6993040182965693,6995106444038232,7013532370262403,7016766698233684,7021458253363648,7024829388289761,7037699445900528,7038117440889701,7047751781972594,7068328396622979,7078519684380004,7079136072514786,7079683617185872,7080593167563472,7081754833089042,7090263369022454,7090827224293435,7098844983687608,7103287349340715,7106775558992670,7107670335907433,7112480093601650,7115518871031565,7117602179959460,7117702213846209,7121896123546063,7132927597129919,7134741662059672,7138333125007121,7140267571474431,7142051916052761,7151557370156782,7151630939274842,7152477932457527,7159009930103737,7163383528300707,7163778957994361,7164516813412020,7165245447295611,7179002153816766,7179903040144596,7182233643190269,7182765199375506,7184665213216156,7186047534415508,7191096990307504,7192853768347715,7193706102549630,7194390208322086,7203674420028153,7205356147061987,7205440304046385,7212278254469221,7213120783173544,7222827358840359,7224545604825492,7225295067327053,7228437063271890,7229216536358274,7229486634256081,7232125512735235,7235403866067319,7239097855225524,7242588962722849,7244564678730935,7244768482327881,7245566857344055,7246915842148466,7248035515958267,7248740619006619,7261781244132035,7262289934411894,7266821489213820,7276326602211605,7277092712154010,7277675196574359,7279500143289769,7285678364340995,7286552397490444,7289711990915466,7291166276647549,7291878183300562,7293014405240845,7293120299138230,7296021783720685,7302877510804560,7305136552146877,7308326077425136,7309526818453637,7312457538294268,7316330078533123,7326575126577204,7328477651553143,7334460645181314,7338010137201066,7344453486207090,7348071649723458,7351715416841733,7354010904401429,7358759012302734,7360258178944323,7363914809349690,7368773239687673,7368990891984284,7369067702745463,7370610419112862,7372069375963896,7372949260070506,7378192097899948,7384234111099756,7386045769570324,7390506088312346,7390710096067356,7398773968230003,7399651877743543,7407156702916619,7407505476310295,7412075044877324,7413103855739244,7416878995770443,7417889495517890,7417992892826458,7418488700207131,7425782918083640,7426542400535424,7427800180129173,7428977237077501,7430512069482015,7435984239692232,7436873913790602,7437474890639138,7443107233720726,7444471858317499,7445843328639391,7447921740402688,7457626872755122,7461417070760230,7462663569101078,7463631301411392,7464791273301861,7465698531247575,7465989092511044,7468955978108988,7470414970130956,7470691742535496,7472016146671178,7475500532623013,7476427212005998,7476626636382557,7490128336689868,7496125413626459,7496331600674721,7496586171914845,7497334675461455,7498030447626019,7498854721985035,7499741802632764,7501028258951772,7507971309059323,7513140878034397,7520995594935128,7522857939238615,7523568386509127,7524079580225271,7531837667095752,7535413758699104,7541987648743096,7547834066144263,7549509942679840,7558348891266390,7562422547591358,7567376160342247,7570266295972391,7570701292061586,7574807836993530,7575973317107614,7577088488345910,7577833848718658,7578638579275105,7580092247778121,7583079410262922,7583396165831142,7583764379769101,7590281297167973,7594145655959215,7596032551054600,7597146399455249,7598973979549112,7607301189042080,7609709905585175,7610703887034738,7612217782008852,7612590563938043,7614314828758987,7626611098574312,7632486697445093,7635361684630569,7635781595842092,7637503540227229,7652643882172024,7654546417001702,7658477424630193,7658594690166121,7658756342775449,7659221163244052,7662988662313759,7670573068797315,7670620641239355,7672952384552853,7688048940346303,7689154854909634,7692157387739297,7692684970839720,7692715035163949,7692872154514462,7699490741028377,7700977357734436,7702985130021974,7704411344416172,7708512355032444,7710242527125752,7713386097094125,7716490464557575,7719367164738556,7721077927518745,7721609917312402,7724275439905728,7724711616367069,7727393378005013,7727688737658572,7728376014820122,7729310710317073,7734200858227759,7737983217239889,7738701740734266,7739863878221100,7748491852382750,7761354279867729,7764178055458929,7766279931264675,7771590301023685,7774873548808587,7775101063842022,7779960077213875,7786006720140585,7786623092023665,7788620682842041,7789614831618262,7790698030398764,7794053661517357,7797582785345941,7799750768370576,7800931705061909,7806696954752845,7807627836703959,7809275090934984,7810562577473506,7810998696754018,7812875292301653,7813351823453750,7815570269509758,7818433714447086,7818985241100641,7820811550237674,7826881780965015,7841588711354365,7852366441407035,7855477820333867,7858337898821330,7860940256961100,7862242994456228,7868991532284921,7875449163603356,7877526742599223,7884811158802680,7885098411306234,7885848475582109,7888074400901569,7888437547783278,7888500835141269,7899000176906699,7900170751256211,7900621082983922,7905781741383824,7915002755515393,7916680314419175,7929090546906102,7930649670973791,7931707661135896,7932664861586881,7936886244974173,7944913299381989,7945929831888496,7947631560228941,7950990573236144,7954932398971022,7955707767338343,7957474102755475,7959145831679961,7961836057698682,7965048882911994,7965254194340254,7969662040440070,7980381451760704,7980587376998310,7980735420462798,7985722197062386,7987424295175338,7995443309794354,7997135431578468,8006897017337402,8006976419849250,8011722895901690,8020262137556052,8021771731265660,8022639767891247,8027888930993228,8028030091531706,8031622641498868,8032268170160689,8036544756403250,8042606792528101,8059389868173903,8059450627794768,8065220473196449,8065582408992410,8066856250710691,8071865664728388,8077418220136860,8082463447862295,8082820682426435,8089588938475723,8095523873680122,8097390569526238,8101747032613392,8105559582340911,8106928489710751,8111834837966998,8112104031271206,8114237531654532,8127670090034835,8129591252255075,8130207971385588,8136369282029634,8142129094140152,8142634926688230,8146423080936368,8147298843986804,8152440707511793,8154879940866529,8161574196354384,8170876711488279,8173481093695052,8174511269651377,8178398099602601,8180578302229082,8181586463868245,8192542128082353,8194052295501546,8194969512324171,8201561228078010,8205287009351885,8213078648151372,8214059341854817,8216440810041214,8217581100021235,8234783421929082,8243069565420263,8246362229814725,8246571670186657,8246716772071880,8249298655984594,8250387434460343,8251262500438279,8253292038013840,8255193598573136,8256150707384279,8262954804685181,8264880757955296,8269769863632137,8271331264772547,8279296087284122,8279996819873290,8282488428172923,8284234491989688,8289349890472704,8289498162240419,8302834672979549,8303400292309646,8309633984147819,8318696076968435,8322561813637125,8324890577048813,8325237800434916,8325244982386600,8325938551251775,8326972902154084,8332190045511090,8333273920187213,8333295411468735,8336642144860625,8336794237476945,8340676844945415,8342359556228636,8343138042400877,8343422431629489,8348222101988490,8348990845348274,8349203373840720,8349929839053237,8353187069342082,8355184882068744,8355216613777188,8357480319128064,8358858858741708,8359032487697274,8359937481115835,8365483157606045,8369163766277845,8369986014094300,8370512453717737,8372233568621885,8372801112194483,8378780774433401,8384716609191558,8390262752827048,8391964943990114,8391968117889949,8392149294258272,8397902100674760,8401772412872195,8408375889672914,8411174366535457,8415585315618209,8416272151484023,8422456048756653,8425746898298470,8430090272302554,8437064789335853,8438071844029923,8440380356686069,8442951603563362,8444597753784738,8451164588916518,8452143133964194,8452834615819362,8457432590283505,8457870432601145,8460876401008247,8462195561662212,8462356309409364,8462779787380437,8463894151036866,8468076223787436,8469301843796323,8471378426717330,8471415745047933,8471563440277386,8472404382970836,8478129640091772,8478656700478130,8492172722223897,8498566054392581,8499229793648454,8506577707409417,8509132075144310,8512769287132428,8513163154527267,8513504634548185,8515344063519634,8520471745574237,8522204087666054,8524148859687867,8526397555774454,8529230741888138,8532559561743484,8532654039076026,8533878116076637,8533995416155386,8534972628296620,8535864863010513,8535971593214147,8536095899242944,8541155088407770,8542730575053594,8543114315775141,8544314610710198,8550168506511268,8552407797606671,8554633315621859,8555174066389539,8557956937889635,8558151645997922,8564419827076679,8573037870741832,8576579691621984,8581367096989732,8582261197731227,8584103263448365,8586129879796676,8588390118640027,8598659143101208,8599129846364328,8601522207106423,8602281558256184,8606118628054373,8609962853072397,8614121798757181,8617181526340432,8624540329478798,8626391389132139,8637210920266035,8642764308475012,8643781560678532,8649648448925106,8654788098657391,8656720607859172,8663480084138334,8668647436071212,8676906171988104,8681093783245916,8681657579012301,8681913385441783,8684040140251633,8684751768894594,8687233995374161,8689451052146568,8694259890202010,8707543506731647,8712178326749055,8713167337349649,8720400968274939,8721624366660549,8723528441052177,8728082879135077,8729412146383037,8730722008863330,8734134339123262,8744943197261790,8752084776243496,8752229489182462,8756024741015291,8764086372954576,8765499574275666,8771763736993583,8774891003290900,8776098695299089,8781771689282057,8786064399236492,8788645315265601,8789521527155337,8800564096781294,8802869383602903,8804388443116965,8822012246032459,8824420613904824,8833617601200992,8839028706384124,8845997843748149,8848441586602708,8851257701148635,8853468904397509,8861373802714789,8863519477739503,8864760219124956,8867353885504610,8868704959256862,8869726068715730,8873178736038157,8877414428990774,8877639105413482,8888927845730161,8893272613048649,8895434307439131,8895481679956001,8896675958607703,8906632086442921,8909764344727962,8910696373745749,8914511413857104,8917771334029599,8922546229199542,8935766685372472,8941687890115326,8946727890515135,8953024300985451,8953097697609406,8973117667284369,8976689351398218,8979052050754475,8979138566806786,8981576765955511,8984458105294828,8989003157868761,8998231894652099,9003633319081270,9007488636990326,9014303703878842,9017342455231486,9017775639960192,9017891402481729,9021516015590569,9021894799811309,9026119584500749,9032187371857715,9032210357540524,9037381687162545,9038550674961458,9043118071475817,9045075196494312,9048587505711843,9051912387029910,9056929643995395,9060898928960346,9062131144374189,9064531090448266,9067993556866039,9069591997036415,9073037543243418,9073542014929237,9074741782974562,9074896498908993,9074954160428210,9079318768294079,9079888527353695,9086751601932227,9090741520266450,9091979750057327,9094268438464734,9095103196939357,9107447274695798,9108055709626815,9118451715638510,9125362263593744,9126355422536516,9129589011938145,9141233415372106,9145865355932391,9146655579064944,9150924106375181,9154695178495413,9154906598387783,9156606782616571,9157375841856013,9158676476944194,9162668962659557,9164560080137859,9164808695055580,9165398869818013,9167897025495661,9169656716086699,9170930061820260,9173729001963953,9180092141515547,9180835981759675,9181392291811972,9182988335016139,9191103445945706,9191895644403376,9192998929861767,9211473363037424,9213354462894451,9216455003394783,9218987363983060,9220215058481045,9224473925896720,9230580606053228,9230781676144500,9233097697027556,9240969247386165,9241853489961590,9244558133126668,9247060266595808,9249469378126723,9262046728311117,9263681010079025,9271020411926998,9281037194537819,9285141224306324,9286405859692622,9287536154919442,9292441630319582,9305483767431545,9307808415233512,9310504704434237,9312863640688829,9321928567260274,9335430691435020,9349870750958121,9351366438163042,9352141537262568,9355129827150685,9355940792017453,9356671043680573,9358554525655764,9365839127076720,9369093634111987,9375641216209833,9380627712808545,9391486822118482,9394976106188444,9397328261168293,9403765537033865,9404565557844940,9408921495622508,9418107590281736,9419050885946372,9422324913293330,9423486081818391,9424880153466631,9431002060282822,9431492779408494,9435750849137196,9435999454995538,9436467803114520,9438440967152456,9449047384009255,9449340008831532,9449825490510575,9450124549058915,9450266223912705,9451923415826938,9453754181386129,9455474915077424,9461773699588349,9474415802799799,9475829378737099,9475995871994700,9481386469401705,9484371784536628,9484984344438553,9488440660460028,9494545505111169,9499991472364593,9500280517945270,9506279993556395,9512690141526979,9517376601859417,9535833980442651,9538837225746487,9543648898717657,9547557360432895,9566613425099599,9570293654144253,9571330378157840,9573924489710217,9575165150956720,9575447421008044,9575885834850839,9580743881484339,9583968797309156,9584561427927733,9592438802141140,9595887445842995,9602902468421116,9606089004061910,9612716915528679,9615883008659902,9618201274077735,9624534057788017,9624758856235873,9627779818273370,9630291603264391,9635050901818979,9638553113056923,9640256664119083,9641362208707358,9645470246174746,9654460153869349,9655820024254396,9657079142409314,9659932051785010,9662025225596157,9667241625101916,9667276386502943,9670575633086786,9672577363633156,9677328620414229,9678605646922309,9680945785313483,9681067738687093,9681267064784409,9684623279089548,9692893983377486,9703534916274044,9704279295656371,9710300281092436,9713887926847430,9722172985687510,9725792147964753,9730102559873491,9731503574801034,9732040155273334,9734113097684625,9745464335501377,9759300027118446,9765786817226776,9774084603677009,9775954892341561,9781535106471005,9782799358625264,9787218473221709,9788457577027091,9790881050593378,9791716416162907,9793594547272040,9797573432707706,9799889278686304,9800030295823990,9800643300496312,9801531586161304,9806868721015896,9810257422545913,9810585298035378,9818968963003545,9827735758375406,9830504103227308,9839088699938530,9840344155855293,9842146939578796,9847076554159479,9849134001046702,9852441706161363,9855206514878035,9858589834827721,9861618221073016,9868175191633702,9870764755744667,9884359216488201,9885568741169094,9888383442614567,9889475178157085,9897712283656982,9905482398851907,9905509887050233,9906275973443018,9914537510086498,9916047888280925,9925919096295581,9927737657495008,9929573922212347,9929764483111561,9936648632787612,9936930189944836,9942546535848415,9943414292728350,9943562503792795,9943864011523125,9944926015830475,9949019217971450,9954313852436822,9956729180121161,9957485611486827,9959245907790498,9959649353025290,9963611887803061,9965461136179694,9971538903925806,9972659184707003,9977300414299533,9978079501560780,9984981814399679,9987074046553413,9988183784680681,9989307766906661,9992489863586669,9994394125866496,9999648366321789,10000031604968879,10002844747369936,10004925495910091,10008184940193864,10008461597296659,10010370056943249,10012185312140615,10013671898596744,10015492885655362,10023685800384052,10024272312740748,10024518507785908,10032518948661767,10035990167276835,10037675774604732,10038212235930891,10039680011817758,10040986745269183,10043143509796825,10059920635183403,10060828780412828,10063209359600748,10063600315370974,10065868826751899,10069449569085214,10079621948289706,10079780537607131,10083178186730993,10083835921342635,10087308692864238,10097389948237264,10099505447983235,10114762120457964,10122602940701424,10127093259012746,10137337268168058,10147088443674066,10154032807160608,10154307535853042,10155542068394802,10160572363193486,10168672086600643,10171123087736692,10175620188550308,10176491974309391,10180241838937158,10185718177412770,10194722954029343,10196461247193047,10198150880968885,10199671620517651,10199695570402626,10213139479075307,10213463533232206,10216483252367678,10217695934166737,10224107621925080,10224341861439251,10225134693123796,10228698111458386,10232055143615071,10240074124610978,10243488805517527,10246709309375903,10250309444905037,10255002992865388,10263727029912731,10264525977886481,10266632870176478,10266820110531598,10268623122698602,10270163349933746,10271469102327032,10272466256560721,10275743124733814,10276224552844118,10279570843072650,10280390537061924,10281974844357620,10296558578829119,10297624439979852,10300473582041388,10303678734793518,10306671241348195,10308330126619188,10309932015401873,10313330243851850,10322521592618940,10324994061564475,10327639436128444,10327836487278240,10328062775184436,10333141638123601,10334247661646447,10336137590649056,10338920736989409,10338954267321910,10343961504376443,10344652626799903,10345084432193915,10349133099508569,10353863721850444,10359698556732429,10362411781950181,10363323817948865,10363856866900979,10364382029087166,10372218357405989,10378769233943499,10383366177443313,10389930038194229,10391227460972667,10392794184418963,10398456973956573,10400718449268838,10401822723036473,10402496249924470,10405138061495149,10414467140029313,10416168489938655,10418627906817111,10419717389280422,10420088526584073,10427907045176189,10429449856647102,10429527060520295,10435742830395507,10437768643316041,10438557857523653,10438647586807590,10440006848965726,10440487445875223,10440675814381911,10440838933113584,10442812714871140,10444872472907940,10444918775580154,10446053077534203,10447329823526441,10447915458986158,10449478084214979,10452337317481683,10455776599349428,10466519528405133,10467269725152768,10469454767792558,10470445288089220,10475593353115645,10479056751944619,10481951551463793,10482250308230673,10483827694271696,10488747371854077,10500385026427457,10500488552967312,10501260356111404,10512322942799654,10523297170953308,10524197477213077,10526510768148749,10536793015345856,10543343012752730,10545121874221728,10548308070444431,10548369311321575,10552704438057457,10563044468102777,10564172672934646,10569936885011978,10570501131760442,10572007743658085,10573848693130061,10576973213646289,10578269597246868,10583112117834044,10584639348411680,10594696458041961,10595951863277111,10608578244765171,10614584966626439,10620592192554759,10620798042888920,10620903315872377,10621082248937070,10628643234912171,10628921785417252,10629062237330307,10633505584587868,10635387052790493,10636120887409717,10646006703435010,10647477117223054,10650028403614625,10651121348848499,10651345084987832,10654625803270603,10658173942822862,10662504984834470,10666212884489150,10674748481093869,10682941113641784,10687323304253849,10688559968805974,10691866174146243,10694995464922729,10695927691488062,10696210215422820,10699195330093884,10706304398299584,10706909043141392,10711714669403874,10718007421668166,10729128988836847,10731904283058389,10739494197398891,10739801112151611,10754362437493971,10756057668895469,10760825923948907,10766441205077631,10772046172473065,10774595446367799,10784226394558794,10788763405851951,10789995657189753,10791861050542069,10802291531494117,10808364573742279,10818898059592134,10824560278958441,10827594971106799,10829139547484424,10839803231136688,10840559858292177,10842762916923867,10845347885148728,10846960440347385,10847253104607377,10848123266416937,10852276831524456,10854428106437565,10861241406654155,10862767928581833,10885189706501902,10885610118151807,10887994298217623,10889705602128169,10889922295232389,10891169429104356,10894930198422418,10898755272190895,10899505158719250,10908847810397854,10913442039163653,10914509218958331,10917306751752594,10921844952698928,10923207650640427,10926914968716545,10928023009400926,10929092524133184,10932202328627689,10936661551264740,10948651072438721,10951948612195804,10957392874098981,10962850107056207,10965247315180552,10967958009290469,10972909371352109,10973378500551496,10974341394948959,10975117317621830,10984973432275198,10987375778918246,10989202769093484,10990409005986868,10991558669034873,10992487444525627,10993385229040135,10995462961190157,10997692366332970,10998901653014457,11001665093098228,11008808435120331,11014427382218185,11019805073365185,11022642525947194,11023763375301913,11024320739845292,11024882796236184,11026937591363346,11027888013301225,11029825398794377,11038025471456068,11042536781194006,11054833551017746,11056592382175061,11060604597813501,11062402375665545,11065372708374227,11067445870108559,11069851171800453,11078922436292795,11080633936367688,11084732592274012,11086682643753664,11090552698103596,11090658585225857,11093188409391605,11095383304270179,11097675961341227,11099369753166044,11099551546210604,11106506387649741,11107790185802013,11123159825527527,11127177251318180,11129601052279134,11129859357638708,11135583018648230,11140988948695005,11145709417931599,11150199869893669,11151387183049239,11151502397313142,11154110766156613,11155371761778170,11157295858466191,11158629464450723,11159414148224323,11164777782395177,11169430954279703,11175202072568888,11175852637265303,11182614329697619,11184082345585609,11189705813167959,11200002087496543,11206266985990044,11212811284757912,11221071832453748,11221944462355990,11226519618035312,11228541607320093,11242771850103375,11246366304663746,11247749010895948,11248516680815322,11253108954825869,11254820718151050,11259273102107656,11261264491503353,11265599260848364,11266044641422248,11273050277652437,11280555107389759,11282775695475018,11287600375177114,11287893566206082,11289098740261755,11294033441021320,11294773174907559,11296245208210814,11296470298526751,11305713023756983,11314863622778589,11315258511556799,11318554733251064,11318702654300182,11319884533641300,11327586528497295,11327962158491035,11328275638112363,11330273278512837,11332552015667182,11332697629166595,11339380783297842,11340864658831256,11341499658993269,11345417090314925,11345675657880008,11345676553858105,11347392098829511,11350409656754127,11351342558936179,11356129989349558,11357460555425130,11358086643159577,11364648346737734,11364824056279735,11375384258325108,11383750361347134,11385168267308423,11389085993650118,11390806798887033,11397120253984795,11401700519195899,11407990305495344,11409131295056209,11409539605925674,11412775300232418,11415931058237717,11417029279499404,11418837401792416,11419050457279467,11425460679750797,11425662918976551,11425674126078597,11426226096713572,11428009680185526,11428645865887160,11438749233694215,11438964462131562,11439459859045383,11441352104880750,11449215149184077,11451628260579985,11452907797520228,11454513203644820,11456881578527630,11457346667741182,11459692638425608,11461197052190285,11462322406268747,11463847343590781,11467023328231824,11475621626604237,11478518643790981,11479308399516360,11480479501736039,11481618683495992,11482045627548411,11484846169745779,11495135828909763,11495940916242717,11496597527405109,11501735487559917,11504102260549809,11519358680969077,11520326123589592,11524872224376445,11528838411980010,11529868184218561,11530775986224529,11532404505521434,11540894335577353,11542654016596156,11544013273650036,11549882026188551,11555345985992577,11557042562885581,11560241639867312,11564144512113061,11568498682605843,11568678400586201,11571509198989020,11578506049932280,11579799804334255,11580137502987103,11581727638901896,11582679596601249,11583117656203215,11583507699894777,11585101265495681,11591047076552913,11597838643220140,11599758357243289,11602974466808884,11606641527777687,11611075089640896,11611748354430947,11612441214023420,11614076397936832,11617014574952198,11619486235348603,11620240919983985,11630644392646970,11635654976144153,11647860207129349,11653989895623444,11654096880041734,11657305088193972,11658683525550614,11664834793123518,11668903396015121,11672731802674267,11675090294053441,11675693720283985,11677165015385087,11678549904666632,11682179864909511,11684720239124533,11686779979541908,11687293075071490,11688194680736119,11688237614883154,11692359880827672,11693102982874445,11698756388262452,11706304165327067,11708423865084678,11713690607980814,11715198530711638,11715865254416537,11719221642455460,11720785116417048,11720873374212079,11724825796758899,11730215861616220,11730467845423531,11731164734577661,11732952721643258,11734286258484747,11738267971902102,11741745751488248,11741795538258227,11742706162990999,11752793473682764,11753431031818860,11760357200965436,11763018637241008,11764356908364626,11764874035248246,11765704631176088,11770706878359270,11776764748856507,11776942132964186,11777247958775743,11779851101536854,11780837550785665,11784146238668811,11785019194973166,11785179977172715,11786257969749074,11788283962253789,11790027713938832,11794754868481503,11801008269183730,11805477304749933,11805878919584950,11806503727533719,11807259505447243,11807669581624359,11808975572599296,11811220919611763,11811523671191877,11812158072536409,11823032149508223,11832905268834899,11837866263209531,11840295016246440,11840740382069149,11851393982879753,11852586198115184,11853010088136902,11863824190392357,11865542872711201,11867361302652812,11867622238392053,11867650483762221,11868412894656304,11870483176868986,11898179901887368,11902610012478374,11905938593275593,11909747036535415,11913133672980273,11913579970744326,11915166616058954,11917437865006221,11921035710134156,11923308867485120,11926548936220126,11926805367659349,11927305572643559,11933874408252082,11937995919116415,11939403507435534,11941423373859996,11943720753021897,11944383058348300,11944816331130113,11947591791853532,11949017351281018,11950260926652836,11950767340516423,11952970464557834,11957777053959725,11958250094792465,11958326503192345,11958460182520420,11971876905384523,11975100060815344,11982172621189391,11983457002283111,12003675199241448,12005848174188578,12006522182584979,12008453650739367,12010943352799406,12015639579945326,12017040715051459,12017280196275791,12020813104002531,12021776657102177,12028404727423728,12031148585055714,12035410219539172,12038634615780325,12038984699485340,12044509607056046,12044968098240593,12048218812219413,12048806622801494,12058410162198406,12063913410743326,12064775457651800,12066617975736949,12068867551957576,12071821708133242,12078782315854672,12082781112646069,12085458622840725,12093768254366446,12100943786761249,12103342077182428,12107118790447412,12110485781570692,12116763440852785,12120377787831548,12125020327675301,12128081806219839,12130816680520314,12135336896340352,12142933439996132,12151932314546584,12155534792479002,12157910100394424,12160896977780321,12177544020169884,12179235809448305,12180040689184058,12180082671639635,12186122156905100,12191783234987303,12192518732233721,12194188433406465,12200556590308310,12206961056205598,12213487168281093,12216273376124566,12217037909052392,12226080030988795,12245236639338607,12247196809844638,12248146350462714,12262311222789758,12270437462627588,12270468054721839,12272177606302533,12272459725983251,12278383271194356,12278465206346582,12283300256073245,12288007619457690,12291976925956833,12293363361122131,12309693411990179,12312452990243375,12313776804764473,12315740660505432,12317737462653854,12318972117511665,12321231790044265,12331039831738345,12331300696945968,12332506101262263,12334476628613713,12334531906613812,12339633893332240,12340540257772691,12345475198663391,12347889475047477,12347909851317459,12360123775491680,12365750013844755,12370916479710467,12371561019916631,12371619531441513,12373603008943996,12376979801050250,12383499878769639,12385506900879288,12386587160860313,12388152182711242,12390527382979238,12399457458248220,12399946676042065,12400047520221171,12403402985696867,12414844391454468,12418785272430115,12419850567303984,12423854238100384,12427118539066615,12438014718231534,12442226783163244,12453429222313446,12454152316786214,12456127570195279,12456974565702299,12462037008975433,12466553139749819,12468960135907940,12474505023309244,12474737975133205,12475509972236592,12482618753698829,12484792688815182,12493121285865448,12493531949000940,12495098079276198,12496980807002515,12502404720988619,12505394079431787,12516465480475702,12520510172302125,12524882028709030,12534442305927340,12537207717978996,12537660041547670,12544032455101983,12547299118310242,12548400145943534,12549910268086697,12559911253268655,12561212829273378,12562621625488566,12567396522160703,12568269490941488,12574953991866005,12576978802157283,12579381981407506,12581297128593136,12582417504397196,12593140979984180,12595025894277088,12595568697579144,12598104253633453,12603845745843686,12606498330358134,12606901469342342,12612529707466049,12615183521565762,12626861709059593,12627315009999764,12632687195472462,12639293891627838,12639418814809649,12640167527828266,12640428361474198,12640818042674893,12640871580590329,12640969010574572,12643494240426710,12652136957477942,12652319841467466,12653336178984584,12669546391130781,12673433235167362,12675636205599614,12678598650222967,12686784777563923,12692097685841819,12694087866558340,12695944838331405,12696892670705054,12701707285742221,12702962657957826,12704004077140774,12708456140034254,12714535709904775,12715273171560431,12717959780809396,12726377376546299,12726436436675201,12727033270826653,12727451946669105,12728089463396868,12732375258004085,12733418262925777,12738924440303227,12743533106187231,12753329851400984,12755303856704861,12771267826669947,12771598737920523,12781473711050942,12783187525335089,12784227033504239,12784609326080007,12786352863464886,12801400431375545,12801458013719616,12805335924853732,12807665721211120,12809522469790774,12809632657832569,12811342685032672,12812182344672176,12818427882691906,12819996417836695,12823282271321919,12829072084255416,12830088505582363,12833367241791767,12834860563514747,12841765441041049,12846385534139025,12848207419401226,12855844760930756,12859899907121954,12865701070922173,12868140593650867,12868775358248804,12875124642594600,12876334808558683,12880367229968159,12881424980533677,12882980822794716,12885114452454742,12887418415101226,12897909619055754,12900385538356311,12906788489234465,12908599797018073,12916112665929539,12918769262556623,12926924362852450,12932353138648641,12933527428631618,12934033598192671,12934350686066194,12935326619280462,12940588789639096,12946489524267653,12947522776480207,12955628525207272,12962843219163460,12963051823146860,12965554585695816,12965962908737325,12969038927449240,12974390471676060,12979430248880345,12979490664124573,12981848516554289,12985447175738467,12985493722134421,12986155313057080,12988371356512075,12993874964553291,13006729868315951,13013340987113008,13035980728818146,13036179358537986,13036645177838759,13036712601500114,13039280823665945,13045554401715049,13045600513120125,13052620397276151,13054236155028234,13056240047123862,13072501768493061,13072617923580734,13078916872324125,13080772514935469,13084504306404264,13086192967607976,13098251820355349,13099158956896694,13104310576837531,13104339456644663,13105958370560896,13106875818164772,13111138396854003,13111577005851999,13112638257280506,13114710504214063,13121300725611891,13121909629776355,13122025064383559,13127052312166869,13131340694012302,13136566433737611,13154101095403837,13154198718450725,13154555158306370,13161267336530233,13164318985726929,13165906763126222,13167979789890447,13168110604938660,13169610591191174,13170738127259843,13174101013456127,13175983907317617,13181966685315968,13184920558769854,13185428376339990,13189380579282347,13192157399149307,13198474033954862,13199022280418750,13199996563430952,13200108351974202,13201807925580353,13204068426067383,13204462037481647,13208555149000976,13211066578415956,13211782471313872,13212061616770885,13212165395243922,13224389449907250,13233545463558187,13238503442128074,13241104178731895,13241168864292142,13241450032861794,13248655970715829,13254044124605936,13254407272590223,13257737390535834,13262295991057322,13265037473491891,13267936932487740,13268996306113479,13270587658638306,13275333794285672,13276579099396774,13277158900551235,13277894633414167,13278754087006669,13280579253065923,13298750975340353,13302754854003883,13314547425279234,13315440480385562,13334201253531043,13341251444506381,13344119991216847,13347417735653001,13350157202852563,13353560065022737,13357601360650186,13359331313196874,13371096247956838,13372563355342461,13381234858826658,13385015528950604,13386085962502095,13392333467703344,13392648285436965,13394339089063032,13394551832655184,13402693157045406,13411243783632080,13411581109556372,13412383321315676,13413384478433494,13414514599248518,13423145336408826,13423985507124709,13428472113899924,13441217894362634,13441764089885110,13442145389950277,13442492834263372,13446542370234933,13448612638765991,13452686859689485,13466915673127210,13472766006472691,13479031376600774,13485627420255013,13489343668600089,13494212405375710,13495705064166062,13498151694534672,13501680776202759,13502956667923325,13505729133720187,13508489645714817,13509178757772730,13511175763453019,13516547793277979,13517706451256090,13519875476747795,13521772043996142,13526044181112873,13532220158545162,13544313444890724,13545457425167339,13546292946364632,13552576650306647,13560284735647415,13568144225858544,13570582888476349,13572007215372631,13579292169199361,13581522598681686,13584670639804403,13588669826245287,13594750552645450,13604060080584622,13606027345535991,13617631557607803,13618539821196872,13621326447410048,13621397819727220,13630191002096322,13634793775983156,13639851440986869,13641546138989675,13641606518127186,13649712080213198,13650051593032688,13651091819607497,13654427643263871,13655297823287389,13656122678588204,13669670567123455,13679106791368898,13682988765515845,13683384323434122,13685822158536416,13689486854312196,13692915679570837,13693027758544966,13697542413338103,13698474516655449,13698940649521784,13699244375297918,13700132194249732,13700395889419752,13701739894308740,13702229819892376,13704275954624893,13704352361096222,13706325468204188,13706670591280801,13707739578231116,13711378433916464,13713126833679041,13714567785435526,13725507831112260,13732875764071009,13741640976152344,13751009984842370,13751355444615875,13752515134941761,13752528537770243,13754252433328047,13758183684987635,13761510571679661,13763495093063765,13763737248840023,13769064518094184,13769616521296557,13772233025470893,13779343273469876,13782374358651901,13784328184108875,13796714897651986,13797271737130311,13806888829912490,13810189191668783,13819216056468458,13830139162136642,13834956579150593,13836967928987258,13841506126069216,13844723334748155,13851419674057285,13851804964013356,13852009236228299,13855148698755421,13860370948717274,13860591920719937,13863190387760047,13866281554758808,13869499215642030,13872276184090322,13883742410074464,13885432559815038,13892745650858214,13893729230139344,13894176996423815,13899662982568281,13903640450103285,13915571727313717,13915751473207815,13924477533272165,13927268834765068,13927696780643085,13930075501042467,13931574119523082,13932773453065067,13944345475258344,13944744739592850,13946019636961185,13947359127055804,13948733294751772,13948849305450763,13950003362814608,13951639188915106,13952318012300341,13957499193946800,13957891625399498,13963970693218754,13965952880287033,13965992746518202,13968491336408470,13969065894497400,13982564979542470,13988336481457298,13988537842078882,13996482893584239,14008782205566643,14010800531316406,14012374992832944,14015507053297472,14020512831650303,14021767660954154,14022387662308266,14028827379125338,14033262874705267,14036809823113558,14039082912681460,14041235400538814,14050877619339410,14053671836796726,14059703515141024,14066003779541480,14068336590289394,14068514727840577,14070452446274375,14073196557139903,14077628043807881,14079413477735954,14082060161725832,14086400253454584,14089660453835508,14093701509746096,14093782787475587,14104609562657279,14105474512409513,14110176260429603,14110846566699118,14111754638843301,14113954118968165,14115700381311298,14116084203345559,14116757201838538,14126627189476844,14129104187791374,14134252052809090,14136333045989523,14141721173215486,14142180399257488,14144286736225168,14147641612048676,14155975414531428,14159319817299730,14166972826657757,14167756848708160,14168406404233927,14175692216668231,14177050809138713,14182196956346963,14182395945348617,14182522370557503,14183668773352145,14187963108760817,14188374719646123,14194895524216878,14199253731869253,14200690083582668,14206687970696716,14209853920811090,14210270522420558,14215487446995196,14221732530161551,14226640214180071,14228466984618789,14228747898918402,14232996386333408,14233668407191507,14240099294539079,14240931409922495,14241368478491777,14245988562300971,14249502033018713,14249687565938747,14250698500890187,14253371408889777,14257631509009617,14258964967231840,14262886155232322,14267007162424724,14270247164026926,14271254001084489,14274483320198377,14274835544974226,14282336913974875,14284383355599952,14286491830383354,14291469615410543,14292177888930623,14292472000295936,14295837452356585,14298173613618098,14301257753026067,14306645039357579,14310015895186965,14312644574968166,14314489483212131,14318196470777373,14324162382494344,14327796466197481,14328346609794800,14330165162804822,14331141876120768,14334872702153449,14335504292083940,14341688237268043,14355465370177113,14356635961829550,14358501668762854,14358712927579207,14371667519001998,14373358653447165,14382653933328395,14389328610898075,14396743849723041,14403258144398461,14412375459733818,14419304139943345,14419408012320654,14419921321665188,14420517295463922,14421526836577881,14422482515005653,14428672766558377,14428963789076396,14433829777128924,14434491585844546,14441663522114382,14447076836883551,14453826393896267,14458414754313519,14460517698166413,14462888447275103,14463327792076139,14466639418787131,14473045986857112,14473621415900765,14474428436715615,14499310299171024,14502523747605405,14503354640357478,14503374543528179,14503421058685682,14506805831023499,14532022272038796,14533166699323342,14533912823231087,14540918644501586,14545143474688100,14546173469934987,14549609943559194,14552911878947037,14559345024889908,14572798606348625,14577681860555800,14584989944454173,14588926759145188,14591042865966381,14604151534357376,14606919734830311,14607574529846282,14609506155169308,14611213899966353,14622201375555281,14630990679638089,14631058148494236,14642640937827937,14643031895991669,14651923318081732,14656739959494770,14657604036440419,14660498729179366,14669778228653280,14674786028678808,14679329027774886,14680624844186736,14686694622392691,14689336054562019,14690256368300896,14693014696087872,14696133465591728,14697396774309339,14699312209892116,14710079784099482,14714972951681498,14718737202293924,14723600578234977,14727357539437919,14745825891890849,14753483946970413,14755563140273075,14755965578235618,14763159151224600,14764626632313599,14767178168674763,14773705053327044,14774424971091441,14775120243637151,14776126161075338,14777180456645831,14782806494910422,14784179132609967,14789280323954198,14792259226342467,14796095030944430,14802524498404394,14805119697076881,14808357837092866,14812335544631245,14814818422854695,14814866777217318,14816837023868332,14818468476875375,14819019180401048,14819207660612212,14821775114466739,14822972850892928,14831940328373824,14832874204975299,14833648039540838,14838200055945191,14841572903295226,14844521188313039,14847956454072103,14848728342207420,14862317369956557,14875244788896295,14877010452958052,14878500342798560,14880698080993082,14883492855733875,14884715585223534,14885001023521200,14885277650679824,14885514754372569,14890608256545870,14891249907937290,14894841507741510,14896432925297083,14899776943325306,14912124553248599,14912527776416275,14914637529083995,14927475356531675,14935912020884502,14936652050795629,14939491797730666,14939629686310269,14940182819303564,14944948556154171,14945635561516349,14948084549356910,14948446551537791,14949218635729694,14950087955881572,14952347763435515,14959164006691718,14961785319983417,14969572759318726,14971749865458474,14972120173118520,14977645952278382,14981979822436646,14988165815323633,14988877042506897,14993137347899829,14993931438103578,14995330188668762,14995498956830547,14996000077091154,14999867324523228,15002117713677743,15005212809692919,15006561064342947,15014220467166698,15015143178733208,15015832834168182,15016747917580341,15017581694255009,15023369839042379,15025234321810668,15027872686689687,15027965624651774,15028626060466021,15029223449524697,15029421099140855,15030279194177507,15035193036474483,15036546311204155,15040592624783964,15045563863877798,15047851634879837,15049206174008406,15058256367816154,15061551128577232,15064441595557983,15066002393956736,15067340188537424,15068416282122043,15068737776024543,15075955407954330,15081726781697934,15083561199479055,15093027538264168,15094447720794002,15100586527842091,15100895887756881,15100923564154126,15113939304708511,15124747624370744,15125038963644248,15125825668976374,15129837287101752,15132962157165167,15142340753501357,15142771204200772,15143222831944000,15144509730846110,15144908199448686,15144983031311469,15146266898753650,15149385225563368,15151128984083655,15152827565413011,15155535079044825,15163502675118430,15163608007084548,15164714333985887,15166301729740851,15168611383865707,15169763864701641,15186748056330061,15188528526614937,15193432864725218,15194881156695303,15198814250439862,15201662707583210,15205461449086049,15212200426217138,15213176306218798,15216511057457596,15223439233922134,15223468402959842,15227318255741609,15231273354612283,15232762341410261,15235610343655303,15244577355919854,15245960594532078,15247433843914559,15249106376606939,15250808075404806,15250896481067742,15258347128439790,15259612081001691,15264731568598059,15269511085038233,15270167427090968,15275297064012400,15281435895520542,15285891219756998,15286093688825701,15289862125703132,15290148594297788,15293123874907838,15298758428575449,15302533346414874,15303593678697230,15304616612479818,15304832061714010,15304847733090522,15305964656672387,15320795701669972,15326286490335430,15329295210127478,15332234480025938,15334331246873691,15345070383303245,15353149026108168,15353232670298897,15355335603796124,15356453151175679,15373930811061691,15377210030466896,15378726728075414,15379533653675249,15379705621247200,15383101065751713,15389713624455313,15390882193918153,15397171895399395,15402636785814262,15407310719467201,15407689229789196,15408822912493961,15409962731524859,15413173941024438,15413515751683038,15416018244148923,15419969270171951,15423153342341016,15428180744186198,15430571383432346,15431617843729266,15433887937403854,15437233971050551,15450715294654043,15452893020203867,15453212547751568,15461080320924835,15462712554318219,15467422080353387,15469490756626048,15484285765017800,15484872489499858,15485469980475937,15485540626374727,15486304467213949,15489723346328738,15490588941327161,15493669981706537,15494125152072718,15500366938118891,15503520406029992,15505519386765147,15507002730562560,15507596083915355,15510353559011411,15514535598456693,15515999929535991,15522061334524465,15529140775061671,15530230344187268,15544311106200778,15544548900805204,15545602100255398,15550480026583451,15557041324109368,15559289591149294,15561564170785191,15561952874606530,15564933328360346,15569147522045783,15572863590946325,15576367136356714,15577133638656672,15578855835529187,15584544164633761,15585305674396933,15586021008360415,15586572984402845,15591835587885800,15595116715517183,15600022327961011,15605001570186520,15607870388183703,15607922224614041,15614292833802726,15618067512755640,15624018957338891,15627340304611938,15638216907788849,15638858384127971,15642870227425833,15646298279506026,15651638904781067,15652623828656547,15659367639512437,15663152139200473,15663518386052835,15668753861818902,15669831866387505,15678947651292970,15682293644901354,15682550662690976,15682651083715691,15684129061048451,15684714219805662,15687701212595998,15689677439468810,15695115813510797,15695286443123085,15700741802774887,15701006667851545,15702846415375903,15703609923212388,15703962508930357,15721982912314664,15743385922200831,15748859641437205,15754569858261273,15755103109741170,15755440794212771,15757615898310353,15760850159052851,15762209549139285,15763329664260405,15763347842500031,15764824185348051,15765990586747192,15766276644999494,15768790414358471,15778030936010985,15778073642134042,15783285537872233,15783457873235222,15783601313856694,15786952156025848,15789976943224909,15810883487535993,15812065825937453,15815973513115164,15816359902190124,15818396444990604,15822569260611075,15827802457455271,15828816982020122,15830964443377814,15840334179132579,15843352451900485,15843600775555820,15846238303690403,15863820783079073,15866236940451977,15867439138551076,15872362195905201,15875675232290550,15878101641545005,15878818841668229,15879020355620882,15879801226947188,15882817785917695,15884400676036182,15887607544869391,15890892365015163,15892366657661960,15892644929711368,15895961844448293,15896204034306061,15897401962344850,15904895288630879,15911275554215588,15911665623700279,15915302121110625,15920977380773424,15922176557778000,15927341566519732,15931653898476584,15934933731048989,15939128572044103,15944489120018696,15954518713119826,15962749062304179,15964603511322079,15966264651417527,15979755963656160,15989521035310443,15992440720931469,16003094114347164,16004817238396021,16011025198707647,16012437984901399,16018234336000847,16033495765775843,16038301793476863,16041339465547053,16049606674888102,16049891952869470,16053420407977591,16060832251885914,16062107659438725,16069046551508727,16069174976182272,16074486970716033,16080740588460730,16083877262816687,16090383828959085,16093240548996996,16098091349807463,16098172275019426,16099819217713851,16108376586576329,16125621753305299,16126978195698598,16127193854760523,16134101469312376,16140981467932346,16144336314735217,16145343445599154,16149410584199209,16150040833847011,16152783913026412,16158129834797633,16160771084575630,16176635623269431,16178760128679429,16181654573567669,16184318280736106,16188822750519283,16190571409547135,16191892773280294,16195655847208731,16198734954314842,16211596379781820,16219147118653087,16220746069769206,16225653430698144,16233112123839300,16233654384438564,16235839769165041,16240762741306135,16250616875540489,16263334721549532,16263780136827362,16266712848564625,16270015344285096,16271083150992386,16280927247624124,16281915533551056,16289174359879745,16289936972512243,16293042607710037,16299677703726115,16307595056946704,16309137054392802,16310681306790082,16315681986373983,16317875420686187,16322691175661967,16327416205924614,16330104654201385,16333887903434801,16336789007686054,16339994308760050,16341765762990956,16347036755860703,16349055180152841,16349624325715789,16356347789934648,16359131609074861,16362066780194864,16369009637282998,16369281391114209,16369945946899217,16372297199748151,16373987141658826,16375988586939943,16377409889399924,16379852130875662,16383306457978105,16386085273604579,16389519573583307,16394940965979110,16396880383990556,16398154649330003,16398733126926655,16400068126248119,16411790303732795,16417759312392243,16419509766610091,16419901644544820,16422440467412931,16426184098291933,16428705781520359,16432971637241688,16433327633528711,16435742390682375,16437965595938221,16445322365770696,16447014000822092,16448682699618749,16448829682806061,16452048761762229,16457674212534286,16458193998413945,16460001490402451,16461454427883020,16462708921584182,16465114761860846,16465741257369090,16467984634116524,16473606709399091,16482765139128458,16483944183054084,16492537636527687,16492753413249082,16495982586896225,16499668866369559,16512946688413824,16514937777288269,16515317531532600,16515515620455784,16515763893503398,16517530335877295,16517537506431523,16520680878383534,16522040144005904,16536209470153361,16549193848756349,16551638786295881,16553191501416196,16555675508895642,16556142033974396,16557075823478085,16560847038118314,16561729019896847,16568638052909005,16569637258340730,16573238078421961,16574236353677299,16578960500697920,16582822217918014,16589279792859879,16591958210106118,16594902459898391,16605131007727492,16606580432110149,16614318900004042,16615202652663010,16618755587989745,16626806031696348,16629787832374700,16631291003094984,16649312379995411,16650212270002998,16650283216078512,16651090716262974,16652944391354801,16653981254978641,16654463028053744,16654924393162193,16660067927175777,16662173317527607,16664243102641776,16666335390036133,16681703184890456,16682373593295548,16682578612688855,16685180422791776,16686094229531741,16690146533382363,16698836185067210,16705001871343701,16708636667018243,16710176517292164,16711961207359232,16724720235210872,16725294155137383,16727032133352391,16727688366022871,16729903330833289,16730086424088046,16736716353986757,16737373768108148,16738231483083309,16739182735043548,16741586841667302,16747302145863252,16753484567013429,16755872335309848,16759630605474370,16761517688376276,16762470025066864,16783509929261859,16784815745535597,16787458448627887,16788378238902865,16788943958964328,16792514152922390,16804992154815880,16810005023919202,16820470058192790,16827645998999199,16830315094868687,16834728273804865,16834972450595855,16837630758069802,16838337071218776,16843019140494565,16845510952053197,16850886838136612,16854336811041325,16858777046733835,16862894270178946,16867020968629490,16867864576520806,16868617407655959,16868967869968917,16870410864242800,16873469111493195,16879575300669180,16880613203851258,16886381613931407,16887131664281014,16888158543374642,16890451337099671,16893613954731672,16895785673304080,16897929991993805,16899560454680533,16913880033664423,16915167008196857,16919853192670914,16923906246184599,16936593144852542,16942918462021910,16945631443389494,16947503499298974,16954247022723534,16956760996707076,16966208347117891,16967956019390437,16972536001685219,16974976960555336,16974996106586527,16976062143580028,16977273505567584,16981100707531732,16983782183294117,16986912710655502,16987865244712954,16989134716990479,16989387007759710,16999582099196194,17000582419359490,17001105496098528,17001625874486973,17004131992558114,17008056242821339,17011749486628658,17014229777897844,17018029384954439,17019281824197645,17022002414706187,17023649823777761,17027024912981538,17027708820603609,17027724390296374,17028905071969344,17033234045807732,17039633394164779,17041956186643831,17043911086147728,17044227183900592,17045020641718652,17047039786296372,17048296154567372,17053515690985016,17057095725127500,17065736283826551,17066204322105669,17066614080750842,17066890141401799,17069702342298005,17070031206981939,17072355840387059,17086180968300670,17092924242292896,17104993056562977,17105192154942325,17109970886679733,17115701285838058,17117697441697646,17125687975796961,17127731933707424,17128001588707458,17138834595066375,17143215310364176,17148449573094945,17150277478457455,17156795393603011,17157898465093855,17158828067013858,17159011651238619,17164804932271093,17171938449805958,17172705597475016,17173738199873813,17179028413037838,17184647258662518,17189985997758617,17192344156604394,17196706052792205,17207135836467323,17215506044805401,17219161189129714,17220596644754506,17233175158773315,17246076311509922,17255087609386479,17255995767294112,17259853567766215,17264169344327606,17264493723089078,17270538843510702,17271660415976271,17276987541026097,17282090993143952,17283049372339150,17291560172494026,17291979671712321,17292310452468705,17293149261210524,17296137849004163,17296192463290079,17304784741854216,17308433563518540,17313632977406292,17318960805851472,17320236078883475,17321015553228474,17321174101274260,17321666593584496,17323038726279631,17329308502917698,17329472456864413,17334589990132684,17334864272182912,17341517372421961,17351099007094397,17353053843653375,17366561361017324,17374440012919698,17378407166143928,17383067517360975,17391219400155350,17391436242706241,17393048485135831,17395137850335402,17396392214119358,17398282123693718,17400913153559823,17407246841014078,17413919734842125,17420499051739414,17421399034054265,17422062281526649,17423105453982540,17424289896774390,17425090194197669,17425614571955386,17432341969660318,17435530177003750,17439323243822026,17440523469292889,17441789247813820,17466786275120017,17470441567829100,17476597996427511,17476827515285484,17479085198776952,17484120239214373,17485776963997171,17486966299030886,17486988765689599,17488816545423819,17496728290093656,17497098710311670,17499228377170813,17500477193853245,17517159540441318,17523212990532497,17523973238515804,17527155984508354,17527974390318800,17541513425567569,17544483234277015,17554939460224349,17558580292594243,17559431629269263,17561448044068445,17562043899076409,17564200744376749,17568398735638434,17572077476907682,17575183081748977,17575560135721806,17578541772101901,17582290334182417,17587104794159329,17593208073575946,17596039699851909,17599396608009249,17600395630340514,17600607133597719,17602282637407016,17608993167118462,17612331367634278,17612661692978454,17613143173683718,17620186225453340,17622683372375560,17624682438741740,17637276506512333,17638727474061409,17641686744036386,17644039113805944,17645312266322218,17650070599794961,17653795308183431,17653847556747406,17657497846454029,17659159129272154,17659621059389953,17660743105223332,17661459152369712,17675621085369441,17676740506186425,17688285802217756,17696080779036912,17697163249179507,17697682504000309,17707842984156022,17709737359906993,17710801222146557,17712409569723362,17712893886367554,17714216772557464,17716382248321539,17720864955352360,17725080861093763,17729422012525851,17732456419233492,17741767011633464,17742935181641766,17746510075374564,17747383804315050,17756105008438482,17759977925515375,17760075502845602,17762550181642638,17767687613087632,17767859597836083,17769865028645273,17770964005963097,17770981472959907,17778432694523101,17780417672471981,17780633004635622,17781695742071988,17794405366215460,17798330281681371,17800329108711822,17805181966455568,17806610508162479,17807775056747143,17810904352266449,17811294904961592,17812394543477172,17816589074280916,17820414995546733,17822270975275609,17824058019135158,17824692487924143,17829073738411995,17829961694570784,17837924490120715,17838730974038612,17840541840864217,17843709229402181,17844481483540230,17848382043638116,17851847109708456,17853025423149005,17859332251377907,17861897896064008,17862392392198693,17863284124973089,17864179561603861,17864592832666958,17868918610462163,17871331243165860,17873251325535856,17875253398653485,17877847214982476,17881080524524475,17889149793462925,17892154693253238,17893885025675099,17896653342000979,17903487860911815,17903975998539248,17905797190497744,17905846546240170,17906626648911419,17919141917693070,17922025141068374,17927103791988031,17933163310113598,17937781070124082,17948193446220343,17961387047243191,17965956572473636,17970314287831337,17970920336945917,17981163734458850,17983136277324499,17988399353053093,17988976441880620,17994870382615510,18001307487480335,18001311042217802,18006237324341425,18006674318531347,18009042115164135,18014189215438894,18014657110806736,18020974854298469,18021886555673319,18026004208911747,18026232516319011,18028316562390501,18028401974332903,18032295415917214,18033529755426165,18041417715473006,18042108482583522,18051621313730146,18055041696562277,18055087872299409,18055587125968368,18061732028717577,18063228282000881,18065517227719762,18068141364358640,18069471533986377,18075555160804033,18076181993827292,18082663742446595,18089241107640661,18096458958758462,18097497168766291,18099088172925428,18100450631735810,18103384897215847,18129411267420255,18131898498754493,18133412567042816,18136432192167355,18138184127031654,18138810838881859,18139979326512529,18143945517551573,18148249253291380,18162150874995819,18163443951512529,18176398171978336,18179155755591764,18179306742222141,18195325370888256,18196224826654508,18198481066177520,18200162616918760,18200433284258315,18201109647268708,18210889187587826,18214396067041321,18215366819351334,18215593970961402,18215688955701607,18218596306288915,18219379302426418,18219480778356908,18219492177590913,18223532024009924,18229457516235802,18230262104505569,18235829425519784,18241422988672837,18244762343715926,18257416541434878,18258026885319486,18258589056361348,18258704607599735,18265102179546859,18275143099845195,18277496942328603,18277532214379777,18306497644679138,18307189866355692,18308565966347727,18309896670692432,18310224044923199,18310260983673763,18311678202467591,18312928450191138,18316793768442045,18320199194413158,18320706645413560,18321247174629916,18325796935244911,18330874810048231,18332434291015304,18336172353658611,18337191336507655,18340192768781464,18354768007590930,18354809741903140,18361245509159168,18362561943328805,18366897023001689,18367494137613346,18377045322936582,18379087767248948,18389507703197237,18390199754916829,18395298635755745,18396190684827929,18400962357421119,18402740084326073,18403600751028920,18410807087565130,18411318682491459,18412761413118797,18417094802826718,18417594946606594,18423285819437783,18429692224833089,18434322404441043,18434966080784818,18435861506577465,18436307626060546,18438159643459969,18440518503779835,18441289041766013,18444487060015322,18444666364288446],"molecule":"DNA","num":0,"seed":42,"abundances":[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]}],"version":0.4}] \ No newline at end of file diff --git a/tests/test_bugs.py b/tests/test_bugs.py index 4635d2d7c9..15b70853f0 100644 --- a/tests/test_bugs.py +++ b/tests/test_bugs.py @@ -14,3 +14,14 @@ def test_bug_781(): print(out) print(err) assert status == 0 + + +@utils.in_tempdir +def test_bug_803(c): + # can we do a 'sourmash search' on an LCA database and a query with abundance? + query = utils.get_test_data('47.abunds.fa.sig') + lca_db = utils.get_test_data('lca/47+63.lca.json') + + c.run_sourmash('search', query, lca_db) + print(c) + assert 'NC_009665.1 Shewanella baltica OS185, complete genome' in str(c) From bbab9c3e5e3dffd0372ecf3c7867cef230c46fb4 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Fri, 20 Dec 2019 16:58:57 +0000 Subject: [PATCH 09/10] fix mem leak in get_mins (#807) fix mem leak in get_mins. ffi.unpack doesn't take ownership of data, still need to free it later. --- include/sourmash.h | 2 ++ sourmash/_minhash.py | 8 ++++++-- src/ffi/minhash.rs | 12 ++++++++++-- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/include/sourmash.h b/include/sourmash.h index 607b37e385..e99ce9082e 100644 --- a/include/sourmash.h +++ b/include/sourmash.h @@ -79,6 +79,8 @@ void kmerminhash_enable_abundance(KmerMinHash *ptr); void kmerminhash_free(KmerMinHash *ptr); +void kmerminhash_slice_free(uint64_t *ptr, uintptr_t insize); + uint64_t kmerminhash_get_abund_idx(KmerMinHash *ptr, uint64_t idx); const uint64_t *kmerminhash_get_abunds(KmerMinHash *ptr); diff --git a/sourmash/_minhash.py b/sourmash/_minhash.py index 2557fd218b..9da1ec5c9a 100644 --- a/sourmash/_minhash.py +++ b/sourmash/_minhash.py @@ -234,9 +234,13 @@ def get_mins(self, with_abundance=False): if with_abundance and self.track_abundance: abunds_ptr = self._methodcall(lib.kmerminhash_get_abunds) - return dict(zip(ffi.unpack(mins_ptr, size), ffi.unpack(abunds_ptr, size))) + result = dict(zip(ffi.unpack(mins_ptr, size), ffi.unpack(abunds_ptr, size))) + lib.kmerminhash_slice_free(abunds_ptr, size) else: - return ffi.unpack(mins_ptr, size) + result = ffi.unpack(mins_ptr, size) + + lib.kmerminhash_slice_free(mins_ptr, size) + return result def get_hashes(self): return self.get_mins() diff --git a/src/ffi/minhash.rs b/src/ffi/minhash.rs index d9ec05397b..5efa221e6b 100644 --- a/src/ffi/minhash.rs +++ b/src/ffi/minhash.rs @@ -1,6 +1,5 @@ use std::ffi::CStr; use std::os::raw::c_char; -use std::ptr; use std::slice; use crate::errors::SourmashError; @@ -50,6 +49,14 @@ pub unsafe extern "C" fn kmerminhash_free(ptr: *mut KmerMinHash) { Box::from_raw(ptr); } +#[no_mangle] +pub unsafe extern "C" fn kmerminhash_slice_free(ptr: *mut u64, insize: usize) { + if ptr.is_null() { + return; + } + Vec::from_raw_parts(ptr as *mut u64, insize, insize); +} + ffi_fn! { unsafe fn kmerminhash_add_sequence(ptr: *mut KmerMinHash, sequence: *const c_char, force: bool) -> Result<()> { @@ -165,7 +172,8 @@ unsafe fn kmerminhash_get_abunds(ptr: *mut KmerMinHash) -> Result<*const u64> { let output = abunds.clone(); Ok(Box::into_raw(output.into_boxed_slice()) as *const u64) } else { - Ok(ptr::null()) + //throw error, can't get abund + unimplemented!() } } } From ba5b5a1b630340f3271adf1a6f94a2a1e48e09ee Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Mon, 23 Dec 2019 07:11:12 -0800 Subject: [PATCH 10/10] [MRG] improve error handling etc. in `sourmash lca index`. (#798) * add --require-taxonomy to lca index, to ignore genomes w/no taxonomy * start building cached properties * Add tests for duplicate idents (FAIL) and --require-taxonomy * force 'build' to run * add check for species assignments * add test for lca_db.lineage_to_lids * add tests for lid_to_idx and idx_to_ident in lca_db objects * fix tests? * fix matplotlib warnings about open figures during tests * fix ijson bytes warnings * fix test breakage caused by new species check * Update sourmash/lca/lca_utils.py Co-Authored-By: Luiz Irber * Update sourmash/lca/lca_utils.py Co-Authored-By: Luiz Irber * Update sourmash/lca/lca_utils.py Co-Authored-By: Luiz Irber * Update sourmash/lca/lca_utils.py Co-Authored-By: Luiz Irber * fix code & tests Co-authored-by: Luiz Irber --- Makefile | 2 +- sourmash/lca/command_compare_csv.py | 3 +- sourmash/lca/command_index.py | 35 ++++++++-- sourmash/lca/lca_utils.py | 73 +++++++++++++------- tests/__init__.py | 3 +- tests/test-data/lca/bad-spreadsheet-2.csv | 7 ++ tests/test-data/lca/bad-spreadsheet-3.csv | 1 + tests/test_lca.py | 82 +++++++++++++++++++++-- tests/test_signature_json.py | 12 ++-- 9 files changed, 177 insertions(+), 41 deletions(-) create mode 100644 tests/test-data/lca/bad-spreadsheet-2.csv create mode 100644 tests/test-data/lca/bad-spreadsheet-3.csv diff --git a/Makefile b/Makefile index 559ad81cbc..d73c2d883d 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ all: build .PHONY: -build: +build: .PHONY $(PYTHON) setup.py build_ext -i cargo build diff --git a/sourmash/lca/command_compare_csv.py b/sourmash/lca/command_compare_csv.py index 1bdff53bd6..6f06adca89 100644 --- a/sourmash/lca/command_compare_csv.py +++ b/sourmash/lca/command_compare_csv.py @@ -39,7 +39,8 @@ def compare_csv(args): # first, load classify-style spreadsheet notify('loading classify output from: {}', args.csv1) assignments0, num_rows0 = load_taxonomy_assignments(args.csv1, - start_column=3) + start_column=3, + force=args.force) notify('loaded {} distinct lineages, {} rows', len(set(assignments0.values())), num_rows0) diff --git a/sourmash/lca/command_index.py b/sourmash/lca/command_index.py index c4fcf9bf98..56e7db5c12 100644 --- a/sourmash/lca/command_index.py +++ b/sourmash/lca/command_index.py @@ -57,6 +57,8 @@ def load_taxonomy_assignments(filename, delimiter=',', start_column=2, # convert into a lineage pair assignments = {} num_rows = 0 + n_species = 0 + n_strains = 0 for row in r: if row and row[0].strip(): # want non-empty row num_rows += 1 @@ -76,10 +78,27 @@ def load_taxonomy_assignments(filename, delimiter=',', start_column=2, # store lineage tuple if lineage: - assignments[ident] = tuple(lineage) + # check duplicates + if ident in assignments: + if assignments[ident] != tuple(lineage): + if not force: + raise Exception("multiple lineages for identifier {}".format(ident)) + else: + assignments[ident] = tuple(lineage) + + if lineage[-1].rank == 'species': + n_species += 1 + elif lineage[-1].rank == 'strain': + n_strains += 1 fp.close() + # this is to guard against a bug that happened once and I can't find + # any more, when building a large GTDB-based database :) --CTB + if len(assignments) * 0.2 > n_species and len(assignments) > 50: + if not force: + raise Exception("error: fewer than 20% of lineages have species-level resolution!? ({} total found)".format(n_species)) + return assignments, num_rows @@ -134,6 +153,8 @@ def index(args): p.add_argument('--traverse-directory', action='store_true', help='load all signatures underneath directories.') p.add_argument('--report', help='output a report on anomalies, if any.') + p.add_argument('--require-taxonomy', action='store_true', + help='ignore signatures with no taxonomy entry') args = p.parse_args(args) if args.start_column < 2: @@ -226,15 +247,16 @@ def get_lineage_id(lineage, arg_d=arg_d): record_remnants = set(ident_to_idx.keys()) record_used_lineages = set() record_used_idents = set() + n_skipped = 0 for filename in inp_files: n += 1 for sig in load_signatures(filename, ksize=args.ksize): notify(u'\r\033[K', end=u'') - notify('... loading signature {} (file {} of {})', sig.name()[:30], n, total_n, end='\r') + notify('\r... loading signature {} (file {} of {}); skipped {} so far', sig.name()[:30], n, total_n, n_skipped, end='') debug(filename, sig.name()) if sig.md5sum() in md5_to_name: - notify('\nWARNING: in file {}, duplicate md5sum: {}; skipping', filename, sig.md5sum()) + debug('WARNING: in file {}, duplicate md5sum: {}; skipping', filename, sig.md5sum()) record_duplicates.add(filename) continue @@ -270,11 +292,16 @@ def get_lineage_id(lineage, arg_d=arg_d): lineage = lid_to_lineage.get(lid) if lineage is None: - notify('WARNING: no lineage assignment for {}.', ident) + debug('WARNING: no lineage assignment for {}.', ident) record_no_lineage.add(ident) else: record_used_lineages.add(lineage) + if lineage is None and args.require_taxonomy: + debug('(skipping, because --require-taxonomy was specified)') + n_skipped += 1 + continue + for hashval in minhash.get_mins(): hashval_to_idx[hashval].add(idx) diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py index 9a61401f20..556d9cbcac 100644 --- a/sourmash/lca/lca_utils.py +++ b/sourmash/lca/lca_utils.py @@ -7,6 +7,8 @@ import gzip from os.path import exists from collections import OrderedDict, namedtuple, defaultdict, Counter +import functools + __all__ = ['taxlist', 'zip_lineage', 'build_tree', 'find_lca', 'load_single_database', 'load_databases', 'gather_assignments', @@ -25,6 +27,21 @@ LineagePair = namedtuple('LineagePair', ['rank', 'name']) +def cached_property(fun): + """A memoize decorator for class properties.""" + @functools.wraps(fun) + def get(self): + try: + return self._cache[fun] + except AttributeError: + self._cache = {} + except KeyError: + pass + ret = self._cache[fun] = fun(self) + return ret + return property(get) + + def check_files_exist(*files): ret = True not_found = [] @@ -147,7 +164,6 @@ class LCA_Database(Index): obj.idx_to_lid: key 'idx' to 'lid' obj.lid_to_lineage: key 'lid' to tuple of LineagePair objects obj.hashval_to_idx: key 'hashval' => set('idx') - obj.lineage_to_lid: key (tuple of LineagePair objects) to 'lid' """ def __init__(self): self.ksize = None @@ -155,7 +171,6 @@ def __init__(self): self.ident_to_idx = None self.idx_to_lid = None - self.lineage_to_lid = None self.lid_to_lineage = None self.hashval_to_idx = None @@ -166,7 +181,6 @@ def __repr__(self): def signatures(self): from .. import SourmashSignature - self._create_signatures() for v in self._signatures.values(): yield SourmashSignature(v) @@ -340,23 +354,22 @@ def get_lineage_assignments(self, hashval): return x - def _create_signatures(self): + @cached_property + def _signatures(self): "Create a _signatures member dictionary that contains {idx: minhash}." from .. import MinHash - if not hasattr(self, '_signatures'): - minhash = MinHash(n=0, ksize=self.ksize, scaled=self.scaled) - - debug('creating signatures for LCA DB...') - sigd = defaultdict(minhash.copy_and_clear) + minhash = MinHash(n=0, ksize=self.ksize, scaled=self.scaled) - for (k, v) in self.hashval_to_idx.items(): - for vv in v: - sigd[vv].add_hash(k) + debug('creating signatures for LCA DB...') + sigd = defaultdict(minhash.copy_and_clear) - self._signatures = sigd + for (k, v) in self.hashval_to_idx.items(): + for vv in v: + sigd[vv].add_hash(k) - debug('=> {} signatures!', len(self._signatures)) + debug('=> {} signatures!', len(sigd)) + return sigd def find_signatures(self, minhash, threshold, containment=False, ignore_scaled=False): @@ -370,16 +383,6 @@ def find_signatures(self, minhash, threshold, containment=False, # note that containment can be calculated w/o matching scaled. raise ValueError("lca db scaled is {} vs query {}; must downsample".format(self.scaled, minhash.scaled)) - self._create_signatures() - - # build idx_to_ident from ident_to_idx - if not hasattr(self, 'idx_to_ident'): - idx_to_ident = {} - for k, v in self.ident_to_idx.items(): - idx_to_ident[v] = k - - self.idx_to_ident = idx_to_ident - query_mins = set(minhash.get_mins()) # collect matching hashes: @@ -415,6 +418,28 @@ def find_signatures(self, minhash, threshold, containment=False, yield score, match_sig, self.filename + @cached_property + def lineage_to_lids(self): + d = defaultdict(set) + for lid, lineage in self.lid_to_lineage.items(): + d[lineage].add(lid) + return d + + @cached_property + def lid_to_idx(self): + d = defaultdict(set) + for idx, lid in self.idx_to_lid.items(): + d[lid].add(idx) + return d + + @cached_property + def idx_to_ident(self): + d = defaultdict(set) + for ident, idx in self.ident_to_idx.items(): + assert idx not in d + d[idx] = ident + return d + def load_single_database(filename, verbose=False): "Load a single LCA database; return (db, ksize, scaled)" diff --git a/tests/__init__.py b/tests/__init__.py index 8b13789179..1019c178fc 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1 +1,2 @@ - +import matplotlib.pyplot as plt +plt.rcParams.update({'figure.max_open_warning': 0}) diff --git a/tests/test-data/lca/bad-spreadsheet-2.csv b/tests/test-data/lca/bad-spreadsheet-2.csv new file mode 100644 index 0000000000..09a3d2bba0 --- /dev/null +++ b/tests/test-data/lca/bad-spreadsheet-2.csv @@ -0,0 +1,7 @@ +MAGs,Domain,Phylum,Class,Order,Family,Genus,Species +BAR, +FOO,,,,,,,, +TARA_ASE_MAG_00031,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii +TARA_ASE_MAG_00031,Bacteria_2,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii + + diff --git a/tests/test-data/lca/bad-spreadsheet-3.csv b/tests/test-data/lca/bad-spreadsheet-3.csv new file mode 100644 index 0000000000..3a9fbda584 --- /dev/null +++ b/tests/test-data/lca/bad-spreadsheet-3.csv @@ -0,0 +1 @@ +MAGs,Domain,Phylum,Class,Order,Family,Genus,Species diff --git a/tests/test_lca.py b/tests/test_lca.py index 5e0b17bce7..faa15fed42 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -211,6 +211,51 @@ def test_gather_db_scaled_lt_sig_scaled(): assert sig.minhash == match_sig.minhash +def test_db_lineage_to_lids(): + dbfile = utils.get_test_data('lca/47+63.lca.json') + db, ksize, scaled = lca_utils.load_single_database(dbfile) + + d = db.lineage_to_lids + items = list(d.items()) + items.sort() + assert len(items) == 2 + + print(items) + + lin1 = items[0][0][-1] + assert lin1.rank == 'strain' + assert lin1.name == 'Shewanella baltica OS185' + lin1 = items[1][0][-1] + assert lin1.rank == 'strain' + assert lin1.name == 'Shewanella baltica OS223' + + +def test_db_lid_to_idx(): + dbfile = utils.get_test_data('lca/47+63.lca.json') + db, ksize, scaled = lca_utils.load_single_database(dbfile) + + d = db.lid_to_idx + items = list(d.items()) + items.sort() + assert len(items) == 2 + + print(items) + assert items == [(32, {32}), (48, {48})] + + +def test_db_idx_to_ident(): + dbfile = utils.get_test_data('lca/47+63.lca.json') + db, ksize, scaled = lca_utils.load_single_database(dbfile) + + d = db.idx_to_ident + items = list(d.items()) + items.sort() + assert len(items) == 2 + + print(items) + assert items == [(32, 'NC_009665'), (48, 'NC_011663')] + + ## command line tests @@ -259,6 +304,34 @@ def test_basic_index_bad_spreadsheet(): assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in err +def test_basic_index_broken_spreadsheet(): + # duplicate identifiers in this spreadsheet + with utils.TempDirectory() as location: + taxcsv = utils.get_test_data('lca/bad-spreadsheet-2.csv') + input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') + lca_db = os.path.join(location, 'delmont-1.lca.json') + + cmd = ['lca', 'index', taxcsv, lca_db, input_sig] + status, out, err = utils.runscript('sourmash', cmd, fail_ok=True) + + assert status != 0 + assert "multiple lineages for identifier TARA_ASE_MAG_00031" in err + + +def test_basic_index_require_taxonomy(): + # no taxonomy in here + with utils.TempDirectory() as location: + taxcsv = utils.get_test_data('lca/bad-spreadsheet-3.csv') + input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') + lca_db = os.path.join(location, 'delmont-1.lca.json') + + cmd = ['lca', 'index', '--require-taxonomy', taxcsv, lca_db, input_sig] + status, out, err = utils.runscript('sourmash', cmd, fail_ok=True) + + assert status != 0 + assert "ERROR: no hash values found - are there any signatures?" in err + + def test_basic_index_column_start(): with utils.TempDirectory() as location: taxcsv = utils.get_test_data('lca/delmont-3.csv') @@ -371,7 +444,7 @@ def test_index_traverse_real_spreadsheet_no_report(): input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') lca_db = os.path.join(location, 'delmont-1.lca.json') - cmd = ['lca', 'index', taxcsv, lca_db, input_sig] + cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '-f'] status, out, err = utils.runscript('sourmash', cmd) print(cmd) @@ -395,7 +468,8 @@ def test_index_traverse_real_spreadsheet_report(): lca_db = os.path.join(location, 'delmont-1.lca.json') report_loc = os.path.join(location, 'report.txt') - cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '--report', report_loc] + cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '--report', + report_loc, '-f'] status, out, err = utils.runscript('sourmash', cmd) print(cmd) @@ -926,7 +1000,7 @@ def test_compare_csv(): a = utils.get_test_data('lca/classify-by-both.csv') b = utils.get_test_data('lca/tara-delmont-SuppTable3.csv') - cmd = ['lca', 'compare_csv', a, b] + cmd = ['lca', 'compare_csv', a, b, '-f'] status, out, err = utils.runscript('sourmash', cmd) print(cmd) @@ -943,7 +1017,7 @@ def test_compare_csv_real(): a = utils.get_test_data('lca/tully-genome-sigs.classify.csv') b = utils.get_test_data('lca/tully-query.delmont-db.sigs.classify.csv') - cmd = ['lca', 'compare_csv', a, b, '--start-column=3'] + cmd = ['lca', 'compare_csv', a, b, '--start-column=3', '-f'] status, out, err = utils.runscript('sourmash', cmd) print(cmd) diff --git a/tests/test_signature_json.py b/tests/test_signature_json.py index c318bc4551..c5628b5d05 100644 --- a/tests/test_signature_json.py +++ b/tests/test_signature_json.py @@ -17,7 +17,7 @@ def test__json_next_atomic_array(): s = json.dumps(t) if sys.version_info[0] < 3: s = unicode(s) - it = ijson.parse(io.StringIO(s)) + it = ijson.parse(io.BytesIO(s.encode('utf-8'))) a = _json_next_atomic_array(it) assert len(t) == len(a) assert all(x == y for x,y in zip(t, a)) @@ -37,7 +37,7 @@ def test__json_next_signature(): s = json.dumps(t) if sys.version_info[0] < 3: s = unicode(s) - it = ijson.parse(io.StringIO(s)) + it = ijson.parse(io.BytesIO(s.encode('utf-8'))) # no MD5SUM sig = _json_next_signature(it, name, filename, ignore_md5sum=True, @@ -53,7 +53,7 @@ def test__json_next_signature(): s = json.dumps(t) if sys.version_info[0] < 3: s = unicode(s) - it = ijson.parse(io.StringIO(s)) + it = ijson.parse(io.BytesIO(s.encode('utf-8'))) sig = _json_next_signature(it, name, filename, ignore_md5sum=False, ijson=ijson) @@ -77,7 +77,7 @@ def test_load_signature_json(): s = json.dumps(t) if sys.version_info[0] < 3: s = unicode(s) - it = ijson.parse(io.StringIO(s)) + it = ijson.parse(io.BytesIO(s.encode('utf-8'))) # no MD5SUM sig_entry = load_signature_json(it, ignore_md5sum=True) @@ -105,7 +105,7 @@ def test_load_signaturesset_json_iter(): if sys.version_info[0] < 3: s = unicode(s) # no MD5SUM - sig_entries = tuple(load_signatureset_json_iter(io.StringIO(s), + sig_entries = tuple(load_signatureset_json_iter(io.BytesIO(s.encode('utf-8')), ignore_md5sum=True, ijson=ijson)) assert len(sig_entries) == 2 @@ -139,7 +139,7 @@ def test_load_signaturesset_json_iter_molecules(): if sys.version_info[0] < 3: s = unicode(s) # no MD5SUM - sig_entries = tuple(load_signatureset_json_iter(io.StringIO(s), + sig_entries = tuple(load_signatureset_json_iter(io.BytesIO(s.encode('utf-8')), ignore_md5sum=True, ijson=ijson)) # Ensure all molecule types were read properly