From 320b021ba663dd05bb0ca9c2d53036dfd889459c Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Mon, 31 Dec 2018 03:14:23 +0000 Subject: [PATCH] Replacing C++ with Rust - Update build system and CI - Add Rust install instructions to docs - Remove dependency on Cython (replaced with cffi and milksnake) - Move _minhash.pyx to _minhash.py, and remove Cython bits - Add convenience functions and classes to work with Rust layer - Remove third-party/ directory --- .coveragerc | 1 - .gitignore | 10 +- .travis.yml | 35 +- MANIFEST.in | 10 +- Makefile | 12 +- README.md | 19 +- doc/developer.md | 15 +- netlify.toml | 12 + setup.py | 82 ++-- sourmash/__init__.py | 16 +- sourmash/_compat.py | 24 ++ sourmash/_minhash.pxd | 70 --- sourmash/_minhash.py | 527 ++++++++++++++++++++++ sourmash/_minhash.pyx | 512 ---------------------- sourmash/exceptions.py | 44 ++ sourmash/kmer_min_hash.hh | 648 ---------------------------- sourmash/sbtmh.py | 5 +- sourmash/sig/__main__.py | 5 +- sourmash/sourmash_args.py | 5 +- sourmash/utils.py | 78 ++++ src/bin/smrs.rs | 8 + src/ffi/minhash.rs | 2 +- src/ffi/signature.rs | 17 +- src/index/bigsi.rs | 10 +- src/index/sbt/mhbt.rs | 19 +- src/signature.rs | 19 +- src/sketch/minhash.rs | 29 +- tests/minhash.rs | 2 +- tests/smrs_cmd.rs | 1 - tests/test__minhash.py | 97 ++++- tests/test_rustobj.py | 18 + tests/test_sourmash.py | 2 +- third-party/.gitignore | 6 - third-party/smhasher/MurmurHash3.cc | 340 --------------- third-party/smhasher/MurmurHash3.h | 37 -- 35 files changed, 982 insertions(+), 1755 deletions(-) create mode 100644 netlify.toml create mode 100644 sourmash/_compat.py delete mode 100644 sourmash/_minhash.pxd create mode 100644 sourmash/_minhash.py delete mode 100644 sourmash/_minhash.pyx create mode 100644 sourmash/exceptions.py delete mode 100644 sourmash/kmer_min_hash.hh create mode 100644 sourmash/utils.py create mode 100644 tests/test_rustobj.py delete mode 100644 third-party/.gitignore delete mode 100644 third-party/smhasher/MurmurHash3.cc delete mode 100644 third-party/smhasher/MurmurHash3.h diff --git a/.coveragerc b/.coveragerc index d284e1014c..09b80d8275 100644 --- a/.coveragerc +++ b/.coveragerc @@ -6,6 +6,5 @@ omit = doc/conf.py setup.py tests/* - third-party/smhasher/MurmurHash3.cc .tox/* benchmarks/* diff --git a/.gitignore b/.gitignore index 0aa0e7a6f1..0b797c7bf5 100644 --- a/.gitignore +++ b/.gitignore @@ -13,14 +13,9 @@ dist build sourmash.egg-info .ipynb_checkpoints -_minhash.so .cache *.so .coverage -sourmash_lib/_minhash.cpp -sourmash/_minhash.cpp -.asv/ -.eggs/ .pytest_cache .python-version sourmash/version.py @@ -30,6 +25,9 @@ sourmash/_lowlevel*.py .env Pipfile Pipfile.lock -ocf/target/ target/ Cargo.lock +.eggs +.asv +pkg/ +wasm-pack.log diff --git a/.travis.yml b/.travis.yml index 937b4b6c0b..7483db938e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,16 +7,18 @@ cache: - "$HOME/.cache/pip" - "$HOME/.cargo" - "target" - - ".tox" branches: only: - master - "/^v.*$/" -script: tox +script: tox -vv -install: pip install tox-travis +install: + - source .travis/install_cargo.sh + +before_script: pip install tox-travis jobs: allow_failures: @@ -28,12 +30,14 @@ jobs: - &test stage: test - python: 2.7 + python: 3.6 - <<: *test os: osx + osx_image: xcode10.1 + python: 3.7 language: generic env: - - TOXENV=py36 + - TOXENV=py37 - <<: *test python: 3.7 name: integration (ipfs/redis) @@ -45,7 +49,7 @@ jobs: - redis-server - docker - <<: *test - python: 3.6 + python: 2.7 - <<: *test python: 3.5 @@ -55,12 +59,16 @@ jobs: services: - docker env: - - PIP=pip + - CIBW_BUILD='cp37-*' - CIBW_SKIP='*-manylinux_i686' - install: skip + - CIBW_BEFORE_BUILD='source .travis/install_cargo.sh' + - CIBW_ENVIRONMENT='PATH="$HOME/.cargo/bin:$PATH"' + - CIBW_ENVIRONMENT_MACOS='MACOSX_DEPLOYMENT_TARGET=10.11' + before_script: skip script: - - sudo $PIP install cibuildwheel==1.0.0 - - cibuildwheel --output-dir wheelhouse + - python -m pip install -U pip setuptools + - python -m pip install cibuildwheel==1.1.0 + - python -m cibuildwheel --output-dir wheelhouse deploy: provider: releases api_key: @@ -73,12 +81,7 @@ jobs: - <<: *wheel os: osx osx_image: xcode10.1 - language: generic - before_script: - - sudo $PIP install -U pip setuptools - env: - - PIP=pip2 - - CIBW_ENVIRONMENT_MACOS='MACOSX_DEPLOYMENT_TARGET=10.11' + language: shell stages: - check diff --git a/MANIFEST.in b/MANIFEST.in index 69df9b2280..1d206dc896 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,10 +1,16 @@ include LICENSE Makefile Dockerfile LICENSE Makefile README.md requirements.txt include index.ipynb +include sourmash VERSION recursive-include sourmash_lib * recursive-include sourmash * -recursive-include third-party *.cc *.h -exclude tests/* +recursive-include src *.rs +recursive-include benches *.rs +include Cargo.toml +include include/sourmash.h +prune .eggs +global-exclude *.rlib global-exclude *.orig global-exclude *.pyc global-exclude *.so prune tests/test-data/ +global-exclude *.git/ diff --git a/Makefile b/Makefile index 8bb87d29dd..559ad81cbc 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,13 @@ PYTHON ?= python -all: - $(PYTHON) setup.py build_ext -i +all: build .PHONY: +build: + $(PYTHON) setup.py build_ext -i + cargo build + clean: $(PYTHON) setup.py clean --all rm -f sourmash/*.so @@ -19,6 +22,7 @@ dist: FORCE test: all pip install -e '.[test]' $(PYTHON) -m pytest + cargo test doc: .PHONY cd doc && make html @@ -29,12 +33,12 @@ include/sourmash.h: src/lib.rs src/ffi/minhash.rs src/ffi/signature.rs src/ffi/n rustup override set stable coverage: all - $(PYTHON) setup.py clean --all - SOURMASH_COVERAGE=1 $(PYTHON) setup.py build_ext -i + $(PYTHON) setup.py build_ext -i $(PYTHON) -m pytest --cov=. --cov-report term-missing benchmark: asv continuous master `git rev-parse HEAD` + cargo bench check: cargo build diff --git a/README.md b/README.md index e2d6619cb3..b1297dade1 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,19 @@ + + # sourmash [![Documentation](https://readthedocs.org/projects/sourmash/badge/?version=latest)](http://sourmash.readthedocs.io/en/latest/) [![Build Status](https://travis-ci.com/dib-lab/sourmash.svg?branch=master)](https://travis-ci.com/dib-lab/sourmash) +PyPI [![codecov](https://codecov.io/gh/dib-lab/sourmash/branch/master/graph/badge.svg)](https://codecov.io/gh/dib-lab/sourmash) [![DOI](http://joss.theoj.org/papers/10.21105/joss.00027/status.svg)](http://joss.theoj.org/papers/10.21105/joss.00027) +License: 3-Clause BSD + +🦀 +[![](http://meritbadge.herokuapp.com/sourmash)](https://crates.io/crates/sourmash) +[![Rust API Documentation on docs.rs](https://docs.rs/sourmash/badge.svg)](https://docs.rs/sourmash) + +--- Compute MinHash signatures for nucleotide (DNA/RNA) and protein sequences. @@ -13,7 +23,7 @@ Usage: sourmash compare *.sig -o distances sourmash plot distances -Sourmash 1.0 is [published on JOSS](https://doi.org/10.21105/joss.00027); please cite that paper if you use sourmash (`doi: 10.21105/joss.00027`):. +sourmash 1.0 is [published on JOSS](https://doi.org/10.21105/joss.00027); please cite that paper if you use sourmash (`doi: 10.21105/joss.00027`):. ---- @@ -48,9 +58,10 @@ A quickstart tutorial [is available](https://sourmash.readthedocs.io/en/latest/t ### Requirements sourmash runs under both Python 2.7.x and Python 3.5+. The base -requirements are screed and ijson, together with a C++ development -environment and the CPython development headers and libraries (for the -C++ extension). +requirements are screed and ijson, together with a Rust environment (for the +extension code). We suggest using `rustup` to install the Rust environment: + + curl https://sh.rustup.rs -sSf | sh The comparison code (`sourmash compare`) uses numpy, and the plotting code uses matplotlib and scipy, but most of the code is usable without diff --git a/doc/developer.md b/doc/developer.md index cd4ab13847..79f2411e9f 100644 --- a/doc/developer.md +++ b/doc/developer.md @@ -7,7 +7,13 @@ You can get the latest development master branch with: ``` git clone https://github.com/dib-lab/sourmash.git ``` -To install all of the necessary dependencies, do: +sourmash runs under both Python 2.7.x and Python 3.5+. The base +requirements are screed and ijson, together with a Rust environment (for the +extension code). We suggest using `rustup` to install the Rust environment: + + curl https://sh.rustup.rs -sSf | sh + +To install all of the necessary Python dependencies, do: ``` pip install -r requirements.txt ``` @@ -25,13 +31,6 @@ pip install -e . We use [Travis][0] for continuous integration. -Code coverage calculation is enabled (on Linux only) by running -`make coverage`. This recompiles the C++ extension without -optimization and with coverage configured. See `setup.py` for -more information on this; the environment variable -`SOURMASH_COVERAGE` controls whether the C++ extension is -compiled with code coverage analysis enabled. - Code coverage can be viewed interactively at [codecov.io][1]. [0]:https://travis-ci.org/dib-lab/sourmash diff --git a/netlify.toml b/netlify.toml new file mode 100644 index 0000000000..994c94d665 --- /dev/null +++ b/netlify.toml @@ -0,0 +1,12 @@ +# Configuration for pull request documentation previews via Netlify + +[build] +publish = "_build/html" +base = "doc" +command = ''' + cd .. && \ + curl https://sh.rustup.rs -sSf | sh -s -- -y && \ + source $HOME/.cargo/env && \ + pip install -e .[doc] && \ + make doc +''' diff --git a/setup.py b/setup.py index 1f1720821d..2850339eda 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,32 @@ from __future__ import print_function -import sys -from setuptools import setup, find_packages -from setuptools import Extension import os +from setuptools import setup, find_packages +import sys + + +DEBUG_BUILD = os.environ.get("SOURMASH_DEBUG") == "1" + + +def build_native(spec): + cmd = ["cargo", "build", "--lib"] + + target = "debug" + if not DEBUG_BUILD: + cmd.append("--release") + target = "release" + + build = spec.add_external_build(cmd=cmd, path=".") + + rtld_flags = ["NOW"] + if sys.platform == "darwin": + rtld_flags.append("NODELETE") + spec.add_cffi_module( + module_path="sourmash._lowlevel", + dylib=lambda: build.find_dylib("sourmash", in_path="target/%s" % target), + header_filename=lambda: build.find_header("sourmash.h", in_path="include"), + rtld_flags=rtld_flags, + ) -EXTRA_COMPILE_ARGS = ['-std=c++11', '-pedantic'] -EXTRA_LINK_ARGS=[] CLASSIFIERS = [ "Environment :: Console", @@ -15,7 +36,7 @@ "Natural Language :: English", "Operating System :: POSIX :: Linux", "Operating System :: MacOS :: MacOS X", - "Programming Language :: C++", + "Programming Language :: Rust", "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", @@ -24,24 +45,10 @@ CLASSIFIERS.append("Development Status :: 5 - Production/Stable") -if sys.platform == 'darwin': # Mac OS X? - # force 64bit only builds - EXTRA_COMPILE_ARGS.extend(['-arch', 'x86_64', '-mmacosx-version-min=10.7', - '-stdlib=libc++']) - -else: # ...likely Linux - if os.environ.get('SOURMASH_COVERAGE'): - print('Turning on coverage analysis.') - EXTRA_COMPILE_ARGS.extend(['-g', '--coverage', '-lgcov']) - EXTRA_LINK_ARGS.extend(['--coverage', '-lgcov']) - else: - EXTRA_COMPILE_ARGS.append('-O3') - -with open('README.md', 'r') as readme: +with open("README.md", "r") as readme: LONG_DESCRIPTION = readme.read() -SETUP_METADATA = \ - { +SETUP_METADATA = { "name": "sourmash", "description": "tools for comparing DNA sequences with MinHash sketches", "long_description": LONG_DESCRIPTION, @@ -55,20 +62,18 @@ 'sourmash = sourmash.__main__:main' ] }, - "ext_modules": [Extension("sourmash._minhash", - sources=["sourmash/_minhash.pyx", - "third-party/smhasher/MurmurHash3.cc"], - depends=["sourmash/kmer_min_hash.hh"], - include_dirs=["./sourmash", - "./third-party/smhasher/"], - language="c++", - extra_compile_args=EXTRA_COMPILE_ARGS, - extra_link_args=EXTRA_LINK_ARGS)], "install_requires": ["screed>=0.9", "ijson>=2.5.1", "khmer>=2.1", 'numpy', + "cffi", 'matplotlib', 'scipy', "deprecation>=2.0.6"], - "setup_requires": ['Cython>=0.25.2', "setuptools>=38.6.0", - 'setuptools_scm', 'setuptools_scm_git_archive'], + "setup_requires": [ + "setuptools>=38.6.0", + "milksnake", + "setuptools_scm", + "setuptools_scm_git_archive", + ], "use_scm_version": {"write_to": "sourmash/version.py"}, + "zip_safe": False, + "platforms": "any", "extras_require": { 'test' : ['pytest', 'pytest-cov'], 'demo' : ['jupyter', 'jupyter_client', 'ipython'], @@ -76,13 +81,10 @@ "sphinxcontrib-napoleon", "nbsphinx"], '10x': ['bam2fasta==1.0.1'], 'storage': ["ipfshttpclient", "redis"] - }, - "include_package_data": True, - "package_data": { - "sourmash": ['*.pxd'] }, - "classifiers": CLASSIFIERS - } + "include_package_data": True, + "classifiers": CLASSIFIERS, + "milksnake_tasks": [build_native], +} setup(**SETUP_METADATA) - diff --git a/sourmash/__init__.py b/sourmash/__init__.py index 2893023d2a..319ccd1e75 100644 --- a/sourmash/__init__.py +++ b/sourmash/__init__.py @@ -7,12 +7,21 @@ import math import os -from ._minhash import (MinHash, get_minhash_default_seed, get_minhash_max_hash) +from ._lowlevel import ffi, lib + +ffi.init_once(lib.sourmash_init, "init") + +from ._minhash import MinHash, get_minhash_default_seed, get_minhash_max_hash + DEFAULT_SEED = get_minhash_default_seed() MAX_HASH = get_minhash_max_hash() -from .signature import (load_signatures, load_one_signature, SourmashSignature, - save_signatures) +from .signature import ( + load_signatures, + load_one_signature, + SourmashSignature, + save_signatures, +) from .sbtmh import load_sbt_index, search_sbt_index, create_sbt_index from . import lca from . import sbt @@ -21,6 +30,7 @@ from . import signature from pkg_resources import get_distribution, DistributionNotFound + try: VERSION = get_distribution(__name__).version except DistributionNotFound: # pragma: no cover diff --git a/sourmash/_compat.py b/sourmash/_compat.py new file mode 100644 index 0000000000..86b4e97f98 --- /dev/null +++ b/sourmash/_compat.py @@ -0,0 +1,24 @@ +import sys + + +PY2 = sys.version_info[0] == 2 + +if PY2: + text_type = unicode + int_types = (int, long) + string_types = (str, unicode) + range_type = xrange + itervalues = lambda x: x.itervalues() + NUL = '\x00' + def implements_to_string(cls): + cls.__unicode__ = cls.__str__ + cls.__str__ = lambda x: x.__unicode__().encode('utf-8') + return cls +else: + text_type = str + int_types = (int,) + string_types = (str,) + range_type = range + itervalues = lambda x: x.values() + NUL = 0 + implements_to_string = lambda x: x diff --git a/sourmash/_minhash.pxd b/sourmash/_minhash.pxd deleted file mode 100644 index a5aa1f4e04..0000000000 --- a/sourmash/_minhash.pxd +++ /dev/null @@ -1,70 +0,0 @@ -# -*- coding: UTF-8 -*- -# cython: language_level=3, c_string_type=str, c_string_encoding=ascii - -from __future__ import unicode_literals - -from libcpp cimport bool -from libcpp.map cimport map -from libcpp.memory cimport unique_ptr -from libcpp.set cimport set as cppset -from libcpp.string cimport string -from libc.stdint cimport uint32_t, uint64_t -from libcpp.vector cimport vector - - -cdef extern from "kmer_min_hash.hh": - ctypedef uint64_t HashIntoType; - ctypedef vector[HashIntoType] CMinHashType; - - - cdef uint64_t _hash_murmur(const string, uint32_t seed) - cdef uint64_t _hash_murmur(const char *, unsigned int, uint32_t) - - cdef cppclass KmerMinHash: - const uint32_t seed; - const unsigned int num; - const unsigned int ksize; - const bool is_protein; - const bool dayhoff; - const bool hp; - const HashIntoType max_hash; - CMinHashType mins; - - KmerMinHash(unsigned int, unsigned int, bool, bool, bool, uint32_t, HashIntoType) - void add_hash(HashIntoType) except +ValueError - void remove_hash(HashIntoType) except +ValueError - void add_word(const string& word) except +ValueError - void add_word(const char * word) except +ValueError - void add_sequence(const string&, bool) except +ValueError - void merge(const KmerMinHash&) except +ValueError - string aa_to_dayhoff(string aa) except +ValueError - string aa_to_hp(string aa) except +ValueError - string translate_codon(string codon) except +ValueError - unsigned int count_common(const KmerMinHash&) except +ValueError - unsigned long size() - - - cdef cppclass KmerMinAbundance(KmerMinHash): - CMinHashType abunds; - - KmerMinAbundance(unsigned int, unsigned int, bool, bool, bool, uint32_t, HashIntoType) - void add_hash(HashIntoType) except +ValueError - void remove_hash(HashIntoType) except +ValueError - void add_word(string word) except +ValueError - void add_word(const char * word) except +ValueError - void add_sequence(const string&, bool) except +ValueError - void merge(const KmerMinAbundance&) except +ValueError - void merge(const KmerMinHash&) except +ValueError - string aa_to_dayhoff(string aa) except +ValueError - string aa_to_hp(string aa) except +ValueError - string translate_codon(string codon) except +ValueError - unsigned int count_common(const KmerMinAbundance&) except +ValueError - unsigned long size() - - -cdef class MinHash(object): - cdef unique_ptr[KmerMinHash] _this - cdef bool _track_abundance - - cpdef get_mins(self, bool with_abundance=*) - cpdef set_abundances(self, dict) diff --git a/sourmash/_minhash.py b/sourmash/_minhash.py new file mode 100644 index 0000000000..2557fd218b --- /dev/null +++ b/sourmash/_minhash.py @@ -0,0 +1,527 @@ +# -*- coding: UTF-8 -*- +from __future__ import unicode_literals, division + +import math +import copy + +from ._compat import string_types, range_type +from ._lowlevel import ffi, lib +from .utils import RustObject, rustcall, decode_str +from .exceptions import SourmashError + +# default MurmurHash seed +MINHASH_DEFAULT_SEED = 42 + + +def get_minhash_default_seed(): + return MINHASH_DEFAULT_SEED + + +# we use the 64-bit hash space of MurmurHash only +# this is 2 ** 64 - 1 in hexadecimal +MINHASH_MAX_HASH = 0xFFFFFFFFFFFFFFFF + + +def get_minhash_max_hash(): + return MINHASH_MAX_HASH + + +def get_max_hash_for_scaled(scaled): + if scaled == 0: + return 0 + elif scaled == 1: + return get_minhash_max_hash() + + return int(round(get_minhash_max_hash() / scaled, 0)) + + +def get_scaled_for_max_hash(max_hash): + if max_hash == 0: + return 0 + return int(round(get_minhash_max_hash() / max_hash, 0)) + + +def to_bytes(s): + # Allow for strings, bytes or int + # Single item of byte string = int + + if isinstance(s, bytes): + return s + + if not isinstance(s, string_types + (bytes, int)): + raise TypeError("Requires a string-like sequence") + + if isinstance(s, string_types): + s = s.encode("utf-8") + elif isinstance(s, int): + s = bytes([s]) + + return s + + +def hash_murmur(kmer, seed=MINHASH_DEFAULT_SEED): + "hash_murmur(string, [,seed])\n\n" + "Compute a hash for a string, optionally using a seed (an integer). " + "The current default seed is returned by hash_seed()." + + return lib.hash_murmur(to_bytes(kmer), seed) + + +def dotproduct(a, b, normalize=True): + """ + Compute the dot product of two dictionaries {k: v} where v is + abundance. + """ + + if normalize: + norm_a = math.sqrt(sum([x * x for x in a.values()])) + norm_b = math.sqrt(sum([x * x for x in b.values()])) + + if norm_a == 0.0 or norm_b == 0.0: + return 0.0 + else: + norm_a = 1.0 + norm_b = 1.0 + + prod = 0.0 + for k, abundance in a.items(): + prod += (float(abundance) / norm_a) * (b.get(k, 0) / norm_b) + + return prod + + +class MinHash(RustObject): + def __init__( + self, + n, + ksize, + is_protein=False, + dayhoff=False, + hp=False, + track_abundance=False, + seed=MINHASH_DEFAULT_SEED, + max_hash=0, + mins=None, + scaled=0, + ): + if max_hash and scaled: + raise ValueError("cannot set both max_hash and scaled") + elif scaled: + max_hash = get_max_hash_for_scaled(scaled) + + if max_hash and n: + raise ValueError("cannot set both n and max_hash") + + if not n and not (max_hash or scaled): + raise ValueError("cannot omit both n and scaled") + + if dayhoff or hp: + is_protein = False + + self._objptr = lib.kmerminhash_new( + n, ksize, is_protein, dayhoff, hp, seed, int(max_hash), track_abundance + ) + self.__dealloc_func__ = lib.kmerminhash_free + + if mins: + if track_abundance: + self.set_abundances(mins) + else: + self.add_many(mins) + + def __copy__(self): + a = MinHash( + self.num, + self.ksize, + is_protein=self.is_protein, + dayhoff=self.dayhoff, + hp=self.hp, + track_abundance=self.track_abundance, + seed=self.seed, + max_hash=self.max_hash, + ) + a.merge(self) + return a + + def __getstate__(self): # enable pickling + return ( + self.num, + self.ksize, + self.is_protein, + self.dayhoff, + self.hp, + self.get_mins(with_abundance=self.track_abundance), + None, + self.track_abundance, + self.max_hash, + self.seed, + ) + + def __setstate__(self, tup): + (n, ksize, is_protein, dayhoff, hp, mins, _, track_abundance, max_hash, seed) = tup + + self.__del__() + self._objptr = lib.kmerminhash_new( + n, ksize, is_protein, dayhoff, hp, seed, max_hash, track_abundance + ) + if track_abundance: + self.set_abundances(mins) + else: + self.add_many(mins) + + def __reduce__(self): + return ( + MinHash, + ( + self.num, + self.ksize, + self.is_protein, + self.dayhoff, + self.hp, + self.track_abundance, + self.seed, + self.max_hash, + self.get_mins(with_abundance=self.track_abundance), + 0, + ), + ) + + def __eq__(self, other): + return self.__getstate__() == other.__getstate__() + + def copy_and_clear(self): + a = MinHash( + self.num, + self.ksize, + self.is_protein, + self.dayhoff, + self.hp, + self.track_abundance, + self.seed, + self.max_hash, + ) + return a + + def add_sequence(self, sequence, force=False): + self._methodcall(lib.kmerminhash_add_sequence, to_bytes(sequence), force) + + def add(self, kmer): + "Add kmer into sketch." + self.add_sequence(kmer) + + def add_many(self, hashes): + "Add many hashes in at once." + if isinstance(hashes, MinHash): + self._methodcall(lib.kmerminhash_add_from, hashes._objptr) + else: + for hash in hashes: + self._methodcall(lib.kmerminhash_add_hash, hash) + + def remove_many(self, hashes): + "Add many hashes in at once." + self._methodcall(lib.kmerminhash_remove_many, list(hashes), len(hashes)) + + def update(self, other): + "Update this estimator from all the hashes from the other." + self.add_many(other) + + def __len__(self): + return self._methodcall(lib.kmerminhash_get_mins_size) + + def get_mins(self, with_abundance=False): + size = self._methodcall(lib.kmerminhash_get_mins_size) + mins_ptr = self._methodcall(lib.kmerminhash_get_mins) + + if with_abundance and self.track_abundance: + abunds_ptr = self._methodcall(lib.kmerminhash_get_abunds) + return dict(zip(ffi.unpack(mins_ptr, size), ffi.unpack(abunds_ptr, size))) + else: + return ffi.unpack(mins_ptr, size) + + def get_hashes(self): + return self.get_mins() + + def subtract_mins(self, other): + a = set(self.get_mins()) + b = set(other.get_mins()) + return a - b + + @property + def seed(self): + return self._methodcall(lib.kmerminhash_seed) + + @property + def num(self): + return self._methodcall(lib.kmerminhash_num) + + @property + def scaled(self): + if self.max_hash: + return get_scaled_for_max_hash(self.max_hash) + return 0 + + @property + def is_dna(self): + return not (self.is_protein or self.dayhoff or self.hp) + + @property + def is_protein(self): + return self._methodcall(lib.kmerminhash_is_protein) + + @property + def dayhoff(self): + return self._methodcall(lib.kmerminhash_dayhoff) + + @property + def hp(self): + return self._methodcall(lib.kmerminhash_hp) + + @property + def ksize(self): + return self._methodcall(lib.kmerminhash_ksize) + + @property + def max_hash(self): + return self._methodcall(lib.kmerminhash_max_hash) + + @property + def track_abundance(self): + return self._methodcall(lib.kmerminhash_track_abundance) + + @track_abundance.setter + def track_abundance(self, b): + if self.track_abundance == b: + return + + if b is False: + self._methodcall(lib.kmerminhash_disable_abundance) + elif len(self) > 0: + raise RuntimeError("Can only set track_abundance=True if the MinHash is empty") + else: + self._methodcall(lib.kmerminhash_enable_abundance) + + def add_hash(self, h): + return self._methodcall(lib.kmerminhash_add_hash, h) + + def translate_codon(self, codon): + try: + return rustcall(lib.sourmash_translate_codon, + to_bytes(codon)).decode('utf-8') + except SourmashError as e: + raise ValueError(e.message) + + def count_common(self, other): + if not isinstance(other, MinHash): + raise TypeError("Must be a MinHash!") + return self._methodcall(lib.kmerminhash_count_common, other._get_objptr()) + + def downsample_n(self, new_num): + if self.num and self.num < new_num: + raise ValueError("new sample n is higher than current sample n") + + a = MinHash( + new_num, self.ksize, self.is_protein, self.dayhoff, self.hp, self.track_abundance, self.seed, 0 + ) + if self.track_abundance: + a.set_abundances(self.get_mins(with_abundance=True)) + else: + a.add_many(self) + + return a + + def downsample_max_hash(self, *others): + max_hashes = [x.max_hash for x in others] + new_max_hash = min(self.max_hash, *max_hashes) + new_scaled = get_scaled_for_max_hash(new_max_hash) + + return self.downsample_scaled(new_scaled) + + def downsample_scaled(self, new_num): + if self.num: + raise ValueError("num != 0 - cannot downsample a standard MinHash") + + max_hash = self.max_hash + if max_hash is None: + raise ValueError("no max_hash available - cannot downsample") + + old_scaled = get_scaled_for_max_hash(self.max_hash) + if old_scaled > new_num: + raise ValueError( + "new scaled {} is lower than current sample scaled {}".format( + new_num, old_scaled + ) + ) + + new_max_hash = get_max_hash_for_scaled(new_num) + + a = MinHash( + 0, + self.ksize, + self.is_protein, + self.dayhoff, + self.hp, + self.track_abundance, + self.seed, + new_max_hash, + ) + if self.track_abundance: + a.set_abundances(self.get_mins(with_abundance=True)) + else: + a.add_many(self) + + return a + + def intersection(self, other, in_common=False): + if not isinstance(other, MinHash): + raise TypeError("Must be a MinHash!") + + if self.num != other.num: + err = "must have same num: {} != {}".format(self.num, other.num) + raise TypeError(err) + + if in_common: + # TODO: copy from buffer to Python land instead, + # this way involves more moving data around. + combined_mh = self.copy_and_clear() + combined_mh.merge(self) + combined_mh.merge(other) + + size = len(combined_mh) + common = set(self.get_mins()) + common.intersection_update(other.get_mins()) + common.intersection_update(combined_mh.get_mins()) + else: + size = self._methodcall(lib.kmerminhash_intersection, other._get_objptr()) + common = set() + + return common, max(size, 1) + + def compare(self, other): + if self.num != other.num: + err = "must have same num: {} != {}".format(self.num, other.num) + raise TypeError(err) + return self._methodcall(lib.kmerminhash_compare, other._get_objptr()) + + def jaccard(self, other): + return self.compare(other) + + def similarity(self, other, ignore_abundance=False): + """Calculate similarity of two sketches. + + If the sketches are not abundance weighted, or ignore_abundance=True, + compute Jaccard similarity. + + If the sketches are abundance weighted, calculate a distance metric + based on the cosine similarity. + + Note, because the term frequencies (tf-idf weights) cannot be negative, + the angle will never be < 0deg or > 90deg. + + See https://en.wikipedia.org/wiki/Cosine_similarity + """ + + # if either signature is flat, calculate Jaccard only. + if not (self.track_abundance and other.track_abundance) or ignore_abundance: + return self.jaccard(other) + else: + # can we merge? if not, raise exception. + aa = copy.copy(self) + aa.merge(other) + + a = self.get_mins(with_abundance=True) + b = other.get_mins(with_abundance=True) + + prod = dotproduct(a, b) + prod = min(1.0, prod) + + distance = 2 * math.acos(prod) / math.pi + return 1.0 - distance + + def contained_by(self, other): + """\ + Calculate how much of self is contained by other. + """ + if not len(self): + return 0.0 + + return self.count_common(other) / len(self) + + def containment_ignore_maxhash(self, other): + if len(self) == 0: + return 0.0 + + if not isinstance(other, MinHash): + raise TypeError("Must be a MinHash!") + + return self._methodcall(lib.kmerminhash_containment_ignore_maxhash, other._get_objptr()) + + def __iadd__(self, other): + if not isinstance(other, MinHash): + raise TypeError("Must be a MinHash!") + self._methodcall(lib.kmerminhash_merge, other._get_objptr()) + return self + + merge = __iadd__ + + def set_abundances(self, values): + if self.track_abundance: + added = 0 + + for k, v in sorted(values.items()): + if not self.max_hash or k <= self.max_hash: + self._methodcall(lib.kmerminhash_mins_push, k) + self._methodcall(lib.kmerminhash_abunds_push, v) + added += 1 + if self.num > 0 and added >= self.num: + break + else: + raise RuntimeError( + "Use track_abundance=True when constructing " + "the MinHash to use set_abundances." + ) + + def add_protein(self, sequence): + ksize = self.ksize // 3 + if len(sequence) < ksize: + return + + aa_kmers = (sequence[i:i + ksize] for i in range(0, len(sequence) - ksize + 1)) + if self.is_protein: + for aa_kmer in aa_kmers: + self._methodcall( + lib.kmerminhash_add_word, to_bytes(aa_kmer) + ) + elif self.dayhoff: + for aa_kmer in aa_kmers: + dayhoff_kmer = '' + for aa in aa_kmer: + data = rustcall(lib.sourmash_aa_to_dayhoff, to_bytes(aa)) + dayhoff_letter = data.decode('utf-8') + dayhoff_kmer += dayhoff_letter + self._methodcall( + lib.kmerminhash_add_word, to_bytes(dayhoff_kmer) + ) + elif self.hp: + for aa_kmer in aa_kmers: + hp_kmer = '' + for aa in aa_kmer: + data = rustcall(lib.sourmash_aa_to_hp, to_bytes(aa)) + hp_letter = data.decode('utf-8') + hp_kmer += hp_letter + self._methodcall( + lib.kmerminhash_add_word, to_bytes(hp_kmer) + ) + else: + raise ValueError("Invalid protein type") + + def is_molecule_type(self, molecule): + if self.is_protein and molecule == 'protein': + return True + elif self.dayhoff and molecule == 'dayhoff': + return True + elif self.hp and molecule == 'hp': + return True + elif molecule.upper() == "DNA" and self.is_dna: + return True + + return False diff --git a/sourmash/_minhash.pyx b/sourmash/_minhash.pyx deleted file mode 100644 index 66d6a357ee..0000000000 --- a/sourmash/_minhash.pyx +++ /dev/null @@ -1,512 +0,0 @@ -# -*- coding: UTF-8 -*- -# cython: language_level=3, c_string_type=str, c_string_encoding=ascii - -from __future__ import unicode_literals - -from cython.operator cimport dereference as deref, address - -from libcpp cimport bool -from libc.stdint cimport uint32_t - -from ._minhash cimport KmerMinHash, KmerMinAbundance, _hash_murmur -import math -import copy - - -# default MurmurHash seed -cdef uint32_t MINHASH_DEFAULT_SEED = 42 - - -def get_minhash_default_seed(): - return MINHASH_DEFAULT_SEED - - -# we use the 64-bit hash space of MurmurHash only -cdef uint64_t MINHASH_MAX_HASH = 2**64 - 1 - - -def get_minhash_max_hash(): - return MINHASH_MAX_HASH - - -def get_max_hash_for_scaled(scaled): - if scaled == 0: - return 0 - elif scaled == 1: - return get_minhash_max_hash() - - return int(round(get_minhash_max_hash() / scaled, 0)) - - -def get_scaled_for_max_hash(max_hash): - if max_hash == 0: - return 0 - return int(round(get_minhash_max_hash() / max_hash, 0)) - - -cdef bytes to_bytes(s): - # Allow for strings, bytes or int - # Single item of byte string = int - if not isinstance(s, (basestring, bytes, int)): - raise TypeError("Requires a string-like sequence") - - if isinstance(s, unicode): - s = s.encode('utf-8') - if isinstance(s, int): - s = bytes([s]) - return s - - -def hash_murmur(kmer, uint32_t seed=MINHASH_DEFAULT_SEED): - "hash_murmur(string, [,seed])\n\n" - "Compute a hash for a string, optionally using a seed (an integer). " - "The current default seed is returned by hash_seed()." - - return _hash_murmur(to_bytes(kmer), seed) - - -def dotproduct(a, b, normalize=True): - """ - Compute the dot product of two dictionaries {k: v} where v is - abundance. - """ - - if normalize: - norm_a = math.sqrt(sum([ x*x for x in a.values() ])) - norm_b = math.sqrt(sum([ x*x for x in b.values() ])) - - if norm_a == 0.0 or norm_b == 0.0: - return 0.0 - else: - norm_a = 1.0 - norm_b = 1.0 - - prod = 0. - for k, abundance in a.items(): - prod += (float(abundance) / norm_a) * (b.get(k, 0) / norm_b) - - return prod - - -cdef class MinHash(object): - - def __init__(self, unsigned int n, unsigned int ksize, - bool is_protein=False, - bool dayhoff=False, - bool hp=False, - bool track_abundance=False, - uint32_t seed=MINHASH_DEFAULT_SEED, - HashIntoType max_hash=0, - mins=None, HashIntoType scaled=0): - self._track_abundance = track_abundance - - if max_hash and scaled: - raise ValueError('cannot set both max_hash and scaled') - elif scaled: - max_hash = get_max_hash_for_scaled(scaled) - - if max_hash and n: - raise ValueError('cannot set both n and max_hash') - - if not n and not (max_hash or scaled): - raise ValueError("cannot omit both n and scaled") - - cdef KmerMinHash *mh = NULL - if track_abundance: - mh = new KmerMinAbundance(n, ksize, is_protein, dayhoff, hp, seed, max_hash) - else: - mh = new KmerMinHash(n, ksize, is_protein, dayhoff, hp, seed, max_hash) - - self._this.reset(mh) - - if mins: - if track_abundance: - self.set_abundances(mins) - else: - self.add_many(mins) - - - def __copy__(self): - a = MinHash(deref(self._this).num, deref(self._this).ksize, - deref(self._this).is_protein, deref(self._this).dayhoff, - deref(self._this).hp, - self.track_abundance, - deref(self._this).seed, deref(self._this).max_hash) - a.merge(self) - return a - - def __getstate__(self): # enable pickling - with_abundance = False - if self.track_abundance: - with_abundance = True - - return (deref(self._this).num, - deref(self._this).ksize, - deref(self._this).is_protein, - deref(self._this).dayhoff, - deref(self._this).hp, - self.get_mins(with_abundance=with_abundance), - None, self.track_abundance, deref(self._this).max_hash, - deref(self._this).seed) - - def __setstate__(self, tup): - (n, ksize, is_protein, dayhoff, hp, mins, _, track_abundance, max_hash, seed) =\ - tup - - self._track_abundance = track_abundance - - cdef KmerMinHash *mh = NULL - if track_abundance: - mh = new KmerMinAbundance(n, ksize, is_protein, dayhoff, hp, seed, max_hash) - self._this.reset(mh) - self.set_abundances(mins) - else: - mh = new KmerMinHash(n, ksize, is_protein, dayhoff, hp, seed, max_hash) - self._this.reset(mh) - self.add_many(mins) - - def __reduce__(self): - return (MinHash, - (deref(self._this).num, - deref(self._this).ksize, - deref(self._this).is_protein, - deref(self._this).dayhoff, - deref(self._this).hp, - self.track_abundance, - deref(self._this).seed, - deref(self._this).max_hash, - self.get_mins(with_abundance=self.track_abundance), - 0)) - - def __richcmp__(self, other, op): - if op == 2: - return self.__getstate__() == other.__getstate__() - raise Exception("undefined comparison") - - def copy_and_clear(self): - a = MinHash(deref(self._this).num, deref(self._this).ksize, - deref(self._this).is_protein, deref(self._this).dayhoff, - deref(self._this).hp, self.track_abundance, - deref(self._this).seed, deref(self._this).max_hash) - return a - - def add_sequence(self, sequence, bool force=False): - deref(self._this).add_sequence(to_bytes(sequence), force) - - def add(self, kmer): - "Add kmer into sketch." - self.add_sequence(kmer) - - def add_many(self, hashes): - "Add many hashes in at once." - for hash in hashes: - self.add_hash(hash) - - def remove_many(self, hashes): - "Remove many hashes at once." - for hash in hashes: - deref(self._this).remove_hash(hash) - - def update(self, other): - "Update this estimator from all the hashes from the other." - self.add_many(other.get_mins()) - - def __len__(self): - return deref(self._this).mins.size() - - cpdef get_mins(self, bool with_abundance=False): - cdef KmerMinAbundance *mh = address(deref(self._this)) - if with_abundance and self.track_abundance: - return dict(zip(mh.mins, mh.abunds)) - else: - return deref(self._this).mins - - def get_hashes(self): - return self.get_mins() - - def subtract_mins(self, other): - a = set(self.get_mins()) - b = set(other.get_mins()) - return a - b - - @property - def seed(self): - return deref(self._this).seed - - @property - def num(self): - return deref(self._this).num - - @property - def scaled(self): - if self.max_hash: - return get_scaled_for_max_hash(self.max_hash) - return 0 - - @property - def is_protein(self): - return deref(self._this).is_protein - - @property - def dayhoff(self): - return deref(self._this).dayhoff - - @property - def hp(self): - return deref(self._this).hp - - @property - def ksize(self): - return deref(self._this).ksize - - @property - def max_hash(self): - return deref(self._this).max_hash - - @property - def track_abundance(self): - return self._track_abundance - - @track_abundance.setter - def track_abundance(self, v): - cdef KmerMinHash *mh = NULL - - if v == self._track_abundance: - return - - if v is True and len(self) != 0: - raise RuntimeError("Can only set track_abundance=True if the MinHash is empty") - - if v: - mh = new KmerMinAbundance(self.num, self.ksize, self.is_protein, - self.dayhoff, self.hp, self.seed, self.max_hash) - self._this.reset(mh) - - # At this point, if we are changing from track_abundance=True to False, - # keep the underlying Abundance MH (to avoid copying data to a new one). - - self._track_abundance = v - - def add_hash(self, uint64_t h): - deref(self._this).add_hash(h) - - def translate_codon(self, codon): - return deref(self._this).translate_codon(to_bytes(codon)) - - def count_common(self, MinHash other): - return deref(self._this).count_common(deref(other._this)) - - def downsample_n(self, new_num): - if self.num and self.num < new_num: - raise ValueError('new sample n is higher than current sample n') - - a = MinHash(new_num, deref(self._this).ksize, - deref(self._this).is_protein, deref(self._this).dayhoff, - deref(self._this).hp, - self.track_abundance, - deref(self._this).seed, 0) - if self.track_abundance: - a.set_abundances(self.get_mins(with_abundance=True)) - else: - a.add_many(self.get_mins()) - - return a - - def downsample_max_hash(self, *others): - max_hashes = [ x.max_hash for x in others ] - new_max_hash = min(self.max_hash, *max_hashes) - new_scaled = get_scaled_for_max_hash(new_max_hash) - - return self.downsample_scaled(new_scaled) - - def downsample_scaled(self, new_num): - if self.num: - raise ValueError('num != 0 - cannot downsample a standard MinHash') - - max_hash = self.max_hash - if max_hash is None: - raise ValueError('no max_hash available - cannot downsample') - - old_scaled = get_scaled_for_max_hash(self.max_hash) - if old_scaled > new_num: - raise ValueError('new scaled {} is lower than current sample scaled {}'.format(new_num, old_scaled)) - - new_max_hash = get_max_hash_for_scaled(new_num) - - a = MinHash(0, deref(self._this).ksize, - deref(self._this).is_protein, deref(self._this).dayhoff, - deref(self._this).hp, - self.track_abundance, - deref(self._this).seed, new_max_hash) - if self.track_abundance: - a.set_abundances(self.get_mins(with_abundance=True)) - else: - a.add_many(self.get_mins()) - - return a - - def intersection(self, MinHash other): - if self.num != other.num: - err = 'must have same num: {} != {}'.format(self.num, - other.num) - raise TypeError(err) - else: - num = self.num - - if self.track_abundance and other.track_abundance: - combined_mh = new KmerMinAbundance(num, - deref(self._this).ksize, - deref(self._this).is_protein, - deref(self._this).dayhoff, - deref(self._this).hp, - deref(self._this).seed, - deref(self._this).max_hash) - - else: - combined_mh = new KmerMinHash(num, - deref(self._this).ksize, - deref(self._this).is_protein, - deref(self._this).dayhoff, - deref(self._this).hp, - deref(self._this).seed, - deref(self._this).max_hash) - - combined_mh.merge(deref(self._this)) - combined_mh.merge(deref(other._this)) - - common = set(self.get_mins()) - common.intersection_update(other.get_mins()) - common.intersection_update(combined_mh.mins) - - size = max(combined_mh.size(), 1) - del combined_mh - - return common, size - - def compare(self, MinHash other): - common, size = self.intersection(other) - n = len(common) - return n / size - - def jaccard(self, MinHash other): - return self.compare(other) - - def similarity(self, other, ignore_abundance=False): - """\ - Calculate similarity of two sketches. - - If the sketches are not abundance weighted, or ignore_abundance=True, - compute Jaccard similarity. - - If the sketches are abundance weighted, calculate a distance metric - based on the cosine similarity. - - Note, because the term frequencies (tf-idf weights) cannot be negative, - the angle will never be < 0deg or > 90deg. - - See https://en.wikipedia.org/wiki/Cosine_similarity - """ - - # if either signature is flat, calculate Jaccard only. - if not (self.track_abundance and other.track_abundance) or \ - ignore_abundance: - return self.jaccard(other) - else: - # can we merge? if not, raise exception. - aa = copy.copy(self) - aa.merge(other) - - a = self.get_mins(with_abundance=True) - b = other.get_mins(with_abundance=True) - - prod = dotproduct(a, b) - prod = min(1.0, prod) - - distance = 2*math.acos(prod) / math.pi - return 1.0 - distance - - def contained_by(self, other): - """\ - Calculate how much of self is contained by other. - """ - if not len(self): - return 0.0 - return self.count_common(other) / len(self.get_mins()) - - def containment_ignore_maxhash(self, MinHash other): - a = set(self.get_mins()) - if not a: - return 0.0 - - b = set(other.get_mins()) - - overlap = a.intersection(b) - return float(len(overlap)) / float(len(a)) - - def __iadd__(self, MinHash other): - cdef KmerMinAbundance *mh = address(deref(self._this)) - cdef KmerMinAbundance *other_mh = address(deref(other._this)) - - if self.track_abundance and other.track_abundance: - deref(mh).merge(deref(other_mh)) - else: - deref(self._this).merge(deref(other._this)) - - return self - merge = __iadd__ - - cpdef set_abundances(self, dict values): - if self.track_abundance: - added = 0 - - for k, v in sorted(values.items()): - if not self.max_hash or k <= self.max_hash: - deref(self._this).mins.push_back(k) - (address(deref(self._this))).abunds.push_back(v) - added += 1 - if self.num > 0 and added >= self.num: - break - else: - raise RuntimeError("Use track_abundance=True when constructing " - "the MinHash to use set_abundances.") - - def add_protein(self, sequence): - cdef uint32_t ksize = deref(self._this).ksize // 3 - if len(sequence) < ksize: - return - - if not deref(self._this).is_protein: - raise ValueError("cannot add amino acid sequence to DNA MinHash!") - - aa_kmers = (sequence[i:i + ksize] for i in range(0, len(sequence) - ksize + 1)) - if not self.dayhoff and not self.hp: - for aa_kmer in aa_kmers: - deref(self._this).add_word(to_bytes(aa_kmer)) - elif self.dayhoff: - for aa_kmer in aa_kmers: - dayhoff_kmer = '' - for aa in aa_kmer: - dayhoff_letter = deref(self._this).aa_to_dayhoff(to_bytes(aa)) - dayhoff_kmer += dayhoff_letter - # dayhoff_kmer = ''.join( for aa in aa_kmer) - deref(self._this).add_word(to_bytes(dayhoff_kmer)) - else: - for aa_kmer in aa_kmers: - hp_kmer = '' - for aa in aa_kmer: - hp_letter = deref(self._this).aa_to_hp(to_bytes(aa)) - hp_kmer += hp_letter - # hp_kmer = ''.join( for aa in aa_kmer) - deref(self._this).add_word(to_bytes(hp_kmer)) - - def is_molecule_type(self, molecule): - if molecule.upper() == 'DNA' and not self.is_protein: - return True - elif self.is_protein and molecule == 'protein' and not any((self.dayhoff, self.hp)): - return True - elif self.dayhoff and molecule == 'dayhoff': - return True - elif self.hp and molecule == 'hp': - return True - - return False diff --git a/sourmash/exceptions.py b/sourmash/exceptions.py new file mode 100644 index 0000000000..8254a21762 --- /dev/null +++ b/sourmash/exceptions.py @@ -0,0 +1,44 @@ +from ._compat import implements_to_string +from ._lowlevel import lib + + +__all__ = ['SourmashError'] +exceptions_by_code = {} + + +@implements_to_string +class SourmashError(Exception): + code = None + + def __init__(self, msg): + Exception.__init__(self) + self.message = msg + self.rust_info = None + + def __str__(self): + rv = self.message + if self.rust_info is not None: + return u'%s\n\n%s' % (rv, self.rust_info) + return rv + + +def _make_exceptions(): + for attr in dir(lib): + if not attr.startswith('SOURMASH_ERROR_CODE_'): + continue + + class Exc(SourmashError): + pass + + code = getattr(lib, attr) + if code < 100 or code > 10000: + Exc.__name__ = attr[20:].title().replace('_', '') + Exc.code = getattr(lib, attr) + globals()[Exc.__name__] = Exc + Exc.code = code + exceptions_by_code[code] = Exc + __all__.append(Exc.__name__) + else: + exceptions_by_code[code] = ValueError + +_make_exceptions() diff --git a/sourmash/kmer_min_hash.hh b/sourmash/kmer_min_hash.hh deleted file mode 100644 index 47decc8105..0000000000 --- a/sourmash/kmer_min_hash.hh +++ /dev/null @@ -1,648 +0,0 @@ -#ifndef KMER_MIN_HASH_HH -#define KMER_MIN_HASH_HH - -#include -#include -#include -#include -#include -#include -#include - -#include "../third-party/smhasher/MurmurHash3.h" - -#define tbl \ - " "\ - /*ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz */\ - " TVGH FCD M KN YSAABW R TVGH FCD M KN YSAABW R" - -inline uint64_t _hash_murmur(const std::string& kmer, - const uint32_t seed) { - uint64_t out[2]; - out[0] = 0; out[1] = 0; - MurmurHash3_x64_128((void *)kmer.c_str(), kmer.size(), seed, &out); - return out[0]; -} - -/** - * @Synopsis Unsafe hash overload. Takes a const char * - * and assumes it has length ksize. - * - * @Param kmer The k-mer. - * @Param ksize Length of the k-mer. - * @Param seed Hashing seed. - * - * @Returns The hash value. - */ -inline uint64_t _hash_murmur(const char * kmer, - unsigned int ksize, - const uint32_t seed) { - uint64_t out[2]; - out[0] = 0; out[1] = 0; - MurmurHash3_x64_128((void *)kmer, ksize, seed, &out); - return out[0]; -} - -typedef uint64_t HashIntoType; - -typedef std::vector CMinHashType; - -class minhash_exception : public std::exception -{ -public: - explicit minhash_exception(const std::string& msg = "Generic minhash exception") - : _msg(msg) { } - - virtual ~minhash_exception() throw() { } - virtual const char* what() const throw () - { - return _msg.c_str(); - } - -protected: - const std::string _msg; -}; - -// Looks like a iterator but all it does is counts push_backs -struct Counter { - struct value_type { - template value_type(const T &) {} - }; - void push_back(const value_type &) { ++count; } - size_t count = 0; -}; - - -class KmerMinHash -{ -public: - const unsigned int num; - const unsigned int ksize; - const bool is_protein; - const bool dayhoff; - const bool hp; - const uint32_t seed; - const HashIntoType max_hash; - CMinHashType mins; - - KmerMinHash(unsigned int n, unsigned int k, bool prot, bool dyhoff, bool hp, uint32_t s, - HashIntoType mx) - : num(n), ksize(k), is_protein(prot), dayhoff(dyhoff), hp(hp), seed(s), max_hash(mx) { - if (n > 0) { - mins.reserve(num + 1); - } - // only reserve a finite amount of space for unbounded MinHashes - else { - mins.reserve(1000); - } - }; - - void check_compatible(const KmerMinHash& other) { - if (ksize != other.ksize) { - throw minhash_exception("different ksizes cannot be compared"); - } - if (is_protein != other.is_protein) { - throw minhash_exception("DNA/prot minhashes cannot be compared"); - } - if (dayhoff != other.dayhoff) { - throw minhash_exception("DNA/prot minhashes cannot be compared"); - } - if (hp != other.hp) { - throw minhash_exception("DNA/prot minhashes cannot be compared"); - } - if (max_hash != other.max_hash) { - throw minhash_exception("mismatch in max_hash; comparison fail"); - } - if (seed != other.seed) { - throw minhash_exception("mismatch in seed; comparison fail"); - } - } - - virtual void add_hash(const HashIntoType h) { - if ((max_hash and h <= max_hash) or not max_hash) { - if (mins.size() == 0) { - mins.push_back(h); - return; - } - else if (h <= max_hash or mins.back() > h or mins.size() < num) { - auto pos = std::lower_bound(std::begin(mins), std::end(mins), h); - - // must still be growing, we know the list won't get too long - if (pos == mins.cend()) { - mins.push_back(h); - } - // inserting somewhere in the middle, if this value isn't already - // in mins store it and shrink list if needed - else if (*pos != h) { - mins.insert(pos, h); - if (num and mins.size() > num) { - mins.pop_back(); - } - } - } - } - } - - virtual void remove_hash(const HashIntoType h) { - auto pos = std::lower_bound(std::begin(mins), std::end(mins), h); - if (pos != mins.cend() and *pos == h) { - mins.erase(pos); - } - } - - void add_word(const std::string& word) { - const HashIntoType hash = _hash_murmur(word, seed); - add_hash(hash); - } - - /** - * @Synopsis Unsafe overload: calls _hash_murmur assuming - * word is length ksize. - * - * @Param word k-mer to add. - */ - void add_word(const char * word, unsigned int size) { - const HashIntoType hash = _hash_murmur(word, size, seed); - add_hash(hash); - } - - void _invalid_kmer(const std::string& kmer) { - std::string msg = "invalid DNA character in input k-mer: "; - msg += kmer; - throw minhash_exception(msg); - } - - void add_sequence(std::string& seq, bool force=false) { - - if (seq.length() < ksize) { - return; - } - - std::transform(seq.begin(), seq.end(), seq.begin(), ::toupper); - - if (!is_protein) { - auto rc = _revcomp(seq); - for (unsigned int i = 0; i < seq.length() - ksize + 1; i++) { - auto fw_kmer = seq.c_str() + i; - auto rc_kmer = rc.c_str() + rc.length() - ksize - i; - - if (! _checkdna(fw_kmer, ksize)) { - if (force) { - continue; - } else { - _invalid_kmer(fw_kmer); - } - } - - if (std::lexicographical_compare(fw_kmer, - fw_kmer + ksize, - rc_kmer, - rc_kmer + ksize)) { - add_word(fw_kmer, ksize); - } else { - add_word(rc_kmer, ksize); - } - } - } else { // protein - std::string rc = _revcomp(seq); - for (unsigned int i = 0; i < 3; i++) { - std::string aa = _dna_to_aa(seq.substr(i, seq.length() - i)); - unsigned int aa_ksize = int(ksize / 3); - std::string kmer; - - for (unsigned int j = 0; j < aa.length() - aa_ksize + 1; j++) { - kmer = aa.substr(j, aa_ksize); - add_word(kmer); - } - - aa = _dna_to_aa(rc.substr(i, rc.length() - i)); - aa_ksize = int(ksize / 3); - - for (unsigned int j = 0; j < aa.length() - aa_ksize + 1; j++) { - kmer = aa.substr(j, aa_ksize); - add_word(kmer); - } - } - } - } - - std::string translate_codon(std::string& codon) { - std::string residue; - - if (codon.length() >= 2 && codon.length() <= 3){ - // If codon is length 2, pad with an N for ambiguous codon amino acids - if (codon.length() == 2) { - codon += "N"; - } - auto translated = _codon_table.find(codon); - - if (translated != _codon_table.end()) { - // "second" is the element mapped to by the codon - // Because .find returns an iterator - residue = translated -> second; - } else { - // Otherwise, assign the "X" or "unknown" amino acid - residue = "X"; - } - } else if (codon.length() == 1){ - // Then we only have one nucleotides and the amino acid is unknown - residue = "X"; - } else { - std::string msg = "Codon is invalid length: "; - msg += codon; - throw minhash_exception(msg); - } - return residue; - } - - std::string _dna_to_aa(const std::string& dna) { - std::string aa; - std::string codon; - std::string residue; - unsigned int dna_size = (dna.size() / 3) * 3; // floor it - for (unsigned int j = 0; j < dna_size; j += 3) { - - codon = dna.substr(j, 3); - - residue = translate_codon(codon); - - // Use dayhoff encoding of amino acids - if (dayhoff) { - std::string new_letter = aa_to_dayhoff(residue); - aa += new_letter; - // Use hp encoding of amino acids - } else if (hp) { - std::string new_letter = aa_to_hp(residue); - aa += new_letter; - } - else { - aa += residue; - } - - } - return aa; - } - - /** - * @Synopsis Check that a single char is a DNA base. - * - * @Param c The character. - * - * @Returns True if valid, false otherwise. - */ - bool _checkdna(const char c) const { - switch(c) { - case 'A': - case 'C': - case 'G': - case 'T': - break; - default: - return false; - } - return true; - } - - /** - * @Synopsis Safe DNA sanity check for a sequence of - * arbitrary length. - * - * @Param seq The sequence. - * - * @Returns True if valid, false otherwise. - */ - bool _checkdna(const std::string& seq) const { - - for (size_t i=0; i < seq.length(); ++i) { - if (!_checkdna(seq[i])) { - return false; - } - } - return true; - } - - /** - * @Synopsis Unsafe k-mer DNA sanity check: doesn't check length. - * - * @Param kmer k-mer to check. - * - * @Returns true if sane; false otherwise. - */ - bool _checkdna(const char * kmer, unsigned int length) const { - - for (size_t i=0; i < length; ++i) { - if (!_checkdna(*(kmer + i))) { - return false; - } - } - return true; - } - - std::string _revcomp(const std::string& kmer) const { - std::string out = kmer; - - auto from = out.begin(); - auto to = out.end(); - - char c; - for (to--; from <= to; from++, to--) { - c = tbl[(int)*from]; - *from = tbl[(int)*to]; - *to = c; - } - - return out; - } - - std::string aa_to_dayhoff(const std::string& aa) const { - // Convert an amino acid letter to dayhoff encoding - std::string new_letter; - - auto dayhoff_encoded = _dayhoff_table.find(aa); - if (dayhoff_encoded != _dayhoff_table.end()) { - // "second" is the element mapped to by the codon - // Because .find returns an iterator - new_letter = dayhoff_encoded -> second; - } else { - // Otherwise, assign the "X" or "unknown" amino acid - new_letter = "X"; - } - return new_letter; - } - - std::string aa_to_hp(const std::string& aa) const { - // Convert an amino acid letter to hp encoding - std::string new_letter; - - auto hp_encoded = _hp_table.find(aa); - if (hp_encoded != _hp_table.end()) { - // "second" is the element mapped to by the codon - // Because .find returns an iterator - new_letter = hp_encoded -> second; - } else { - // Otherwise, assign the "X" or "unknown" amino acid - new_letter = "X"; - } - return new_letter; - } - - virtual void merge(const KmerMinHash& other) { - check_compatible(other); - - CMinHashType merged; - merged.reserve(other.mins.size() + mins.size()); - std::set_union(other.mins.begin(), other.mins.end(), - mins.begin(), mins.end(), - std::back_inserter(merged)); - if (merged.size() < num or !num) { - mins = merged; - } - else { - mins = CMinHashType(std::begin(merged), std::begin(merged) + num); - } - } - - virtual unsigned int count_common(const KmerMinHash& other) { - check_compatible(other); - - Counter counter; - std::set_intersection(mins.begin(), mins.end(), - other.mins.begin(), other.mins.end(), - std::back_inserter(counter)); - return counter.count; - } - - virtual size_t size() { - return mins.size(); - } - - virtual ~KmerMinHash() throw() { } - -private: - std::map _codon_table = { - {"TTT", "F"}, {"TTC", "F"}, - {"TTA", "L"}, {"TTG", "L"}, - - {"TCT", "S"}, {"TCC", "S"}, {"TCA", "S"}, {"TCG", "S"}, {"TCN", "S"}, - - {"TAT", "Y"}, {"TAC", "Y"}, - {"TAA", "*"}, {"TAG", "*"}, - - {"TGT", "C"}, {"TGC", "C"}, - {"TGA", "*"}, - {"TGG", "W"}, - - {"CTT", "L"}, {"CTC", "L"}, {"CTA", "L"}, {"CTG", "L"}, {"CTN", "L"}, - - {"CCT", "P"}, {"CCC", "P"}, {"CCA", "P"}, {"CCG", "P"}, {"CCN", "P"}, - - {"CAT", "H"}, {"CAC", "H"}, - {"CAA", "Q"}, {"CAG", "Q"}, - - {"CGT", "R"}, {"CGC", "R"}, {"CGA", "R"}, {"CGG", "R"}, {"CGN", "R"}, - - {"ATT", "I"}, {"ATC", "I"}, {"ATA", "I"}, - {"ATG", "M"}, - - {"ACT", "T"}, {"ACC", "T"}, {"ACA", "T"}, {"ACG", "T"}, {"ACN", "T"}, - - {"AAT", "N"}, {"AAC", "N"}, - {"AAA", "K"}, {"AAG", "K"}, - - {"AGT", "S"}, {"AGC", "S"}, - {"AGA", "R"}, {"AGG", "R"}, - - {"GTT", "V"}, {"GTC", "V"}, {"GTA", "V"}, {"GTG", "V"}, {"GTN", "V"}, - - {"GCT", "A"}, {"GCC", "A"}, {"GCA", "A"}, {"GCG", "A"}, {"GCN", "A"}, - - {"GAT", "D"}, {"GAC", "D"}, - {"GAA", "E"}, {"GAG", "E"}, - - {"GGT", "G"}, {"GGC", "G"}, {"GGA", "G"}, {"GGG", "G"}, {"GGN", "G"} - }; - - -// Dayhoff table from -// Peris, P., López, D., & Campos, M. (2008). -// IgTM: An algorithm to predict transmembrane domains and topology in -// proteins. BMC Bioinformatics, 9(1), 1029–11. -// http://doi.org/10.1186/1471-2105-9-367 -// -// Original source: -// Dayhoff M. O., Schwartz R. M., Orcutt B. C. (1978). -// A model of evolutionary change in proteins, -// in Atlas of Protein Sequence and Structure, -// ed Dayhoff M. O., editor. -// (Washington, DC: National Biomedical Research Foundation; ), 345–352. -// -// | Amino acid | Property | Dayhoff | -// |---------------|-----------------------|---------| -// | C | Sulfur polymerization | a | -// | A, G, P, S, T | Small | b | -// | D, E, N, Q | Acid and amide | c | -// | H, K, R | Basic | d | -// | I, L, M, V | Hydrophobic | e | -// | F, W, Y | Aromatic | f | - std::map _dayhoff_table = { - {"C", "a"}, - - {"A", "b"}, {"G", "b"}, {"P", "b"}, {"S", "b"}, {"T", "b"}, - - {"D", "c"}, {"E", "c"}, {"N", "c"}, {"Q", "c"}, - - {"H", "d"}, {"K", "d"}, {"R", "d"}, - - {"I", "e"}, {"L", "e"}, {"M", "e"}, {"V", "e"}, - - {"F", "f"}, {"W", "f"}, {"Y", "f"} - - }; - -// HP Hydrophobic/hydrophilic mapping -// From: Phillips, R., Kondev, J., Theriot, J. (2008). -// Physical Biology of the Cell. New York: Garland Science, Taylor & Francis Group. ISBN: 978-0815341635 - -// -// | Amino acid | HP -// |---------------------------------------|---------| -// | A, F, G, I, L, M, P, V, W, Y | h | -// | N, C, S, T, D, E, R, H, K, Q | p | - std::map _hp_table = { - {"A", "h"}, {"F", "h"}, {"G", "h"}, {"I", "h"}, - {"L", "h"}, {"M", "h"}, {"P", "h"}, {"V", "h"}, - {"W", "h"}, {"Y", "h"}, {"N", "p"}, {"C", "p"}, - {"S", "p"}, {"T", "p"}, {"D", "p"}, {"E", "p"}, - {"R", "p"}, {"H", "p"}, {"K", "p"}, {"Q", "p"} - }; - -}; - -class KmerMinAbundance: public KmerMinHash { - public: - CMinHashType abunds; - - KmerMinAbundance(unsigned int n, unsigned int k, bool prot, bool dayhoff, - bool hp, uint32_t seed, HashIntoType mx) : - KmerMinHash(n, k, prot, dayhoff, hp, seed, mx) { }; - - virtual void add_hash(HashIntoType h) { - if ((max_hash and h <= max_hash) or not max_hash) { - // empty? add it, if within range / no range specified. - if (mins.size() == 0) { - mins.push_back(h); - abunds.push_back(1); - return; - } else if (h <= max_hash or mins.back() > h or mins.size() < num) { - // "good" hash - within range, smaller than current entry, or - // still space. - auto pos = std::lower_bound(std::begin(mins), std::end(mins), h); - - // at end -- must still be growing, we know the list won't get too - // long - if (pos == mins.cend()) { - mins.push_back(h); - abunds.push_back(1); - } else if (*pos != h) { - // didn't find hash already in mins, so - // inserting somewhere in the middle; shrink list if needed. - - // calculate distance for use w/abunds *before* insert, as - // 'mins.insert' may invalidate 'pos'. - size_t dist = std::distance(begin(mins), pos); - mins.insert(pos, h); - abunds.insert(begin(abunds) + dist, 1); - - // now too big? if so, continue. - if (mins.size() > num and not max_hash) { - mins.pop_back(); - abunds.pop_back(); - } - } else { // *pos == h - hash value already there, increment count. - auto p = std::distance(begin(mins), pos); - abunds[p] += 1; - } - } - } - } - - virtual void remove_hash(const HashIntoType h) { - auto pos = std::lower_bound(std::begin(mins), std::end(mins), h); - if (pos != mins.cend() and *pos == h) { - mins.erase(pos); - size_t dist = std::distance(begin(mins), pos); - abunds.erase(begin(abunds) + dist); - } - } - - virtual void merge(const KmerMinAbundance& other) { - check_compatible(other); - - CMinHashType merged_mins; - CMinHashType merged_abunds; - size_t max_size = other.mins.size() + mins.size(); - - merged_mins.reserve(max_size); - merged_abunds.reserve(max_size); - - auto it1_m = mins.begin(); - auto it2_m = other.mins.begin(); - auto out_m = std::back_inserter(merged_mins); - - auto it1_a = abunds.begin(); - auto it2_a = other.abunds.begin(); - auto out_a = std::back_inserter(merged_abunds); - - for (; it1_m != mins.end(); ++out_m, ++out_a) { - if (it2_m == other.mins.end()) { - /* we reached the end of other.mins, - so just copy the remainder of mins to the output */ - std::copy(it1_m, mins.end(), out_m); - std::copy(it1_a, abunds.end(), out_a); - break; - } - if (*it2_m < *it1_m) { - /* other.mins is smaller than mins, - so copy it to output and advance other.mins iterators */ - *out_m = *it2_m; - *out_a = *it2_a; - ++it2_m; - ++it2_a; - } else if (*it2_m == *it1_m) { - /* same value in both mins, so sums the abundances - on the output and advances all iterators */ - *out_m = *it1_m; - *out_a = *it1_a + *it2_a; - ++it1_m; ++it1_a; - ++it2_m; ++it2_a; - } else { - /* mins is smaller than other.mins, - so copy it to output and advance the mins iterators */ - *out_m = *it1_m; - *out_a = *it1_a; - ++it1_m; - ++it1_a; - } - } - /* we reached the end of mins/abunds, - so just copy the remainder of other to the output - (other might already be at the end, in this case nothing happens) */ - std::copy(it2_m, other.mins.end(), out_m); - std::copy(it2_a, other.abunds.end(), out_a); - - if (merged_mins.size() < num || !num) { - mins = merged_mins; - abunds = merged_abunds; - } else { - mins = CMinHashType(std::begin(merged_mins), std::begin(merged_mins) + num); - abunds = CMinHashType(std::begin(merged_abunds), std::begin(merged_abunds) + num); - } - } - - virtual size_t size() { - return mins.size(); - } - -}; - -#endif // KMER_MIN_HASH_HH diff --git a/sourmash/sbtmh.py b/sourmash/sbtmh.py index 2289e0cb94..0f7b46ae9c 100644 --- a/sourmash/sbtmh.py +++ b/sourmash/sbtmh.py @@ -54,10 +54,11 @@ def save(self, path): return self.storage.save(path, buf.getvalue()) def update(self, parent): - for v in self.data.minhash.get_mins(): + mh = self.data.minhash + for v in mh.get_mins(): parent.data.count(v) min_n_below = parent.metadata.get('min_n_below', sys.maxsize) - min_n_below = min(len(self.data.minhash), min_n_below) + min_n_below = min(len(mh), min_n_below) if min_n_below == 0: min_n_below = 1 diff --git a/sourmash/sig/__main__.py b/sourmash/sig/__main__.py index b0d29a8efe..10c9765107 100644 --- a/sourmash/sig/__main__.py +++ b/sourmash/sig/__main__.py @@ -268,10 +268,13 @@ def merge(args): mh.track_abundance = False try: + sigobj_mh = sigobj.minhash if not args.flatten: _check_abundance_compatibility(first_sig, sigobj) + else: + sigobj_mh.track_abundance = False - mh.merge(sigobj.minhash) + mh.merge(sigobj_mh) except: error("ERROR when merging signature '{}' ({}) from file {}", sigobj.name(), sigobj.md5sum()[:8], sigfile) diff --git a/sourmash/sourmash_args.py b/sourmash/sourmash_args.py index 542559a9a5..e496173737 100644 --- a/sourmash/sourmash_args.py +++ b/sourmash/sourmash_args.py @@ -215,11 +215,12 @@ def __iter__(self): self.ksizes.add(query_ksize) self.moltypes.add(query_moltype) - yield filename, query, query_moltype, query_ksize - if len(self.ksizes) > 1 or len(self.moltypes) > 1: raise ValueError('multiple k-mer sizes/molecule types present') + for query in sl: + yield filename, query, query_moltype, query_ksize + def traverse_find_sigs(dirnames, yield_all_files=False): for dirname in dirnames: diff --git a/sourmash/utils.py b/sourmash/utils.py new file mode 100644 index 0000000000..4fb2835fd8 --- /dev/null +++ b/sourmash/utils.py @@ -0,0 +1,78 @@ +import weakref + +from ._lowlevel import ffi, lib +from .exceptions import exceptions_by_code, SourmashError + +attached_refs = weakref.WeakKeyDictionary() + + +class RustObject(object): + __dealloc_func__ = None + _objptr = None + _shared = False + + def __init__(self): + raise TypeError("Cannot instanciate %r objects" % self.__class__.__name__) + + @classmethod + def _from_objptr(cls, ptr, shared=False): + rv = object.__new__(cls) + rv._objptr = ptr + rv._shared = shared + return rv + + def _methodcall(self, func, *args): + return rustcall(func, self._get_objptr(), *args) + + def _get_objptr(self): + if not self._objptr: + raise RuntimeError("Object is closed") + return self._objptr + + def __del__(self): + if self._objptr is None or self._shared: + return + f = self.__class__.__dealloc_func__ + if f is not None: + rustcall(f, self._objptr) + self._objptr = None + + +def decode_str(s, free=False): + """Decodes a SourmashStr""" + try: + if s.len == 0: + return u"" + return ffi.unpack(s.data, s.len).decode("utf-8", "replace") + finally: + if free: + lib.sourmash_str_free(ffi.addressof(s)) + + +def encode_str(s): + """Encodes a SourmashStr""" + rv = ffi.new("SourmashStr *") + if isinstance(s, text_type): + s = s.encode("utf-8") + rv.data = ffi.from_buffer(s) + rv.len = len(s) + # we have to hold a weak reference here to ensure our string does not + # get collected before the string is used. + attached_refs[rv] = s + return rv + + +def rustcall(func, *args): + """Calls rust method and does some error handling.""" + lib.sourmash_err_clear() + rv = func(*args) + err = lib.sourmash_err_get_last_code() + if not err: + return rv + msg = lib.sourmash_err_get_last_message() + cls = exceptions_by_code.get(err, SourmashError) + exc = cls(decode_str(msg)) + backtrace = decode_str(lib.sourmash_err_get_backtrace()) + if backtrace: + exc.rust_info = backtrace + raise exc diff --git a/src/bin/smrs.rs b/src/bin/smrs.rs index b58f2c44ee..b3fee5264d 100644 --- a/src/bin/smrs.rs +++ b/src/bin/smrs.rs @@ -1,3 +1,4 @@ +use std::convert::TryInto; use std::fs::File; use std::io; use std::path::Path; @@ -24,6 +25,7 @@ use sourmash::index::search::{ use sourmash::index::storage::{FSStorage, Storage}; use sourmash::index::{Comparable, Index, MHBT}; use sourmash::signature::{Signature, SigsTrait}; +use sourmash::sketch::minhash::HashFunctions; use sourmash::sketch::Sketch; pub fn index( @@ -126,6 +128,12 @@ fn load_query_signature( moltype: Option<&str>, scaled: Option, ) -> Result, Error> { + let moltype: Option = if let Some(mol) = moltype { + Some(mol.try_into()?) + } else { + None + }; + let mut reader = io::BufReader::new(File::open(query)?); let sigs = Signature::load_signatures(&mut reader, ksize, moltype, scaled)?; diff --git a/src/ffi/minhash.rs b/src/ffi/minhash.rs index 2f84fab499..d9ec05397b 100644 --- a/src/ffi/minhash.rs +++ b/src/ffi/minhash.rs @@ -297,7 +297,7 @@ unsafe fn kmerminhash_enable_abundance(ptr: *mut KmerMinHash) -> Result<()> { &mut *ptr }; - if mh.mins.is_empty() { + if !mh.mins.is_empty() { return Err(SourmashError::NonEmptyMinHash { message: "track_abundance=True".into()}.into()); } diff --git a/src/ffi/signature.rs b/src/ffi/signature.rs index 56e147ebe3..2d6984347b 100644 --- a/src/ffi/signature.rs +++ b/src/ffi/signature.rs @@ -1,3 +1,4 @@ +use std::convert::TryInto; use std::ffi::CStr; use std::io; use std::os::raw::c_char; @@ -8,7 +9,7 @@ use serde_json; use crate::ffi::utils::SourmashStr; use crate::signature::Signature; -use crate::sketch::minhash::KmerMinHash; +use crate::sketch::minhash::{HashFunctions, KmerMinHash}; use crate::sketch::Sketch; // Signature methods @@ -233,12 +234,11 @@ unsafe fn signatures_load_path(ptr: *const c_char, CStr::from_ptr(ptr) }; - let moltype = { - if select_moltype.is_null() { + let moltype: Option = if select_moltype.is_null() { None } else { - Some(CStr::from_ptr(select_moltype).to_str()?) - } + let mol = CStr::from_ptr(select_moltype).to_str()?; + Some(mol.try_into()?) }; // TODO: implement ignore_md5sum @@ -274,12 +274,11 @@ unsafe fn signatures_load_buffer(ptr: *const c_char, slice::from_raw_parts(ptr as *mut u8, insize) }; - let moltype = { - if select_moltype.is_null() { + let moltype: Option = if select_moltype.is_null() { None } else { - Some(CStr::from_ptr(select_moltype).to_str()?) - } + let mol = CStr::from_ptr(select_moltype).to_str()?; + Some(mol.try_into()?) }; let k = match ksize { diff --git a/src/index/bigsi.rs b/src/index/bigsi.rs index 6fac529a05..141211f584 100644 --- a/src/index/bigsi.rs +++ b/src/index/bigsi.rs @@ -155,6 +155,7 @@ impl<'a> Index<'a> for BIGSI { #[cfg(test)] mod test { + use std::convert::TryInto; use std::fs::File; use std::io::BufReader; use std::path::PathBuf; @@ -179,8 +180,13 @@ mod test { filename.push("tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); let mut reader = BufReader::new(File::open(filename).unwrap()); - let sigs = - Signature::load_signatures(&mut reader, Some(31), Some("DNA".into()), None).unwrap(); + let sigs = Signature::load_signatures( + &mut reader, + Some(31), + Some("DNA".try_into().unwrap()), + None, + ) + .unwrap(); let sig_data = sigs[0].clone(); let leaf: SigStore<_> = sig_data.into(); diff --git a/src/index/sbt/mhbt.rs b/src/index/sbt/mhbt.rs index b26975f6de..565468a278 100644 --- a/src/index/sbt/mhbt.rs +++ b/src/index/sbt/mhbt.rs @@ -153,14 +153,15 @@ impl ReadData for Node { #[cfg(test)] mod test { + use std::convert::TryInto; use std::fs::File; use std::io::{BufReader, Seek, SeekFrom}; use std::path::PathBuf; use std::rc::Rc; - use tempfile; use assert_matches::assert_matches; use lazy_init::Lazy; + use tempfile; use super::Factory; @@ -207,8 +208,13 @@ mod test { filename.push("tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); let mut reader = BufReader::new(File::open(filename).unwrap()); - let sigs = - Signature::load_signatures(&mut reader, Some(31), Some("DNA".into()), None).unwrap(); + let sigs = Signature::load_signatures( + &mut reader, + Some(31), + Some("DNA".try_into().unwrap()), + None, + ) + .unwrap(); let sig_data = sigs[0].clone(); let data = Lazy::new(); @@ -290,7 +296,12 @@ mod test { filename.push("tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); let mut reader = BufReader::new(File::open(filename)?); - let sigs = Signature::load_signatures(&mut reader, Some(31), Some("DNA".into()), None)?; + let sigs = Signature::load_signatures( + &mut reader, + Some(31), + Some("DNA".try_into().unwrap()), + None, + )?; let sig_data = sigs[0].clone(); let leaf: SigStore<_> = sig_data.into(); diff --git a/src/signature.rs b/src/signature.rs index 07823a43c3..914c962f3e 100644 --- a/src/signature.rs +++ b/src/signature.rs @@ -14,6 +14,7 @@ use typed_builder::TypedBuilder; use crate::errors::SourmashError; use crate::index::storage::ToWriter; +use crate::sketch::minhash::HashFunctions; use crate::sketch::Sketch; pub trait SigsTrait { @@ -156,7 +157,7 @@ impl Signature { pub fn load_signatures( buf: &mut R, ksize: Option, - moltype: Option<&str>, + moltype: Option, _scaled: Option, ) -> Result, Error> where @@ -190,9 +191,7 @@ impl Signature { match moltype { Some(x) => { - if (x.to_lowercase() == "dna" && !mh.is_protein()) - || (x.to_lowercase() == "protein" && mh.is_protein()) - { + if mh.hash_function() == x { return true; } } @@ -208,7 +207,7 @@ impl Signature { match moltype { Some(x) => { - if x.to_lowercase() == "dna" { + if x == HashFunctions::murmur64_DNA { return true; } else { // TODO: draff only supports dna for now @@ -285,6 +284,7 @@ impl PartialEq for Signature { #[cfg(test)] mod test { + use std::convert::TryInto; use std::fs::File; use std::io::BufReader; use std::path::PathBuf; @@ -297,8 +297,13 @@ mod test { filename.push("tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); let mut reader = BufReader::new(File::open(filename).unwrap()); - let sigs = - Signature::load_signatures(&mut reader, Some(31), Some("DNA".into()), None).unwrap(); + let sigs = Signature::load_signatures( + &mut reader, + Some(31), + Some("DNA".try_into().unwrap()), + None, + ) + .unwrap(); let _sig_data = sigs[0].clone(); // TODO: check sig_data } diff --git a/src/sketch/minhash.rs b/src/sketch/minhash.rs index 427ccb977d..088d2a4fd3 100644 --- a/src/sketch/minhash.rs +++ b/src/sketch/minhash.rs @@ -1,14 +1,14 @@ -use serde::de::{Deserialize, Deserializer}; -use serde::ser::{Serialize, SerializeStruct, Serializer}; -use serde_derive::Deserialize; - use std::cmp::Ordering; use std::collections::HashMap; +use std::convert::TryFrom; use std::iter::{Iterator, Peekable}; use std::str; use failure::Error; use lazy_static::lazy_static; +use serde::de::{Deserialize, Deserializer}; +use serde::ser::{Serialize, SerializeStruct, Serializer}; +use serde_derive::Deserialize; use crate::_hash_murmur; use crate::errors::SourmashError; @@ -27,6 +27,20 @@ pub enum HashFunctions { murmur64_hp = 4, } +impl TryFrom<&str> for HashFunctions { + type Error = Error; + + fn try_from(moltype: &str) -> Result { + match moltype.to_lowercase().as_ref() { + "dna" => Ok(HashFunctions::murmur64_DNA), + "dayhoff" => Ok(HashFunctions::murmur64_dayhoff), + "hp" => Ok(HashFunctions::murmur64_hp), + "protein" => Ok(HashFunctions::murmur64_protein), + _ => unimplemented!(), + } + } +} + #[cfg_attr(all(target_arch = "wasm32", target_vendor = "unknown"), wasm_bindgen)] #[derive(Debug, Clone, PartialEq)] pub struct KmerMinHash { @@ -177,6 +191,10 @@ impl KmerMinHash { self.hash_function == HashFunctions::murmur64_protein } + fn is_dna(&self) -> bool { + self.hash_function == HashFunctions::murmur64_DNA + } + pub fn seed(&self) -> u64 { self.seed } @@ -523,8 +541,7 @@ impl SigsTrait for KmerMinHash { .map(|&x| (x as char).to_ascii_uppercase() as u8) .collect(); if sequence.len() >= (self.ksize as usize) { - if !self.is_protein() { - // dna + if self.is_dna() { for kmer in sequence.windows(self.ksize as usize) { if _checkdna(kmer) { let rc = revcomp(kmer); diff --git a/tests/minhash.rs b/tests/minhash.rs index af0853e69f..8b38d5b8ac 100644 --- a/tests/minhash.rs +++ b/tests/minhash.rs @@ -73,6 +73,6 @@ fn dayhoff() { a.add_sequence(b"ACTGAC", false).unwrap(); b.add_sequence(b"ACTGAC", false).unwrap(); - assert_eq!(a.size(), 1); + assert_eq!(a.size(), 2); assert_eq!(b.size(), 2); } diff --git a/tests/smrs_cmd.rs b/tests/smrs_cmd.rs index c203099158..54ff329286 100644 --- a/tests/smrs_cmd.rs +++ b/tests/smrs_cmd.rs @@ -2,7 +2,6 @@ use std::fs; use std::process::Command; use assert_cmd::prelude::*; -use predicates::prelude::*; use predicates::str::contains; use tempfile::TempDir; diff --git a/tests/test__minhash.py b/tests/test__minhash.py index e994fcfde2..1c18a57bed 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -42,12 +42,17 @@ import pytest import sourmash -from sourmash._minhash import (MinHash, hash_murmur, dotproduct, - get_scaled_for_max_hash, - get_max_hash_for_scaled) -from . import sourmash_tst_utils as utils +from sourmash._minhash import ( + MinHash, + hash_murmur, + dotproduct, + get_scaled_for_max_hash, + get_max_hash_for_scaled, +) from sourmash import signature +from . import sourmash_tst_utils as utils + # add: # * get default params from Python # * keyword args for minhash constructor @@ -69,6 +74,7 @@ def test_basic_dna(track_abundance): print(a, b) assert a == b assert len(b) == 1 + assert a[0] == b[0] == 12415348535738636339 def test_div_zero(track_abundance): @@ -95,12 +101,12 @@ def test_bytes_dna(track_abundance): mh = MinHash(1, 4, track_abundance=track_abundance) mh.add_sequence('ATGC') mh.add_sequence(b'ATGC') - mh.add_sequence(u'ATGC') + mh.add_sequence('ATGC') a = mh.get_mins() mh.add_sequence('GCAT') # this will not get added; hash > ATGC mh.add_sequence(b'GCAT') # this will not get added; hash > ATGC - mh.add_sequence(u'GCAT') # this will not get added; hash > ATGC + mh.add_sequence('GCAT') # this will not get added; hash > ATGC b = mh.get_mins() print(a, b) @@ -112,7 +118,7 @@ def test_bytes_protein_dayhoff(track_abundance, dayhoff): # verify that we can hash protein/aa sequences mh = MinHash(10, 6, True, dayhoff=dayhoff, hp=False, track_abundance=track_abundance) mh.add_protein('AGYYG') - mh.add_protein(u'AGYYG') + mh.add_protein('AGYYG') mh.add_protein(b'AGYYG') assert len(mh.get_mins()) == 4 @@ -254,6 +260,13 @@ def test_max_hash_conversion(): assert new_scaled == SCALED +def test_max_hash_and_scaled_zero(): + max_hash = get_max_hash_for_scaled(0) + new_scaled = get_scaled_for_max_hash(0) + assert max_hash == new_scaled + assert max_hash == 0 + + def test_max_hash_and_scaled_error(track_abundance): # test behavior when supplying both max_hash and scaled with pytest.raises(ValueError): @@ -369,6 +382,28 @@ def test_compare_1(track_abundance): assert b.compare(b) == 1.0 +def test_intersection_errors(track_abundance): + a = MinHash(20, 10, track_abundance=track_abundance) + b = MinHash(20, 10, track_abundance=track_abundance) + c = MinHash(30, 10, track_abundance=track_abundance) + + a.add_sequence("TGCCGCCCAGCA") + b.add_sequence("TGCCGCCCAGCA") + + common = set(a.get_mins()) + combined_size = 3 + + intersection, size = a.intersection(b, in_common=False) + assert intersection == set() + assert combined_size == size + + with pytest.raises(TypeError): + a.intersection(set()) + + with pytest.raises(TypeError): + a.intersection(c) + + def test_intersection_1(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) b = MinHash(20, 10, track_abundance=track_abundance) @@ -379,38 +414,38 @@ def test_intersection_1(track_abundance): common = set(a.get_mins()) combined_size = 3 - intersection, size = a.intersection(b) + intersection, size = a.intersection(b, in_common=True) assert intersection == common assert combined_size == size - intersection, size = b.intersection(b) + intersection, size = b.intersection(b, in_common=True) assert intersection == common assert combined_size == size - intersection, size = b.intersection(a) + intersection, size = b.intersection(a, in_common=True) assert intersection == common assert combined_size == size - intersection, size = a.intersection(a) + intersection, size = a.intersection(a, in_common=True) assert intersection == common assert combined_size == size # add same sequence again b.add_sequence('TGCCGCCCAGCA') - intersection, size = a.intersection(b) + intersection, size = a.intersection(b, in_common=True) assert intersection == common assert combined_size == size - intersection, size = b.intersection(b) + intersection, size = b.intersection(b, in_common=True) assert intersection == common assert combined_size == size - intersection, size = b.intersection(a) + intersection, size = b.intersection(a, in_common=True) assert intersection == common assert combined_size == size - intersection, size = a.intersection(a) + intersection, size = a.intersection(a, in_common=True) assert intersection == common assert combined_size == size @@ -420,18 +455,18 @@ def test_intersection_1(track_abundance): new_in_common = set(a.get_mins()).intersection(set(b.get_mins())) new_combined_size = 8 - intersection, size = a.intersection(b) + intersection, size = a.intersection(b, in_common=True) assert intersection == new_in_common assert size == new_combined_size - intersection, size = b.intersection(a) + intersection, size = b.intersection(a, in_common=True) assert intersection == new_in_common assert size == new_combined_size - intersection, size = a.intersection(a) + intersection, size = a.intersection(a, in_common=True) assert intersection == set(a.get_mins()) - intersection, size = b.intersection(b) + intersection, size = b.intersection(b, in_common=True) assert intersection == set(b.get_mins()) @@ -510,6 +545,20 @@ def test_mh_count_common_diff_ksize(track_abundance): a.count_common(b) +def test_mh_count_common_notmh(track_abundance): + a = MinHash(20, 5, track_abundance=track_abundance) + b = set() + + with pytest.raises(TypeError): + a.count_common(b) + + +def test_mh_downsample_n_error(track_abundance): + a = MinHash(20, 10, track_abundance=track_abundance) + with pytest.raises(ValueError): + a.downsample_n(30) + + def test_mh_asymmetric(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): @@ -531,6 +580,12 @@ def test_mh_asymmetric(track_abundance): assert b.compare(a) == 0.5 +def test_mh_merge_typeerror(track_abundance): + a = MinHash(20, 10, track_abundance=track_abundance) + with pytest.raises(TypeError): + a.merge(set()) + + def test_mh_merge(track_abundance): # test merging two identically configured minhashes a = MinHash(20, 10, track_abundance=track_abundance) @@ -596,7 +651,7 @@ def test_mh_merge_check_length(track_abundance): b.add_hash(i) c = a.merge(b) - assert(len(c.get_mins()) == 20) + assert len(c.get_mins()) == 20 def test_mh_merge_check_length2(track_abundance): @@ -612,7 +667,7 @@ def test_mh_merge_check_length2(track_abundance): b.add_hash(4) c = a.merge(b) - assert(len(c.get_mins()) == 3) + assert len(c.get_mins()) == 3 def test_mh_asymmetric_merge(track_abundance): diff --git a/tests/test_rustobj.py b/tests/test_rustobj.py new file mode 100644 index 0000000000..4be7a0e1ee --- /dev/null +++ b/tests/test_rustobj.py @@ -0,0 +1,18 @@ +import pytest + +from sourmash.utils import RustObject +from sourmash._minhash import to_bytes + + +def test_rustobj_init(): + with pytest.raises(TypeError): + RustObject() + + +def test_to_bytes(): + with pytest.raises(TypeError): + to_bytes([9882]) + + assert to_bytes(98) == bytes([98]) + assert to_bytes("abc") == b"abc" + assert to_bytes(b"abc") == b"abc" diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index e9c4a5ecc5..c9073d4e4e 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -67,7 +67,7 @@ def test_do_serial_compare(c): cmp_outfile = c.output('cmp') assert os.path.exists(cmp_outfile) - cmp_out = numpy.load(cmp_outfile.encode('utf-8')) + cmp_out = numpy.load(cmp_outfile) sigs = [] for fn in testsigs: diff --git a/third-party/.gitignore b/third-party/.gitignore deleted file mode 100644 index 2ced0cd228..0000000000 --- a/third-party/.gitignore +++ /dev/null @@ -1,6 +0,0 @@ -zlib/example -zlib/example64 -zlib/examplesh -zlib/minigzip -zlib/minigzip64 -zlib/minigzipsh diff --git a/third-party/smhasher/MurmurHash3.cc b/third-party/smhasher/MurmurHash3.cc deleted file mode 100644 index 758a230d5d..0000000000 --- a/third-party/smhasher/MurmurHash3.cc +++ /dev/null @@ -1,340 +0,0 @@ -//----------------------------------------------------------------------------- -// MurmurHash3 was written by Austin Appleby, and is placed in the public -// domain. The author hereby disclaims copyright to this source code. - -// Note - The x86 and x64 versions do _not_ produce the same results, as the -// algorithms are optimized for their respective platforms. You can still -// compile and run any of them on any platform, but your performance with the -// non-native version will be less than optimal. - -#include "MurmurHash3.h" - -//----------------------------------------------------------------------------- -// Platform-specific functions and macros - -// Microsoft Visual Studio - -#if defined(_MSC_VER) - -#define FORCE_INLINE __forceinline - -#include - -#define ROTL32(x,y) _rotl(x,y) -#define ROTL64(x,y) _rotl64(x,y) - -#define BIG_CONSTANT(x) (x) - -// Other compilers - -#else // defined(_MSC_VER) - -#if defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && GNUC_MINOR >= 4)) -/* gcc version >= 4.4 4.1 = RHEL 5, 4.4 = RHEL 6. Don't inline for RHEL 5 gcc which is 4.1*/ -#define FORCE_INLINE inline __attribute__((always_inline)) -#else -#define FORCE_INLINE -#endif - -inline uint32_t rotl32 ( uint32_t x, int8_t r ) -{ - return (x << r) | (x >> (32 - r)); -} - -inline uint64_t rotl64 ( uint64_t x, int8_t r ) -{ - return (x << r) | (x >> (64 - r)); -} - -#define ROTL32(x,y) rotl32(x,y) -#define ROTL64(x,y) rotl64(x,y) - -#define BIG_CONSTANT(x) (x##LLU) - -#endif // !defined(_MSC_VER) - -//----------------------------------------------------------------------------- -// Block read - if your platform needs to do endian-swapping or can only -// handle aligned reads, do the conversion here - -FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i ) -{ - return p[i]; -} - -FORCE_INLINE uint64_t getblock ( const uint64_t * p, int i ) -{ - return p[i]; -} - -//----------------------------------------------------------------------------- -// Finalization mix - force all bits of a hash block to avalanche - -FORCE_INLINE uint32_t fmix ( uint32_t h ) -{ - h ^= h >> 16; - h *= 0x85ebca6b; - h ^= h >> 13; - h *= 0xc2b2ae35; - h ^= h >> 16; - - return h; -} - -//---------- - -FORCE_INLINE uint64_t fmix ( uint64_t k ) -{ - k ^= k >> 33; - k *= BIG_CONSTANT(0xff51afd7ed558ccd); - k ^= k >> 33; - k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); - k ^= k >> 33; - - return k; -} - -//----------------------------------------------------------------------------- - -void MurmurHash3_x86_32 ( const void * key, int len, - uint32_t seed, void * out ) -{ - const uint8_t * data = (const uint8_t*)key; - const int nblocks = len / 4; - - uint32_t h1 = seed; - - const uint32_t c1 = 0xcc9e2d51; - const uint32_t c2 = 0x1b873593; - - //---------- - // body - - const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); - - for(int i = -nblocks; i; i++) - { - uint32_t k1 = getblock(blocks,i); - - k1 *= c1; - k1 = ROTL32(k1,15); - k1 *= c2; - - h1 ^= k1; - h1 = ROTL32(h1,13); - h1 = h1*5+0xe6546b64; - } - - //---------- - // tail - - const uint8_t * tail = (const uint8_t*)(data + nblocks*4); - - uint32_t k1 = 0; - - switch(len & 3) - { - case 3: k1 ^= tail[2] << 16; - case 2: k1 ^= tail[1] << 8; - case 1: k1 ^= tail[0]; - k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; - }; - - //---------- - // finalization - - h1 ^= len; - - h1 = fmix(h1); - - *(uint32_t*)out = h1; -} - -//----------------------------------------------------------------------------- - -void MurmurHash3_x86_128 ( const void * key, const int len, - uint32_t seed, void * out ) -{ - const uint8_t * data = (const uint8_t*)key; - const int nblocks = len / 16; - - uint32_t h1 = seed; - uint32_t h2 = seed; - uint32_t h3 = seed; - uint32_t h4 = seed; - - const uint32_t c1 = 0x239b961b; - const uint32_t c2 = 0xab0e9789; - const uint32_t c3 = 0x38b34ae5; - const uint32_t c4 = 0xa1e38b93; - - //---------- - // body - - const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); - - for(int i = -nblocks; i; i++) - { - uint32_t k1 = getblock(blocks,i*4+0); - uint32_t k2 = getblock(blocks,i*4+1); - uint32_t k3 = getblock(blocks,i*4+2); - uint32_t k4 = getblock(blocks,i*4+3); - - k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; - - h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; - - k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; - - h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; - - k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; - - h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; - - k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; - - h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; - } - - //---------- - // tail - - const uint8_t * tail = (const uint8_t*)(data + nblocks*16); - - uint32_t k1 = 0; - uint32_t k2 = 0; - uint32_t k3 = 0; - uint32_t k4 = 0; - - switch(len & 15) - { - case 15: k4 ^= tail[14] << 16; - case 14: k4 ^= tail[13] << 8; - case 13: k4 ^= tail[12] << 0; - k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; - - case 12: k3 ^= tail[11] << 24; - case 11: k3 ^= tail[10] << 16; - case 10: k3 ^= tail[ 9] << 8; - case 9: k3 ^= tail[ 8] << 0; - k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; - - case 8: k2 ^= tail[ 7] << 24; - case 7: k2 ^= tail[ 6] << 16; - case 6: k2 ^= tail[ 5] << 8; - case 5: k2 ^= tail[ 4] << 0; - k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; - - case 4: k1 ^= tail[ 3] << 24; - case 3: k1 ^= tail[ 2] << 16; - case 2: k1 ^= tail[ 1] << 8; - case 1: k1 ^= tail[ 0] << 0; - k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; - }; - - //---------- - // finalization - - h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; - - h1 += h2; h1 += h3; h1 += h4; - h2 += h1; h3 += h1; h4 += h1; - - h1 = fmix(h1); - h2 = fmix(h2); - h3 = fmix(h3); - h4 = fmix(h4); - - h1 += h2; h1 += h3; h1 += h4; - h2 += h1; h3 += h1; h4 += h1; - - ((uint32_t*)out)[0] = h1; - ((uint32_t*)out)[1] = h2; - ((uint32_t*)out)[2] = h3; - ((uint32_t*)out)[3] = h4; -} - -//----------------------------------------------------------------------------- - -void MurmurHash3_x64_128 ( const void * key, const int len, - const uint32_t seed, void * out ) -{ - const uint8_t * data = (const uint8_t*)key; - const int nblocks = len / 16; - - uint64_t h1 = seed; - uint64_t h2 = seed; - - const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); - const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); - - //---------- - // body - - const uint64_t * blocks = (const uint64_t *)(data); - - for(int i = 0; i < nblocks; i++) - { - uint64_t k1 = getblock(blocks,i*2+0); - uint64_t k2 = getblock(blocks,i*2+1); - - k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; - - h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; - - k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; - - h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; - } - - //---------- - // tail - - const uint8_t * tail = (const uint8_t*)(data + nblocks*16); - - uint64_t k1 = 0; - uint64_t k2 = 0; - - switch(len & 15) - { - case 15: k2 ^= uint64_t(tail[14]) << 48; - case 14: k2 ^= uint64_t(tail[13]) << 40; - case 13: k2 ^= uint64_t(tail[12]) << 32; - case 12: k2 ^= uint64_t(tail[11]) << 24; - case 11: k2 ^= uint64_t(tail[10]) << 16; - case 10: k2 ^= uint64_t(tail[ 9]) << 8; - case 9: k2 ^= uint64_t(tail[ 8]) << 0; - k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; - - case 8: k1 ^= uint64_t(tail[ 7]) << 56; - case 7: k1 ^= uint64_t(tail[ 6]) << 48; - case 6: k1 ^= uint64_t(tail[ 5]) << 40; - case 5: k1 ^= uint64_t(tail[ 4]) << 32; - case 4: k1 ^= uint64_t(tail[ 3]) << 24; - case 3: k1 ^= uint64_t(tail[ 2]) << 16; - case 2: k1 ^= uint64_t(tail[ 1]) << 8; - case 1: k1 ^= uint64_t(tail[ 0]) << 0; - k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; - }; - - //---------- - // finalization - - h1 ^= len; h2 ^= len; - - h1 += h2; - h2 += h1; - - h1 = fmix(h1); - h2 = fmix(h2); - - h1 += h2; - h2 += h1; - - ((uint64_t*)out)[0] = h1; - ((uint64_t*)out)[1] = h2; -} - -//----------------------------------------------------------------------------- - diff --git a/third-party/smhasher/MurmurHash3.h b/third-party/smhasher/MurmurHash3.h deleted file mode 100644 index 54e9d3f9e3..0000000000 --- a/third-party/smhasher/MurmurHash3.h +++ /dev/null @@ -1,37 +0,0 @@ -//----------------------------------------------------------------------------- -// MurmurHash3 was written by Austin Appleby, and is placed in the public -// domain. The author hereby disclaims copyright to this source code. - -#ifndef _MURMURHASH3_H_ -#define _MURMURHASH3_H_ - -//----------------------------------------------------------------------------- -// Platform-specific functions and macros - -// Microsoft Visual Studio - -#if defined(_MSC_VER) - -typedef unsigned char uint8_t; -typedef unsigned long uint32_t; -typedef unsigned __int64 uint64_t; - -// Other compilers - -#else // defined(_MSC_VER) - -#include - -#endif // !defined(_MSC_VER) - -//----------------------------------------------------------------------------- - -void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); - -void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); - -void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); - -//----------------------------------------------------------------------------- - -#endif // _MURMURHASH3_H_