From 9a2ccc10a58a945b7fadaa00a0778d59e5f208d5 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Fri, 8 Jul 2022 18:39:26 -0700 Subject: [PATCH] rkyv support in signature try to skip md5sum derive typed builder for GatherResult expose md5 start moving mastiff remove unused bigsi and sbt indices Expose iterator methods for Signature remove unused function preparing for MinHashOps Bump version, there will be many breaking changes... more splits default to large minhash, expose frozen use enum for MinHash thru FFI bug fixes and cleanup add c++ stdlib package for mamba fix finch feature try out roaring Add back check command initial update impl add semver check for rust bump once_cell fix rust ci start bringing #1943 more selection more picklist use dashmap in mem_revindex Revert "use dashmap in mem_revindex" This reverts commit 22727b7091dee2dbafd10ce707578c5085aa6cfc. bump rocksdb to 0.19 bump rocksdb to 0.20, bump MSRV to 1.60 update deps add cargo deny config use cibuildwheel configs in pyproject.toml flake cleanup fix cargo.lock tox updates don't worry with in-mem sigs for now --- .github/workflows/dev_envs.yml | 2 +- .github/workflows/rust.yml | 6 + Cargo.lock | 409 ++++++- Makefile | 2 + deny.toml | 1 + doc/developer.md | 2 +- flake.nix | 1 + include/sourmash.h | 106 +- pyproject.toml | 2 +- src/core/Cargo.toml | 17 +- src/core/benches/index.rs | 83 -- src/core/src/encodings.rs | 5 + src/core/src/errors.rs | 5 + src/core/src/ffi/hyperloglog.rs | 12 +- src/core/src/ffi/index/mod.rs | 166 +++ src/core/src/ffi/index/revindex.rs | 176 ++- src/core/src/ffi/manifest.rs | 73 ++ src/core/src/ffi/minhash.rs | 408 ++++++- src/core/src/ffi/mod.rs | 2 + src/core/src/ffi/nodegraph.rs | 13 +- src/core/src/ffi/picklist.rs | 89 ++ src/core/src/ffi/signature.rs | 128 +- src/core/src/ffi/storage.rs | 7 +- src/core/src/from.rs | 1 + src/core/src/index/bigsi.rs | 218 ---- src/core/src/index/linear.rs | 32 +- src/core/src/index/mod.rs | 132 ++- src/core/src/index/revindex.rs | 699 ----------- src/core/src/index/revindex/disk_revindex.rs | 549 +++++++++ src/core/src/index/revindex/mem_revindex.rs | 1118 ++++++++++++++++++ src/core/src/index/revindex/mod.rs | 509 ++++++++ src/core/src/index/sbt/mhbt.rs | 361 ------ src/core/src/index/sbt/mhmt.rs | 227 ---- src/core/src/index/sbt/mod.rs | 878 -------------- src/core/src/lib.rs | 2 + src/core/src/manifest.rs | 186 +++ src/core/src/picklist.rs | 29 + src/core/src/prelude.rs | 2 + src/core/src/signature.rs | 139 ++- src/core/src/sketch/hyperloglog/mod.rs | 25 +- src/core/src/sketch/minhash.rs | 956 +++++++++------ src/core/src/sketch/mod.rs | 4 + src/core/src/sketch/nodegraph.rs | 11 +- src/core/src/storage.rs | 9 +- src/core/tests/minhash.rs | 1 + src/core/tests/storage.rs | 38 + src/sourmash/hll.py | 4 +- src/sourmash/index/__init__.py | 229 +++- src/sourmash/manifest.py | 36 + src/sourmash/minhash.py | 16 +- src/sourmash/nodegraph.py | 15 +- src/sourmash/picklist.py | 18 + src/sourmash/sbt_storage.py | 4 +- src/sourmash/signature.py | 16 +- src/sourmash/utils.py | 7 + tests/test_index.py | 3 + tox.ini | 7 +- 57 files changed, 5082 insertions(+), 3114 deletions(-) delete mode 100644 src/core/benches/index.rs create mode 100644 src/core/src/ffi/manifest.rs create mode 100644 src/core/src/ffi/picklist.rs delete mode 100644 src/core/src/index/bigsi.rs delete mode 100644 src/core/src/index/revindex.rs create mode 100644 src/core/src/index/revindex/disk_revindex.rs create mode 100644 src/core/src/index/revindex/mem_revindex.rs create mode 100644 src/core/src/index/revindex/mod.rs delete mode 100644 src/core/src/index/sbt/mhbt.rs delete mode 100644 src/core/src/index/sbt/mhmt.rs delete mode 100644 src/core/src/index/sbt/mod.rs create mode 100644 src/core/src/manifest.rs create mode 100644 src/core/src/picklist.rs diff --git a/.github/workflows/dev_envs.yml b/.github/workflows/dev_envs.yml index 33b5069d00..5c8c359bfb 100644 --- a/.github/workflows/dev_envs.yml +++ b/.github/workflows/dev_envs.yml @@ -57,7 +57,7 @@ jobs: - name: install dependencies shell: bash -l {0} - run: mamba install 'tox>=3.27,<4' tox-conda rust git compilers pandoc + run: mamba install 'tox>=3.27,<4' tox-conda rust git compilers pandoc libstdcxx-ng - name: run tests for 3.9 shell: bash -l {0} diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 300169efa6..89c87c7551 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -234,6 +234,12 @@ jobs: toolchain: stable override: true + - name: Check semver + uses: obi1kenobi/cargo-semver-checks-action@v2 + with: + crate-name: sourmash + version-tag-prefix: r + - name: Make sure we can publish the sourmash crate uses: actions-rs/cargo@v1 with: diff --git a/Cargo.lock b/Cargo.lock index 560ef1c702..248d7a5864 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8,6 +8,17 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "ahash" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" +dependencies = [ + "getrandom", + "once_cell", + "version_check", +] + [[package]] name = "aliasable" version = "0.1.3" @@ -59,6 +70,12 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b7e4c2464d97fe331d41de9d5db0def0a96f4d823b8b32a2efd503578988973" +[[package]] +name = "binary-merge" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "597bb81c80a54b6a4381b23faba8d7774b144c94cbd1d6fe3f1329bd776554ab" + [[package]] name = "bincode" version = "1.3.3" @@ -68,6 +85,26 @@ dependencies = [ "serde", ] +[[package]] +name = "bindgen" +version = "0.64.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4243e6031260db77ede97ad86c27e501d646a27ab57b59a574f725d98ab1fb4" +dependencies = [ + "bitflags 1.3.2", + "cexpr", + "clang-sys", + "lazy_static", + "lazycell", + "peeking_take_while", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn 1.0.104", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -80,18 +117,6 @@ version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42" -[[package]] -name = "bstr" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" -dependencies = [ - "lazy_static", - "memchr", - "regex-automata", - "serde", -] - [[package]] name = "buffer-redux" version = "1.0.0" @@ -108,12 +133,39 @@ version = "3.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535" +[[package]] +name = "bytecheck" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a31f923c2db9513e4298b72df143e6e655a759b3d6a0966df18f81223fff54f" +dependencies = [ + "bytecheck_derive", + "ptr_meta", +] + +[[package]] +name = "bytecheck_derive" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edb17c862a905d912174daa27ae002326fff56dc8b8ada50a0a5f0976cb174f0" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.104", +] + [[package]] name = "bytecount" version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" +[[package]] +name = "bytemuck" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f5715e491b5a1598fc2bef5a606847b5dc1d48ea625bd3c02c00de8285591da" + [[package]] name = "byteorder" version = "1.4.3" @@ -164,6 +216,18 @@ name = "cc" version = "1.0.73" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" +dependencies = [ + "jobserver", +] + +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] [[package]] name = "cfg-if" @@ -213,6 +277,17 @@ dependencies = [ "half", ] +[[package]] +name = "clang-sys" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a050e2153c5be08febd6734e29298e844fdb0fa21aeddd63b4eb7baa106c69b" +dependencies = [ + "glob", + "libc", + "libloading", +] + [[package]] name = "clap" version = "4.3.0" @@ -374,13 +449,12 @@ dependencies = [ [[package]] name = "csv" -version = "1.1.6" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" +checksum = "af91f40b7355f82b0a891f50e70399475945bb0b0da4f1700ce60761c9d3e359" dependencies = [ - "bstr", "csv-core", - "itoa 0.4.8", + "itoa", "ryu", "serde", ] @@ -396,9 +470,9 @@ dependencies = [ [[package]] name = "cxx" -version = "1.0.85" +version = "1.0.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5add3fc1717409d029b20c5b6903fc0c0b02fa6741d820054f4a2efa5e5816fd" +checksum = "86d3488e7665a7a483b57e25bdd90d0aeb2bc7608c8d0346acf2ad3f1caf1d62" dependencies = [ "cc", "cxxbridge-flags", @@ -408,9 +482,9 @@ dependencies = [ [[package]] name = "cxx-build" -version = "1.0.85" +version = "1.0.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4c87959ba14bc6fbc61df77c3fcfe180fc32b93538c4f1031dd802ccb5f2ff0" +checksum = "48fcaf066a053a41a81dfb14d57d99738b767febb8b735c3016e469fac5da690" dependencies = [ "cc", "codespan-reporting", @@ -423,15 +497,15 @@ dependencies = [ [[package]] name = "cxxbridge-flags" -version = "1.0.85" +version = "1.0.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69a3e162fde4e594ed2b07d0f83c6c67b745e7f28ce58c6df5e6b6bef99dfb59" +checksum = "a2ef98b8b717a829ca5603af80e1f9e2e48013ab227b68ef37872ef84ee479bf" [[package]] name = "cxxbridge-macro" -version = "1.0.85" +version = "1.0.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e7e2adeb6a0d4a282e581096b06e1791532b7d576dcde5ccd9382acf55db8e6" +checksum = "086c685979a698443656e5cf7856c95c642295a38599f12fb1ff76fb28d19892" dependencies = [ "proc-macro2", "quote", @@ -531,12 +605,27 @@ dependencies = [ "syn 1.0.104", ] +[[package]] +name = "glob" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" + [[package]] name = "half" version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" +[[package]] +name = "hashbrown" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db0d4cf898abf0081f964436dc980e96670a0f36863e4b83aaacdb65c9d7ccc3" +dependencies = [ + "ahash", +] + [[package]] name = "heck" version = "0.4.1" @@ -558,6 +647,12 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b" +[[package]] +name = "histogram" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12cb882ccb290b8646e554b157ab0b71e64e8d5bef775cd66b6531e52d302669" + [[package]] name = "iana-time-zone" version = "0.1.53" @@ -582,6 +677,15 @@ dependencies = [ "cxx-build", ] +[[package]] +name = "inplace-vec-builder" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf64c2edc8226891a71f127587a2861b132d2b942310843814d5001d99a1d307" +dependencies = [ + "smallvec", +] + [[package]] name = "io-lifetimes" version = "1.0.11" @@ -616,15 +720,18 @@ dependencies = [ [[package]] name = "itoa" -version = "0.4.8" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" +checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35" [[package]] -name = "itoa" -version = "1.0.1" +name = "jobserver" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35" +checksum = "af25a77299a7f711a01975c35a6a424eb6862092cc2d6c72c4ed6cbc56dfc1fa" +dependencies = [ + "libc", +] [[package]] name = "js-sys" @@ -641,18 +748,61 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + [[package]] name = "libc" version = "0.2.146" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f92be4933c13fd498862a9e02a3055f8a8d9c039ce33db97306fd5a6caa7f29b" +[[package]] +name = "libloading" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" +dependencies = [ + "cfg-if", + "winapi", +] + [[package]] name = "libm" version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb" +[[package]] +name = "librocksdb-sys" +version = "0.10.0+7.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fe4d5874f5ff2bc616e55e8c6086d478fcda13faf9495768a4aa1c22042d30b" +dependencies = [ + "bindgen", + "bzip2-sys", + "cc", + "glob", + "libc", + "libz-sys", + "lz4-sys", + "zstd-sys", +] + +[[package]] +name = "libz-sys" +version = "1.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9702761c3935f8cc2f101793272e202c72b99da8f4224a19ddcf1279a6450bbf" +dependencies = [ + "cc", + "pkg-config", + "vcpkg", +] + [[package]] name = "link-cplusplus" version = "1.0.8" @@ -680,6 +830,16 @@ version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +[[package]] +name = "lz4-sys" +version = "1.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57d27b317e207b10f69f5e75494119e391a96f48861ae870d1da6edac98ca900" +dependencies = [ + "cc", + "libc", +] + [[package]] name = "lzma-sys" version = "0.1.17" @@ -731,6 +891,12 @@ dependencies = [ "autocfg", ] +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "miniz_oxide" version = "0.4.4" @@ -763,9 +929,9 @@ dependencies = [ [[package]] name = "niffler" -version = "2.4.0" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68c7ffd42bdba05fc9fbfda31283d44c5c8a88fed1a191f68795dba23cc8204b" +checksum = "470dd05a938a5ad42c2cb80ceea4255e275990ee530b86ca164e6d8a19fa407f" dependencies = [ "cfg-if", "flate2", @@ -778,6 +944,16 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2bf50223579dc7cdcfb3bfcacf7069ff68243f8c363f62ffa99cf000a6b9c451" +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + [[package]] name = "num-integer" version = "0.1.44" @@ -819,6 +995,15 @@ dependencies = [ "libc", ] +[[package]] +name = "numsep" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad5c49c3e12c314efb1f43cba136031b657dcd59ee26936ab2be313c5e97da22" +dependencies = [ + "slicestring", +] + [[package]] name = "once_cell" version = "1.18.0" @@ -855,6 +1040,12 @@ dependencies = [ "syn 2.0.23", ] +[[package]] +name = "peeking_take_while" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" + [[package]] name = "piz" version = "0.5.1" @@ -970,6 +1161,26 @@ dependencies = [ "unarray", ] +[[package]] +name = "ptr_meta" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0738ccf7ea06b608c10564b31debd4f5bc5e197fc8bfe088f68ae5ce81e7a4f1" +dependencies = [ + "ptr_meta_derive", +] + +[[package]] +name = "ptr_meta_derive" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16b845dbfca988fa33db069c0e230574d15a3088f147a87b64c7589eb662c9ac" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.104", +] + [[package]] name = "quote" version = "1.0.29" @@ -1058,18 +1269,79 @@ dependencies = [ "regex-syntax", ] -[[package]] -name = "regex-automata" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" - [[package]] name = "regex-syntax" version = "0.6.26" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49b3de9ec5dc0a3417da371aab17d729997c15010e7fd24ff707773a33bddb64" +[[package]] +name = "rend" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "581008d2099240d37fb08d77ad713bcaec2c4d89d50b5b21a8bb1996bbab68ab" +dependencies = [ + "bytecheck", +] + +[[package]] +name = "retain_mut" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c31b5c4033f8fdde8700e4657be2c497e7288f01515be52168c631e2e4d4086" + +[[package]] +name = "rkyv" +version = "0.7.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c30f1d45d9aa61cbc8cd1eb87705470892289bb2d01943e7803b873a57404dc3" +dependencies = [ + "bytecheck", + "hashbrown", + "ptr_meta", + "rend", + "rkyv_derive", + "seahash", +] + +[[package]] +name = "rkyv_derive" +version = "0.7.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff26ed6c7c4dfc2aa9480b86a60e3c7233543a270a680e10758a507c5a4ce476" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.104", +] + +[[package]] +name = "roaring" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef0fb5e826a8bde011ecae6a8539dd333884335c57ff0f003fbe27c25bbe8f71" +dependencies = [ + "bytemuck", + "byteorder", + "retain_mut", +] + +[[package]] +name = "rocksdb" +version = "0.20.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "015439787fce1e75d55f279078d33ff14b4af5d93d995e8838ee4631301c8a99" +dependencies = [ + "libc", + "librocksdb-sys", +] + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "rustix" version = "0.37.20" @@ -1136,6 +1408,12 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ddccb15bcce173023b3fedd9436f882a0739b8dfb45e4f6b6002bee5929f61b2" +[[package]] +name = "seahash" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b" + [[package]] name = "serde" version = "1.0.168" @@ -1162,11 +1440,29 @@ version = "1.0.104" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "076066c5f1078eac5b722a31827a8832fe108bed65dfa75e233c89f8206e976c" dependencies = [ - "itoa 1.0.1", + "itoa", "ryu", "serde", ] +[[package]] +name = "shlex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" + +[[package]] +name = "size" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fed904c7fb2856d868b92464fc8fa597fce366edea1a9cbfaa8cb5fe080bd6d" + +[[package]] +name = "slicestring" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "636b979c5672ac7c2a1120ca0a9a6074cd090dadfec42af6f8a5baea1223d180" + [[package]] name = "smallvec" version = "1.8.0" @@ -1181,7 +1477,7 @@ checksum = "9f1341053f34bb13b5e9590afb7d94b48b48d4b87467ec28e3c238693bb553de" [[package]] name = "sourmash" -version = "0.11.0" +version = "0.12.0" dependencies = [ "assert_matches", "az", @@ -1191,10 +1487,12 @@ dependencies = [ "chrono", "counter", "criterion", + "csv", "finch", "fixedbitset", "getrandom", "getset", + "histogram", "log", "md5", "memmap2", @@ -1203,6 +1501,7 @@ dependencies = [ "niffler", "nohash-hasher", "num-iter", + "numsep", "once_cell", "ouroboros", "piz", @@ -1210,8 +1509,12 @@ dependencies = [ "proptest", "rand", "rayon", + "rkyv", + "roaring", + "rocksdb", "serde", "serde_json", + "size", "tempfile", "thiserror", "twox-hash", @@ -1349,16 +1652,25 @@ checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3" [[package]] name = "unicode-width" -version = "0.1.9" +version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973" +checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" [[package]] name = "vec-collections" -version = "0.3.6" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f2390c4dc8ae8640c57d067b1a3d40bc05c124cc6bc7394d761b53435d41b76" +checksum = "3c9965c8f2ffed1dbcd16cafe18a009642f540fa22661c6cfd6309ddb02e4982" dependencies = [ + "binary-merge", + "inplace-vec-builder", + "lazy_static", "num-traits", "serde", "smallvec", @@ -1601,3 +1913,14 @@ checksum = "c179869f34fc7c01830d3ce7ea2086bc3a07e0d35289b667d0a8bf910258926c" dependencies = [ "lzma-sys", ] + +[[package]] +name = "zstd-sys" +version = "2.0.7+zstd.1.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94509c3ba2fe55294d752b79842c530ccfab760192521df74a081a78d2b3c7f5" +dependencies = [ + "cc", + "libc", + "pkg-config", +] diff --git a/Makefile b/Makefile index 4c2ef69abb..990f79c068 100644 --- a/Makefile +++ b/Makefile @@ -28,6 +28,8 @@ include/sourmash.h: src/core/src/lib.rs \ src/core/src/ffi/hyperloglog.rs \ src/core/src/ffi/minhash.rs \ src/core/src/ffi/signature.rs \ + src/core/src/ffi/manifest.rs \ + src/core/src/ffi/picklist.rs \ src/core/src/ffi/nodegraph.rs \ src/core/src/ffi/index/mod.rs \ src/core/src/ffi/index/revindex.rs \ diff --git a/deny.toml b/deny.toml index 29d148d50b..99f3b442c7 100644 --- a/deny.toml +++ b/deny.toml @@ -29,6 +29,7 @@ default = "deny" confidence-threshold = 0.8 exceptions = [ { allow = ["Zlib"], name = "piz", version = "*" }, + { allow = ["ISC"], name = "libloading", version = "*" }, ] [bans] diff --git a/doc/developer.md b/doc/developer.md index 2368611e7a..b169d557de 100644 --- a/doc/developer.md +++ b/doc/developer.md @@ -25,7 +25,7 @@ and the [`conda-forge`](https://conda-forge.org/) channel by default). Once `mamba` is installed, run ``` -mamba create -n sourmash_dev 'tox>=3.27,<4' tox-conda rust git compilers pandoc +mamba create -n sourmash_dev 'tox>=3.27,<4' tox-conda rust git compilers pandoc libstdcxx-ng ``` to create an environment called `sourmash_dev` containing the programs needed for development. diff --git a/flake.nix b/flake.nix index 43ad3b8d78..2c94aed20e 100644 --- a/flake.nix +++ b/flake.nix @@ -114,6 +114,7 @@ nixpkgs-fmt ]; + # Needed for matplotlib LD_LIBRARY_PATH = lib.makeLibraryPath [ pkgs.stdenv.cc.cc.lib ]; # workaround for https://github.com/NixOS/nixpkgs/blob/48dfc9fa97d762bce28cc8372a2dd3805d14c633/doc/languages-frameworks/python.section.md#python-setuppy-bdist_wheel-cannot-create-whl diff --git a/include/sourmash.h b/include/sourmash.h index 6fa7854880..4a7c9bd235 100644 --- a/include/sourmash.h +++ b/include/sourmash.h @@ -16,6 +16,12 @@ enum HashFunctions { }; typedef uint32_t HashFunctions; +enum PickStyle { + PICK_STYLE_INCLUDE = 1, + PICK_STYLE_EXCLUDE = 2, +}; +typedef uint32_t PickStyle; + enum SourmashErrorCode { SOURMASH_ERROR_CODE_NO_ERROR = 0, SOURMASH_ERROR_CODE_PANIC = 1, @@ -42,6 +48,7 @@ enum SourmashErrorCode { SOURMASH_ERROR_CODE_PARSE_INT = 100003, SOURMASH_ERROR_CODE_SERDE_ERROR = 100004, SOURMASH_ERROR_CODE_NIFFLER_ERROR = 100005, + SOURMASH_ERROR_CODE_CSV_ERROR = 100006, }; typedef uint32_t SourmashErrorCode; @@ -51,14 +58,26 @@ typedef struct SourmashHyperLogLog SourmashHyperLogLog; typedef struct SourmashKmerMinHash SourmashKmerMinHash; +typedef struct SourmashLinearIndex SourmashLinearIndex; + +typedef struct SourmashManifest SourmashManifest; + +typedef struct SourmashManifestRowIter SourmashManifestRowIter; + typedef struct SourmashNodegraph SourmashNodegraph; +typedef struct SourmashPicklist SourmashPicklist; + typedef struct SourmashRevIndex SourmashRevIndex; typedef struct SourmashSearchResult SourmashSearchResult; +typedef struct SourmashSelection SourmashSelection; + typedef struct SourmashSignature SourmashSignature; +typedef struct SourmashSignatureIter SourmashSignatureIter; + typedef struct SourmashZipStorage SourmashZipStorage; /** @@ -79,6 +98,15 @@ typedef struct { bool owned; } SourmashStr; +typedef struct { + uint32_t ksize; + uint8_t with_abundance; + SourmashStr md5; + SourmashStr internal_location; + SourmashStr name; + SourmashStr moltype; +} SourmashManifestRow; + bool computeparams_dayhoff(const SourmashComputeParameters *ptr); bool computeparams_dna(const SourmashComputeParameters *ptr); @@ -263,8 +291,38 @@ double kmerminhash_similarity(const SourmashKmerMinHash *ptr, void kmerminhash_slice_free(uint64_t *ptr, uintptr_t insize); +SourmashKmerMinHash *kmerminhash_to_frozen(const SourmashKmerMinHash *ptr); + +SourmashKmerMinHash *kmerminhash_to_mutable(const SourmashKmerMinHash *ptr); + bool kmerminhash_track_abundance(const SourmashKmerMinHash *ptr); +void linearindex_free(SourmashLinearIndex *ptr); + +uint64_t linearindex_len(const SourmashLinearIndex *ptr); + +SourmashStr linearindex_location(const SourmashLinearIndex *ptr); + +const SourmashManifest *linearindex_manifest(const SourmashLinearIndex *ptr); + +SourmashLinearIndex *linearindex_new(SourmashZipStorage *storage_ptr, + SourmashManifest *manifest_ptr, + SourmashSelection *selection_ptr, + bool use_manifest); + +SourmashLinearIndex *linearindex_select(SourmashLinearIndex *ptr, + const SourmashSelection *selection_ptr); + +void linearindex_set_manifest(SourmashLinearIndex *ptr, SourmashManifest *manifest_ptr); + +SourmashSignatureIter *linearindex_signatures(const SourmashLinearIndex *ptr); + +const SourmashZipStorage *linearindex_storage(const SourmashLinearIndex *ptr); + +SourmashManifestRowIter *manifest_rows(const SourmashManifest *ptr); + +const SourmashManifestRow *manifest_rows_iter_next(SourmashManifestRowIter *ptr); + void nodegraph_buffer_free(uint8_t *ptr, uintptr_t insize); bool nodegraph_count(SourmashNodegraph *ptr, uint64_t h); @@ -309,6 +367,18 @@ SourmashNodegraph *nodegraph_with_tables(uintptr_t ksize, uintptr_t starting_size, uintptr_t n_tables); +void picklist_free(SourmashPicklist *ptr); + +SourmashPicklist *picklist_new(void); + +void picklist_set_coltype(SourmashPicklist *ptr, const char *coltype_ptr, uintptr_t insize); + +void picklist_set_column_name(SourmashPicklist *ptr, const char *prop_ptr, uintptr_t insize); + +void picklist_set_pickfile(SourmashPicklist *ptr, const char *prop_ptr, uintptr_t insize); + +void picklist_set_pickstyle(SourmashPicklist *ptr, PickStyle pickstyle); + void revindex_free(SourmashRevIndex *ptr); const SourmashSearchResult *const *revindex_gather(const SourmashRevIndex *ptr, @@ -354,6 +424,36 @@ double searchresult_score(const SourmashSearchResult *ptr); SourmashSignature *searchresult_signature(const SourmashSearchResult *ptr); +bool selection_abund(const SourmashSelection *ptr); + +bool selection_containment(const SourmashSelection *ptr); + +uint32_t selection_ksize(const SourmashSelection *ptr); + +HashFunctions selection_moltype(const SourmashSelection *ptr); + +SourmashSelection *selection_new(void); + +uint32_t selection_num(const SourmashSelection *ptr); + +const SourmashPicklist *selection_picklist(const SourmashSelection *ptr); + +uint32_t selection_scaled(const SourmashSelection *ptr); + +void selection_set_abund(SourmashSelection *ptr, bool new_abund); + +void selection_set_containment(SourmashSelection *ptr, bool new_containment); + +void selection_set_ksize(SourmashSelection *ptr, uint32_t new_ksize); + +void selection_set_moltype(SourmashSelection *ptr, HashFunctions new_moltype); + +void selection_set_num(SourmashSelection *ptr, uint32_t new_num); + +void selection_set_picklist(SourmashSelection *ptr, SourmashPicklist *new_picklist); + +void selection_set_scaled(SourmashSelection *ptr, uint32_t new_scaled); + void signature_add_protein(SourmashSignature *ptr, const char *sequence); void signature_add_sequence(SourmashSignature *ptr, const char *sequence, bool force); @@ -370,16 +470,12 @@ SourmashStr signature_get_filename(const SourmashSignature *ptr); SourmashStr signature_get_license(const SourmashSignature *ptr); -SourmashKmerMinHash **signature_get_mhs(const SourmashSignature *ptr, uintptr_t *size); - SourmashStr signature_get_name(const SourmashSignature *ptr); uintptr_t signature_len(const SourmashSignature *ptr); SourmashSignature *signature_new(void); -void signature_push_mh(SourmashSignature *ptr, const SourmashKmerMinHash *other); - SourmashStr signature_save_json(const SourmashSignature *ptr); void signature_set_filename(SourmashSignature *ptr, const char *name); @@ -388,6 +484,8 @@ void signature_set_mh(SourmashSignature *ptr, const SourmashKmerMinHash *other); void signature_set_name(SourmashSignature *ptr, const char *name); +const SourmashSignature *signatures_iter_next(SourmashSignatureIter *ptr); + SourmashSignature **signatures_load_buffer(const char *ptr, uintptr_t insize, bool _ignore_md5sum, diff --git a/pyproject.toml b/pyproject.toml index ff3b831c20..bb3b0d1754 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -164,7 +164,7 @@ known_first_party = ["sourmash"] [tool.cibuildwheel] build = "cp39-*" -skip = "*-win32 *-manylinux_i686 *-musllinux_ppc64le *-musllinux_s390x" +skip = "*-win32 *-manylinux_i686 *-musllinux_*" before-all = [ "curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain=stable", "cargo update --dry-run", diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index 03bd19ce48..23f8163a5a 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sourmash" -version = "0.11.0" +version = "0.12.0" authors = ["Luiz Irber "] description = "MinHash sketches for genomic data" repository = "https://github.com/sourmash-bio/sourmash" @@ -22,6 +22,8 @@ bench = false from-finch = ["finch"] parallel = ["rayon"] maturin = [] +mastiff = ["rocksdb", "rkyv", "parallel"] +default = ["parallel", "mastiff"] [dependencies] az = "1.0.0" @@ -29,6 +31,7 @@ bytecount = "0.6.0" byteorder = "1.4.3" cfg-if = "1.0" counter = "0.5.7" +csv = "1.1.6" finch = { version = "0.6.0", optional = true } fixedbitset = "0.4.0" getrandom = { version = "0.2", features = ["js"] } @@ -47,10 +50,16 @@ primal-check = "0.3.1" thiserror = "1.0" typed-builder = "0.14.0" twox-hash = "1.6.0" -vec-collections = "0.3.4" +vec-collections = "0.4.3" piz = "0.5.0" memmap2 = "0.7.1" ouroboros = "0.17.2" +rkyv = { version = "0.7.39", optional = true } +rocksdb = { version = "0.20.0", optional = true } +roaring = "0.10.0" +histogram = "0.6.9" +numsep = "0.1.12" +size = "0.4.0" [dev-dependencies] assert_matches = "1.3.0" @@ -60,10 +69,6 @@ proptest = { version = "1.2.0", default-features = false, features = ["std"]} rand = "0.8.2" tempfile = "3.7.1" -[[bench]] -name = "index" -harness = false - [[bench]] name = "compute" harness = false diff --git a/src/core/benches/index.rs b/src/core/benches/index.rs deleted file mode 100644 index d3d4b54118..0000000000 --- a/src/core/benches/index.rs +++ /dev/null @@ -1,83 +0,0 @@ -#[macro_use] -extern crate criterion; - -use std::path::PathBuf; - -use criterion::{Bencher, Criterion, Fun}; -use sourmash::index::bigsi::BIGSI; -use sourmash::index::linear::LinearIndex; -use sourmash::index::Index; -use sourmash::index::MHBT; -use sourmash::signature::Signature; - -fn find_small_bench(c: &mut Criterion) { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/v5.sbt.json"); - - let sbt = MHBT::from_path(filename).expect("Loading error"); - - let leaf: Signature = (*sbt.signatures().first().unwrap()).clone(); - - let mut linear = LinearIndex::builder().storage(sbt.storage()).build(); - - for l in sbt.signatures() { - linear.insert(l).unwrap(); - } - - let mut bigsi = BIGSI::new(10000, 10); - for l in sbt.signatures() { - bigsi.insert(l).unwrap(); - } - - let sbt_find = Fun::new("sbt_search", move |b: &mut Bencher, leaf: &Signature| { - b.iter(|| sbt.search(leaf, 0.1, false)) - }); - - let linear_find = Fun::new("linear_search", move |b: &mut Bencher, leaf: &Signature| { - b.iter(|| linear.search(leaf, 0.1, false)) - }); - - let bigsi_find = Fun::new("bigsi_search", move |b: &mut Bencher, leaf: &Signature| { - b.iter(|| bigsi.search(leaf, 0.1, false)) - }); - - let functions = vec![sbt_find, linear_find, bigsi_find]; - c.bench_functions("search_small", functions, leaf); -} - -fn find_subset_bench(c: &mut Criterion) { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/subset.sbt.json"); - - let sbt = MHBT::from_path(filename).expect("Loading error"); - - let leaf: Signature = (*sbt.signatures().first().unwrap()).clone(); - - let mut linear = LinearIndex::builder().storage(sbt.storage()).build(); - for l in sbt.signatures() { - linear.insert(l).unwrap(); - } - - let mut bigsi = BIGSI::new(10000, 10); - for l in sbt.signatures() { - bigsi.insert(l).unwrap(); - } - - let sbt_find = Fun::new("sbt_search", move |b: &mut Bencher, leaf: &Signature| { - b.iter(|| sbt.search(leaf, 0.1, false)) - }); - - let linear_find = Fun::new("linear_search", move |b: &mut Bencher, leaf: &Signature| { - b.iter(|| linear.search(leaf, 0.1, false)) - }); - - let bigsi_find = Fun::new("bigsi_search", move |b: &mut Bencher, leaf: &Signature| { - b.iter(|| bigsi.search(leaf, 0.1, false)) - }); - - let functions = vec![sbt_find, linear_find, bigsi_find]; - c.bench_functions("search_subset", functions, leaf); -} - -criterion_group!(benches, find_small_bench, find_subset_bench); -criterion_main!(benches); diff --git a/src/core/src/encodings.rs b/src/core/src/encodings.rs index 6010cf2f6d..443db90b50 100644 --- a/src/core/src/encodings.rs +++ b/src/core/src/encodings.rs @@ -7,6 +7,7 @@ use std::str; use nohash_hasher::BuildNoHashHasher; use once_cell::sync::Lazy; +use vec_collections::AbstractVecSet; use crate::Error; @@ -23,6 +24,10 @@ type ColorToIdx = HashMap>; #[allow(non_camel_case_types)] #[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[cfg_attr( + feature = "rkyv", + derive(rkyv::Serialize, rkyv::Deserialize, rkyv::Archive) +)] #[repr(u32)] pub enum HashFunctions { murmur64_DNA = 1, diff --git a/src/core/src/errors.rs b/src/core/src/errors.rs index cd4ddcfaf1..f6c3ce311e 100644 --- a/src/core/src/errors.rs +++ b/src/core/src/errors.rs @@ -63,6 +63,9 @@ pub enum SourmashError { #[error(transparent)] IOError(#[from] std::io::Error), + #[error(transparent)] + CsvError(#[from] csv::Error), + #[cfg(not(all(target_arch = "wasm32", target_os = "unknown")))] #[error(transparent)] Panic(#[from] crate::ffi::utils::Panic), @@ -108,6 +111,7 @@ pub enum SourmashErrorCode { ParseInt = 100_003, SerdeError = 100_004, NifflerError = 100_005, + CsvError = 100_006, } #[cfg(not(all(target_arch = "wasm32", target_os = "unknown")))] @@ -137,6 +141,7 @@ impl SourmashErrorCode { SourmashError::IOError { .. } => SourmashErrorCode::Io, SourmashError::NifflerError { .. } => SourmashErrorCode::NifflerError, SourmashError::Utf8Error { .. } => SourmashErrorCode::Utf8Error, + SourmashError::CsvError { .. } => SourmashErrorCode::CsvError, } } } diff --git a/src/core/src/ffi/hyperloglog.rs b/src/core/src/ffi/hyperloglog.rs index d9e828ab48..a5412fb6c8 100644 --- a/src/core/src/ffi/hyperloglog.rs +++ b/src/core/src/ffi/hyperloglog.rs @@ -6,7 +6,7 @@ use crate::prelude::*; use crate::signature::SigsTrait; use crate::sketch::hyperloglog::HyperLogLog; -use crate::ffi::minhash::SourmashKmerMinHash; +use crate::ffi::minhash::{MinHash, SourmashKmerMinHash}; use crate::ffi::utils::ForeignObject; pub struct SourmashHyperLogLog; @@ -108,14 +108,14 @@ unsafe fn hll_merge( } ffi_fn! { -unsafe fn hll_update_mh( - ptr: *mut SourmashHyperLogLog, - optr: *const SourmashKmerMinHash, -) { +unsafe fn hll_update_mh(ptr: *mut SourmashHyperLogLog, optr: *const SourmashKmerMinHash) { let hll = SourmashHyperLogLog::as_rust_mut(ptr); let mh = SourmashKmerMinHash::as_rust(optr); - mh.update(hll)? + match mh { + MinHash::Mutable(mh) => mh.update(hll)?, + MinHash::Frozen(mh) => mh.update(hll)?, + } } } diff --git a/src/core/src/ffi/index/mod.rs b/src/core/src/ffi/index/mod.rs index 932a97b222..644fcbaa86 100644 --- a/src/core/src/ffi/index/mod.rs +++ b/src/core/src/ffi/index/mod.rs @@ -1,7 +1,12 @@ +#[cfg(feature = "mastiff")] pub mod revindex; +use crate::encodings::HashFunctions; +use crate::index::{Selection, SigStore}; + use crate::signature::Signature; +use crate::ffi::picklist::SourmashPicklist; use crate::ffi::signature::SourmashSignature; use crate::ffi::utils::{ForeignObject, SourmashStr}; @@ -35,3 +40,164 @@ pub unsafe extern "C" fn searchresult_signature( let result = SourmashSearchResult::as_rust(ptr); SourmashSignature::from_rust((result.1).clone()) } + +//================================================================ + +pub struct SourmashSelection; + +impl ForeignObject for SourmashSelection { + type RustObject = Selection; +} + +#[no_mangle] +pub unsafe extern "C" fn selection_new() -> *mut SourmashSelection { + SourmashSelection::from_rust(Selection::default()) +} + +#[no_mangle] +pub unsafe extern "C" fn selection_ksize(ptr: *const SourmashSelection) -> u32 { + let sel = SourmashSelection::as_rust(ptr); + if let Some(ksize) = sel.ksize() { + ksize + } else { + todo!("empty ksize case not supported yet") + } +} + +#[no_mangle] +pub unsafe extern "C" fn selection_set_ksize(ptr: *mut SourmashSelection, new_ksize: u32) { + let sel = SourmashSelection::as_rust_mut(ptr); + sel.set_ksize(new_ksize); +} + +#[no_mangle] +pub unsafe extern "C" fn selection_num(ptr: *const SourmashSelection) -> u32 { + let sel = SourmashSelection::as_rust(ptr); + if let Some(num) = sel.num() { + num + } else { + todo!("empty num case not supported yet") + } +} + +#[no_mangle] +pub unsafe extern "C" fn selection_set_num(ptr: *mut SourmashSelection, new_num: u32) { + let sel = SourmashSelection::as_rust_mut(ptr); + sel.set_num(new_num); +} + +#[no_mangle] +pub unsafe extern "C" fn selection_scaled(ptr: *const SourmashSelection) -> u32 { + let sel = SourmashSelection::as_rust(ptr); + if let Some(scaled) = sel.scaled() { + scaled + } else { + todo!("empty scaled case not supported yet") + } +} + +#[no_mangle] +pub unsafe extern "C" fn selection_set_scaled(ptr: *mut SourmashSelection, new_scaled: u32) { + let sel = SourmashSelection::as_rust_mut(ptr); + sel.set_scaled(new_scaled); +} + +#[no_mangle] +pub unsafe extern "C" fn selection_containment(ptr: *const SourmashSelection) -> bool { + let sel = SourmashSelection::as_rust(ptr); + if let Some(containment) = sel.containment() { + containment + } else { + todo!("empty scaled case not supported yet") + } +} + +#[no_mangle] +pub unsafe extern "C" fn selection_set_containment( + ptr: *mut SourmashSelection, + new_containment: bool, +) { + let sel = SourmashSelection::as_rust_mut(ptr); + sel.set_containment(new_containment); +} + +#[no_mangle] +pub unsafe extern "C" fn selection_abund(ptr: *const SourmashSelection) -> bool { + let sel = SourmashSelection::as_rust(ptr); + if let Some(abund) = sel.abund() { + abund + } else { + todo!("empty abund case not supported yet") + } +} + +#[no_mangle] +pub unsafe extern "C" fn selection_set_abund(ptr: *mut SourmashSelection, new_abund: bool) { + let sel = SourmashSelection::as_rust_mut(ptr); + sel.set_abund(new_abund); +} + +#[no_mangle] +pub unsafe extern "C" fn selection_moltype(ptr: *const SourmashSelection) -> HashFunctions { + let sel = SourmashSelection::as_rust(ptr); + if let Some(hash_function) = sel.moltype() { + hash_function + } else { + todo!("empty hash_function case not supported yet") + } +} + +#[no_mangle] +pub unsafe extern "C" fn selection_set_moltype( + ptr: *mut SourmashSelection, + new_moltype: HashFunctions, +) { + let sel = SourmashSelection::as_rust_mut(ptr); + sel.set_moltype(new_moltype); +} + +#[no_mangle] +pub unsafe extern "C" fn selection_picklist( + ptr: *const SourmashSelection, +) -> *const SourmashPicklist { + let sel = SourmashSelection::as_rust(ptr); + if let Some(picklist) = sel.picklist() { + SourmashPicklist::from_rust(picklist) + } else { + todo!("empty picklist case not supported yet") + } +} + +#[no_mangle] +pub unsafe extern "C" fn selection_set_picklist( + ptr: *mut SourmashSelection, + new_picklist: *mut SourmashPicklist, +) { + let sel = SourmashSelection::as_rust_mut(ptr); + let pick = SourmashPicklist::into_rust(new_picklist); + sel.set_picklist(*pick); +} + +//================================================================ +// +pub struct SignatureIterator { + iter: Box>, +} + +pub struct SourmashSignatureIter; + +impl ForeignObject for SourmashSignatureIter { + type RustObject = SignatureIterator; +} + +#[no_mangle] +pub unsafe extern "C" fn signatures_iter_next( + ptr: *mut SourmashSignatureIter, +) -> *const SourmashSignature { + let iterator = SourmashSignatureIter::as_rust_mut(ptr); + + match iterator.iter.next() { + Some(sig) => SourmashSignature::from_rust(sig.into()), + None => std::ptr::null(), + } +} diff --git a/src/core/src/ffi/index/revindex.rs b/src/core/src/ffi/index/revindex.rs index 3597121bce..abf0bc6bad 100644 --- a/src/core/src/ffi/index/revindex.rs +++ b/src/core/src/ffi/index/revindex.rs @@ -1,15 +1,22 @@ use std::path::PathBuf; use std::slice; +use std::sync::Arc; -use crate::index::revindex::RevIndex; +use crate::index::revindex::mem_revindex::{LinearRevIndex, RevIndex}; use crate::index::Index; +use crate::manifest::Manifest; use crate::signature::{Signature, SigsTrait}; -use crate::sketch::minhash::KmerMinHash; +use crate::sketch::minhash::{max_hash_for_scaled, KmerMinHash}; use crate::sketch::Sketch; +use crate::storage::Storage; -use crate::ffi::index::SourmashSearchResult; -use crate::ffi::minhash::SourmashKmerMinHash; +use crate::ffi::index::{ + SignatureIterator, SourmashSearchResult, SourmashSelection, SourmashSignatureIter, +}; +use crate::ffi::manifest::SourmashManifest; +use crate::ffi::minhash::{MinHash, SourmashKmerMinHash}; use crate::ffi::signature::SourmashSignature; +use crate::ffi::storage::SourmashZipStorage; use crate::ffi::utils::{ForeignObject, SourmashStr}; pub struct SourmashRevIndex; @@ -42,8 +49,7 @@ unsafe fn revindex_new_with_paths( let template = { assert!(!template_ptr.is_null()); - //TODO: avoid clone here - Sketch::MinHash(SourmashKmerMinHash::as_rust(template_ptr).clone()) + SourmashKmerMinHash::as_rust(template_ptr).clone().into() }; let queries_vec: Vec; @@ -52,9 +58,11 @@ unsafe fn revindex_new_with_paths( } else { queries_vec = slice::from_raw_parts(queries_ptr, inqueries) .iter() - .map(|mh_ptr| - // TODO: avoid this clone - SourmashKmerMinHash::as_rust(*mh_ptr).clone()) + .map(|mh_ptr| match SourmashKmerMinHash::as_rust(*mh_ptr) { + // TODO: avoid clone + MinHash::Mutable(mh) => mh.clone().into(), + MinHash::Frozen(mh) => mh.clone(), + }) .collect(); Some(queries_vec.as_ref()) }; @@ -90,7 +98,7 @@ unsafe fn revindex_new_with_sigs( let template = { assert!(!template_ptr.is_null()); //TODO: avoid clone here - Sketch::MinHash(SourmashKmerMinHash::as_rust(template_ptr).clone()) + SourmashKmerMinHash::as_rust(template_ptr).clone().into() }; let queries_vec: Vec; @@ -99,9 +107,13 @@ unsafe fn revindex_new_with_sigs( } else { queries_vec = slice::from_raw_parts(queries_ptr, inqueries) .iter() - .map(|mh_ptr| - // TODO: avoid this clone - SourmashKmerMinHash::as_rust(*mh_ptr).clone()) + .map(|mh_ptr| { + // TODO: avoid this clone + match SourmashKmerMinHash::as_rust(*mh_ptr) { + MinHash::Mutable(mh) => mh.clone().into(), + MinHash::Frozen(mh) => mh.clone(), + } + }) .collect(); Some(queries_vec.as_ref()) }; @@ -248,3 +260,141 @@ unsafe fn revindex_signatures( Ok(Box::into_raw(b) as *mut *mut SourmashSignature) } } + +//-------------------------------------------------- + +pub struct SourmashLinearIndex; + +impl ForeignObject for SourmashLinearIndex { + type RustObject = LinearRevIndex; +} + +ffi_fn! { +unsafe fn linearindex_new( + storage_ptr: *mut SourmashZipStorage, + manifest_ptr: *mut SourmashManifest, + selection_ptr: *mut SourmashSelection, + use_manifest: bool, +) -> Result<*mut SourmashLinearIndex> { + let storage = Arc::try_unwrap(*SourmashZipStorage::into_rust(storage_ptr)).ok().unwrap(); + + let manifest = if manifest_ptr.is_null() { + if use_manifest { + // Load manifest from zipstorage + Some(Manifest::from_reader(storage.load("SOURMASH-MANIFEST.csv")?.as_slice())?) + } else { + None + } + } else { + Some(*SourmashManifest::into_rust(manifest_ptr)) + }; + + let _selection = if !selection_ptr.is_null() { + Some(SourmashSelection::into_rust(selection_ptr)) + } else { + None + }; + // TODO: how to extract a template? Probably from selection? + let max_hash = max_hash_for_scaled(100); + let template = Sketch::MinHash( + KmerMinHash::builder() + .num(0u32) + .ksize(57) + .hash_function(crate::encodings::HashFunctions::murmur64_protein) + .max_hash(max_hash) + .build(), + ); + + /* + def __init__(self, storage, *, selection_dict=None, + traverse_yield_all=False, manifest=None, use_manifest=True): + sig_files: Manifest, + template: &Sketch, + keep_sigs: bool, + ref_sigs: Option>, + storage: Option, + */ + + let linear_index = LinearRevIndex::new(manifest, &template, false, None, Some(storage)); + + Ok(SourmashLinearIndex::from_rust(linear_index)) +} +} + +#[no_mangle] +pub unsafe extern "C" fn linearindex_free(ptr: *mut SourmashLinearIndex) { + SourmashLinearIndex::drop(ptr); +} + +#[no_mangle] +pub unsafe extern "C" fn linearindex_manifest( + ptr: *const SourmashLinearIndex, +) -> *const SourmashManifest { + let index = SourmashLinearIndex::as_rust(ptr); + SourmashManifest::from_rust(index.manifest()) +} + +ffi_fn! { +unsafe fn linearindex_set_manifest( + ptr: *mut SourmashLinearIndex, + manifest_ptr: *mut SourmashManifest, +) -> Result<()> { + let index = SourmashLinearIndex::as_rust_mut(ptr); + let manifest = SourmashManifest::into_rust(manifest_ptr); + + index.set_manifest(*manifest)?; + Ok(()) +} +} + +#[no_mangle] +pub unsafe extern "C" fn linearindex_len(ptr: *const SourmashLinearIndex) -> u64 { + let index = SourmashLinearIndex::as_rust(ptr); + index.len() as u64 +} + +#[no_mangle] +pub unsafe extern "C" fn linearindex_location(ptr: *const SourmashLinearIndex) -> SourmashStr { + let index = SourmashLinearIndex::as_rust(ptr); + match index.location() { + Some(x) => x, + None => "".into(), + } + .into() +} + +#[no_mangle] +pub unsafe extern "C" fn linearindex_storage( + ptr: *const SourmashLinearIndex, +) -> *const SourmashZipStorage { + let index = SourmashLinearIndex::as_rust(ptr); + let storage = index.storage(); + + match storage { + Some(st) => SourmashZipStorage::from_rust(st), + None => std::ptr::null::(), + } +} + +#[no_mangle] +pub unsafe extern "C" fn linearindex_signatures( + ptr: *const SourmashLinearIndex, +) -> *mut SourmashSignatureIter { + let index = SourmashLinearIndex::as_rust(ptr); + + let iter = Box::new(index.signatures_iter()); + SourmashSignatureIter::from_rust(SignatureIterator { iter }) +} + +ffi_fn! { +unsafe fn linearindex_select( + ptr: *mut SourmashLinearIndex, + selection_ptr: *const SourmashSelection, +) -> Result<*mut SourmashLinearIndex> { + let index = SourmashLinearIndex::into_rust(ptr); + let selection = SourmashSelection::as_rust(selection_ptr); + + let new_index = index.select(selection)?; + Ok(SourmashLinearIndex::from_rust(new_index)) +} +} diff --git a/src/core/src/ffi/manifest.rs b/src/core/src/ffi/manifest.rs new file mode 100644 index 0000000000..815f8d83f1 --- /dev/null +++ b/src/core/src/ffi/manifest.rs @@ -0,0 +1,73 @@ +use crate::manifest::{Manifest, Record}; + +use crate::ffi::utils::{ForeignObject, SourmashStr}; + +pub struct SourmashManifest; + +impl ForeignObject for SourmashManifest { + type RustObject = Manifest; +} + +pub struct ManifestRowIterator { + iter: Box>, +} + +pub struct SourmashManifestRowIter; + +impl ForeignObject for SourmashManifestRowIter { + type RustObject = ManifestRowIterator; +} + +#[no_mangle] +pub unsafe extern "C" fn manifest_rows_iter_next( + ptr: *mut SourmashManifestRowIter, +) -> *const SourmashManifestRow { + let iterator = SourmashManifestRowIter::as_rust_mut(ptr); + + match iterator.iter.next() { + Some(row) => SourmashManifestRow::from_rust(row.into()), + None => std::ptr::null(), + } +} + +#[no_mangle] +pub unsafe extern "C" fn manifest_rows( + ptr: *const SourmashManifest, +) -> *mut SourmashManifestRowIter { + let manifest = SourmashManifest::as_rust(ptr); + + let iter = Box::new(manifest.iter()); + SourmashManifestRowIter::from_rust(ManifestRowIterator { iter }) +} + +#[repr(C)] +pub struct SourmashManifestRow { + pub ksize: u32, + pub with_abundance: u8, + pub md5: SourmashStr, + pub internal_location: SourmashStr, + pub name: SourmashStr, + pub moltype: SourmashStr, +} + +impl ForeignObject for SourmashManifestRow { + type RustObject = SourmashManifestRow; +} + +impl From<&Record> for SourmashManifestRow { + fn from(record: &Record) -> SourmashManifestRow { + Self { + ksize: record.ksize(), + with_abundance: record.with_abundance() as u8, + md5: record.md5().into(), + name: record.name().into(), + moltype: record.moltype().to_string().into(), + internal_location: record + .internal_location() + .to_str() + .unwrap() + .to_owned() + .into(), + } + } +} diff --git a/src/core/src/ffi/minhash.rs b/src/core/src/ffi/minhash.rs index 45890b81d9..3509c705ab 100644 --- a/src/core/src/ffi/minhash.rs +++ b/src/core/src/ffi/minhash.rs @@ -6,12 +6,33 @@ use crate::encodings::{aa_to_dayhoff, aa_to_hp, translate_codon, HashFunctions}; use crate::ffi::utils::{ForeignObject, SourmashStr}; use crate::signature::SeqToHashes; use crate::signature::SigsTrait; -use crate::sketch::minhash::KmerMinHash; +use crate::sketch::hyperloglog::HyperLogLog; +use crate::sketch::minhash::{ + AbundMinHashOps, FracMinHashOps, KmerMinHash, KmerMinHashBTree, MinHashOps, +}; +use crate::sketch::Sketch; +use crate::Error; +use crate::HashIntoType; + +#[derive(Clone)] +pub enum MinHash { + Mutable(KmerMinHashBTree), + Frozen(KmerMinHash), +} pub struct SourmashKmerMinHash; +#[no_mangle] +pub unsafe extern "C" fn kmerminhash_to_mutable( + ptr: *const SourmashKmerMinHash, +) -> *mut SourmashKmerMinHash { + let mh = SourmashKmerMinHash::as_rust(ptr); + + SourmashKmerMinHash::from_rust(mh.clone().to_mutable()) +} + impl ForeignObject for SourmashKmerMinHash { - type RustObject = KmerMinHash; + type RustObject = MinHash; } #[no_mangle] @@ -23,9 +44,9 @@ pub unsafe extern "C" fn kmerminhash_new( track_abundance: bool, n: u32, ) -> *mut SourmashKmerMinHash { - let mh = KmerMinHash::new(scaled, k, hash_function, seed, track_abundance, n); + let mh = KmerMinHashBTree::new(scaled, k, hash_function, seed, track_abundance, n); - SourmashKmerMinHash::from_rust(mh) + SourmashKmerMinHash::from_rust(MinHash::Mutable(mh)) } #[no_mangle] @@ -33,6 +54,15 @@ pub unsafe extern "C" fn kmerminhash_free(ptr: *mut SourmashKmerMinHash) { SourmashKmerMinHash::drop(ptr); } +#[no_mangle] +pub unsafe extern "C" fn kmerminhash_to_frozen( + ptr: *const SourmashKmerMinHash, +) -> *mut SourmashKmerMinHash { + let mh = SourmashKmerMinHash::as_rust(ptr); + + SourmashKmerMinHash::from_rust(mh.clone().to_frozen()) +} + #[no_mangle] pub unsafe extern "C" fn kmerminhash_slice_free(ptr: *mut u64, insize: usize) { // FIXME @@ -471,6 +501,7 @@ unsafe fn kmerminhash_similarity(ptr: *const SourmashKmerMinHash, other: *const mh.similarity(other_mh, ignore_abundance, downsample) } } + ffi_fn! { unsafe fn kmerminhash_angular_similarity(ptr: *const SourmashKmerMinHash, other: *const SourmashKmerMinHash) -> Result { @@ -479,3 +510,372 @@ unsafe fn kmerminhash_angular_similarity(ptr: *const SourmashKmerMinHash, other: mh.angular_similarity(other_mh) } } + +impl MinHash { + pub fn to_mutable(self) -> MinHash { + match self { + MinHash::Mutable(mh) => MinHash::Mutable(mh), + MinHash::Frozen(mh) => MinHash::Mutable(mh.into()), + } + } + + pub fn to_frozen(self) -> MinHash { + match self { + MinHash::Mutable(mh) => MinHash::Frozen(mh.into()), + MinHash::Frozen(mh) => MinHash::Frozen(mh), + } + } + + pub fn num(&self) -> u32 { + match self { + MinHash::Mutable(mh) => mh.num(), + MinHash::Frozen(mh) => mh.num(), + } + } + + pub fn count_common(&self, other: &MinHash, downsample: bool) -> Result { + match *self { + MinHash::Mutable(ref mh) => match other { + MinHash::Mutable(ref ot) => mh.count_common(ot, downsample), + MinHash::Frozen(ref ot) => { + Into::::into(mh.clone()).count_common(ot, downsample) + } + }, + + MinHash::Frozen(ref mh) => match other { + MinHash::Frozen(ref ot) => mh.count_common(ot, downsample), + MinHash::Mutable(ref ot) => { + Into::::into(mh.clone()).count_common(ot, downsample) + } + }, + } + } + + pub fn merge(&mut self, other: &MinHash) -> Result<(), Error> { + match *self { + MinHash::Mutable(ref mut mh) => match other { + MinHash::Mutable(ref ot) => mh.merge(ot), + MinHash::Frozen(ref ot) => mh.merge(&Into::::into(ot.clone())), + }, + + MinHash::Frozen(ref mut mh) => match other { + MinHash::Frozen(ref ot) => mh.merge(ot), + MinHash::Mutable(ref ot) => mh.merge(&Into::::into(ot.clone())), + }, + } + } + + pub fn similarity( + &self, + other: &MinHash, + ignore_abundance: bool, + downsample: bool, + ) -> Result { + match *self { + MinHash::Mutable(ref mh) => match other { + MinHash::Mutable(ref ot) => mh.similarity(ot, ignore_abundance, downsample), + MinHash::Frozen(ref ot) => Into::::into(mh.clone()).similarity( + ot, + ignore_abundance, + downsample, + ), + }, + + MinHash::Frozen(ref mh) => { + match other { + MinHash::Frozen(ref ot) => mh.similarity(ot, ignore_abundance, downsample), + MinHash::Mutable(ref ot) => Into::::into(mh.clone()) + .similarity(ot, ignore_abundance, downsample), + } + } + } + } + + pub fn jaccard(&self, other: &MinHash) -> Result { + match *self { + MinHash::Mutable(ref mh) => match other { + MinHash::Mutable(ref ot) => mh.jaccard(ot), + MinHash::Frozen(ref ot) => Into::::into(mh.clone()).jaccard(ot), + }, + + MinHash::Frozen(ref mh) => match other { + MinHash::Frozen(ref ot) => mh.jaccard(ot), + MinHash::Mutable(ref ot) => Into::::into(mh.clone()).jaccard(ot), + }, + } + } + + pub fn intersection_size(&self, other: &MinHash) -> Result<(u64, u64), Error> { + match *self { + MinHash::Mutable(ref mh) => match other { + MinHash::Mutable(ref ot) => mh.intersection_size(ot), + MinHash::Frozen(ref ot) => { + Into::::into(mh.clone()).intersection_size(ot) + } + }, + + MinHash::Frozen(ref mh) => match other { + MinHash::Frozen(ref ot) => mh.intersection_size(ot), + MinHash::Mutable(ref ot) => { + Into::::into(mh.clone()).intersection_size(ot) + } + }, + } + } + + pub fn intersection(&self, other: &MinHash) -> Result<(Vec, u64), Error> { + match *self { + MinHash::Mutable(ref mh) => match other { + MinHash::Mutable(ref ot) => mh.intersection(ot), + MinHash::Frozen(ref ot) => Into::::into(mh.clone()).intersection(ot), + }, + + MinHash::Frozen(ref mh) => match other { + MinHash::Frozen(ref ot) => mh.intersection(ot), + MinHash::Mutable(ref ot) => { + Into::::into(mh.clone()).intersection(ot) + } + }, + } + } + + pub fn add_from(&mut self, other: &MinHash) -> Result<(), Error> { + match *self { + MinHash::Mutable(ref mut mh) => match other { + MinHash::Mutable(ref ot) => mh.add_from(ot), + MinHash::Frozen(ref ot) => mh.add_from(&Into::::into(ot.clone())), + }, + + MinHash::Frozen(ref mut mh) => match other { + MinHash::Frozen(ref ot) => mh.add_from(ot), + MinHash::Mutable(ref ot) => mh.add_from(&Into::::into(ot.clone())), + }, + } + } + + pub fn remove_from(&mut self, other: &MinHash) -> Result<(), Error> { + match *self { + MinHash::Mutable(ref mut mh) => match other { + MinHash::Mutable(ref ot) => mh.remove_from(ot), + MinHash::Frozen(ref ot) => { + mh.remove_from(&Into::::into(ot.clone())) + } + }, + + MinHash::Frozen(ref mut mh) => match other { + MinHash::Frozen(ref ot) => mh.remove_from(ot), + MinHash::Mutable(ref ot) => mh.remove_from(&Into::::into(ot.clone())), + }, + } + } +} + +impl From for Sketch { + fn from(mh: MinHash) -> Sketch { + match mh { + MinHash::Mutable(mh) => Sketch::LargeMinHash(mh), + MinHash::Frozen(mh) => Sketch::MinHash(mh), + } + } +} + +impl FracMinHashOps for MinHash { + fn max_hash(&self) -> HashIntoType { + match *self { + MinHash::Mutable(ref mh) => mh.max_hash(), + MinHash::Frozen(ref mh) => mh.max_hash(), + } + } + + fn downsample_max_hash(&self, max_hash: HashIntoType) -> Result { + match *self { + MinHash::Mutable(ref mh) => Ok(MinHash::Mutable(mh.downsample_max_hash(max_hash)?)), + MinHash::Frozen(ref mh) => Ok(MinHash::Frozen(mh.downsample_max_hash(max_hash)?)), + } + } +} + +impl AbundMinHashOps for MinHash { + fn track_abundance(&self) -> bool { + match *self { + MinHash::Mutable(ref mh) => mh.track_abundance(), + MinHash::Frozen(ref mh) => mh.track_abundance(), + } + } + + fn enable_abundance(&mut self) -> Result<(), Error> { + match *self { + MinHash::Mutable(ref mut mh) => mh.enable_abundance(), + MinHash::Frozen(ref mut mh) => mh.enable_abundance(), + } + } + + fn disable_abundance(&mut self) { + match *self { + MinHash::Mutable(ref mut mh) => mh.disable_abundance(), + MinHash::Frozen(ref mut mh) => mh.disable_abundance(), + } + } + + fn add_hash_with_abundance(&mut self, hash: HashIntoType, abundance: u64) { + match *self { + MinHash::Mutable(ref mut mh) => mh.add_hash_with_abundance(hash, abundance), + MinHash::Frozen(ref mut mh) => mh.add_hash_with_abundance(hash, abundance), + } + } + + fn set_hash_with_abundance(&mut self, hash: HashIntoType, abundance: u64) { + match *self { + MinHash::Mutable(ref mut mh) => mh.set_hash_with_abundance(hash, abundance), + MinHash::Frozen(ref mut mh) => mh.set_hash_with_abundance(hash, abundance), + } + } + + fn abunds(&self) -> Option> { + match *self { + MinHash::Mutable(ref mh) => mh.abunds(), + MinHash::Frozen(ref mh) => mh.abunds(), + } + } + + fn to_vec_abunds(&self) -> Vec<(HashIntoType, u64)> { + match *self { + MinHash::Mutable(ref mh) => mh.to_vec_abunds(), + MinHash::Frozen(ref mh) => mh.to_vec_abunds(), + } + } +} + +impl MinHashOps for MinHash { + fn clear(&mut self) { + match *self { + MinHash::Mutable(ref mut mh) => mh.clear(), + MinHash::Frozen(ref mut mh) => mh.clear(), + } + } + + fn is_empty(&self) -> bool { + match *self { + MinHash::Mutable(ref mh) => mh.is_empty(), + MinHash::Frozen(ref mh) => mh.is_empty(), + } + } + + fn reset_md5sum(&self) { + match *self { + MinHash::Mutable(ref mh) => mh.reset_md5sum(), + MinHash::Frozen(ref mh) => mh.reset_md5sum(), + } + } + + fn md5sum(&self) -> String { + match *self { + MinHash::Mutable(ref mh) => mh.md5sum(), + MinHash::Frozen(ref mh) => mh.md5sum(), + } + } + + fn mins(&self) -> Vec { + match *self { + MinHash::Mutable(ref mh) => mh.mins(), + MinHash::Frozen(ref mh) => mh.mins(), + } + } + + fn remove_hash(&mut self, hash: HashIntoType) { + match *self { + MinHash::Mutable(ref mut mh) => mh.remove_hash(hash), + MinHash::Frozen(ref mut mh) => mh.remove_hash(hash), + } + } + + fn as_hll(&self) -> HyperLogLog { + match *self { + MinHash::Mutable(ref mh) => mh.as_hll(), + MinHash::Frozen(ref mh) => mh.as_hll(), + } + } +} + +impl SigsTrait for MinHash { + fn size(&self) -> usize { + match *self { + MinHash::Mutable(ref mh) => mh.size(), + MinHash::Frozen(ref mh) => mh.size(), + } + } + + fn to_vec(&self) -> Vec { + match *self { + MinHash::Mutable(ref mh) => mh.to_vec(), + MinHash::Frozen(ref mh) => mh.to_vec(), + } + } + + fn ksize(&self) -> usize { + match *self { + MinHash::Mutable(ref mh) => mh.ksize(), + MinHash::Frozen(ref mh) => mh.ksize(), + } + } + + fn seed(&self) -> u64 { + match *self { + MinHash::Mutable(ref mh) => mh.seed(), + MinHash::Frozen(ref mh) => mh.seed(), + } + } + + fn hash_function(&self) -> HashFunctions { + match *self { + MinHash::Mutable(ref mh) => mh.hash_function(), + MinHash::Frozen(ref mh) => mh.hash_function(), + } + } + + fn set_hash_function(&mut self, h: HashFunctions) -> Result<(), Error> { + match *self { + MinHash::Mutable(ref mut mh) => mh.set_hash_function(h), + MinHash::Frozen(ref mut mh) => mh.set_hash_function(h), + } + } + + fn add_hash(&mut self, hash: HashIntoType) { + match *self { + MinHash::Mutable(ref mut mh) => mh.add_hash(hash), + MinHash::Frozen(ref mut mh) => mh.add_hash(hash), + } + } + + fn check_compatible(&self, other: &Self) -> Result<(), Error> { + match *self { + MinHash::Mutable(ref mh) => match other { + MinHash::Mutable(ref ot) => mh.check_compatible(ot), + MinHash::Frozen(ref ot) => { + Into::::into(mh.clone()).check_compatible(ot) + } + }, + + MinHash::Frozen(ref mh) => match other { + MinHash::Frozen(ref ot) => mh.check_compatible(ot), + MinHash::Mutable(ref ot) => { + Into::::into(mh.clone()).check_compatible(ot) + } + }, + } + } + + fn add_sequence(&mut self, seq: &[u8], force: bool) -> Result<(), Error> { + match *self { + MinHash::Mutable(ref mut mh) => mh.add_sequence(seq, force), + MinHash::Frozen(ref mut mh) => mh.add_sequence(seq, force), + } + } + + fn add_protein(&mut self, seq: &[u8]) -> Result<(), Error> { + match *self { + MinHash::Mutable(ref mut mh) => mh.add_protein(seq), + MinHash::Frozen(ref mut mh) => mh.add_protein(seq), + } + } +} diff --git a/src/core/src/ffi/mod.rs b/src/core/src/ffi/mod.rs index a67de37176..44e856001f 100644 --- a/src/core/src/ffi/mod.rs +++ b/src/core/src/ffi/mod.rs @@ -9,8 +9,10 @@ pub mod utils; pub mod cmd; pub mod hyperloglog; pub mod index; +pub mod manifest; pub mod minhash; pub mod nodegraph; +pub mod picklist; pub mod signature; pub mod storage; diff --git a/src/core/src/ffi/nodegraph.rs b/src/core/src/ffi/nodegraph.rs index 2e0753b94d..46842d6513 100644 --- a/src/core/src/ffi/nodegraph.rs +++ b/src/core/src/ffi/nodegraph.rs @@ -5,7 +5,7 @@ use std::slice; use crate::prelude::*; use crate::sketch::nodegraph::Nodegraph; -use crate::ffi::minhash::SourmashKmerMinHash; +use crate::ffi::minhash::{MinHash, SourmashKmerMinHash}; use crate::ffi::utils::ForeignObject; pub struct SourmashNodegraph; @@ -134,7 +134,11 @@ pub unsafe extern "C" fn nodegraph_matches( ) -> usize { let ng = SourmashNodegraph::as_rust(ptr); let mh = SourmashKmerMinHash::as_rust(mh_ptr); - ng.matches(mh) + + match mh { + MinHash::Mutable(mh) => ng.matches(&mh.clone().into()), + MinHash::Frozen(mh) => ng.matches(mh), + } } #[no_mangle] @@ -157,7 +161,10 @@ pub unsafe extern "C" fn nodegraph_update_mh( let ng = SourmashNodegraph::as_rust_mut(ptr); let mh = SourmashKmerMinHash::as_rust(optr); - mh.update(ng).unwrap(); + match mh { + MinHash::Mutable(mh) => mh.update(ng).unwrap(), + MinHash::Frozen(mh) => mh.update(ng).unwrap(), + } } ffi_fn! { diff --git a/src/core/src/ffi/picklist.rs b/src/core/src/ffi/picklist.rs new file mode 100644 index 0000000000..c7bea755ae --- /dev/null +++ b/src/core/src/ffi/picklist.rs @@ -0,0 +1,89 @@ +use std::os::raw::c_char; +use std::slice; + +use crate::picklist::{PickStyle, Picklist}; + +use crate::ffi::utils::ForeignObject; + +pub struct SourmashPicklist; + +impl ForeignObject for SourmashPicklist { + type RustObject = Picklist; +} + +#[no_mangle] +pub unsafe extern "C" fn picklist_new() -> *mut SourmashPicklist { + SourmashPicklist::from_rust(Picklist::default()) +} + +#[no_mangle] +pub unsafe extern "C" fn picklist_free(ptr: *mut SourmashPicklist) { + SourmashPicklist::drop(ptr); +} + +ffi_fn! { +unsafe fn picklist_set_coltype( + ptr: *mut SourmashPicklist, + coltype_ptr: *const c_char, + insize: usize, +) -> Result<()> { + let coltype = { + assert!(!coltype_ptr.is_null()); + let coltype = slice::from_raw_parts(coltype_ptr as *mut u8, insize); + std::str::from_utf8(coltype)? + }; + let pl = SourmashPicklist::as_rust_mut(ptr); + pl.set_coltype(coltype.to_string()); + + Ok(()) +} +} + +ffi_fn! { +unsafe fn picklist_set_pickfile( + ptr: *mut SourmashPicklist, + prop_ptr: *const c_char, + insize: usize, +) -> Result<()> { + let prop = { + assert!(!prop_ptr.is_null()); + let prop = slice::from_raw_parts(prop_ptr as *mut u8, insize); + std::str::from_utf8(prop)? + }; + let pl = SourmashPicklist::as_rust_mut(ptr); + pl.set_pickfile(prop.to_string()); + + Ok(()) +} +} + +ffi_fn! { +unsafe fn picklist_set_column_name( + ptr: *mut SourmashPicklist, + prop_ptr: *const c_char, + insize: usize, +) -> Result<()> { + let prop = { + assert!(!prop_ptr.is_null()); + let prop = slice::from_raw_parts(prop_ptr as *mut u8, insize); + std::str::from_utf8(prop)? + }; + let pl = SourmashPicklist::as_rust_mut(ptr); + pl.set_column_name(prop.to_string()); + + Ok(()) +} +} + +ffi_fn! { +unsafe fn picklist_set_pickstyle( + ptr: *mut SourmashPicklist, + pickstyle: PickStyle, +) -> Result<()> { + let pl = SourmashPicklist::as_rust_mut(ptr); + + pl.set_pickstyle(pickstyle); + + Ok(()) +} +} diff --git a/src/core/src/ffi/signature.rs b/src/core/src/ffi/signature.rs index 825e091f4d..82e859f74f 100644 --- a/src/core/src/ffi/signature.rs +++ b/src/core/src/ffi/signature.rs @@ -4,14 +4,12 @@ use std::io; use std::os::raw::c_char; use std::slice; -use crate::errors::SourmashError; - use crate::encodings::HashFunctions; use crate::signature::Signature; use crate::sketch::Sketch; use crate::ffi::cmd::compute::SourmashComputeParameters; -use crate::ffi::minhash::SourmashKmerMinHash; +use crate::ffi::minhash::{MinHash, SourmashKmerMinHash}; use crate::ffi::utils::{ForeignObject, SourmashStr}; pub struct SourmashSignature; @@ -117,23 +115,37 @@ unsafe fn signature_set_filename(ptr: *mut SourmashSignature, name: *const c_cha } ffi_fn! { -unsafe fn signature_push_mh(ptr: *mut SourmashSignature, other: *const SourmashKmerMinHash) -> - Result<()> { +unsafe fn signature_set_mh( + ptr: *mut SourmashSignature, + other: *const SourmashKmerMinHash, +) -> Result<()> { let sig = SourmashSignature::as_rust_mut(ptr); let mh = SourmashKmerMinHash::as_rust(other); - sig.push(Sketch::MinHash(mh.clone())); + sig.reset_sketches(); + // TODO(lirber): avoid clone here + sig.push(mh.clone().into()); Ok(()) } } ffi_fn! { -unsafe fn signature_set_mh(ptr: *mut SourmashSignature, other: *const SourmashKmerMinHash) -> - Result<()> { - let sig = SourmashSignature::as_rust_mut(ptr); - let mh = SourmashKmerMinHash::as_rust(other); - sig.reset_sketches(); - sig.push(Sketch::MinHash(mh.clone())); - Ok(()) +unsafe fn signature_first_mh(ptr: *const SourmashSignature) -> Result<*mut SourmashKmerMinHash> { + let sig = SourmashSignature::as_rust(ptr); + + match sig.signatures.get(0) { + Some(Sketch::LargeMinHash(mh)) => { + // TODO(lirber): avoid clone here + Ok(SourmashKmerMinHash::from_rust(MinHash::Mutable(mh.clone()))) + } + Some(Sketch::MinHash(mh)) => { + // TODO(lirber): avoid clone here + Ok(SourmashKmerMinHash::from_rust(MinHash::Frozen(mh.clone()))) + } + _ => { + // TODO: signatures is empty? + unimplemented!() + } + } } } @@ -163,24 +175,6 @@ unsafe fn signature_get_license(ptr: *const SourmashSignature) -> Result Result<*mut SourmashKmerMinHash> { - let sig = SourmashSignature::as_rust(ptr); - - match sig.signatures.get(0) { - Some(Sketch::MinHash(mh)) => { - Ok(SourmashKmerMinHash::from_rust(mh.clone())) - }, - Some(Sketch::LargeMinHash(mh_btree)) => { - Ok(SourmashKmerMinHash::from_rust(mh_btree.into())) - }, - _ => Err(SourmashError::Internal { - message: "found unsupported sketch type".to_string() - }), - } -} -} - ffi_fn! { unsafe fn signature_eq(ptr: *const SourmashSignature, other: *const SourmashSignature) -> Result { let sig = SourmashSignature::as_rust(ptr); @@ -199,25 +193,12 @@ unsafe fn signature_save_json(ptr: *const SourmashSignature) -> Result Result<*mut *mut SourmashKmerMinHash> { - let sig = SourmashSignature::as_rust(ptr); - - let output = sig.sketches(); - - // FIXME: how to fit this into the ForeignObject trait? - let ptr_sigs: Vec<*mut Signature> = output.into_iter().map(|x| { - Box::into_raw(Box::new(x)) as *mut Signature - }).collect(); - - let b = ptr_sigs.into_boxed_slice(); - *size = b.len(); - - Ok(Box::into_raw(b) as *mut *mut SourmashKmerMinHash) -} -} - -ffi_fn! { -unsafe fn signatures_save_buffer(ptr: *const *const SourmashSignature, size: usize, compression: u8, osize: *mut usize) -> Result<*const u8> { +unsafe fn signatures_save_buffer( + ptr: *const *const SourmashSignature, + size: usize, + compression: u8, + osize: *mut usize, +) -> Result<*const u8> { // FIXME: review this for ForeignObject let sigs = { @@ -225,30 +206,35 @@ unsafe fn signatures_save_buffer(ptr: *const *const SourmashSignature, size: usi slice::from_raw_parts(ptr, size) }; - let rsigs: Vec<&Signature> = sigs.iter().map(|x| SourmashSignature::as_rust(*x)).collect(); + let rsigs: Vec<&Signature> = sigs + .iter() + .map(|x| SourmashSignature::as_rust(*x)) + .collect(); let mut buffer = vec![]; { - let mut writer = if compression > 0 { - let level = match compression { - 1 => niffler::compression::Level::One, - 2 => niffler::compression::Level::Two, - 3 => niffler::compression::Level::Three, - 4 => niffler::compression::Level::Four, - 5 => niffler::compression::Level::Five, - 6 => niffler::compression::Level::Six, - 7 => niffler::compression::Level::Seven, - 8 => niffler::compression::Level::Eight, - _ => niffler::compression::Level::Nine, - }; - - niffler::get_writer(Box::new(&mut buffer), - niffler::compression::Format::Gzip, - level)? - } else { - Box::new(&mut buffer) - }; - serde_json::to_writer(&mut writer, &rsigs)?; + let mut writer = if compression > 0 { + let level = match compression { + 1 => niffler::compression::Level::One, + 2 => niffler::compression::Level::Two, + 3 => niffler::compression::Level::Three, + 4 => niffler::compression::Level::Four, + 5 => niffler::compression::Level::Five, + 6 => niffler::compression::Level::Six, + 7 => niffler::compression::Level::Seven, + 8 => niffler::compression::Level::Eight, + _ => niffler::compression::Level::Nine, + }; + + niffler::get_writer( + Box::new(&mut buffer), + niffler::compression::Format::Gzip, + level, + )? + } else { + Box::new(&mut buffer) + }; + serde_json::to_writer(&mut writer, &rsigs)?; } let b = buffer.into_boxed_slice(); diff --git a/src/core/src/ffi/storage.rs b/src/core/src/ffi/storage.rs index 86d3834201..882a8d5f20 100644 --- a/src/core/src/ffi/storage.rs +++ b/src/core/src/ffi/storage.rs @@ -1,5 +1,6 @@ use std::os::raw::c_char; use std::slice; +use std::sync::Arc; use crate::ffi::utils::{ForeignObject, SourmashStr}; use crate::prelude::*; @@ -8,7 +9,7 @@ use crate::storage::ZipStorage; pub struct SourmashZipStorage; impl ForeignObject for SourmashZipStorage { - type RustObject = ZipStorage; + type RustObject = Arc; } ffi_fn! { @@ -20,7 +21,7 @@ unsafe fn zipstorage_new(ptr: *const c_char, insize: usize) -> Result<*mut Sourm }; let zipstorage = ZipStorage::from_file(path)?; - Ok(SourmashZipStorage::from_rust(zipstorage)) + Ok(SourmashZipStorage::from_rust(Arc::new(zipstorage))) } } @@ -110,7 +111,7 @@ unsafe fn zipstorage_set_subdir( std::str::from_utf8(path)? }; - storage.set_subdir(path.to_string()); + (*Arc::get_mut(storage).unwrap()).set_subdir(path.to_string()); Ok(()) } } diff --git a/src/core/src/from.rs b/src/core/src/from.rs index dfc384236e..95c5aa5fcd 100644 --- a/src/core/src/from.rs +++ b/src/core/src/from.rs @@ -2,6 +2,7 @@ use finch::sketch_schemes::mash::MashSketcher; use finch::sketch_schemes::SketchScheme; use crate::encodings::HashFunctions; +use crate::prelude::*; use crate::sketch::minhash::KmerMinHash; /* diff --git a/src/core/src/index/bigsi.rs b/src/core/src/index/bigsi.rs deleted file mode 100644 index 0e45348fc7..0000000000 --- a/src/core/src/index/bigsi.rs +++ /dev/null @@ -1,218 +0,0 @@ -use std::collections::HashMap; -use std::path::Path; - -use fixedbitset::FixedBitSet; -use thiserror::Error; -use typed_builder::TypedBuilder; - -use crate::index::Index; -use crate::signature::{Signature, SigsTrait}; -use crate::sketch::nodegraph::Nodegraph; -use crate::sketch::Sketch; -use crate::Error; -use crate::HashIntoType; - -#[derive(Clone, TypedBuilder)] -pub struct BIGSI { - matrix: Vec, - ksize: usize, - datasets: Vec, - //#[builder(setter(skip))] - //storage: Rc, -} - -#[derive(Debug, Error)] -pub enum BIGSIError { - #[error("BIGSI doesn't support this method")] - MethodDisabled, -} - -impl BIGSI { - pub fn new(bf_size: usize, ksize: usize) -> BIGSI { - let mut matrix = Vec::with_capacity(bf_size); - for _ in 0..bf_size { - // TODO: figure initial capacity for each row - matrix.push(FixedBitSet::with_capacity(100)); - } - - BIGSI { - matrix, - ksize, - datasets: Vec::new(), - } - } -} - -impl BIGSI { - pub fn add(&mut self, dataset: Signature) { - let mut ng = Nodegraph::new(&[self.matrix.len()], self.ksize); - - // TODO: select correct minhash - if let Sketch::MinHash(mh) = &dataset.signatures[0] { - for h in mh.mins() { - ng.count(h); - } - } else { - // TODO: what if it is not a mh? - unimplemented!() - } - - self.datasets.push(dataset); - let col = self.datasets.len() - 1; - - let bs = ng.into_bitsets(); - for pos in bs[0].ones() { - let bs = &mut self.matrix[pos]; - if bs.len() == col { - bs.grow(col + col / 2); - } - bs.insert(col); - } - } - - pub fn query(&self, hash: HashIntoType) -> impl Iterator + '_ { - let pos = hash as usize % self.matrix.len(); - let bs = &self.matrix[pos]; - bs.ones() - } - - pub fn query_datasets(&self, hash: HashIntoType) -> impl Iterator + '_ { - self.query(hash).map(move |pos| self.datasets[pos].clone()) - } -} - -impl<'a> Index<'a> for BIGSI { - type Item = Signature; - //type SignatureIterator = std::slice::Iter<'a, Self::Item>; - - fn search( - &self, - sig: &Self::Item, - threshold: f64, - containment: bool, - ) -> Result, Error> { - let mut results = Vec::new(); - - //TODO: still assuming one mh in the signature! - if let Sketch::MinHash(hashes) = &sig.signatures[0] { - let mut counter: HashMap = HashMap::with_capacity(hashes.size()); - - for hash in hashes.mins() { - self.query(hash).for_each(|dataset_idx| { - let idx = counter.entry(dataset_idx).or_insert(0); - *idx += 1; - }); - } - - for (idx, count) in counter { - let match_sig = &self.datasets[idx]; - //TODO: still assuming one mh in the signature! - let match_mh = match_sig.signatures[0].size(); - - let score = if containment { - count as f64 / hashes.size() as f64 - } else { - count as f64 / (hashes.size() + match_mh - count) as f64 - }; - - if score >= threshold { - results.push(match_sig) - } - } - - Ok(results) - } else { - // TODO: what if it is not a minhash? - unimplemented!() - } - } - - fn insert(&mut self, node: Self::Item) -> Result<(), Error> { - self.add(node); - Ok(()) - } - - fn save>(&self, _path: P) -> Result<(), Error> { - unimplemented!() - } - - fn load>(_path: P) -> Result<(), Error> { - unimplemented!() - } - - fn signatures(&self) -> Vec { - unimplemented!() - } - - fn signature_refs(&self) -> Vec<&Self::Item> { - unimplemented!() - } - - /* - fn iter_signatures(&'a self) -> Self::SignatureIterator { - self.datasets.iter() - } - */ -} - -#[cfg(test)] -mod test { - use std::convert::TryInto; - use std::fs::File; - use std::io::BufReader; - use std::path::PathBuf; - - use super::BIGSI; - - use crate::index::SigStore; - use crate::index::{Index, MHBT}; - use crate::signature::Signature; - - #[test] - fn bigsi_sbt_oracle() { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/v5.sbt.json"); - - let sbt = MHBT::from_path(filename).expect("Loading error"); - - let mut bigsi = BIGSI::new(10000, 10); - let datasets = sbt.signatures(); - - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); - - let mut reader = BufReader::new(File::open(filename).unwrap()); - let sigs = Signature::load_signatures( - &mut reader, - Some(31), - Some("DNA".try_into().unwrap()), - None, - ) - .unwrap(); - let sig_data = sigs[0].clone(); - - let leaf: SigStore<_> = sig_data.into(); - - for l in datasets { - bigsi.insert(l).expect("insertion error!"); - } - - let results_sbt = sbt.search(&leaf, 0.5, false).unwrap(); - assert_eq!(results_sbt.len(), 1); - - let data = leaf.data.get().unwrap(); - let results_bigsi = bigsi.search(data, 0.5, false).unwrap(); - assert_eq!(results_bigsi.len(), 1); - - assert_eq!(results_sbt.len(), results_bigsi.len()); - - let results_sbt = sbt.search(&leaf, 0.1, false).unwrap(); - assert_eq!(results_sbt.len(), 2); - - let data = leaf.data.get().unwrap(); - let results_bigsi = bigsi.search(data, 0.1, false).unwrap(); - assert_eq!(results_bigsi.len(), 2); - - assert_eq!(results_sbt.len(), results_bigsi.len()); - } -} diff --git a/src/core/src/index/linear.rs b/src/core/src/index/linear.rs index 78b2c6f1f5..6ae2916f16 100644 --- a/src/core/src/index/linear.rs +++ b/src/core/src/index/linear.rs @@ -6,18 +6,18 @@ use std::path::PathBuf; use serde::{Deserialize, Serialize}; use typed_builder::TypedBuilder; -use crate::index::{Comparable, DatasetInfo, Index, SigStore}; +use crate::index::{DatasetInfo, Index, SigStore}; use crate::prelude::*; use crate::storage::{FSStorage, InnerStorage, Storage, StorageInfo}; use crate::Error; #[derive(TypedBuilder)] -pub struct LinearIndex { +pub struct LinearIndex { #[builder(default)] storage: Option, #[builder(default)] - datasets: Vec>, + datasets: Vec, } #[derive(Serialize, Deserialize)] @@ -27,15 +27,11 @@ struct LinearInfo { leaves: Vec, } -impl<'a, L> Index<'a> for LinearIndex -where - L: Clone + Comparable + 'a, - SigStore: From, -{ - type Item = L; +impl<'a> Index<'a> for LinearIndex { + type Item = Signature; //type SignatureIterator = std::slice::Iter<'a, Self::Item>; - fn insert(&mut self, node: L) -> Result<(), Error> { + fn insert(&mut self, node: Self::Item) -> Result<(), Error> { self.datasets.push(node.into()); Ok(()) } @@ -76,11 +72,7 @@ where */ } -impl LinearIndex -where - L: ToWriter, - SigStore: ReadData, -{ +impl LinearIndex { pub fn save_file>( &mut self, path: P, @@ -115,7 +107,7 @@ where .iter_mut() .map(|l| { // Trigger data loading - let _: &L = (*l).data().unwrap(); + let _: &Signature = (*l).data().unwrap(); // set storage to new one l.storage = Some(storage.clone()); @@ -137,7 +129,7 @@ where Ok(()) } - pub fn from_path>(path: P) -> Result, Error> { + pub fn from_path>(path: P) -> Result { let file = File::open(&path)?; let mut reader = BufReader::new(file); @@ -147,11 +139,11 @@ where basepath.push(path); basepath.canonicalize()?; - let linear = LinearIndex::::from_reader(&mut reader, basepath.parent().unwrap())?; + let linear = LinearIndex::from_reader(&mut reader, basepath.parent().unwrap())?; Ok(linear) } - pub fn from_reader(rdr: R, path: P) -> Result, Error> + pub fn from_reader(rdr: R, path: P) -> Result where R: Read, P: AsRef, @@ -171,7 +163,7 @@ where .leaves .into_iter() .map(|l| { - let mut v: SigStore = l.into(); + let mut v: SigStore = l.into(); v.storage = Some(storage.clone()); v }) diff --git a/src/core/src/index/mod.rs b/src/core/src/index/mod.rs index 4e43074ebe..5b0e16f82e 100644 --- a/src/core/src/index/mod.rs +++ b/src/core/src/index/mod.rs @@ -3,10 +3,10 @@ //! An index organizes signatures to allow for fast similarity search. //! Some indices also support containment searches. -pub mod bigsi; pub mod linear; + +#[cfg(feature = "mastiff")] pub mod revindex; -pub mod sbt; pub mod search; @@ -17,27 +17,84 @@ use once_cell::sync::OnceCell; use serde::{Deserialize, Serialize}; use typed_builder::TypedBuilder; +use crate::encodings::HashFunctions; use crate::errors::ReadDataError; -use crate::index::sbt::{Node, SBT}; use crate::index::search::{search_minhashes, search_minhashes_containment}; +use crate::picklist::Picklist; use crate::prelude::*; use crate::signature::SigsTrait; -use crate::sketch::nodegraph::Nodegraph; use crate::sketch::Sketch; use crate::storage::{InnerStorage, Storage}; use crate::Error; -pub type MHBT = SBT, Signature>; +#[derive(Default)] +pub struct Selection { + ksize: Option, + abund: Option, + num: Option, + scaled: Option, + containment: Option, + moltype: Option, + picklist: Option, +} + +impl Selection { + pub fn ksize(&self) -> Option { + self.ksize + } + + pub fn set_ksize(&mut self, ksize: u32) { + self.ksize = Some(ksize); + } + + pub fn abund(&self) -> Option { + self.abund + } + + pub fn set_abund(&mut self, value: bool) { + self.abund = Some(value); + } + + pub fn num(&self) -> Option { + self.num + } + + pub fn set_num(&mut self, num: u32) { + self.num = Some(num); + } + + pub fn scaled(&self) -> Option { + self.scaled + } + + pub fn set_scaled(&mut self, scaled: u32) { + self.scaled = Some(scaled); + } + + pub fn containment(&self) -> Option { + self.containment + } + + pub fn set_containment(&mut self, containment: bool) { + self.containment = Some(containment); + } + + pub fn moltype(&self) -> Option { + self.moltype + } + + pub fn set_moltype(&mut self, value: HashFunctions) { + self.moltype = Some(value); + } + + pub fn picklist(&self) -> Option { + self.picklist.clone() + } -/* FIXME: bring back after MQF works on macOS and Windows -use cfg_if::cfg_if; -cfg_if! { - if #[cfg(not(target_arch = "wasm32"))] { - use mqf::MQF; - pub type MHMT = SBT, Signature>; + pub fn set_picklist(&mut self, value: Picklist) { + self.picklist = Some(value); } } -*/ pub trait Index<'a> { type Item: Comparable; @@ -132,7 +189,7 @@ pub struct DatasetInfo { } #[derive(TypedBuilder, Default, Clone)] -pub struct SigStore { +pub struct SigStore { #[builder(setter(into))] filename: String, @@ -145,16 +202,16 @@ pub struct SigStore { storage: Option, #[builder(setter(into), default)] - data: OnceCell, + data: OnceCell, } -impl SigStore { +impl SigStore { pub fn name(&self) -> String { self.name.clone() } } -impl std::fmt::Debug for SigStore { +impl std::fmt::Debug for SigStore { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, @@ -164,7 +221,7 @@ impl std::fmt::Debug for SigStore { } } -impl ReadData for SigStore { +impl ReadData for SigStore { fn data(&self) -> Result<&Signature, Error> { if let Some(sig) = self.data.get() { Ok(sig) @@ -188,8 +245,25 @@ impl ReadData for SigStore { } } -impl SigStore { - pub fn count_common(&self, other: &SigStore) -> u64 { +impl SigStore { + pub fn save(&self, path: &str) -> Result { + if let Some(storage) = &self.storage { + if let Some(data) = self.data.get() { + let mut buffer = Vec::new(); + data.to_writer(&mut buffer)?; + + Ok(storage.save(path, &buffer)?) + } else { + unimplemented!() + } + } else { + unimplemented!() + } + } +} + +impl SigStore { + pub fn count_common(&self, other: &SigStore) -> u64 { let ng: &Signature = self.data().unwrap(); let ong: &Signature = other.data().unwrap(); @@ -216,13 +290,13 @@ impl SigStore { } } -impl From> for Signature { - fn from(other: SigStore) -> Signature { +impl From for Signature { + fn from(other: SigStore) -> Signature { other.data.get().unwrap().to_owned() } } -impl Deref for SigStore { +impl Deref for SigStore { type Target = Signature; fn deref(&self) -> &Signature { @@ -230,8 +304,8 @@ impl Deref for SigStore { } } -impl From for SigStore { - fn from(other: Signature) -> SigStore { +impl From for SigStore { + fn from(other: Signature) -> SigStore { let name = other.name(); let filename = other.filename(); @@ -245,8 +319,8 @@ impl From for SigStore { } } -impl Comparable> for SigStore { - fn similarity(&self, other: &SigStore) -> f64 { +impl Comparable for SigStore { + fn similarity(&self, other: &SigStore) -> f64 { let ng: &Signature = self.data().unwrap(); let ong: &Signature = other.data().unwrap(); @@ -269,7 +343,7 @@ impl Comparable> for SigStore { unimplemented!() } - fn containment(&self, other: &SigStore) -> f64 { + fn containment(&self, other: &SigStore) -> f64 { let ng: &Signature = self.data().unwrap(); let ong: &Signature = other.data().unwrap(); @@ -321,8 +395,8 @@ impl Comparable for Signature { } } -impl From for SigStore { - fn from(other: DatasetInfo) -> SigStore { +impl From for SigStore { + fn from(other: DatasetInfo) -> SigStore { SigStore { filename: other.filename, name: other.name, diff --git a/src/core/src/index/revindex.rs b/src/core/src/index/revindex.rs deleted file mode 100644 index 0a1fc25d18..0000000000 --- a/src/core/src/index/revindex.rs +++ /dev/null @@ -1,699 +0,0 @@ -use std::collections::{HashMap, HashSet}; -use std::path::{Path, PathBuf}; -use std::sync::atomic::{AtomicUsize, Ordering}; - -use getset::{CopyGetters, Getters, Setters}; -use log::{debug, info}; -use nohash_hasher::BuildNoHashHasher; -use serde::{Deserialize, Serialize}; - -#[cfg(feature = "parallel")] -use rayon::prelude::*; - -use crate::encodings::{Color, Colors, Idx}; -use crate::index::Index; -use crate::signature::{Signature, SigsTrait}; -use crate::sketch::minhash::KmerMinHash; -use crate::sketch::Sketch; -use crate::Error; -use crate::HashIntoType; - -type SigCounter = counter::Counter; - -#[derive(Serialize, Deserialize)] -struct HashToColor(HashMap>); - -impl HashToColor { - fn new() -> Self { - HashToColor(HashMap::< - HashIntoType, - Color, - BuildNoHashHasher, - >::with_hasher(BuildNoHashHasher::default())) - } - - fn get(&self, hash: &HashIntoType) -> Option<&Color> { - self.0.get(hash) - } - - fn retain(&mut self, hashes: &HashSet) { - self.0.retain(|hash, _| hashes.contains(hash)) - } - - fn len(&self) -> usize { - self.0.len() - } - - fn is_empty(&self) -> bool { - self.0.is_empty() - } - - fn add_to(&mut self, colors: &mut Colors, dataset_id: usize, matched_hashes: Vec) { - let mut color = None; - - matched_hashes.into_iter().for_each(|hash| { - color = Some(colors.update(color, &[dataset_id as Idx]).unwrap()); - self.0.insert(hash, color.unwrap()); - }); - } - - fn reduce_hashes_colors( - a: (HashToColor, Colors), - b: (HashToColor, Colors), - ) -> (HashToColor, Colors) { - let ((small_hashes, small_colors), (mut large_hashes, mut large_colors)) = - if a.0.len() > b.0.len() { - (b, a) - } else { - (a, b) - }; - - small_hashes.0.into_iter().for_each(|(hash, color)| { - large_hashes - .0 - .entry(hash) - .and_modify(|entry| { - // Hash is already present. - // Update the current color by adding the indices from - // small_colors. - let ids = small_colors.indices(&color); - let new_color = large_colors.update(Some(*entry), ids).unwrap(); - *entry = new_color; - }) - .or_insert_with(|| { - // In this case, the hash was not present yet. - // we need to create the same color from small_colors - // into large_colors. - let ids = small_colors.indices(&color); - let new_color = large_colors.update(None, ids).unwrap(); - assert_eq!(new_color, color); - new_color - }); - }); - - (large_hashes, large_colors) - } -} - -// Use rkyv for serialization? -// https://davidkoloski.me/rkyv/ -#[derive(Serialize, Deserialize)] -pub struct RevIndex { - hash_to_color: HashToColor, - - sig_files: Vec, - - #[serde(skip)] - ref_sigs: Option>, - - template: Sketch, - colors: Colors, - //#[serde(skip)] - //storage: Option, -} - -impl RevIndex { - pub fn load>( - index_path: P, - queries: Option<&[KmerMinHash]>, - ) -> Result> { - let (rdr, _) = niffler::from_path(index_path)?; - let revindex = if let Some(qs) = queries { - // TODO: avoid loading full revindex if query != None - /* - struct PartialRevIndex { - hashes_to_keep: Option>, - marker: PhantomData T>, - } - - impl PartialRevIndex { - pub fn new(hashes_to_keep: HashSet) -> Self { - PartialRevIndex { - hashes_to_keep: Some(hashes_to_keep), - marker: PhantomData, - } - } - } - */ - - let mut hashes: HashSet = HashSet::new(); - for q in qs { - hashes.extend(q.iter_mins()); - } - - //let mut revindex: RevIndex = PartialRevIndex::new(hashes).deserialize(&rdr).unwrap(); - - let mut revindex: RevIndex = serde_json::from_reader(rdr)?; - revindex.hash_to_color.retain(&hashes); - revindex - } else { - // Load the full revindex - serde_json::from_reader(rdr)? - }; - - Ok(revindex) - } - - pub fn new( - search_sigs: &[PathBuf], - template: &Sketch, - threshold: usize, - queries: Option<&[KmerMinHash]>, - keep_sigs: bool, - ) -> RevIndex { - // If threshold is zero, let's merge all queries and save time later - let merged_query = queries.and_then(|qs| Self::merge_queries(qs, threshold)); - - let processed_sigs = AtomicUsize::new(0); - - #[cfg(feature = "parallel")] - let sig_iter = search_sigs.par_iter(); - - #[cfg(not(feature = "parallel"))] - let sig_iter = search_sigs.iter(); - - let filtered_sigs = sig_iter.enumerate().filter_map(|(dataset_id, filename)| { - let i = processed_sigs.fetch_add(1, Ordering::SeqCst); - if i % 1000 == 0 { - info!("Processed {} reference sigs", i); - } - - let search_sig = Signature::from_path(filename) - .unwrap_or_else(|_| panic!("Error processing {:?}", filename)) - .swap_remove(0); - - RevIndex::map_hashes_colors( - dataset_id, - &search_sig, - queries, - &merged_query, - threshold, - template, - ) - }); - - #[cfg(feature = "parallel")] - let (hash_to_color, colors) = filtered_sigs.reduce( - || (HashToColor::new(), Colors::default()), - HashToColor::reduce_hashes_colors, - ); - - #[cfg(not(feature = "parallel"))] - let (hash_to_color, colors) = filtered_sigs.fold( - (HashToColor::new(), Colors::default()), - HashToColor::reduce_hashes_colors, - ); - - // TODO: build this together with hash_to_idx? - let ref_sigs = if keep_sigs { - #[cfg(feature = "parallel")] - let sigs_iter = search_sigs.par_iter(); - - #[cfg(not(feature = "parallel"))] - let sigs_iter = search_sigs.iter(); - - Some( - sigs_iter - .map(|ref_path| { - Signature::from_path(ref_path) - .unwrap_or_else(|_| panic!("Error processing {:?}", ref_path)) - .swap_remove(0) - }) - .collect(), - ) - } else { - None - }; - - RevIndex { - hash_to_color, - sig_files: search_sigs.into(), - ref_sigs, - template: template.clone(), - colors, - // storage: Some(InnerStorage::new(MemStorage::default())), - } - } - - fn merge_queries(qs: &[KmerMinHash], threshold: usize) -> Option { - if threshold == 0 { - let mut merged = qs[0].clone(); - for query in &qs[1..] { - merged.merge(query).unwrap(); - } - Some(merged) - } else { - None - } - } - - pub fn new_with_sigs( - search_sigs: Vec, - template: &Sketch, - threshold: usize, - queries: Option<&[KmerMinHash]>, - ) -> RevIndex { - // If threshold is zero, let's merge all queries and save time later - let merged_query = queries.and_then(|qs| Self::merge_queries(qs, threshold)); - - let processed_sigs = AtomicUsize::new(0); - - #[cfg(feature = "parallel")] - let sigs_iter = search_sigs.par_iter(); - #[cfg(not(feature = "parallel"))] - let sigs_iter = search_sigs.iter(); - - let filtered_sigs = sigs_iter.enumerate().filter_map(|(dataset_id, sig)| { - let i = processed_sigs.fetch_add(1, Ordering::SeqCst); - if i % 1000 == 0 { - info!("Processed {} reference sigs", i); - } - - RevIndex::map_hashes_colors( - dataset_id, - sig, - queries, - &merged_query, - threshold, - template, - ) - }); - - #[cfg(feature = "parallel")] - let (hash_to_color, colors) = filtered_sigs.reduce( - || (HashToColor::new(), Colors::default()), - HashToColor::reduce_hashes_colors, - ); - - #[cfg(not(feature = "parallel"))] - let (hash_to_color, colors) = filtered_sigs.fold( - (HashToColor::new(), Colors::default()), - HashToColor::reduce_hashes_colors, - ); - - RevIndex { - hash_to_color, - sig_files: vec![], - ref_sigs: search_sigs.into(), - template: template.clone(), - colors, - //storage: None, - } - } - - fn map_hashes_colors( - dataset_id: usize, - search_sig: &Signature, - queries: Option<&[KmerMinHash]>, - merged_query: &Option, - threshold: usize, - template: &Sketch, - ) -> Option<(HashToColor, Colors)> { - let mut search_mh = None; - if let Some(Sketch::MinHash(mh)) = search_sig.select_sketch(template) { - search_mh = Some(mh); - } - - let search_mh = search_mh.expect("Couldn't find a compatible MinHash"); - let mut hash_to_color = HashToColor::new(); - let mut colors = Colors::default(); - - if let Some(qs) = queries { - if let Some(ref merged) = merged_query { - let (matched_hashes, intersection) = merged.intersection(search_mh).unwrap(); - if !matched_hashes.is_empty() || intersection > threshold as u64 { - hash_to_color.add_to(&mut colors, dataset_id, matched_hashes); - } - } else { - for query in qs { - let (matched_hashes, intersection) = query.intersection(search_mh).unwrap(); - if !matched_hashes.is_empty() || intersection > threshold as u64 { - hash_to_color.add_to(&mut colors, dataset_id, matched_hashes); - } - } - } - } else { - let matched = search_mh.mins(); - let size = matched.len() as u64; - if !matched.is_empty() || size > threshold as u64 { - hash_to_color.add_to(&mut colors, dataset_id, matched); - } - }; - - if hash_to_color.is_empty() { - None - } else { - Some((hash_to_color, colors)) - } - } - - pub fn search( - &self, - counter: SigCounter, - similarity: bool, - threshold: usize, - ) -> Result, Box> { - let mut matches = vec![]; - if similarity { - unimplemented!("TODO: threshold correction") - } - - for (dataset_id, size) in counter.most_common() { - if size >= threshold { - matches.push(self.sig_files[dataset_id as usize].to_str().unwrap().into()); - } else { - break; - }; - } - Ok(matches) - } - - pub fn gather( - &self, - mut counter: SigCounter, - threshold: usize, - query: &KmerMinHash, - ) -> Result, Box> { - let mut match_size = usize::max_value(); - let mut matches = vec![]; - - while match_size > threshold && !counter.is_empty() { - let (dataset_id, size) = counter.most_common()[0]; - match_size = if size >= threshold { size } else { break }; - - let p; - let match_path = if self.sig_files.is_empty() { - p = PathBuf::new(); // TODO: Fix somehow? - &p - } else { - &self.sig_files[dataset_id as usize] - }; - - let ref_match; - let match_sig = if let Some(refsigs) = &self.ref_sigs { - &refsigs[dataset_id as usize] - } else { - // TODO: remove swap_remove - ref_match = Signature::from_path(match_path)?.swap_remove(0); - &ref_match - }; - - let mut match_mh = None; - if let Some(Sketch::MinHash(mh)) = match_sig.select_sketch(&self.template) { - match_mh = Some(mh); - } - let match_mh = match_mh.expect("Couldn't find a compatible MinHash"); - - // Calculate stats - let f_orig_query = match_size as f64 / query.size() as f64; - let f_match = match_size as f64 / match_mh.size() as f64; - let filename = match_path.to_str().unwrap().into(); - let name = match_sig.name(); - let unique_intersect_bp = match_mh.scaled() as usize * match_size; - let gather_result_rank = matches.len(); - - let (intersect_orig, _) = match_mh.intersection_size(query)?; - let intersect_bp = (match_mh.scaled() * intersect_orig) as usize; - - let f_unique_to_query = intersect_orig as f64 / query.size() as f64; - let match_ = match_sig.clone(); - - // TODO: all of these - let f_unique_weighted = 0.; - let average_abund = 0; - let median_abund = 0; - let std_abund = 0; - let md5 = "".into(); - let f_match_orig = 0.; - let remaining_bp = 0; - - let result = GatherResult { - intersect_bp, - f_orig_query, - f_match, - f_unique_to_query, - f_unique_weighted, - average_abund, - median_abund, - std_abund, - filename, - name, - md5, - match_, - f_match_orig, - unique_intersect_bp, - gather_result_rank, - remaining_bp, - }; - matches.push(result); - - // Prepare counter for finding the next match by decrementing - // all hashes found in the current match in other datasets - for hash in match_mh.iter_mins() { - if let Some(color) = self.hash_to_color.get(hash) { - for dataset in self.colors.indices(color) { - counter.entry(*dataset).and_modify(|e| { - if *e > 0 { - *e -= 1 - } - }); - } - } - } - counter.remove(&dataset_id); - } - Ok(matches) - } - - pub fn counter_for_query(&self, query: &KmerMinHash) -> SigCounter { - query - .iter_mins() - .filter_map(|hash| self.hash_to_color.get(hash)) - .flat_map(|color| self.colors.indices(color)) - .cloned() - .collect() - } - - pub fn template(&self) -> Sketch { - self.template.clone() - } - - // TODO: mh should be a sketch, or even a sig... - pub(crate) fn find_signatures( - &self, - mh: &KmerMinHash, - threshold: f64, - containment: bool, - _ignore_scaled: bool, - ) -> Result, Error> { - /* - let template_mh = None; - if let Sketch::MinHash(mh) = self.template { - template_mh = Some(mh); - }; - // TODO: throw error - let template_mh = template_mh.unwrap(); - - let tmp_mh; - let mh = if template_mh.scaled() > mh.scaled() { - // TODO: proper error here - tmp_mh = mh.downsample_scaled(self.scaled)?; - &tmp_mh - } else { - mh - }; - - if self.scaled < mh.scaled() && !ignore_scaled { - return Err(LcaDBError::ScaledMismatchError { - db: self.scaled, - query: mh.scaled(), - } - .into()); - } - */ - - // TODO: proper threshold calculation - let threshold: usize = (threshold * (mh.size() as f64)) as _; - - let counter = self.counter_for_query(mh); - - debug!( - "number of matching signatures for hashes: {}", - counter.len() - ); - - let mut results = vec![]; - for (dataset_id, size) in counter.most_common() { - let match_size = if size >= threshold { size } else { break }; - - let p; - let match_path = if self.sig_files.is_empty() { - p = PathBuf::new(); // TODO: Fix somehow? - &p - } else { - &self.sig_files[dataset_id as usize] - }; - - let ref_match; - let match_sig = if let Some(refsigs) = &self.ref_sigs { - &refsigs[dataset_id as usize] - } else { - // TODO: remove swap_remove - ref_match = Signature::from_path(match_path)?.swap_remove(0); - &ref_match - }; - - let mut match_mh = None; - if let Some(Sketch::MinHash(mh)) = match_sig.select_sketch(&self.template) { - match_mh = Some(mh); - } - let match_mh = match_mh.unwrap(); - - if size >= threshold { - let score = if containment { - size as f64 / mh.size() as f64 - } else { - size as f64 / (mh.size() + match_size - size) as f64 - }; - let filename = match_path.to_str().unwrap().into(); - let mut sig = match_sig.clone(); - sig.reset_sketches(); - sig.push(Sketch::MinHash(match_mh.clone())); - results.push((score, sig, filename)); - } else { - break; - }; - } - Ok(results) - } -} - -#[derive(CopyGetters, Getters, Setters, Serialize, Deserialize, Debug)] -pub struct GatherResult { - #[getset(get_copy = "pub")] - intersect_bp: usize, - - #[getset(get_copy = "pub")] - f_orig_query: f64, - - #[getset(get_copy = "pub")] - f_match: f64, - - f_unique_to_query: f64, - f_unique_weighted: f64, - average_abund: usize, - median_abund: usize, - std_abund: usize, - - #[getset(get = "pub")] - filename: String, - - #[getset(get = "pub")] - name: String, - - md5: String, - match_: Signature, - f_match_orig: f64, - unique_intersect_bp: usize, - gather_result_rank: usize, - remaining_bp: usize, -} - -impl GatherResult { - pub fn get_match(&self) -> Signature { - self.match_.clone() - } -} - -impl<'a> Index<'a> for RevIndex { - type Item = Signature; - - fn insert(&mut self, _node: Self::Item) -> Result<(), Error> { - unimplemented!() - } - - fn save>(&self, _path: P) -> Result<(), Error> { - unimplemented!() - } - - fn load>(_path: P) -> Result<(), Error> { - unimplemented!() - } - - fn len(&self) -> usize { - if let Some(refs) = &self.ref_sigs { - refs.len() - } else { - self.sig_files.len() - } - } - - fn signatures(&self) -> Vec { - if let Some(ref sigs) = self.ref_sigs { - sigs.to_vec() - } else { - unimplemented!() - } - } - - fn signature_refs(&self) -> Vec<&Self::Item> { - unimplemented!() - } -} - -#[cfg(test)] -mod test { - use super::*; - - use crate::sketch::minhash::max_hash_for_scaled; - - #[test] - fn revindex_new() { - let max_hash = max_hash_for_scaled(10000); - let template = Sketch::MinHash( - KmerMinHash::builder() - .num(0u32) - .ksize(31) - .max_hash(max_hash) - .build(), - ); - let search_sigs = [ - "../../tests/test-data/gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig".into(), - "../../tests/test-data/gather/GCF_000007545.1_ASM754v1_genomic.fna.gz.sig".into(), - ]; - let index = RevIndex::new(&search_sigs, &template, 0, None, false); - assert_eq!(index.colors.len(), 3); - } - - #[test] - fn revindex_many() { - let max_hash = max_hash_for_scaled(10000); - let template = Sketch::MinHash( - KmerMinHash::builder() - .num(0u32) - .ksize(31) - .max_hash(max_hash) - .build(), - ); - let search_sigs = [ - "../../tests/test-data/gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig".into(), - "../../tests/test-data/gather/GCF_000007545.1_ASM754v1_genomic.fna.gz.sig".into(), - "../../tests/test-data/gather/GCF_000008105.1_ASM810v1_genomic.fna.gz.sig".into(), - ]; - - let index = RevIndex::new(&search_sigs, &template, 0, None, false); - /* - dbg!(&index.colors.colors); - 0: 86 - 1: 132 - 2: 91 - (0, 1): 53 - (0, 2): 90 - (1, 2): 26 - (0, 1, 2): 261 - union: 739 - */ - //assert_eq!(index.colors.len(), 3); - assert_eq!(index.colors.len(), 7); - } -} diff --git a/src/core/src/index/revindex/disk_revindex.rs b/src/core/src/index/revindex/disk_revindex.rs new file mode 100644 index 0000000000..2cf1a1f890 --- /dev/null +++ b/src/core/src/index/revindex/disk_revindex.rs @@ -0,0 +1,549 @@ +use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher}; +use std::path::{Path, PathBuf}; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; + +use byteorder::{LittleEndian, WriteBytesExt}; +use log::{info, trace}; +use rayon::prelude::*; +use rocksdb::{ColumnFamilyDescriptor, MergeOperands, Options}; + +use crate::index::revindex::mem_revindex::GatherResult; +use crate::signature::{Signature, SigsTrait}; +use crate::sketch::minhash::{KmerMinHash, MinHashOps}; +use crate::sketch::Sketch; + +use crate::index::revindex::prepare_query; +use crate::index::revindex::{ + self as module, sig_save_to_db, stats_for_cf, Color, DatasetID, Datasets, HashToColor, + QueryColors, SigCounter, SignatureData, DB, HASHES, SIGS, +}; + +fn compute_color(idxs: &Datasets) -> Color { + let s = BuildHasherDefault::::default(); + let mut hasher = s.build_hasher(); + /* + // TODO: remove this... + let mut sorted: Vec<_> = idxs.iter().collect(); + sorted.sort(); + */ + idxs.hash(&mut hasher); + hasher.finish() +} + +#[derive(Debug, Clone)] +pub struct RevIndex { + db: Arc, +} + +fn merge_datasets( + _: &[u8], + existing_val: Option<&[u8]>, + operands: &MergeOperands, +) -> Option> { + let mut datasets = existing_val + .and_then(Datasets::from_slice) + .unwrap_or_default(); + + for op in operands { + let new_vals = Datasets::from_slice(op).unwrap(); + datasets.union(new_vals); + } + // TODO: optimization! if nothing changed, skip as_bytes() + datasets.as_bytes() +} + +/* TODO: need the repair_cf variant, not available in rocksdb-rust yet +pub fn repair(path: &Path) { + let opts = db_options(); + + DB::repair(&opts, path).unwrap() +} +*/ + +impl RevIndex { + pub fn create(path: &Path) -> module::RevIndex { + let mut opts = module::RevIndex::db_options(); + opts.create_if_missing(true); + opts.create_missing_column_families(true); + + // prepare column family descriptors + let cfs = cf_descriptors(); + + let db = Arc::new(DB::open_cf_descriptors(&opts, path, cfs).unwrap()); + + module::RevIndex::Plain(Self { db }) + } + + pub fn open(path: &Path, read_only: bool) -> module::RevIndex { + let opts = module::RevIndex::db_options(); + + // prepare column family descriptors + let cfs = cf_descriptors(); + + let db = if read_only { + Arc::new(DB::open_cf_descriptors_read_only(&opts, path, cfs, false).unwrap()) + } else { + Arc::new(DB::open_cf_descriptors(&opts, path, cfs).unwrap()) + }; + + module::RevIndex::Plain(Self { db }) + } + + fn map_hashes_colors( + &self, + dataset_id: DatasetID, + filename: &PathBuf, + threshold: f64, + template: &Sketch, + save_paths: bool, + ) { + let search_sig = Signature::from_path(filename) + .unwrap_or_else(|_| panic!("Error processing {:?}", filename)) + .swap_remove(0); + + let search_mh = + prepare_query(&search_sig, template).expect("Couldn't find a compatible MinHash"); + + let colors = Datasets::new(&[dataset_id]).as_bytes().unwrap(); + + let cf_hashes = self.db.cf_handle(HASHES).unwrap(); + + let matched = search_mh.mins(); + let size = matched.len() as u64; + if !matched.is_empty() || size > threshold as u64 { + // FIXME threshold is f64 + let mut hash_bytes = [0u8; 8]; + for hash in matched { + (&mut hash_bytes[..]) + .write_u64::(hash) + .expect("error writing bytes"); + self.db + .merge_cf(&cf_hashes, &hash_bytes[..], colors.as_slice()) + .expect("error merging"); + } + } + + sig_save_to_db( + self.db.clone(), + search_sig, + search_mh, + size, + threshold, + save_paths, + filename, + dataset_id, + ); + } + + pub fn counter_for_query(&self, query: &KmerMinHash) -> SigCounter { + info!("Collecting hashes"); + let cf_hashes = self.db.cf_handle(HASHES).unwrap(); + let hashes_iter = query.iter_mins().map(|hash| { + let mut v = vec![0_u8; 8]; + (&mut v[..]) + .write_u64::(*hash) + .expect("error writing bytes"); + (&cf_hashes, v) + }); + + info!("Multi get"); + self.db + .multi_get_cf(hashes_iter) + .into_iter() + .filter_map(|r| r.ok().unwrap_or(None)) + .flat_map(|raw_datasets| { + let new_vals = Datasets::from_slice(&raw_datasets).unwrap(); + new_vals.into_iter() + }) + .collect() + } + + pub fn prepare_gather_counters( + &self, + query: &KmerMinHash, + ) -> (SigCounter, QueryColors, HashToColor) { + let cf_hashes = self.db.cf_handle(HASHES).unwrap(); + let hashes_iter = query.iter_mins().map(|hash| { + let mut v = vec![0_u8; 8]; + (&mut v[..]) + .write_u64::(*hash) + .expect("error writing bytes"); + (&cf_hashes, v) + }); + + /* + build a HashToColors for query, + and a QueryColors (Color -> Datasets) mapping. + Loading Datasets from rocksdb for every hash takes too long. + */ + let mut query_colors: QueryColors = Default::default(); + let mut counter: SigCounter = Default::default(); + + info!("Building hash_to_colors and query_colors"); + let hash_to_colors = query + .iter_mins() + .zip(self.db.multi_get_cf(hashes_iter)) + .filter_map(|(k, r)| { + let raw = r.ok().unwrap_or(None); + raw.map(|raw| { + let new_vals = Datasets::from_slice(&raw).unwrap(); + let color = compute_color(&new_vals); + query_colors + .entry(color) + .or_insert_with(|| new_vals.clone()); + counter.update(new_vals); + (*k, color) + }) + }) + .collect(); + + (counter, query_colors, hash_to_colors) + } + + pub fn matches_from_counter( + &self, + counter: SigCounter, + threshold: usize, + ) -> Vec<(String, usize)> { + let cf_sigs = self.db.cf_handle(SIGS).unwrap(); + + let matches_iter = counter + .most_common() + .into_iter() + .filter_map(|(dataset_id, size)| { + if size >= threshold { + let mut v = vec![0_u8; 8]; + (&mut v[..]) + .write_u64::(dataset_id) + .expect("error writing bytes"); + Some((&cf_sigs, v, size)) + } else { + None + } + }); + + let matches_sizes = matches_iter.clone().map(|(_, _, v)| v); + + info!("Multi get matches"); + self.db + .multi_get_cf(matches_iter.map(|(k, v, _)| (k, v))) + .into_iter() + .zip(matches_sizes) + .filter_map(|(r, size)| r.ok().unwrap_or(None).map(|v| (v, size))) + .filter_map( + |(sigdata, size)| match SignatureData::from_slice(&sigdata).unwrap() { + SignatureData::Empty => None, + SignatureData::External(p) => Some((p, size)), + SignatureData::Internal(sig) => Some((sig.name(), size)), + }, + ) + .collect() + } + + pub fn gather( + &self, + mut counter: SigCounter, + query_colors: QueryColors, + hash_to_color: HashToColor, + threshold: usize, + orig_query: &KmerMinHash, + template: &Sketch, + ) -> Result, Box> { + let mut match_size = usize::max_value(); + let mut matches = vec![]; + let mut key_bytes = [0u8; 8]; + //let mut query: KmerMinHashBTree = orig_query.clone().into(); + + let cf_sigs = self.db.cf_handle(SIGS).unwrap(); + + while match_size > threshold && !counter.is_empty() { + trace!("counter len: {}", counter.len()); + trace!("match size: {}", match_size); + + let (dataset_id, size) = counter.k_most_common_ordered(1)[0]; + match_size = if size >= threshold { size } else { break }; + + (&mut key_bytes[..]) + .write_u64::(dataset_id) + .expect("error writing bytes"); + + let match_sig = self + .db + .get_cf(&cf_sigs, &key_bytes[..]) + .ok() + .map( + |sigdata| match SignatureData::from_slice(&(sigdata.unwrap())).unwrap() { + SignatureData::Empty => todo!("throw error, empty sig"), + SignatureData::External(_p) => todo!("Load from external"), + SignatureData::Internal(sig) => sig, + }, + ) + .unwrap_or_else(|| panic!("Unknown dataset {}", dataset_id)); + + let match_mh = + prepare_query(&match_sig, template).expect("Couldn't find a compatible MinHash"); + + // Calculate stats + let f_orig_query = match_size as f64 / orig_query.size() as f64; + let f_match = match_size as f64 / match_mh.size() as f64; + let name = match_sig.name(); + let unique_intersect_bp = match_mh.scaled() as usize * match_size; + let gather_result_rank = matches.len(); + + let (intersect_orig, _) = match_mh.intersection_size(orig_query)?; + let intersect_bp = (match_mh.scaled() * intersect_orig) as usize; + + let f_unique_to_query = intersect_orig as f64 / orig_query.size() as f64; + let match_ = match_sig.clone(); + let md5 = match_sig.md5sum(); + + // TODO: all of these + let filename = "".into(); + let f_unique_weighted = 0.; + let average_abund = 0; + let median_abund = 0; + let std_abund = 0; + let f_match_orig = 0.; + let remaining_bp = 0; + + let result = GatherResult::builder() + .intersect_bp(intersect_bp) + .f_orig_query(f_orig_query) + .f_match(f_match) + .f_unique_to_query(f_unique_to_query) + .f_unique_weighted(f_unique_weighted) + .average_abund(average_abund) + .median_abund(median_abund) + .std_abund(std_abund) + .filename(filename) + .name(name) + .md5(md5) + .match_(match_) + .f_match_orig(f_match_orig) + .unique_intersect_bp(unique_intersect_bp) + .gather_result_rank(gather_result_rank) + .remaining_bp(remaining_bp) + .build(); + matches.push(result); + + trace!("Preparing counter for next round"); + // Prepare counter for finding the next match by decrementing + // all hashes found in the current match in other datasets + // TODO: not used at the moment, so just skip. + //query.remove_many(match_mh.to_vec().as_slice())?; + + // TODO: Use HashesToColors here instead. If not initialized, + // build it. + match_mh + .iter_mins() + .filter_map(|hash| hash_to_color.get(hash)) + .flat_map(|color| { + // TODO: remove this clone + query_colors.get(color).unwrap().clone().into_iter() + }) + .for_each(|dataset| { + // TODO: collect the flat_map into a Counter, and remove more + // than one at a time... + counter.entry(dataset).and_modify(|e| { + if *e > 0 { + *e -= 1 + } + }); + }); + + counter.remove(&dataset_id); + } + Ok(matches) + } + + pub fn index( + &self, + index_sigs: Vec, + template: &Sketch, + threshold: f64, + save_paths: bool, + ) { + let processed_sigs = AtomicUsize::new(0); + + index_sigs + .par_iter() + .enumerate() + .for_each(|(dataset_id, filename)| { + let i = processed_sigs.fetch_add(1, Ordering::SeqCst); + if i % 1000 == 0 { + info!("Processed {} reference sigs", i); + } + + self.map_hashes_colors( + dataset_id as DatasetID, + filename, + threshold, + template, + save_paths, + ); + }); + info!("Processed {} reference sigs", processed_sigs.into_inner()); + } + + pub fn update( + &self, + index_sigs: Vec, + template: &Sketch, + threshold: f64, + save_paths: bool, + ) { + use byteorder::ReadBytesExt; + + if !save_paths { + todo!("only supports with save_paths=True for now"); + } + + let cf_sigs = self.db.cf_handle(SIGS).unwrap(); + let iter = self.db.iterator_cf(&cf_sigs, rocksdb::IteratorMode::Start); + + info!("Verifying existing sigs"); + // verify data match up to this point + let mut max_dataset_id = 0; + let to_skip = iter + .map(|result| { + let (key, value) = result.unwrap(); + let current_dataset_id = (&key[..]).read_u64::().unwrap(); + + let filename = &index_sigs[current_dataset_id as usize]; + let sig_data = SignatureData::from_slice(&value).unwrap(); + match sig_data { + SignatureData::External(sig) => { + assert_eq!(sig, filename.as_os_str().to_str().unwrap().to_string()) + } + SignatureData::Empty => (), + SignatureData::Internal(_) => { + todo!("only supports with save_paths=True for now") + } + }; + max_dataset_id = max_dataset_id.max(current_dataset_id); + }) + .count(); + + max_dataset_id += 1; + assert_eq!(max_dataset_id as usize, to_skip); + + // process the remainder + let processed_sigs = AtomicUsize::new(0); + + index_sigs + .par_iter() + .skip(to_skip) + .enumerate() + .for_each(|(i, filename)| { + let dataset_id = i + to_skip; + + let i = processed_sigs.fetch_add(1, Ordering::SeqCst); + if i % 1000 == 0 { + info!("Processed {} reference sigs", i); + } + + self.map_hashes_colors( + dataset_id as DatasetID, + filename, + threshold, + template, + save_paths, + ); + }); + + info!( + "Processed additional {} reference sigs", + processed_sigs.into_inner() + ); + } + + pub fn check(&self, quick: bool) { + stats_for_cf(self.db.clone(), HASHES, true, quick); + info!(""); + stats_for_cf(self.db.clone(), SIGS, false, quick); + } + + pub fn compact(&self) { + for cf_name in [HASHES, SIGS] { + let cf = self.db.cf_handle(cf_name).unwrap(); + self.db.compact_range_cf(&cf, None::<&[u8]>, None::<&[u8]>) + } + } + + pub fn flush(&self) -> Result<(), Box> { + self.db.flush_wal(true)?; + + for cf_name in [HASHES, SIGS] { + let cf = self.db.cf_handle(cf_name).unwrap(); + self.db.flush_cf(&cf)?; + } + + Ok(()) + } + + pub fn convert(&self, _output_db: module::RevIndex) -> Result<(), Box> { + todo!() + /* + if let RevIndex::Color(db) = output_db { + let other_db = db.db; + + let cf_hashes = self.db.cf_handle(HASHES).unwrap(); + + info!("start converting colors"); + let mut color_bytes = [0u8; 8]; + let iter = self + .db + .iterator_cf(&cf_hashes, rocksdb::IteratorMode::Start); + for (key, value) in iter { + let datasets = Datasets::from_slice(&value).unwrap(); + let new_idx: Vec<_> = datasets.into_iter().collect(); + let new_color = Colors::update(other_db.clone(), None, new_idx.as_slice()).unwrap(); + + (&mut color_bytes[..]) + .write_u64::(new_color) + .expect("error writing bytes"); + other_db + .put_cf(&cf_hashes, &key[..], &color_bytes[..]) + .unwrap(); + } + info!("finished converting colors"); + + info!("copying sigs to output"); + let cf_sigs = self.db.cf_handle(SIGS).unwrap(); + let iter = self.db.iterator_cf(&cf_sigs, rocksdb::IteratorMode::Start); + for (key, value) in iter { + other_db.put_cf(&cf_sigs, &key[..], &value[..]).unwrap(); + } + info!("finished copying sigs to output"); + + Ok(()) + } else { + todo!() + } + */ + } +} + +fn cf_descriptors() -> Vec { + let mut cfopts = Options::default(); + cfopts.set_max_write_buffer_number(16); + cfopts.set_merge_operator_associative("datasets operator", merge_datasets); + cfopts.set_min_write_buffer_number_to_merge(10); + + // Updated default from + // https://github.com/facebook/rocksdb/wiki/Setup-Options-and-Basic-Tuning#other-general-options + cfopts.set_level_compaction_dynamic_level_bytes(true); + + let cf_hashes = ColumnFamilyDescriptor::new(HASHES, cfopts); + + let mut cfopts = Options::default(); + cfopts.set_max_write_buffer_number(16); + // Updated default + cfopts.set_level_compaction_dynamic_level_bytes(true); + //cfopts.set_merge_operator_associative("colors operator", merge_colors); + + let cf_sigs = ColumnFamilyDescriptor::new(SIGS, cfopts); + + vec![cf_hashes, cf_sigs] +} diff --git a/src/core/src/index/revindex/mem_revindex.rs b/src/core/src/index/revindex/mem_revindex.rs new file mode 100644 index 0000000000..c989ac87b3 --- /dev/null +++ b/src/core/src/index/revindex/mem_revindex.rs @@ -0,0 +1,1118 @@ +use std::collections::{HashMap, HashSet}; +use std::path::{Path, PathBuf}; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; + +use getset::{CopyGetters, Getters, Setters}; +use log::{debug, info}; +use nohash_hasher::BuildNoHashHasher; +use serde::{Deserialize, Serialize}; +use typed_builder::TypedBuilder; + +#[cfg(feature = "parallel")] +use rayon::prelude::*; + +use crate::encodings::{Color, Colors, Idx}; +use crate::index::{Index, Selection, SigStore}; +use crate::manifest::Manifest; +use crate::signature::{Signature, SigsTrait}; +use crate::sketch::minhash::{KmerMinHash, MinHashOps}; +use crate::sketch::Sketch; +use crate::storage::{Storage, ZipStorage}; +use crate::Error; +use crate::HashIntoType; + +type SigCounter = counter::Counter; + +#[derive(Serialize, Deserialize)] +struct HashToColor(HashMap>); + +impl HashToColor { + fn new() -> Self { + HashToColor(HashMap::< + HashIntoType, + Color, + BuildNoHashHasher, + >::with_hasher(BuildNoHashHasher::default())) + } + + fn get(&self, hash: &HashIntoType) -> Option<&Color> { + self.0.get(hash) + } + + fn retain(&mut self, hashes: &HashSet) { + self.0.retain(|hash, _| hashes.contains(hash)) + } + + fn len(&self) -> usize { + self.0.len() + } + + fn is_empty(&self) -> bool { + self.0.is_empty() + } + + fn add_to(&mut self, colors: &mut Colors, dataset_id: usize, matched_hashes: Vec) { + let mut color = None; + + matched_hashes.into_iter().for_each(|hash| { + color = Some(colors.update(color, &[dataset_id as Idx]).unwrap()); + self.0.insert(hash, color.unwrap()); + }); + } + + fn reduce_hashes_colors( + a: (HashToColor, Colors), + b: (HashToColor, Colors), + ) -> (HashToColor, Colors) { + let ((small_hashes, small_colors), (mut large_hashes, mut large_colors)) = + if a.0.len() > b.0.len() { + (b, a) + } else { + (a, b) + }; + + small_hashes.0.into_iter().for_each(|(hash, color)| { + large_hashes + .0 + .entry(hash) + .and_modify(|entry| { + // Hash is already present. + // Update the current color by adding the indices from + // small_colors. + let ids = small_colors.indices(&color); + let new_color = large_colors.update(Some(*entry), ids).unwrap(); + *entry = new_color; + }) + .or_insert_with(|| { + // In this case, the hash was not present yet. + // we need to create the same color from small_colors + // into large_colors. + let ids = small_colors.indices(&color); + let new_color = large_colors.update(None, ids).unwrap(); + assert_eq!(new_color, color); + new_color + }); + }); + + (large_hashes, large_colors) + } +} + +// Use rkyv for serialization? +// https://davidkoloski.me/rkyv/ +#[derive(Serialize, Deserialize)] +pub struct RevIndex { + linear: LinearRevIndex, + hash_to_color: HashToColor, + colors: Colors, +} + +#[derive(Serialize, Deserialize)] +pub struct LinearRevIndex { + sig_files: Manifest, + + #[serde(skip)] + ref_sigs: Option>, + + template: Sketch, + + #[serde(skip)] + storage: Option>, +} + +impl LinearRevIndex { + pub fn new( + sig_files: Option, + template: &Sketch, + keep_sigs: bool, + ref_sigs: Option>, + storage: Option, + ) -> Self { + if ref_sigs.is_none() && sig_files.is_none() { + todo!("throw error, one need to be set"); + } + + let ref_sigs = if let Some(ref_sigs) = ref_sigs { + Some(ref_sigs.into_iter().map(|m| m.into()).collect()) + } else if keep_sigs { + let search_sigs: Vec<_> = sig_files + .as_ref() + .unwrap() + .internal_locations() + .map(PathBuf::from) + .collect(); + + #[cfg(feature = "parallel")] + let sigs_iter = search_sigs.par_iter(); + + #[cfg(not(feature = "parallel"))] + let sigs_iter = search_sigs.iter(); + + Some( + sigs_iter + .map(|ref_path| { + if let Some(storage) = &storage { + let sig_data = storage + .load(ref_path.to_str().unwrap_or_else(|| { + panic!("error converting path {:?}", ref_path) + })) + .unwrap_or_else(|_| panic!("error loading {:?}", ref_path)); + Signature::from_reader(sig_data.as_slice()) + .unwrap_or_else(|_| panic!("Error processing {:?}", ref_path)) + .swap_remove(0) + .into() + } else { + Signature::from_path(ref_path) + .unwrap_or_else(|_| panic!("Error processing {:?}", ref_path)) + .swap_remove(0) + .into() + } + }) + .collect(), + ) + } else { + None + }; + + let storage = storage.map(Arc::new); + + let sig_files = sig_files.unwrap_or_else(|| { + todo!("generate manifest for ref_sigs"); + }); + + LinearRevIndex { + sig_files, + template: template.clone(), + ref_sigs, + storage, + } + } + + fn index( + self, + threshold: usize, + merged_query: Option, + queries: Option<&[KmerMinHash]>, + ) -> RevIndex { + let processed_sigs = AtomicUsize::new(0); + + let search_sigs: Vec<_> = self + .sig_files + .internal_locations() + .map(PathBuf::from) + .collect(); + + #[cfg(feature = "parallel")] + let sig_iter = search_sigs.par_iter(); + + #[cfg(not(feature = "parallel"))] + let sig_iter = search_sigs.iter(); + + let filtered_sigs = sig_iter.enumerate().filter_map(|(dataset_id, filename)| { + let i = processed_sigs.fetch_add(1, Ordering::SeqCst); + if i % 1000 == 0 { + info!("Processed {} reference sigs", i); + } + + let search_sig = if let Some(storage) = &self.storage { + let sig_data = storage + .load( + filename + .to_str() + .unwrap_or_else(|| panic!("error converting path {:?}", filename)), + ) + .unwrap_or_else(|_| panic!("error loading {:?}", filename)); + + Signature::from_reader(sig_data.as_slice()) + } else { + Signature::from_path(filename) + } + .unwrap_or_else(|_| panic!("Error processing {:?}", filename)) + .swap_remove(0); + + RevIndex::map_hashes_colors( + dataset_id, + &search_sig, + queries, + &merged_query, + threshold, + &self.template, + ) + }); + + #[cfg(feature = "parallel")] + let (hash_to_color, colors) = filtered_sigs.reduce( + || (HashToColor::new(), Colors::default()), + HashToColor::reduce_hashes_colors, + ); + + #[cfg(not(feature = "parallel"))] + let (hash_to_color, colors) = filtered_sigs.fold( + (HashToColor::new(), Colors::default()), + HashToColor::reduce_hashes_colors, + ); + + RevIndex { + hash_to_color, + colors, + linear: self, + } + } + + pub fn location(&self) -> Option { + if let Some(storage) = &self.storage { + storage.path() + } else { + None + } + } + + pub fn storage(&self) -> Option> { + self.storage.clone() + } + + pub fn select(mut self, selection: &Selection) -> Result { + let manifest = self.sig_files.select_to_manifest(selection)?; + self.sig_files = manifest; + + Ok(self) + /* + # if we have a manifest, run 'select' on the manifest. + manifest = self.manifest + traverse_yield_all = self.traverse_yield_all + + if manifest is not None: + manifest = manifest.select_to_manifest(**kwargs) + return ZipFileLinearIndex(self.storage, + selection_dict=None, + traverse_yield_all=traverse_yield_all, + manifest=manifest, + use_manifest=True) + else: + # no manifest? just pass along all the selection kwargs to + # the new ZipFileLinearIndex. + + assert manifest is None + if self.selection_dict: + # combine selects... + d = dict(self.selection_dict) + for k, v in kwargs.items(): + if k in d: + if d[k] is not None and d[k] != v: + raise ValueError(f"incompatible select on '{k}'") + d[k] = v + kwargs = d + + return ZipFileLinearIndex(self.storage, + selection_dict=kwargs, + traverse_yield_all=traverse_yield_all, + manifest=None, + use_manifest=False) + */ + } + + pub fn counter_for_query(&self, query: &KmerMinHash) -> SigCounter { + let processed_sigs = AtomicUsize::new(0); + + // TODO: Some(ref_sigs) case + + let search_sigs: Vec<_> = self + .sig_files + .internal_locations() + .map(PathBuf::from) + .collect(); + + #[cfg(feature = "parallel")] + let sig_iter = search_sigs.par_iter(); + + #[cfg(not(feature = "parallel"))] + let sig_iter = search_sigs.iter(); + + let counters = sig_iter.enumerate().filter_map(|(dataset_id, filename)| { + let i = processed_sigs.fetch_add(1, Ordering::SeqCst); + if i % 1000 == 0 { + info!("Processed {} reference sigs", i); + } + + let search_sig = if let Some(storage) = &self.storage { + let sig_data = storage + .load( + filename + .to_str() + .unwrap_or_else(|| panic!("error converting path {:?}", filename)), + ) + .unwrap_or_else(|_| panic!("error loading {:?}", filename)); + + Signature::from_reader(sig_data.as_slice()) + } else { + Signature::from_path(filename) + } + .unwrap_or_else(|_| panic!("Error processing {:?}", filename)) + .swap_remove(0); + + let mut search_mh = None; + if let Some(Sketch::MinHash(mh)) = search_sig.select_sketch(&self.template) { + search_mh = Some(mh); + }; + let search_mh = search_mh.expect("Couldn't find a compatible MinHash"); + + let (large_mh, small_mh) = if query.size() > search_mh.size() { + (query, search_mh) + } else { + (search_mh, query) + }; + + let (size, _) = small_mh + .intersection_size(large_mh) + .unwrap_or_else(|_| panic!("error computing intersection for {:?}", filename)); + + if size == 0 { + None + } else { + let mut counter: SigCounter = Default::default(); + counter[&(dataset_id as u64)] += size as usize; + Some(counter) + } + }); + + let reduce_counters = |mut a: SigCounter, b: SigCounter| { + a.extend(&b); + a + }; + + #[cfg(feature = "parallel")] + let counter = counters.reduce(SigCounter::new, reduce_counters); + + #[cfg(not(feature = "parallel"))] + let counter = counters.fold(SigCounter::new(), reduce_counters); + + counter + } + + pub fn search( + &self, + counter: SigCounter, + similarity: bool, + threshold: usize, + ) -> Result, Box> { + let mut matches = vec![]; + if similarity { + unimplemented!("TODO: threshold correction") + } + + for (dataset_id, size) in counter.most_common() { + if size >= threshold { + matches.push( + self.sig_files[dataset_id as usize] + .internal_location() + .to_str() + .unwrap() + .into(), + ); + } else { + break; + }; + } + Ok(matches) + } + + fn gather_round( + &self, + dataset_id: u64, + match_size: usize, + query: &KmerMinHash, + round: usize, + ) -> Result { + let match_path = if self.sig_files.is_empty() { + PathBuf::new() + } else { + self.sig_files[dataset_id as usize].internal_location() + }; + let match_sig = self.sig_for_dataset(dataset_id as usize)?; + let result = self.stats_for_match(&match_sig, query, match_size, match_path, round)?; + Ok(result) + } + + fn sig_for_dataset(&self, dataset_id: usize) -> Result { + let match_path = if self.sig_files.is_empty() { + PathBuf::new() + } else { + self.sig_files[dataset_id].internal_location() + }; + + let match_sig = if let Some(refsigs) = &self.ref_sigs { + refsigs[dataset_id].clone() + } else { + let mut sig = if let Some(storage) = &self.storage { + let sig_data = storage + .load( + match_path + .to_str() + .unwrap_or_else(|| panic!("error converting path {:?}", match_path)), + ) + .unwrap_or_else(|_| panic!("error loading {:?}", match_path)); + Signature::from_reader(sig_data.as_slice())? + } else { + Signature::from_path(&match_path)? + }; + // TODO: remove swap_remove + sig.swap_remove(0).into() + }; + Ok(match_sig) + } + + fn stats_for_match( + &self, + match_sig: &Signature, + query: &KmerMinHash, + match_size: usize, + match_path: PathBuf, + gather_result_rank: usize, + ) -> Result { + let mut match_mh = None; + if let Some(Sketch::MinHash(mh)) = match_sig.select_sketch(&self.template) { + match_mh = Some(mh); + } + let match_mh = match_mh.expect("Couldn't find a compatible MinHash"); + + // Calculate stats + let f_orig_query = match_size as f64 / query.size() as f64; + let f_match = match_size as f64 / match_mh.size() as f64; + let filename = match_path.to_str().unwrap().into(); + let name = match_sig.name(); + let unique_intersect_bp = match_mh.scaled() as usize * match_size; + + let (intersect_orig, _) = match_mh.intersection_size(query)?; + let intersect_bp = (match_mh.scaled() * intersect_orig) as usize; + + let f_unique_to_query = intersect_orig as f64 / query.size() as f64; + let match_ = match_sig.clone(); + + // TODO: all of these + let f_unique_weighted = 0.; + let average_abund = 0; + let median_abund = 0; + let std_abund = 0; + let md5 = "".into(); + let f_match_orig = 0.; + let remaining_bp = 0; + + Ok(GatherResult { + intersect_bp, + f_orig_query, + f_match, + f_unique_to_query, + f_unique_weighted, + average_abund, + median_abund, + std_abund, + filename, + name, + md5, + match_, + f_match_orig, + unique_intersect_bp, + gather_result_rank, + remaining_bp, + }) + } + + pub fn gather( + &self, + mut counter: SigCounter, + threshold: usize, + query: &KmerMinHash, + ) -> Result, Box> { + let mut match_size = usize::max_value(); + let mut matches = vec![]; + + while match_size > threshold && !counter.is_empty() { + let (dataset_id, size) = counter.most_common()[0]; + if threshold == 0 && size == 0 { + break; + } + + match_size = if size >= threshold { + size + } else { + break; + }; + + let result = self.gather_round(dataset_id, match_size, query, matches.len())?; + + // Prepare counter for finding the next match by decrementing + // all hashes found in the current match in other datasets + // TODO: maybe par_iter? + let mut to_remove: HashSet = Default::default(); + to_remove.insert(dataset_id); + + for (dataset, value) in counter.iter_mut() { + let dataset_sig = self.sig_for_dataset(*dataset as usize)?; + let mut match_mh = None; + if let Some(Sketch::MinHash(mh)) = dataset_sig.select_sketch(&self.template) { + match_mh = Some(mh); + } + let match_mh = match_mh.expect("Couldn't find a compatible MinHash"); + + let (intersection, _) = query.intersection_size(match_mh)?; + if intersection as usize > *value { + to_remove.insert(*dataset); + } else { + *value -= intersection as usize; + }; + } + to_remove.iter().for_each(|dataset_id| { + counter.remove(dataset_id); + }); + matches.push(result); + } + Ok(matches) + } + + pub fn manifest(&self) -> Manifest { + self.sig_files.clone() + } + + pub fn set_manifest(&mut self, new_manifest: Manifest) -> Result<(), Error> { + self.sig_files = new_manifest; + Ok(()) + } + + pub fn signatures_iter(&self) -> impl Iterator + '_ { + if let Some(_sigs) = &self.ref_sigs { + //sigs.iter().cloned() + todo!("this works, but need to match return types") + } else { + // FIXME temp solution, must find better one! + (0..self.sig_files.len()) + .map(move |dataset_id| self.sig_for_dataset(dataset_id).expect("error loading sig")) + } + } +} + +impl<'a> Index<'a> for LinearRevIndex { + type Item = SigStore; + + fn insert(&mut self, _node: Self::Item) -> Result<(), Error> { + unimplemented!() + } + + fn save>(&self, _path: P) -> Result<(), Error> { + unimplemented!() + } + + fn load>(_path: P) -> Result<(), Error> { + unimplemented!() + } + + fn len(&self) -> usize { + if let Some(refs) = &self.ref_sigs { + refs.len() + } else { + self.sig_files.len() + } + } + + fn signatures(&self) -> Vec { + if let Some(ref sigs) = self.ref_sigs { + sigs.to_vec() + } else { + unimplemented!() + } + } + + fn signature_refs(&self) -> Vec<&Self::Item> { + unimplemented!() + } +} + +impl RevIndex { + pub fn load>( + index_path: P, + queries: Option<&[KmerMinHash]>, + ) -> Result> { + let (rdr, _) = niffler::from_path(index_path)?; + let revindex = if let Some(qs) = queries { + // TODO: avoid loading full revindex if query != None + /* + struct PartialRevIndex { + hashes_to_keep: Option>, + marker: PhantomData T>, + } + + impl PartialRevIndex { + pub fn new(hashes_to_keep: HashSet) -> Self { + PartialRevIndex { + hashes_to_keep: Some(hashes_to_keep), + marker: PhantomData, + } + } + } + */ + + let mut hashes: HashSet = HashSet::new(); + for q in qs { + hashes.extend(q.iter_mins()); + } + + //let mut revindex: RevIndex = PartialRevIndex::new(hashes).deserialize(&rdr).unwrap(); + + let mut revindex: RevIndex = serde_json::from_reader(rdr)?; + revindex.hash_to_color.retain(&hashes); + revindex + } else { + // Load the full revindex + serde_json::from_reader(rdr)? + }; + + Ok(revindex) + } + + pub fn new( + search_sigs: &[PathBuf], + template: &Sketch, + threshold: usize, + queries: Option<&[KmerMinHash]>, + keep_sigs: bool, + ) -> RevIndex { + // If threshold is zero, let's merge all queries and save time later + let merged_query = queries.and_then(|qs| Self::merge_queries(qs, threshold)); + + let linear = LinearRevIndex::new(Some(search_sigs.into()), template, keep_sigs, None, None); + linear.index(threshold, merged_query, queries) + } + + pub fn from_zipstorage( + storage: ZipStorage, + template: &Sketch, + threshold: usize, + queries: Option<&[KmerMinHash]>, + keep_sigs: bool, + ) -> Result { + // If threshold is zero, let's merge all queries and save time later + let merged_query = queries.and_then(|qs| Self::merge_queries(qs, threshold)); + + // Load manifest from zipstorage + let manifest = Manifest::from_reader(storage.load("SOURMASH-MANIFEST.csv")?.as_slice())?; + let search_sigs: Vec<_> = manifest.internal_locations().map(PathBuf::from).collect(); + + let linear = LinearRevIndex::new( + Some(search_sigs.as_slice().into()), + template, + keep_sigs, + None, + Some(storage), + ); + + Ok(linear.index(threshold, merged_query, queries)) + } + + fn merge_queries(qs: &[KmerMinHash], threshold: usize) -> Option { + if threshold == 0 { + let mut merged = qs[0].clone(); + for query in &qs[1..] { + merged.merge(query).unwrap(); + } + Some(merged) + } else { + None + } + } + + pub fn new_with_sigs( + search_sigs: Vec, + template: &Sketch, + threshold: usize, + queries: Option<&[KmerMinHash]>, + ) -> RevIndex { + // If threshold is zero, let's merge all queries and save time later + let merged_query = queries.and_then(|qs| Self::merge_queries(qs, threshold)); + + let linear = LinearRevIndex::new( + Default::default(), + template, + false, + search_sigs.into(), + None, + ); + + linear.index(threshold, merged_query, queries) + } + + fn map_hashes_colors( + dataset_id: usize, + search_sig: &Signature, + queries: Option<&[KmerMinHash]>, + merged_query: &Option, + threshold: usize, + template: &Sketch, + ) -> Option<(HashToColor, Colors)> { + let mut search_mh = None; + if let Some(Sketch::MinHash(mh)) = search_sig.select_sketch(template) { + search_mh = Some(mh); + } + + let search_mh = search_mh.expect("Couldn't find a compatible MinHash"); + let mut hash_to_color = HashToColor::new(); + let mut colors = Colors::default(); + + if let Some(qs) = queries { + if let Some(ref merged) = merged_query { + let (matched_hashes, intersection) = merged.intersection(search_mh).unwrap(); + if !matched_hashes.is_empty() || intersection > threshold as u64 { + hash_to_color.add_to(&mut colors, dataset_id, matched_hashes); + } + } else { + for query in qs { + let (matched_hashes, intersection) = query.intersection(search_mh).unwrap(); + if !matched_hashes.is_empty() || intersection > threshold as u64 { + hash_to_color.add_to(&mut colors, dataset_id, matched_hashes); + } + } + } + } else { + let matched = search_mh.mins(); + let size = matched.len() as u64; + if !matched.is_empty() || size > threshold as u64 { + hash_to_color.add_to(&mut colors, dataset_id, matched); + } + }; + + if hash_to_color.is_empty() { + None + } else { + Some((hash_to_color, colors)) + } + } + + pub fn counter_for_query(&self, query: &KmerMinHash) -> SigCounter { + query + .iter_mins() + .filter_map(|hash| self.hash_to_color.get(hash)) + .flat_map(|color| self.colors.indices(color)) + .cloned() + .collect() + } + + pub fn search( + &self, + counter: SigCounter, + similarity: bool, + threshold: usize, + ) -> Result, Box> { + self.linear.search(counter, similarity, threshold) + } + + pub fn gather( + &self, + mut counter: SigCounter, + threshold: usize, + query: &KmerMinHash, + ) -> Result, Box> { + let mut match_size = usize::max_value(); + let mut matches = vec![]; + + while match_size > threshold && !counter.is_empty() { + let (dataset_id, size) = counter.most_common()[0]; + match_size = if size >= threshold { size } else { break }; + let result = self + .linear + .gather_round(dataset_id, match_size, query, matches.len())?; + if let Some(Sketch::MinHash(match_mh)) = + result.match_.select_sketch(&self.linear.template) + { + // Prepare counter for finding the next match by decrementing + // all hashes found in the current match in other datasets + for hash in match_mh.iter_mins() { + if let Some(color) = self.hash_to_color.get(hash) { + counter.subtract(self.colors.indices(color).cloned()); + } + } + counter.remove(&dataset_id); + matches.push(result); + } else { + unimplemented!() + } + } + Ok(matches) + } + + pub fn template(&self) -> Sketch { + self.linear.template.clone() + } + + // TODO: mh should be a sketch, or even a sig... + pub(crate) fn find_signatures( + &self, + mh: &KmerMinHash, + threshold: f64, + containment: bool, + _ignore_scaled: bool, + ) -> Result, Error> { + /* + let template_mh = None; + if let Sketch::MinHash(mh) = self.template { + template_mh = Some(mh); + }; + // TODO: throw error + let template_mh = template_mh.unwrap(); + + let tmp_mh; + let mh = if template_mh.scaled() > mh.scaled() { + // TODO: proper error here + tmp_mh = mh.downsample_scaled(self.scaled)?; + &tmp_mh + } else { + mh + }; + + if self.scaled < mh.scaled() && !ignore_scaled { + return Err(LcaDBError::ScaledMismatchError { + db: self.scaled, + query: mh.scaled(), + } + .into()); + } + */ + + // TODO: proper threshold calculation + let threshold: usize = (threshold * (mh.size() as f64)) as _; + + let counter = self.counter_for_query(mh); + + debug!( + "number of matching signatures for hashes: {}", + counter.len() + ); + + let mut results = vec![]; + for (dataset_id, size) in counter.most_common() { + let match_size = if size >= threshold { size } else { break }; + + let match_path = if self.linear.sig_files.is_empty() { + PathBuf::new() + } else { + self.linear.sig_files[dataset_id as usize].internal_location() + }; + + let ref_match; + let match_sig = if let Some(refsigs) = &self.linear.ref_sigs { + &refsigs[dataset_id as usize] + } else { + let mut sig = if let Some(storage) = &self.linear.storage { + let sig_data = + storage + .load(match_path.to_str().unwrap_or_else(|| { + panic!("error converting path {:?}", match_path) + })) + .unwrap_or_else(|_| panic!("error loading {:?}", match_path)); + Signature::from_reader(sig_data.as_slice())? + } else { + Signature::from_path(&match_path)? + }; + // TODO: remove swap_remove + ref_match = sig.swap_remove(0); + &ref_match + }; + + let mut match_mh = None; + if let Some(Sketch::MinHash(mh)) = match_sig.select_sketch(&self.linear.template) { + match_mh = Some(mh); + } + let match_mh = match_mh.unwrap(); + + if size >= threshold { + let score = if containment { + size as f64 / mh.size() as f64 + } else { + size as f64 / (mh.size() + match_size - size) as f64 + }; + let filename = match_path.to_str().unwrap().into(); + let mut sig = match_sig.clone(); + sig.reset_sketches(); + sig.push(Sketch::MinHash(match_mh.clone())); + results.push((score, sig, filename)); + } else { + break; + }; + } + Ok(results) + } +} + +#[derive(TypedBuilder, CopyGetters, Getters, Setters, Serialize, Deserialize, Debug, PartialEq)] +pub struct GatherResult { + #[getset(get_copy = "pub")] + intersect_bp: usize, + + #[getset(get_copy = "pub")] + f_orig_query: f64, + + #[getset(get_copy = "pub")] + f_match: f64, + + f_unique_to_query: f64, + f_unique_weighted: f64, + average_abund: usize, + median_abund: usize, + std_abund: usize, + + #[getset(get = "pub")] + filename: String, + + #[getset(get = "pub")] + name: String, + + #[getset(get = "pub")] + md5: String, + match_: Signature, + f_match_orig: f64, + unique_intersect_bp: usize, + gather_result_rank: usize, + remaining_bp: usize, +} + +impl GatherResult { + pub fn get_match(&self) -> Signature { + self.match_.clone() + } +} + +impl<'a> Index<'a> for RevIndex { + type Item = Signature; + + fn insert(&mut self, _node: Self::Item) -> Result<(), Error> { + unimplemented!() + } + + fn save>(&self, _path: P) -> Result<(), Error> { + unimplemented!() + } + + fn load>(_path: P) -> Result<(), Error> { + unimplemented!() + } + + fn len(&self) -> usize { + if let Some(refs) = &self.linear.ref_sigs { + refs.len() + } else { + self.linear.sig_files.len() + } + } + + fn signatures(&self) -> Vec { + if let Some(ref sigs) = self.linear.ref_sigs { + sigs.iter().map(|s| s.clone().into()).collect() + } else { + unimplemented!() + } + } + + fn signature_refs(&self) -> Vec<&Self::Item> { + unimplemented!() + } +} + +#[cfg(test)] +mod test { + use super::*; + + use crate::sketch::minhash::max_hash_for_scaled; + + #[test] + fn revindex_new() { + let max_hash = max_hash_for_scaled(10000); + let template = Sketch::MinHash( + KmerMinHash::builder() + .num(0u32) + .ksize(31) + .max_hash(max_hash) + .build(), + ); + let search_sigs = [ + "../../tests/test-data/gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig".into(), + "../../tests/test-data/gather/GCF_000007545.1_ASM754v1_genomic.fna.gz.sig".into(), + ]; + let index = RevIndex::new(&search_sigs, &template, 0, None, false); + assert_eq!(index.colors.len(), 3); + } + + #[test] + fn revindex_many() { + let max_hash = max_hash_for_scaled(10000); + let template = Sketch::MinHash( + KmerMinHash::builder() + .num(0u32) + .ksize(31) + .max_hash(max_hash) + .build(), + ); + let search_sigs = [ + "../../tests/test-data/gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig".into(), + "../../tests/test-data/gather/GCF_000007545.1_ASM754v1_genomic.fna.gz.sig".into(), + "../../tests/test-data/gather/GCF_000008105.1_ASM810v1_genomic.fna.gz.sig".into(), + ]; + + let index = RevIndex::new(&search_sigs, &template, 0, None, false); + /* + dbg!(&index.colors.colors); + 0: 86 + 1: 132 + 2: 91 + (0, 1): 53 + (0, 2): 90 + (1, 2): 26 + (0, 1, 2): 261 + union: 739 + */ + //assert_eq!(index.colors.len(), 3); + assert_eq!(index.colors.len(), 7); + } + + #[test] + fn revindex_from_zipstorage() { + let max_hash = max_hash_for_scaled(100); + let template = Sketch::MinHash( + KmerMinHash::builder() + .num(0u32) + .ksize(57) + .hash_function(crate::encodings::HashFunctions::murmur64_protein) + .max_hash(max_hash) + .build(), + ); + let storage = ZipStorage::from_file("../../tests/test-data/prot/protein.zip") + .expect("error loading zipfile"); + let index = RevIndex::from_zipstorage(storage, &template, 0, None, false) + .expect("error building from ziptorage"); + + assert_eq!(index.colors.len(), 3); + + let query_sig = Signature::from_path( + "../../tests/test-data/prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig", + ) + .expect("Error processing query") + .swap_remove(0); + let mut query_mh = None; + if let Some(Sketch::MinHash(mh)) = query_sig.select_sketch(&template) { + query_mh = Some(mh); + } + let query_mh = query_mh.expect("Couldn't find a compatible MinHash"); + + let counter_rev = index.counter_for_query(query_mh); + let counter_lin = index.linear.counter_for_query(query_mh); + + let results_rev = index.search(counter_rev, false, 0).unwrap(); + let results_linear = index.linear.search(counter_lin, false, 0).unwrap(); + assert_eq!(results_rev, results_linear); + + let counter_rev = index.counter_for_query(query_mh); + let counter_lin = index.linear.counter_for_query(query_mh); + + let results_rev = index.gather(counter_rev, 0, query_mh).unwrap(); + let results_linear = index.linear.gather(counter_lin, 0, query_mh).unwrap(); + assert_eq!(results_rev.len(), 1); + assert_eq!(results_rev, results_linear); + } +} diff --git a/src/core/src/index/revindex/mod.rs b/src/core/src/index/revindex/mod.rs new file mode 100644 index 0000000000..f917c71836 --- /dev/null +++ b/src/core/src/index/revindex/mod.rs @@ -0,0 +1,509 @@ +pub mod disk_revindex; +pub mod mem_revindex; + +use std::collections::HashMap; +use std::hash::{Hash, Hasher}; +use std::path::{Path, PathBuf}; +use std::sync::Arc; + +use byteorder::{LittleEndian, WriteBytesExt}; +use rkyv::{Archive, Deserialize, Serialize}; +use roaring::RoaringTreemap; + +use crate::index::revindex::mem_revindex::GatherResult; +use crate::signature::{Signature, SigsTrait}; +use crate::sketch::minhash::{max_hash_for_scaled, FracMinHashOps, KmerMinHash, MinHashOps}; +use crate::sketch::Sketch; + +use crate::encodings::Color; + +//type DB = rocksdb::DBWithThreadMode; +type DB = rocksdb::DBWithThreadMode; + +type DatasetID = u64; +type SigCounter = counter::Counter; +type QueryColors = HashMap; +type HashToColor = HashMap; + +const HASHES: &str = "hashes"; +const SIGS: &str = "signatures"; +const COLORS: &str = "colors"; + +pub enum RevIndex { + //Color(color_revindex::ColorRevIndex), + Plain(disk_revindex::RevIndex), +} + +impl RevIndex { + /* TODO: need the repair_cf variant, not available in rocksdb-rust yet + pub fn repair(index: &Path, colors: bool) { + if colors { + color_revindex::repair(index); + } else { + disk_revindex::repair(index); + } + } + */ + + pub fn counter_for_query(&self, query: &KmerMinHash) -> SigCounter { + match self { + //Self::Color(db) => db.counter_for_query(query), + Self::Plain(db) => db.counter_for_query(query), + } + } + + pub fn matches_from_counter( + &self, + counter: SigCounter, + threshold: usize, + ) -> Vec<(String, usize)> { + match self { + //Self::Color(db) => todo!(), //db.matches_from_counter(counter, threshold), + Self::Plain(db) => db.matches_from_counter(counter, threshold), + } + } + + pub fn prepare_gather_counters( + &self, + query: &KmerMinHash, + ) -> (SigCounter, QueryColors, HashToColor) { + match self { + //Self::Color(_db) => todo!(), //db.prepare_gather_counters(query), + Self::Plain(db) => db.prepare_gather_counters(query), + } + } + + pub fn index( + &self, + index_sigs: Vec, + template: &Sketch, + threshold: f64, + save_paths: bool, + ) { + match self { + //Self::Color(db) => db.index(index_sigs, template, threshold, save_paths), + Self::Plain(db) => db.index(index_sigs, template, threshold, save_paths), + } + } + + pub fn update( + &self, + index_sigs: Vec, + template: &Sketch, + threshold: f64, + save_paths: bool, + ) { + match self { + //Self::Color(db) => db.update(index_sigs, template, threshold, save_paths), + Self::Plain(db) => db.update(index_sigs, template, threshold, save_paths), + } + } + + pub fn compact(&self) { + match self { + //Self::Color(db) => db.compact(), + Self::Plain(db) => db.compact(), + }; + } + + pub fn flush(&self) -> Result<(), Box> { + match self { + //Self::Color(db) => db.flush(), + Self::Plain(db) => db.flush(), + } + } + + pub fn convert(&self, output_db: RevIndex) -> Result<(), Box> { + match self { + //Self::Color(_db) => todo!(), + Self::Plain(db) => db.convert(output_db), + } + } + + pub fn check(&self, quick: bool) { + match self { + //Self::Color(db) => db.check(quick), + Self::Plain(db) => db.check(quick), + } + } + + pub fn create(index: &Path, colors: bool) -> Self { + if colors { + todo!() //color_revindex::ColorRevIndex::create(index) + } else { + disk_revindex::RevIndex::create(index) + } + } + + pub fn open(index: &Path, read_only: bool) -> Self { + let opts = Self::db_options(); + let cfs = DB::list_cf(&opts, index).unwrap(); + + if cfs.into_iter().any(|c| c == COLORS) { + // TODO: ColorRevIndex can't be read-only for now, + // due to pending unmerged colors + todo!() //color_revindex::ColorRevIndex::open(index, false) + } else { + disk_revindex::RevIndex::open(index, read_only) + } + } + + fn db_options() -> rocksdb::Options { + let mut opts = rocksdb::Options::default(); + opts.set_max_open_files(500); + + // Updated defaults from + // https://github.com/facebook/rocksdb/wiki/Setup-Options-and-Basic-Tuning#other-general-options + opts.set_bytes_per_sync(1048576); + let mut block_opts = rocksdb::BlockBasedOptions::default(); + block_opts.set_block_size(16 * 1024); + block_opts.set_cache_index_and_filter_blocks(true); + block_opts.set_pin_l0_filter_and_index_blocks_in_cache(true); + block_opts.set_format_version(5); + opts.set_block_based_table_factory(&block_opts); + // End of updated defaults + + opts + } + + pub fn gather( + &self, + counter: SigCounter, + query_colors: QueryColors, + hash_to_color: HashToColor, + threshold: usize, + query: &KmerMinHash, + template: &Sketch, + ) -> Result, Box> { + match self { + //Self::Color(_db) => todo!(), + Self::Plain(db) => db.gather( + counter, + query_colors, + hash_to_color, + threshold, + query, + template, + ), + } + } +} + +#[derive(Debug, Default, PartialEq, Clone, Archive, Serialize, Deserialize)] +enum SignatureData { + #[default] + Empty, + Internal(Signature), + External(String), +} + +impl SignatureData { + fn from_slice(slice: &[u8]) -> Option { + // TODO: avoid the aligned vec allocation here + let mut vec = rkyv::AlignedVec::new(); + vec.extend_from_slice(slice); + let archived_value = unsafe { rkyv::archived_root::(vec.as_ref()) }; + let inner = archived_value.deserialize(&mut rkyv::Infallible).unwrap(); + Some(inner) + } + + fn as_bytes(&self) -> Option> { + let bytes = rkyv::to_bytes::<_, 256>(self).unwrap(); + Some(bytes.into_vec()) + + /* + let mut serializer = DefaultSerializer::default(); + let v = serializer.serialize_value(self).unwrap(); + debug_assert_eq!(v, 0); + let buf = serializer.into_serializer().into_inner(); + debug_assert!(Datasets::from_slice(&buf.to_vec()).is_some()); + Some(buf.to_vec()) + */ + } +} + +fn check_compatible_downsample(me: &KmerMinHash, other: &KmerMinHash) -> Result<(), crate::Error> { + /* + if self.num != other.num { + return Err(Error::MismatchNum { + n1: self.num, + n2: other.num, + } + .into()); + } + */ + use crate::Error; + + if me.ksize() != other.ksize() { + return Err(Error::MismatchKSizes); + } + if me.hash_function() != other.hash_function() { + // TODO: fix this error + return Err(Error::MismatchDNAProt); + } + if me.max_hash() < other.max_hash() { + return Err(Error::MismatchScaled); + } + if me.seed() != other.seed() { + return Err(Error::MismatchSeed); + } + Ok(()) +} + +pub fn prepare_query(search_sig: &Signature, template: &Sketch) -> Option { + let mut search_mh = None; + if let Some(Sketch::MinHash(mh)) = search_sig.select_sketch(template) { + search_mh = Some(mh.clone()); + } else { + // try to find one that can be downsampled + if let Sketch::MinHash(template_mh) = template { + for sketch in search_sig.sketches() { + if let Sketch::MinHash(ref_mh) = sketch { + if check_compatible_downsample(&ref_mh, template_mh).is_ok() { + let max_hash = max_hash_for_scaled(template_mh.scaled()); + let mh = ref_mh.downsample_max_hash(max_hash).unwrap(); + search_mh = Some(mh); + } + } + } + } + } + search_mh +} + +#[derive(Debug, Default, PartialEq, Clone)] +pub enum Datasets { + #[default] + Empty, + Unique(DatasetID), + Many(RoaringTreemap), +} + +impl Hash for Datasets { + fn hash(&self, state: &mut H) + where + H: Hasher, + { + match self { + Self::Empty => todo!(), + Self::Unique(v) => v.hash(state), + Self::Many(_) => todo!(), + } + } +} + +impl IntoIterator for Datasets { + type Item = DatasetID; + type IntoIter = Box>; + + fn into_iter(self) -> Self::IntoIter { + match self { + Self::Empty => Box::new(std::iter::empty()), + Self::Unique(v) => Box::new(std::iter::once(v)), + Self::Many(v) => Box::new(v.into_iter()), + } + } +} + +impl Extend for Datasets { + fn extend(&mut self, iter: T) + where + T: IntoIterator, + { + if let Self::Many(v) = self { + v.extend(iter); + return; + } + + let mut it = iter.into_iter(); + while let Some(value) = it.next() { + match self { + Self::Empty => *self = Datasets::Unique(value), + Self::Unique(v) => { + if *v != value { + *self = Self::Many([*v, value].iter().copied().collect()); + } + } + Self::Many(v) => { + v.extend(it); + return; + } + } + } + } +} + +impl Datasets { + fn new(vals: &[DatasetID]) -> Self { + if vals.is_empty() { + Self::Empty + } else if vals.len() == 1 { + Self::Unique(vals[0]) + } else { + Self::Many(RoaringTreemap::from_sorted_iter(vals.iter().copied()).unwrap()) + } + } + + fn from_slice(slice: &[u8]) -> Option { + use byteorder::ReadBytesExt; + + if slice.len() == 8 { + // Unique + Some(Self::Unique( + (&slice[..]).read_u64::().unwrap(), + )) + } else if slice.len() == 1 { + // Empty + Some(Self::Empty) + } else { + // Many + Some(Self::Many(RoaringTreemap::deserialize_from(slice).unwrap())) + } + } + + fn as_bytes(&self) -> Option> { + use byteorder::WriteBytesExt; + + match self { + Self::Empty => Some(vec![42_u8]), + Self::Unique(v) => { + let mut buf = vec![0u8; 8]; + (&mut buf[..]) + .write_u64::(*v) + .expect("error writing bytes"); + Some(buf) + } + Self::Many(v) => { + let mut buf = vec![]; + v.serialize_into(&mut buf).unwrap(); + Some(buf) + } + } + } + + fn union(&mut self, other: Datasets) { + match self { + Datasets::Empty => match other { + Datasets::Empty => (), + Datasets::Unique(_) | Datasets::Many(_) => *self = other, + }, + Datasets::Unique(v) => match other { + Datasets::Empty => (), + Datasets::Unique(o) => { + if *v != o { + *self = Datasets::Many([*v, o].iter().copied().collect()) + } + } + Datasets::Many(mut o) => { + o.extend([*v]); + *self = Datasets::Many(o); + } + }, + Datasets::Many(ref mut v) => v.extend(other), + } + } + + fn len(&self) -> usize { + match self { + Self::Empty => 0, + Self::Unique(_) => 1, + Self::Many(ref v) => v.len() as usize, + } + } + + /* + fn contains(&self, value: &DatasetID) -> bool { + match self { + Self::Empty => false, + Self::Unique(v) => v == value, + Self::Many(ref v) => v.contains(*value), + } + } + */ +} + +fn sig_save_to_db( + db: Arc, + mut search_sig: Signature, + search_mh: KmerMinHash, + size: u64, + threshold: f64, + save_paths: bool, + filename: &Path, + dataset_id: u64, +) { + // Save signature to DB + let sig = if search_mh.is_empty() || size < threshold as u64 { + SignatureData::Empty + } else if save_paths { + SignatureData::External(filename.to_str().unwrap().to_string()) + } else { + search_sig.reset_sketches(); + search_sig.push(Sketch::MinHash(search_mh)); + SignatureData::Internal(search_sig) + }; + + let sig_bytes = sig.as_bytes().unwrap(); + let cf_sigs = db.cf_handle(SIGS).unwrap(); + let mut hash_bytes = [0u8; 8]; + (&mut hash_bytes[..]) + .write_u64::(dataset_id) + .expect("error writing bytes"); + db.put_cf(&cf_sigs, &hash_bytes[..], sig_bytes.as_slice()) + .expect("error saving sig"); +} + +fn stats_for_cf(db: Arc, cf_name: &str, deep_check: bool, quick: bool) { + use byteorder::ReadBytesExt; + use histogram::Histogram; + use log::info; + use numsep::{separate, Locale}; + + let cf = db.cf_handle(cf_name).unwrap(); + + let iter = db.iterator_cf(&cf, rocksdb::IteratorMode::Start); + let mut kcount = 0; + let mut vcount = 0; + let mut vcounts = Histogram::new(); + let mut datasets: Datasets = Default::default(); + + for result in iter { + let (key, value) = result.unwrap(); + let _k = (&key[..]).read_u64::().unwrap(); + kcount += key.len(); + + //println!("Saw {} {:?}", k, Datasets::from_slice(&value)); + vcount += value.len(); + + if !quick && deep_check { + let v = Datasets::from_slice(&value).expect("Error with value"); + vcounts.increment(v.len() as u64).unwrap(); + datasets.union(v); + } + //println!("Saw {} {:?}", k, value); + } + + info!("*** {} ***", cf_name); + use size::Size; + let ksize = Size::from_bytes(kcount); + let vsize = Size::from_bytes(vcount); + if !quick && cf_name == COLORS { + info!( + "total datasets: {}", + separate(datasets.len(), Locale::English) + ); + } + info!("total keys: {}", separate(kcount / 8, Locale::English)); + + info!("k: {}", ksize.to_string()); + info!("v: {}", vsize.to_string()); + + if !quick && kcount > 0 && deep_check { + info!("max v: {}", vcounts.maximum().unwrap()); + info!("mean v: {}", vcounts.mean().unwrap()); + info!("stddev: {}", vcounts.stddev().unwrap()); + info!("median v: {}", vcounts.percentile(50.0).unwrap()); + info!("p25 v: {}", vcounts.percentile(25.0).unwrap()); + info!("p75 v: {}", vcounts.percentile(75.0).unwrap()); + } +} diff --git a/src/core/src/index/sbt/mhbt.rs b/src/core/src/index/sbt/mhbt.rs deleted file mode 100644 index 2d4ceb3fb8..0000000000 --- a/src/core/src/index/sbt/mhbt.rs +++ /dev/null @@ -1,361 +0,0 @@ -use std::collections::HashMap; -use std::io::Write; - -use crate::errors::ReadDataError; -use crate::index::sbt::{Factory, FromFactory, Node, SBT}; -use crate::prelude::*; -use crate::signature::SigsTrait; -use crate::sketch::nodegraph::Nodegraph; -use crate::sketch::Sketch; -use crate::storage::Storage; -use crate::Error; - -impl ToWriter for Nodegraph { - fn to_writer(&self, writer: &mut W) -> Result<(), Error> - where - W: Write, - { - self.save_to_writer(writer) - } -} - -impl FromFactory> for SBT, L> { - fn factory(&self, name: &str) -> Result, Error> { - match self.factory { - Factory::GraphFactory { args: (k, t, n) } => { - let n = Nodegraph::with_tables(t as usize, n as usize, k as usize); - - Ok(Node::builder() - .filename(name) - .name(name) - .metadata(HashMap::default()) - .storage(self.storage()) - .data(n) - .build()) - } - } - } -} - -impl Update> for Node { - fn update(&self, _other: &mut Node) -> Result<(), Error> { - unimplemented!(); - } -} - -impl Update> for Signature { - fn update(&self, parent: &mut Node) -> Result<(), Error> { - // TODO: avoid copy here - let mut parent_data = parent.data()?.clone(); - - if let Sketch::MinHash(sig) = &self.signatures[0] { - for h in sig.mins() { - parent_data.count(h); - } - - let min_n_below = parent - .metadata - .entry("min_n_below".into()) - .or_insert(u64::max_value()); - - *min_n_below = u64::min(sig.size() as u64, *min_n_below); - if *min_n_below == 0 { - *min_n_below = 1 - } - } else { - //TODO what if it is not a minhash? - unimplemented!() - } - - parent.data = parent_data.into(); - - Ok(()) - } -} - -impl Comparable> for Node { - fn similarity(&self, other: &Node) -> f64 { - let ng: &Nodegraph = self.data().unwrap(); - let ong: &Nodegraph = other.data().unwrap(); - ng.similarity(ong) - } - - fn containment(&self, other: &Node) -> f64 { - let ng: &Nodegraph = self.data().unwrap(); - let ong: &Nodegraph = other.data().unwrap(); - ng.containment(ong) - } -} - -impl Comparable for Node { - fn similarity(&self, other: &Signature) -> f64 { - let ng: &Nodegraph = self.data().unwrap(); - - // TODO: select the right signatures... - if let Sketch::MinHash(sig) = &other.signatures[0] { - if sig.size() == 0 { - return 0.0; - } - - let matches: usize = sig.mins().iter().map(|h| ng.get(*h)).sum(); - - let min_n_below = self.metadata["min_n_below"] as f64; - - // This overestimates the similarity, but better than truncating too - // soon and losing matches - matches as f64 / min_n_below - } else { - //TODO what if it is not a minhash? - unimplemented!() - } - } - - fn containment(&self, other: &Signature) -> f64 { - let ng: &Nodegraph = self.data().unwrap(); - - // TODO: select the right signatures... - if let Sketch::MinHash(sig) = &other.signatures[0] { - if sig.size() == 0 { - return 0.0; - } - - let matches: usize = sig.mins().iter().map(|h| ng.get(*h)).sum(); - - matches as f64 / sig.size() as f64 - } else { - //TODO what if it is not a minhash? - unimplemented!() - } - } -} - -impl ReadData for Node { - fn data(&self) -> Result<&Nodegraph, Error> { - if let Some(storage) = &self.storage { - Ok(self.data.get_or_init(|| { - let raw = storage.load(&self.filename).unwrap(); - Nodegraph::from_reader(&mut &raw[..]).unwrap() - })) - } else if let Some(data) = self.data.get() { - Ok(data) - } else { - Err(ReadDataError::LoadError.into()) - } - } -} - -#[cfg(test)] -mod test { - use std::convert::TryInto; - use std::fs::File; - use std::io::{BufReader, Seek, SeekFrom}; - use std::path::PathBuf; - - use assert_matches::assert_matches; - - use super::Factory; - - use crate::index::linear::LinearIndex; - use crate::index::sbt::scaffold; - use crate::index::search::{search_minhashes, search_minhashes_containment}; - use crate::index::{Index, SigStore, MHBT}; - use crate::prelude::*; - - #[test] - fn save_sbt() { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/v5.sbt.json"); - - let mut sbt = MHBT::from_path(filename).expect("Loading error"); - - let mut tmpfile = tempfile::NamedTempFile::new().unwrap(); - sbt.save_file(tmpfile.path(), None).unwrap(); - - tmpfile.seek(SeekFrom::Start(0)).unwrap(); - } - - #[test] - fn load_sbt() { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/v5.sbt.json"); - - let sbt = MHBT::from_path(filename).expect("Loading error"); - - assert_eq!(sbt.d, 2); - //assert_eq!(sbt.storage.backend, "FSStorage"); - //assert_eq!(sbt.storage.args["path"], ".sbt.v5"); - //assert_matches!(&sbt.storage, ::FSStorage(args) => { - // assert_eq!(args, &[1, 100000, 4]); - //}); - assert_matches!(&sbt.factory, Factory::GraphFactory { args } => { - assert_eq!(args, &(1, 100000.0, 4)); - }); - - println!("sbt leaves {:?} {:?}", sbt.leaves.len(), sbt.leaves); - - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); - - let mut reader = BufReader::new(File::open(filename).unwrap()); - let sigs = Signature::load_signatures( - &mut reader, - Some(31), - Some("DNA".try_into().unwrap()), - None, - ) - .unwrap(); - let leaf = sigs[0].clone(); - - let results = sbt.find(search_minhashes, &leaf, 0.5).unwrap(); - assert_eq!(results.len(), 1); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = sbt.find(search_minhashes, &leaf, 0.1).unwrap(); - assert_eq!(results.len(), 2); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let mut linear = LinearIndex::builder().storage(sbt.storage()).build(); - for l in &sbt.leaves { - linear.insert(l.1.data().unwrap().clone()).unwrap(); - } - - let datasets = linear.signatures(); - println!("linear leaves {:?} {:?}", datasets.len(), datasets); - - let results = linear.find(search_minhashes, &leaf, 0.5).unwrap(); - assert_eq!(results.len(), 1); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = linear.find(search_minhashes, &leaf, 0.1).unwrap(); - assert_eq!(results.len(), 2); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = linear - .find(search_minhashes_containment, &leaf, 0.5) - .unwrap(); - assert_eq!(results.len(), 2); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = linear - .find(search_minhashes_containment, &leaf, 0.1) - .unwrap(); - assert_eq!(results.len(), 4); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - } - - #[test] - #[ignore] - fn roundtrip_sbt() -> Result<(), Box> { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/v5.sbt.json"); - - let sbt = MHBT::from_path(filename)?; - - assert_eq!(sbt.d, 2); - //assert_eq!(sbt.storage.backend, "FSStorage"); - //assert_eq!(sbt.storage.args["path"], ".sbt.v5"); - //assert_matches!(&sbt.storage, ::FSStorage(args) => { - // assert_eq!(args, &[1, 100000, 4]); - //}); - assert_matches!(&sbt.factory, Factory::GraphFactory { args } => { - assert_eq!(args, &(1, 100000.0, 4)); - }); - - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); - - let mut reader = BufReader::new(File::open(filename)?); - let sigs = Signature::load_signatures( - &mut reader, - Some(31), - Some("DNA".try_into().unwrap()), - None, - )?; - let sig_data = sigs[0].clone(); - - let leaf: SigStore<_> = sig_data.into(); - - let results = sbt.find(search_minhashes, &leaf, 0.5)?; - assert_eq!(results.len(), 1); - //println!("results: {:?}", results); - //println!("leaf: {:?}", leaf); - - let results = sbt.find(search_minhashes, &leaf, 0.1)?; - assert_eq!(results.len(), 2); - //println!("results: {:?}", results); - //println!("leaf: {:?}", leaf); - - println!("sbt internal {:?} {:?}", sbt.nodes.len(), sbt.nodes); - println!("sbt leaves {:?} {:?}", sbt.leaves.len(), sbt.leaves); - - let mut new_sbt: MHBT = MHBT::builder().storage(None).build(); - let datasets = sbt.signatures(); - for l in datasets { - new_sbt.insert(l)?; - } - - for (i, node) in &sbt.nodes { - assert_eq!(node.data().unwrap(), new_sbt.nodes[i].data().unwrap()); - } - - assert_eq!(new_sbt.signature_refs().len(), 7); - println!("new_sbt internal {:?} {:?}", sbt.nodes.len(), sbt.nodes); - println!("new_sbt leaves {:?} {:?}", sbt.leaves.len(), sbt.leaves); - - let results = new_sbt.find(search_minhashes, &leaf, 0.5)?; - //println!("results: {:?}", results); - //println!("leaf: {:?}", leaf); - assert_eq!(results.len(), 1); - - let results = new_sbt.find(search_minhashes, &leaf, 0.1)?; - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - assert_eq!(results.len(), 2); - - let results = new_sbt.find(search_minhashes_containment, &leaf, 0.5)?; - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - assert_eq!(results.len(), 2); - - let results = new_sbt.find(search_minhashes_containment, &leaf, 0.1)?; - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - assert_eq!(results.len(), 4); - - Ok(()) - } - - #[test] - fn scaffold_sbt() { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/v5.sbt.json"); - - let sbt = MHBT::from_path(filename).expect("Loading error"); - - let new_sbt: MHBT = scaffold(sbt.leaves(), sbt.storage()); - - assert_eq!(new_sbt.signatures().len(), 7); - } - - #[test] - fn load_v4() { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/v4.sbt.json"); - - let _sbt = MHBT::from_path(filename).expect("Loading error"); - } - - #[test] - fn load_v5() { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/v5.sbt.json"); - - let _sbt = MHBT::from_path(filename).expect("Loading error"); - } -} diff --git a/src/core/src/index/sbt/mhmt.rs b/src/core/src/index/sbt/mhmt.rs deleted file mode 100644 index 5eeb8a09b3..0000000000 --- a/src/core/src/index/sbt/mhmt.rs +++ /dev/null @@ -1,227 +0,0 @@ -use std::io::{Read, Write}; - -use mqf::MQF; - -use crate::Error; -use crate::index::sbt::{FromFactory, Node, Update, SBT}; -use crate::index::storage::{ReadData, ReadDataError, ToWriter}; -use crate::index::Comparable; -use crate::signature::{Signature, SigsTrait}; -use crate::sketch::Sketch; - -impl ToWriter for MQF { - fn to_writer(&self, writer: &mut W) -> Result<(), Error> - where - W: Write, - { - // TODO: using tempfile for now, but ideally want to avoid that - let mut tmpfile = tempfile::NamedTempFile::new()?; - self.serialize(tmpfile.path()).unwrap(); // TODO: convert this to a proper error - - let mut buffer = Vec::new(); - tmpfile.read_to_end(&mut buffer)?; - writer.write_all(&buffer)?; - - Ok(()) - } -} - -impl ReadData for Node { - fn data(&self) -> Result<&MQF, Error> { - if let Some(storage) = &self.storage { - Ok(self.data.get_or_create(|| { - let raw = storage.load(&self.filename).unwrap(); - - // TODO: using tempfile for now, but ideally want to avoid that - let mut tmpfile = tempfile::NamedTempFile::new().unwrap(); - tmpfile.write_all(&raw[..]).unwrap(); - - MQF::deserialize(tmpfile.path()).unwrap() - })) - } else if let Some(data) = self.data.get() { - Ok(data) - } else { - Err(ReadDataError::LoadError.into()) - } - } -} - -impl FromFactory> for SBT, L> { - fn factory(&self, _name: &str) -> Result, Error> { - unimplemented!() - } -} - -impl Update> for Node { - fn update(&self, _other: &mut Node) -> Result<(), Error> { - unimplemented!(); - } -} - -impl Update> for Signature { - fn update(&self, _other: &mut Node) -> Result<(), Error> { - unimplemented!(); - } -} - -impl Comparable> for Node { - fn similarity(&self, other: &Node) -> f64 { - let _ng: &MQF = self.data().unwrap(); - let _ong: &MQF = other.data().unwrap(); - unimplemented!(); - //ng.similarity(&ong) - } - - fn containment(&self, other: &Node) -> f64 { - let _ng: &MQF = self.data().unwrap(); - let _ong: &MQF = other.data().unwrap(); - unimplemented!(); - //ng.containment(&ong) - } -} - -impl Comparable for Node { - fn similarity(&self, other: &Signature) -> f64 { - let ng: &MQF = self.data().unwrap(); - - // TODO: select the right signatures... - if let Sketch::MinHash(sig) = &other.signatures[0] { - if sig.size() == 0 { - return 0.0; - } - - let matches: usize = sig - .mins - .iter() - .filter(|h| dbg!(ng.count_key(**h % u64::pow(2, 26))) > 0) - //.filter(|h| dbg!(ng.count_key(**h)) > 0) - .count(); - - let min_n_below = self.metadata["min_n_below"] as f64; - - // This overestimates the similarity, but better than truncating too - // soon and losing matches - matches as f64 / min_n_below - } else { - //TODO what if it is not a minhash? - unimplemented!() - } - } - - fn containment(&self, other: &Signature) -> f64 { - let ng: &MQF = self.data().unwrap(); - - // TODO: select the right signatures... - if let Sketch::MinHash(sig) = &other.signatures[0] { - if sig.size() == 0 { - return 0.0; - } - - let matches: usize = sig - .mins - .iter() - .filter(|h| ng.count_key(**h % u64::pow(2, 26)) > 0) - //.filter(|h| ng.count_key(**h) > 0) - .count(); - - matches as f64 / sig.size() as f64 - } else { - //TODO what if it is not a minhash? - unimplemented!() - } - } -} - -/* FIXME: bring back after MQF works on macOS and Windows -#[cfg(test)] -mod test { - use std::fs::File; - use std::io::{BufReader, Seek, SeekFrom}; - use std::path::PathBuf; - use std::rc::Rc; - use tempfile; - - use assert_matches::assert_matches; - use lazy_init::Lazy; - - use super::{scaffold, Factory}; - - use crate::index::linear::LinearIndex; - use crate::index::search::{search_minhashes, search_minhashes_containment}; - use crate::index::storage::ReadData; - use crate::index::{Index, SigStore, MHBT}; - use crate::signature::Signature; - - #[cfg(not(target_arch = "wasm32"))] - #[test] - fn load_mhmt() { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("tests/test-data/v5_mhmt.sbt.json"); - - let mut sbt = crate::index::MHMT::from_path(filename).expect("Loading error"); - - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); - - let mut reader = BufReader::new(File::open(filename).unwrap()); - let sigs = Signature::load_signatures(&mut reader, 31, Some("DNA".into()), None).unwrap(); - let sig_data = sigs[0].clone(); - - let data = Lazy::new(); - data.get_or_create(|| sig_data); - - let leaf = SigStore::builder() - .data(Rc::new(data)) - .filename("") - .name("") - .metadata("") - .storage(None) - .build(); - - let results = sbt.find(search_minhashes, &leaf, 0.5).unwrap(); - //assert_eq!(results.len(), 1); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = sbt.find(search_minhashes, &leaf, 0.1).unwrap(); - assert_eq!(results.len(), 2); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let mut linear = LinearIndex::builder().storage(sbt.storage()).build(); - for l in &sbt.leaves { - linear.insert(l.1.data().unwrap().clone()).unwrap(); - } - - println!( - "linear leaves {:?} {:?}", - linear.datasets.len(), - linear.datasets - ); - - let results = linear.find(search_minhashes, &leaf, 0.5).unwrap(); - assert_eq!(results.len(), 1); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = linear.find(search_minhashes, &leaf, 0.1).unwrap(); - assert_eq!(results.len(), 2); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = linear - .find(search_minhashes_containment, &leaf, 0.5) - .unwrap(); - assert_eq!(results.len(), 2); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = linear - .find(search_minhashes_containment, &leaf, 0.1) - .unwrap(); - assert_eq!(results.len(), 4); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - } - */ -} diff --git a/src/core/src/index/sbt/mod.rs b/src/core/src/index/sbt/mod.rs deleted file mode 100644 index 5245defe1f..0000000000 --- a/src/core/src/index/sbt/mod.rs +++ /dev/null @@ -1,878 +0,0 @@ -pub mod mhbt; - -/* FIXME: bring back after boomphf changes -pub mod ukhs; -*/ - -/* FIXME: bring back after MQF works on macOS and Windows -#[cfg(not(target_arch = "wasm32"))] -pub mod mhmt; -*/ - -use std::collections::hash_map::Entry; -use std::collections::{HashMap, HashSet}; -use std::fmt::Debug; -use std::fs::File; -use std::hash::BuildHasherDefault; -use std::io::{BufReader, Read}; -use std::path::{Path, PathBuf}; - -use log::info; -use nohash_hasher::NoHashHasher; -use once_cell::sync::OnceCell; -use serde::{Deserialize, Serialize}; -use typed_builder::TypedBuilder; - -use crate::index::{Comparable, DatasetInfo, Index, SigStore}; -use crate::prelude::*; -use crate::storage::{FSStorage, InnerStorage, StorageInfo}; -use crate::Error; - -#[derive(TypedBuilder)] -pub struct SBT { - #[builder(default = 2)] - d: u32, - - #[builder(default, setter(into))] - storage: Option, - - #[builder(default = Factory::GraphFactory { args: (1, 100000.0, 4) })] - factory: Factory, - - #[builder(default = HashMap::default())] - nodes: HashMap, - - #[builder(default = HashMap::default())] - leaves: HashMap>, -} - -const fn parent(pos: u64, d: u64) -> u64 { - (pos - 1) / d -} - -const fn child(parent: u64, pos: u64, d: u64) -> u64 { - d * parent + pos + 1 -} - -impl SBT -where - L: std::clone::Clone + Default, - N: Default, -{ - #[inline(always)] - fn parent(&self, pos: u64) -> Option { - if pos == 0 { - None - } else { - Some(parent(pos, u64::from(self.d))) - } - } - - #[inline(always)] - fn child(&self, parent: u64, pos: u64) -> u64 { - child(parent, pos, u64::from(self.d)) - } - - #[inline(always)] - fn children(&self, pos: u64) -> Vec { - (0..u64::from(self.d)).map(|c| self.child(pos, c)).collect() - } - - pub fn storage(&self) -> Option { - self.storage.clone() - } - - /* - fn fill_up(&mut self) -> Result<(), Error> { - let mut visited = HashSet::new(); - let mut queue: Vec<_> = self.leaves.keys().collect(); - - while !queue.is_empty() { - let pos = queue.pop().unwrap(); - - if !visited.contains(&pos) { - visited.insert(pos); - } - } - - Ok(()) - } - */ - - // combine -} - -impl SBT, T> -where - T: ToWriter + Clone, - U: ToWriter, - Node: ReadData, - SigStore: ReadData, -{ - fn parse_v4(rdr: &mut R) -> Result - where - R: Read, - { - let sinfo: SBTInfoV4 = serde_json::from_reader(rdr)?; - Ok(SBTInfo::V4(sinfo)) - } - - fn parse_v5(rdr: &mut R) -> Result - where - R: Read, - { - let sinfo: SBTInfoV5 = serde_json::from_reader(rdr)?; - Ok(SBTInfo::V5(sinfo)) - } - - pub fn from_reader(mut rdr: R, path: P) -> Result, T>, Error> - where - R: Read, - P: AsRef, - { - // TODO: I would love to do this, but I get an untagged enum error with - // SBTInfo... - //let sinfo: SBTInfo = serde_json::from_reader(rdr)?; - - let mut s = String::new(); - rdr.read_to_string(&mut s)?; - - let sinfo = - Self::parse_v5(&mut s.as_bytes()).or_else(|_| Self::parse_v4(&mut s.as_bytes()))?; - - // TODO: support other storages - let mut st: FSStorage = match sinfo { - SBTInfo::V4(ref sbt) => (&sbt.storage.args).into(), - SBTInfo::V5(ref sbt) => (&sbt.storage.args).into(), - SBTInfo::V6(ref sbt) => (&sbt.storage.args).into(), - }; - st.set_base(path.as_ref().to_str().unwrap()); - let storage = InnerStorage::new(st); - - let d = match sinfo { - SBTInfo::V4(ref sbt) => sbt.d, - SBTInfo::V5(ref sbt) => sbt.d, - SBTInfo::V6(ref sbt) => sbt.d, - }; - - let factory = match sinfo { - SBTInfo::V4(ref sbt) => sbt.factory.clone(), - SBTInfo::V5(ref sbt) => sbt.factory.clone(), - SBTInfo::V6(ref sbt) => sbt.factory.clone(), - }; - - let (nodes, leaves) = match sinfo { - SBTInfo::V6(sbt) => { - let nodes = sbt - .nodes - .into_iter() - .map(|(n, l)| { - ( - n, - Node::builder() - .filename(l.filename) - .name(l.name) - .metadata(l.metadata) - .storage(Some(storage.clone())) - .build(), - ) - }) - .collect(); - let leaves = sbt - .signatures - .into_iter() - .map(|(n, l)| { - ( - n, - SigStore::builder() - .filename(l.filename) - .name(l.name) - .metadata(l.metadata) - .storage(Some(storage.clone())) - .build(), - ) - }) - .collect(); - (nodes, leaves) - } - SBTInfo::V5(sbt) => { - let nodes = sbt - .nodes - .into_iter() - .map(|(n, l)| { - ( - n, - Node::builder() - .filename(l.filename) - .name(l.name) - .metadata(l.metadata) - .storage(Some(storage.clone())) - .build(), - ) - }) - .collect(); - let leaves = sbt - .leaves - .into_iter() - .map(|(n, l)| { - ( - n, - SigStore::builder() - .filename(l.filename) - .name(l.name) - .metadata(l.metadata) - .storage(Some(storage.clone())) - .build(), - ) - }) - .collect(); - (nodes, leaves) - } - SBTInfo::V4(sbt) => { - let nodes = sbt - .nodes - .iter() - .filter_map(|(n, x)| match x { - NodeInfoV4::Node(l) => Some(( - *n, - Node::builder() - .filename(l.filename.clone()) - .name(l.name.clone()) - .metadata(l.metadata.clone()) - .storage(Some(storage.clone())) - .build(), - )), - NodeInfoV4::Leaf(_) => None, - }) - .collect(); - - let leaves = sbt - .nodes - .into_iter() - .filter_map(|(n, x)| match x { - NodeInfoV4::Node(_) => None, - NodeInfoV4::Leaf(l) => Some(( - n, - SigStore::builder() - .filename(l.filename) - .name(l.name) - .metadata(l.metadata) - .storage(Some(storage.clone())) - .build(), - )), - }) - .collect(); - - (nodes, leaves) - } - }; - - Ok(SBT { - d, - factory, - storage: Some(storage), - nodes, - leaves, - }) - } - - pub fn from_path>(path: P) -> Result, T>, Error> { - let file = File::open(&path)?; - let mut reader = BufReader::new(file); - - // TODO: match with available Storage while we don't - // add a function to build a Storage from a StorageInfo - let mut basepath = PathBuf::new(); - basepath.push(path); - // TODO: canonicalize doesn't work on wasm32-wasi - //basepath.canonicalize()?; - - let sbt = SBT::, T>::from_reader(&mut reader, basepath.parent().unwrap())?; - Ok(sbt) - } - - pub fn save_file>( - &mut self, - path: P, - storage: Option, - ) -> Result<(), Error> { - let ref_path = path.as_ref(); - let mut basename = ref_path.file_name().unwrap().to_str().unwrap().to_owned(); - if basename.ends_with(".sbt.json") { - basename = basename.replace(".sbt.json", ""); - } - let location = ref_path.parent().unwrap(); - - let storage = match storage { - Some(s) => s, - None => { - let subdir = format!(".sbt.{}", basename); - InnerStorage::new(FSStorage::new(location.to_str().unwrap(), &subdir)) - } - }; - - let args = storage.args(); - let storage_info = StorageInfo { - backend: "FSStorage".into(), - args, - }; - - let info: SBTInfoV5 = SBTInfoV5 { - d: self.d, - factory: self.factory.clone(), - storage: storage_info, - version: 5, - nodes: self - .nodes - .iter_mut() - .map(|(n, l)| { - // Trigger data loading - let _: &U = (*l).data().expect("Couldn't load data"); - - // set storage to new one - l.storage = Some(storage.clone()); - - let filename = (*l).save(&l.filename).unwrap(); - let new_node = NodeInfo { - filename, - name: l.name.clone(), - metadata: l.metadata.clone(), - }; - (*n, new_node) - }) - .collect(), - leaves: self - .leaves - .iter_mut() - .map(|(n, l)| { - // Trigger data loading - let _: &T = (*l).data().unwrap(); - - // set storage to new one - l.storage = Some(storage.clone()); - - // TODO: this should be l.md5sum(), not l.filename - let filename = (*l).save(&l.filename).unwrap(); - let new_node = DatasetInfo { - filename, - name: l.name.clone(), - metadata: l.metadata.clone(), - }; - (*n, new_node) - }) - .collect(), - }; - - let file = File::create(path)?; - serde_json::to_writer(file, &info)?; - - Ok(()) - } - - pub fn leaves(&self) -> Vec> { - self.leaves.values().cloned().collect() - } -} - -impl<'a, N, L> Index<'a> for SBT -where - N: Comparable + Comparable + Update + Debug + Default, - L: Comparable + Update + Clone + Debug + Default, - SBT: FromFactory, - SigStore: From + ReadData, -{ - type Item = L; - - fn find(&self, search_fn: F, sig: &L, threshold: f64) -> Result, Error> - where - F: Fn(&dyn Comparable, &Self::Item, f64) -> bool, - { - let mut matches = Vec::new(); - let mut visited = HashSet::new(); - let mut queue = vec![0u64]; - - while let Some(pos) = queue.pop() { - if !visited.contains(&pos) { - visited.insert(pos); - - if let Some(node) = self.nodes.get(&pos) { - if search_fn(&node, sig, threshold) { - for c in self.children(pos) { - queue.push(c); - } - } - } else if let Some(leaf) = self.leaves.get(&pos) { - let data = leaf.data().expect("Error reading data"); - if search_fn(data, sig, threshold) { - matches.push(data); - } - } - } - } - - Ok(matches) - } - - fn insert(&mut self, dataset: L) -> Result<(), Error> { - if self.leaves.is_empty() { - // in this case the tree is empty, - // just add the dataset to the first available leaf - self.leaves.entry(0).or_insert_with(|| dataset.into()); - return Ok(()); - } - - // we can unwrap here because the root node case - // only happens on an empty tree, and if we got - // to this point we have at least one leaf already. - // TODO: find position by similarity search - let pos = self.leaves.keys().max().unwrap() + 1; - let parent_pos = self.parent(pos).unwrap(); - let final_pos; - - if let Entry::Occupied(pnode) = self.leaves.entry(parent_pos) { - // Case 1: parent is a Leaf - // create a new internal node, add it to self.nodes[parent_pos] - - let (_, leaf) = pnode.remove_entry(); - - let mut new_node = self.factory(&format!("internal.{}", parent_pos))?; - - // for each children update the parent node - // TODO: write the update method - leaf.data.get().unwrap().update(&mut new_node)?; - dataset.update(&mut new_node)?; - - // node and parent are children of new internal node - let mut c_pos = self.children(parent_pos).into_iter().take(2); - let c1_pos = c_pos.next().unwrap(); - let c2_pos = c_pos.next().unwrap(); - - self.leaves.entry(c1_pos).or_insert(leaf); - self.leaves.entry(c2_pos).or_insert_with(|| dataset.into()); - final_pos = c2_pos; - - // add the new internal node to self.nodes[parent_pos) - // TODO check if it is really empty? - self.nodes.entry(parent_pos).or_insert(new_node); - } else { - // TODO: moved these two lines here to avoid borrow checker - // error E0502 in the Vacant case, but would love to avoid it! - let mut new_node = self.factory(&format!("internal.{}", parent_pos))?; - let c_pos = self.children(parent_pos)[0]; - - match self.nodes.entry(parent_pos) { - // Case 2: parent is a node and has an empty child spot available - // (if there isn't an empty spot, it was already covered by case 1) - Entry::Occupied(mut pnode) => { - dataset.update(pnode.get_mut())?; - self.leaves.entry(pos).or_insert_with(|| dataset.into()); - final_pos = pos; - } - - // Case 3: parent is None/empty - // this can happen with d != 2, need to create parent node - Entry::Vacant(pnode) => { - dataset.update(&mut new_node)?; - self.leaves.entry(c_pos).or_insert_with(|| dataset.into()); - final_pos = c_pos; - pnode.insert(new_node); - } - } - } - - let entry = &self.leaves[&final_pos]; - let data = entry.data.get().unwrap(); - - let mut parent_pos = parent_pos; - while let Some(ppos) = self.parent(parent_pos) { - if let Entry::Occupied(mut pnode) = self.nodes.entry(parent_pos) { - //TODO: use children for this node to update, instead of dragging - // dataset up to the root? It would be more generic, but this - // works for minhash, draff signatures and nodegraphs... - data.update(pnode.get_mut())?; - } - parent_pos = ppos; - } - - Ok(()) - } - - /* - fn batch_insert(&mut self, nodes: Vec) -> Result<(), Error> { - self = scaffold(nodes, self.storage()); - Ok(()) - } - */ - - fn save>(&self, _path: P) -> Result<(), Error> { - unimplemented!(); - } - - fn load>(_path: P) -> Result<(), Error> { - unimplemented!() - } - - fn signatures(&self) -> Vec { - self.leaves - .values() - .map(|x| x.data().unwrap().clone()) - .collect() - } - - fn signature_refs(&self) -> Vec<&Self::Item> { - self.leaves.values().map(|x| x.data().unwrap()).collect() - } - - /* - fn iter_signatures(&'a self) -> Self::SignatureIterator { - self.leaves.values() - } - */ -} - -/* -#[derive(TypedBuilder, Clone, Default, Serialize, Deserialize)] -pub struct Factory { - class: String, - args: Vec, -} -*/ - -#[derive(Debug, Clone, Serialize, Deserialize)] -#[serde(tag = "class")] -pub enum Factory { - GraphFactory { args: (u64, f64, u64) }, -} - -#[derive(TypedBuilder, Default, Clone)] -pub struct Node { - #[builder(setter(into))] - filename: String, - - #[builder(setter(into))] - name: String, - - metadata: HashMap, - - #[builder(default)] - storage: Option, - - #[builder(setter(into), default)] - data: OnceCell, -} - -impl Node -where - T: ToWriter, -{ - pub fn save(&self, path: &str) -> Result { - if let Some(storage) = &self.storage { - if let Some(data) = self.data.get() { - let mut buffer = Vec::new(); - data.to_writer(&mut buffer)?; - - Ok(storage.save(path, &buffer)?) - } else { - // TODO throw error, data was not initialized - unimplemented!() - } - } else { - unimplemented!() - } - } -} - -impl PartialEq for Node -where - T: PartialEq, - Node: ReadData, -{ - fn eq(&self, other: &Node) -> bool { - self.data().unwrap() == other.data().unwrap() - } -} - -impl SigStore -where - T: ToWriter, -{ - pub fn save(&self, path: &str) -> Result { - if let Some(storage) = &self.storage { - if let Some(data) = self.data.get() { - let mut buffer = Vec::new(); - data.to_writer(&mut buffer)?; - - Ok(storage.save(path, &buffer)?) - } else { - unimplemented!() - } - } else { - unimplemented!() - } - } -} - -impl std::fmt::Debug for Node -where - T: Debug, -{ - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "Node [name={}, filename={}, metadata: {:?}, data: {:?}]", - self.name, - self.filename, - self.metadata, - self.data.get().is_some() - ) - } -} - -#[derive(Serialize, Deserialize, Debug)] -struct NodeInfo { - filename: String, - name: String, - metadata: HashMap, -} - -#[derive(Serialize, Deserialize, Debug)] -#[serde(untagged)] -enum NodeInfoV4 { - Node(NodeInfo), - Leaf(DatasetInfo), -} - -#[derive(Serialize, Deserialize)] -struct SBTInfoV4 { - d: u32, - version: u32, - storage: StorageInfo, - factory: Factory, - nodes: HashMap, -} - -#[derive(Serialize, Deserialize)] -struct SBTInfoV5 { - d: u32, - version: u32, - storage: StorageInfo, - factory: Factory, - nodes: HashMap, - leaves: HashMap, -} - -#[derive(Serialize, Deserialize)] -struct SBTInfoV6 { - d: u32, - version: u32, - storage: StorageInfo, - factory: Factory, - nodes: HashMap, - signatures: HashMap, -} - -#[derive(Deserialize)] -#[serde(untagged)] -enum SBTInfo { - V6(SBTInfoV6), - V5(SBTInfoV5), - V4(SBTInfoV4), -} - -enum BinaryTree { - Empty, - Internal(Box>>>>), - Leaf(Box>>), -} - -struct TreeNode { - element: T, - left: BinaryTree, - right: BinaryTree, -} - -pub fn scaffold( - mut datasets: Vec>, - storage: Option, -) -> SBT, Signature> -where - N: Clone + Default, -{ - let mut leaves: HashMap> = HashMap::with_capacity(datasets.len()); - - let mut next_round = Vec::new(); - - // generate two bottom levels: - // - datasets - // - first level of internal nodes - info!("Start processing leaves"); - while let Some(next_leaf) = datasets.pop() { - let (simleaf_tree, in_common) = if datasets.is_empty() { - (BinaryTree::Empty, next_leaf.mins().into_iter().collect()) - } else { - let mut similar_leaf_pos = 0; - let mut current_max = 0; - for (pos, leaf) in datasets.iter().enumerate() { - let common = next_leaf.count_common(leaf); - if common > current_max { - current_max = common; - similar_leaf_pos = pos; - } - } - - let similar_leaf = datasets.remove(similar_leaf_pos); - - let in_common = next_leaf - .mins() - .into_iter() - .collect::>>>() - .union(&similar_leaf.mins().into_iter().collect()) - .cloned() - .collect(); - - let simleaf_tree = BinaryTree::Leaf(Box::new(TreeNode { - element: similar_leaf, - left: BinaryTree::Empty, - right: BinaryTree::Empty, - })); - (simleaf_tree, in_common) - }; - - let leaf_tree = BinaryTree::Leaf(Box::new(TreeNode { - element: next_leaf, - left: BinaryTree::Empty, - right: BinaryTree::Empty, - })); - - let tree = BinaryTree::Internal(Box::new(TreeNode { - element: in_common, - left: leaf_tree, - right: simleaf_tree, - })); - - next_round.push(tree); - - if next_round.len() % 100 == 0 { - info!("Processed {} leaves", next_round.len() * 2); - } - } - info!("Finished processing leaves"); - - // while we don't get to the root, generate intermediary levels - while next_round.len() != 1 { - next_round = BinaryTree::process_internal_level(next_round); - info!("Finished processing round {}", next_round.len()); - } - - // Convert from binary tree to nodes/leaves - let root = next_round.pop().unwrap(); - let mut visited = HashSet::new(); - let mut queue = vec![(0u64, root)]; - - while let Some((pos, cnode)) = queue.pop() { - if !visited.contains(&pos) { - visited.insert(pos); - - match cnode { - BinaryTree::Leaf(leaf) => { - leaves.insert(pos, leaf.element); - } - BinaryTree::Internal(mut node) => { - let left = std::mem::replace(&mut node.left, BinaryTree::Empty); - let right = std::mem::replace(&mut node.right, BinaryTree::Empty); - queue.push((2 * pos + 1, left)); - queue.push((2 * pos + 2, right)); - } - BinaryTree::Empty => (), - } - } - } - - SBT::builder() - .storage(storage) - .nodes(HashMap::default()) - .leaves(leaves) - .build() -} - -impl BinaryTree { - fn process_internal_level(mut current_round: Vec) -> Vec { - let mut next_round = Vec::with_capacity(current_round.len() + 1); - - while let Some(next_node) = current_round.pop() { - let similar_node = if current_round.is_empty() { - BinaryTree::Empty - } else { - let mut similar_node_pos = 0; - let mut current_max = 0; - for (pos, cmpe) in current_round.iter().enumerate() { - let common = BinaryTree::intersection_size(&next_node, cmpe); - if common > current_max { - current_max = common; - similar_node_pos = pos; - } - } - current_round.remove(similar_node_pos) - }; - - let tree = BinaryTree::new_tree(next_node, similar_node); - - next_round.push(tree); - } - next_round - } - - // Remove this when MSRV is >= 1.40 - #[allow(clippy::mem_replace_with_default)] - fn new_tree(mut left: BinaryTree, mut right: BinaryTree) -> BinaryTree { - let in_common = if let BinaryTree::Internal(ref mut el1) = left { - match right { - BinaryTree::Internal(ref mut el2) => { - let c1 = std::mem::replace( - &mut el1.element, - HashSet::>>::default(), - ); - let c2 = std::mem::replace( - &mut el2.element, - HashSet::>>::default(), - ); - c1.union(&c2).cloned().collect() - } - BinaryTree::Empty => std::mem::replace( - &mut el1.element, - HashSet::>>::default(), - ), - _ => panic!("Should not see a Leaf at this level"), - } - } else { - HashSet::>>::default() - }; - - BinaryTree::Internal(Box::new(TreeNode { - element: in_common, - left, - right, - })) - } - - fn intersection_size(n1: &BinaryTree, n2: &BinaryTree) -> usize { - if let BinaryTree::Internal(ref el1) = n1 { - if let BinaryTree::Internal(ref el2) = n2 { - return el1.element.intersection(&el2.element).count(); - } - }; - 0 - } -} - -/* -impl From> for SBT, Signature> -where - U: Default + Clone, -{ - fn from(other: LinearIndex) -> Self { - let storage = other.storage(); - scaffold(other.datasets, storage) - } -} -*/ diff --git a/src/core/src/lib.rs b/src/core/src/lib.rs index 66de82e6a0..5eebef3b0a 100644 --- a/src/core/src/lib.rs +++ b/src/core/src/lib.rs @@ -26,6 +26,8 @@ pub mod prelude; pub mod cmd; +pub mod manifest; +pub mod picklist; pub mod signature; pub mod sketch; pub mod storage; diff --git a/src/core/src/manifest.rs b/src/core/src/manifest.rs new file mode 100644 index 0000000000..ce740c638b --- /dev/null +++ b/src/core/src/manifest.rs @@ -0,0 +1,186 @@ +use std::convert::TryInto; +use std::io::Read; +use std::ops::Deref; +use std::path::PathBuf; + +use serde::de; +use serde::{Deserialize, Serialize}; + +use crate::encodings::HashFunctions; +use crate::index::Selection; +use crate::Error; + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct Record { + internal_location: String, + ksize: u32, + + #[serde(deserialize_with = "to_bool")] + with_abundance: bool, + + md5: String, + name: String, + moltype: String, + /* + md5short: String, + num: String, + scaled: String, + n_hashes: String, + filename: String, + */ +} + +fn to_bool<'de, D>(deserializer: D) -> Result +where + D: de::Deserializer<'de>, +{ + match String::deserialize(deserializer)? + .to_ascii_lowercase() + .as_ref() + { + "0" | "false" => Ok(false), + "1" | "true" => Ok(true), + other => Err(de::Error::invalid_value( + de::Unexpected::Str(other), + &"0/1 or true/false are the only supported values", + )), + } +} + +#[derive(Debug, Default, Serialize, Deserialize, Clone)] +pub struct Manifest { + records: Vec, +} + +impl Record { + pub fn internal_location(&self) -> PathBuf { + self.internal_location.clone().into() + } + + pub fn ksize(&self) -> u32 { + self.ksize + } + + pub fn with_abundance(&self) -> bool { + self.with_abundance + } + + pub fn md5(&self) -> &str { + self.md5.as_ref() + } + + pub fn name(&self) -> &str { + self.name.as_ref() + } + + pub fn moltype(&self) -> HashFunctions { + self.moltype.as_str().try_into().unwrap() + } +} + +impl Manifest { + pub fn from_reader(rdr: R) -> Result { + let mut records = vec![]; + + let mut rdr = csv::ReaderBuilder::new() + .comment(Some(b'#')) + .from_reader(rdr); + for result in rdr.deserialize() { + let record: Record = result?; + records.push(record); + } + Ok(Manifest { records }) + } + + pub fn internal_locations(&self) -> impl Iterator { + self.records.iter().map(|r| r.internal_location.as_str()) + } + + pub fn iter(&self) -> impl Iterator { + self.records.iter() + } + + pub fn select_to_manifest(&self, selection: &Selection) -> Result { + let rows = self.records.iter().filter(|row| { + let mut valid = true; + valid = if let Some(ksize) = selection.ksize() { + row.ksize == ksize + } else { + valid + }; + valid = if let Some(abund) = selection.abund() { + valid && row.with_abundance() == abund + } else { + valid + }; + valid = if let Some(moltype) = selection.moltype() { + valid && row.moltype() == moltype + } else { + valid + }; + valid + }); + + Ok(Manifest { + records: rows.cloned().collect(), + }) + + /* + matching_rows = self.rows + if ksize: + matching_rows = ( row for row in matching_rows + if row['ksize'] == ksize ) + if moltype: + matching_rows = ( row for row in matching_rows + if row['moltype'] == moltype ) + if scaled or containment: + if containment and not scaled: + raise ValueError("'containment' requires 'scaled' in Index.select'") + + matching_rows = ( row for row in matching_rows + if row['scaled'] and not row['num'] ) + if num: + matching_rows = ( row for row in matching_rows + if row['num'] and not row['scaled'] ) + + if abund: + # only need to concern ourselves if abundance is _required_ + matching_rows = ( row for row in matching_rows + if row['with_abundance'] ) + + if picklist: + matching_rows = ( row for row in matching_rows + if picklist.matches_manifest_row(row) ) + + # return only the internal filenames! + for row in matching_rows: + yield row + */ + } +} + +impl From<&[PathBuf]> for Manifest { + fn from(v: &[PathBuf]) -> Self { + Manifest { + records: v + .iter() + .map(|p| Record { + internal_location: p.to_str().unwrap().into(), + ksize: 0, // FIXME + with_abundance: false, // FIXME + md5: "".into(), // FIXME + name: "".into(), // FIXME + moltype: "".into(), // FIXME + }) + .collect(), + } + } +} + +impl Deref for Manifest { + type Target = Vec; + + fn deref(&self) -> &Self::Target { + &self.records + } +} diff --git a/src/core/src/picklist.rs b/src/core/src/picklist.rs new file mode 100644 index 0000000000..ddb3183d14 --- /dev/null +++ b/src/core/src/picklist.rs @@ -0,0 +1,29 @@ +use getset::{CopyGetters, Getters, Setters}; +use typed_builder::TypedBuilder; + +#[derive(Default, TypedBuilder, CopyGetters, Getters, Setters, Clone)] +pub struct Picklist { + #[getset(get = "pub", set = "pub")] + #[builder(default = "".into())] + coltype: String, + + #[getset(get = "pub", set = "pub")] + #[builder(default = "".into())] + pickfile: String, + + #[getset(get = "pub", set = "pub")] + #[builder(default = "".into())] + column_name: String, + + #[getset(get = "pub", set = "pub")] + #[builder] + pickstyle: PickStyle, +} + +#[derive(Clone, Default)] +#[repr(u32)] +pub enum PickStyle { + #[default] + Include = 1, + Exclude = 2, +} diff --git a/src/core/src/prelude.rs b/src/core/src/prelude.rs index ef7d4aa27b..eb265d42ee 100644 --- a/src/core/src/prelude.rs +++ b/src/core/src/prelude.rs @@ -5,6 +5,8 @@ use crate::Error; pub use crate::signature::Signature; pub use crate::storage::Storage; +pub use crate::sketch::minhash::{AbundMinHashOps, FracMinHashOps, MinHashOps}; + pub trait ToWriter { fn to_writer(&self, writer: &mut W) -> Result<(), Error> where diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index db2a85ea05..0fef51b166 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -2,6 +2,8 @@ //! //! A signature is a collection of sketches for a genomic dataset. +use core::iter::FusedIterator; + use std::fs::File; use std::io; use std::iter::Iterator; @@ -20,6 +22,8 @@ use crate::sketch::Sketch; use crate::Error; use crate::HashIntoType; +// TODO: this is the behavior expected from Sketch, but that name is already +// used. Sketchable? pub trait SigsTrait { fn size(&self) -> usize; fn to_vec(&self) -> Vec; @@ -28,6 +32,16 @@ pub trait SigsTrait { fn seed(&self) -> u64; fn hash_function(&self) -> HashFunctions; + fn set_hash_function(&mut self, h: HashFunctions) -> Result<(), Error>; + fn is_protein(&self) -> bool { + self.hash_function() == HashFunctions::murmur64_protein + } + fn dayhoff(&self) -> bool { + self.hash_function() == HashFunctions::murmur64_dayhoff + } + fn hp(&self) -> bool { + self.hash_function() == HashFunctions::murmur64_hp + } fn add_hash(&mut self, hash: HashIntoType); @@ -117,6 +131,14 @@ impl SigsTrait for Sketch { } } + fn set_hash_function(&mut self, h: HashFunctions) -> Result<(), Error> { + match *self { + Sketch::MinHash(ref mut mh) => mh.set_hash_function(h), + Sketch::LargeMinHash(ref mut mh) => mh.set_hash_function(h), + Sketch::HyperLogLog(ref mut hll) => hll.set_hash_function(h), + } + } + fn add_hash(&mut self, hash: HashIntoType) { match *self { Sketch::MinHash(ref mut mh) => mh.add_hash(hash), @@ -395,6 +417,10 @@ impl Iterator for SeqToHashes { } #[derive(Serialize, Deserialize, Debug, Clone, TypedBuilder)] +#[cfg_attr( + feature = "rkyv", + derive(rkyv::Serialize, rkyv::Deserialize, rkyv::Archive) +)] pub struct Signature { #[serde(default = "default_class")] #[builder(default = default_class())] @@ -654,6 +680,92 @@ impl Signature { Ok(()) } + + pub fn iter_mut(&mut self) -> IterMut<'_> { + let length = self.signatures.len(); + IterMut { + iter: self.signatures.iter_mut(), + length, + } + } + + pub fn iter(&mut self) -> Iter<'_> { + let length = self.signatures.len(); + Iter { + iter: self.signatures.iter(), + length, + } + } +} + +pub struct IterMut<'a> { + iter: std::slice::IterMut<'a, Sketch>, + length: usize, +} + +impl<'a> IntoIterator for &'a mut Signature { + type Item = &'a mut Sketch; + type IntoIter = IterMut<'a>; + + fn into_iter(self) -> IterMut<'a> { + self.iter_mut() + } +} + +impl<'a> Iterator for IterMut<'a> { + type Item = &'a mut Sketch; + + fn next(&mut self) -> Option<&'a mut Sketch> { + if self.length == 0 { + None + } else { + self.length -= 1; + self.iter.next() + } + } + + fn size_hint(&self) -> (usize, Option) { + (self.length, Some(self.length)) + } +} + +pub struct Iter<'a> { + iter: std::slice::Iter<'a, Sketch>, + length: usize, +} + +impl<'a> Iterator for Iter<'a> { + type Item = &'a Sketch; + + fn next(&mut self) -> Option<&'a Sketch> { + if self.length == 0 { + None + } else { + self.length -= 1; + self.iter.next() + } + } + + fn size_hint(&self) -> (usize, Option) { + (self.length, Some(self.length)) + } +} + +impl FusedIterator for Iter<'_> {} + +impl ExactSizeIterator for Iter<'_> { + fn len(&self) -> usize { + self.length + } +} + +impl Clone for Iter<'_> { + fn clone(&self) -> Self { + Iter { + iter: self.iter.clone(), + length: self.length, + } + } } impl ToWriter for Signature { @@ -683,6 +795,8 @@ impl Default for Signature { impl PartialEq for Signature { fn eq(&self, other: &Signature) -> bool { + use crate::sketch::minhash::{KmerMinHash, KmerMinHashBTree}; + let metadata = self.class == other.class && self.email == other.email && self.hash_function == other.hash_function @@ -691,14 +805,25 @@ impl PartialEq for Signature { // TODO: find the right signature // as long as we have a matching - if let Sketch::MinHash(mh) = &self.signatures[0] { - if let Sketch::MinHash(other_mh) = &other.signatures[0] { - return metadata && (mh == other_mh); - } - } else { - unimplemented!() + match &self.signatures[0] { + Sketch::MinHash(mh) => match &other.signatures[0] { + Sketch::MinHash(other_mh) => metadata && (mh == other_mh), + Sketch::LargeMinHash(other_mh) => { + // TODO: avoid clone + metadata && (mh == &Into::::into(other_mh.clone())) + } + Sketch::HyperLogLog(_) => todo!(), + }, + Sketch::LargeMinHash(mh) => match &other.signatures[0] { + Sketch::LargeMinHash(other_mh) => metadata && (mh == other_mh), + Sketch::MinHash(other_mh) => { + // TODO: avoid clone + metadata && (mh == &Into::::into(other_mh.clone())) + } + Sketch::HyperLogLog(_) => todo!(), + }, + Sketch::HyperLogLog(_) => todo!(), } - metadata } } diff --git a/src/core/src/sketch/hyperloglog/mod.rs b/src/core/src/sketch/hyperloglog/mod.rs index 409d2a2c44..85436ff52f 100644 --- a/src/core/src/sketch/hyperloglog/mod.rs +++ b/src/core/src/sketch/hyperloglog/mod.rs @@ -18,7 +18,7 @@ use serde::{Deserialize, Serialize}; use crate::encodings::HashFunctions; use crate::prelude::*; use crate::signature::SigsTrait; -use crate::sketch::KmerMinHash; +use crate::sketch::{KmerMinHash, KmerMinHashBTree}; use crate::Error; use crate::HashIntoType; @@ -26,6 +26,10 @@ pub mod estimators; use estimators::CounterType; #[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[cfg_attr( + feature = "rkyv", + derive(rkyv::Serialize, rkyv::Deserialize, rkyv::Archive) +)] pub struct HyperLogLog { registers: Vec, p: usize, @@ -183,6 +187,16 @@ impl SigsTrait for HyperLogLog { HashFunctions::murmur64_DNA } + fn set_hash_function(&mut self, h: HashFunctions) -> Result<(), Error> { + //TODO support other hash functions + if h != HashFunctions::murmur64_DNA { + return Err(Error::InvalidHashFunction { + function: h.to_string(), + }); + } + Ok(()) + } + fn add_hash(&mut self, hash: HashIntoType) { let value = hash >> self.p; let index = (hash - (value << self.p)) as usize; @@ -208,6 +222,15 @@ impl SigsTrait for HyperLogLog { } } +impl Update for KmerMinHashBTree { + fn update(&self, other: &mut HyperLogLog) -> Result<(), Error> { + for h in self.mins() { + other.add_hash(h); + } + Ok(()) + } +} + impl Update for KmerMinHash { fn update(&self, other: &mut HyperLogLog) -> Result<(), Error> { for h in self.mins() { diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index 5c5f1114f8..e816b00a82 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -11,13 +11,13 @@ use serde::ser::{SerializeStruct, Serializer}; use serde::{Deserialize, Serialize}; use typed_builder::TypedBuilder; -use crate::_hash_murmur; use crate::encodings::HashFunctions; use crate::signature::SigsTrait; use crate::sketch::hyperloglog::HyperLogLog; use crate::Error; +use crate::{HashIntoType, _hash_murmur}; -pub fn max_hash_for_scaled(scaled: u64) -> u64 { +pub fn max_hash_for_scaled(scaled: u64) -> HashIntoType { match scaled { 0 => 0, 1 => u64::max_value(), @@ -25,14 +25,158 @@ pub fn max_hash_for_scaled(scaled: u64) -> u64 { } } -pub fn scaled_for_max_hash(max_hash: u64) -> u64 { +pub fn scaled_for_max_hash(max_hash: HashIntoType) -> u64 { match max_hash { 0 => 0, _ => (u64::max_value() as f64 / max_hash as f64) as u64, } } +pub trait MinHashOps: SigsTrait { + fn clear(&mut self); + fn is_empty(&self) -> bool; + fn reset_md5sum(&self); + fn md5sum(&self) -> String; + + fn mins(&self) -> Vec; + + fn add_word(&mut self, word: &[u8]) { + let hash = _hash_murmur(word, self.seed()); + self.add_hash(hash); + } + + fn remove_hash(&mut self, hash: HashIntoType); + + fn remove_many(&mut self, hashes: &[HashIntoType]) -> Result<(), Error> { + for min in hashes { + self.remove_hash(*min); + } + Ok(()) + } + + fn add_many(&mut self, hashes: &[HashIntoType]) -> Result<(), Error> { + for min in hashes { + self.add_hash(*min); + } + Ok(()) + } + + /* TODO(lirber): these need to avoid KmerMinHash and be more generic + + // TODO: use iterator + fn remove_from(&mut self, other: &KmerMinHash) -> Result<(), Error>; + fn merge(&mut self, other: &KmerMinHash) -> Result<(), Error>; + fn add_from(&mut self, other: &KmerMinHash) -> Result<(), Error>; + fn count_common(&self, other: &KmerMinHash, downsample: bool) -> Result; + fn intersection(&self, other: &KmerMinHash) -> Result<(Vec, u64), Error>; + + // FIXME: intersection_size and count_common should be the same? + // (for scaled minhashes) + fn intersection_size(&self, other: &KmerMinHash) -> Result<(u64, u64), Error>; + + // calculate Jaccard similarity, ignoring abundance. + fn jaccard(&self, other: &KmerMinHash) -> Result; + + fn similarity( + &self, + other: &KmerMinHash, + ignore_abundance: bool, + downsample: bool, + ) -> Result; + */ + + fn as_hll(&self) -> HyperLogLog; +} + +pub trait AbundMinHashOps: MinHashOps { + fn track_abundance(&self) -> bool; + fn enable_abundance(&mut self) -> Result<(), Error>; + fn disable_abundance(&mut self); + fn add_hash_with_abundance(&mut self, hash: HashIntoType, abundance: u64); + fn set_hash_with_abundance(&mut self, hash: HashIntoType, abundance: u64); + fn add_many_with_abund(&mut self, hashes: &[(HashIntoType, u64)]) -> Result<(), Error> { + for item in hashes { + self.add_hash_with_abundance(item.0, item.1); + } + Ok(()) + } + + fn abunds(&self) -> Option>; + fn to_vec_abunds(&self) -> Vec<(HashIntoType, u64)>; + + // compare two minhashes, with abundance; + // calculate their angular similarity. + fn angular_similarity(&self, other: &A) -> Result { + // TODO(lirber): bring back compat check once method sig changes + //self.check_compatible(other)?; + + if !self.track_abundance() || !other.track_abundance() { + return Err(Error::NeedsAbundanceTracking); + } + + // TODO: check which one is smaller, swap around if needed + // TODO(lirber): use iters here, instead of allocating new vecs! + let abunds = self.to_vec_abunds(); + let other_abunds = other.to_vec_abunds(); + + let mut prod = 0; + let mut other_iter = other_abunds.iter(); + let mut next_hash = other_iter.next(); + let a_sq: u64 = abunds.iter().map(|(_hash, abund)| (abund * abund)).sum(); + let b_sq: u64 = other_abunds + .iter() + .map(|(_hash, abund)| (abund * abund)) + .sum(); + + for (hash, abund) in abunds { + while let Some((k, other_abund)) = next_hash { + match k.cmp(&hash) { + Ordering::Less => next_hash = other_iter.next(), + Ordering::Equal => { + prod += abund * other_abund; + break; + } + Ordering::Greater => break, + } + } + } + + let norm_a = (a_sq as f64).sqrt(); + let norm_b = (b_sq as f64).sqrt(); + + if norm_a == 0. || norm_b == 0. { + return Ok(0.0); + } + let prod = f64::min(prod as f64 / (norm_a * norm_b), 1.); + let distance = 2. * prod.acos() / PI; + Ok(1. - distance) + } +} + +pub trait FracMinHashOps: MinHashOps { + fn max_hash(&self) -> HashIntoType; + fn scaled(&self) -> u64 { + scaled_for_max_hash(self.max_hash()) + } + fn downsample_max_hash(&self, max_hash: HashIntoType) -> Result + where + Self: Sized; + + // create a downsampled copy of self + fn downsample_scaled(&self, scaled: u64) -> Result + where + Self: Sized, + { + let max_hash = max_hash_for_scaled(scaled); + self.downsample_max_hash(max_hash) + } +} + #[derive(Debug, TypedBuilder)] +#[cfg_attr( + feature = "rkyv", + derive(rkyv::Serialize, rkyv::Deserialize, rkyv::Archive) +)] pub struct KmerMinHash { num: u32, ksize: u32, @@ -53,6 +197,8 @@ pub struct KmerMinHash { abunds: Option>, #[builder(default)] + //#[cfg_attr(feature = "rkyv", with(rkyv::with::Lock))] + #[cfg_attr(feature = "rkyv", with(rkyv::with::Skip))] md5sum: Mutex>, } @@ -215,208 +361,16 @@ impl KmerMinHash { self.num } - pub fn is_protein(&self) -> bool { - self.hash_function == HashFunctions::murmur64_protein - } - - pub fn max_hash(&self) -> u64 { - self.max_hash - } - pub fn scaled(&self) -> u64 { - scaled_for_max_hash(self.max_hash) - } - - pub fn clear(&mut self) { - self.mins.clear(); - if let Some(ref mut abunds) = self.abunds { - abunds.clear(); - } - } - - pub fn is_empty(&self) -> bool { - self.mins.is_empty() - } - - pub fn set_hash_function(&mut self, h: HashFunctions) -> Result<(), Error> { - if self.hash_function == h { - return Ok(()); - } - - if !self.is_empty() { - return Err(Error::NonEmptyMinHash { - message: "hash_function".into(), - }); - } - - self.hash_function = h; - Ok(()) - } - - pub fn track_abundance(&self) -> bool { - self.abunds.is_some() - } - - pub fn enable_abundance(&mut self) -> Result<(), Error> { - if !self.mins.is_empty() { - return Err(Error::NonEmptyMinHash { - message: "track_abundance=True".into(), - }); - } - - self.abunds = Some(vec![]); - - Ok(()) - } - - pub fn disable_abundance(&mut self) { - self.abunds = None; - } - - fn reset_md5sum(&self) { - let mut data = self.md5sum.lock().unwrap(); - if data.is_some() { - *data = None; - } - } - - pub fn md5sum(&self) -> String { - let mut data = self.md5sum.lock().unwrap(); - if data.is_none() { - let mut buffer = String::with_capacity(20); - - let mut md5_ctx = md5::Context::new(); - write!(&mut buffer, "{}", self.ksize()).unwrap(); - md5_ctx.consume(&buffer); - buffer.clear(); - for x in &self.mins { - write!(&mut buffer, "{}", x).unwrap(); - md5_ctx.consume(&buffer); - buffer.clear(); - } - *data = Some(format!("{:x}", md5_ctx.compute())); - } - data.clone().unwrap() - } - - pub fn add_hash(&mut self, hash: u64) { - self.add_hash_with_abundance(hash, 1); - } - - pub fn add_hash_with_abundance(&mut self, hash: u64, abundance: u64) { - let current_max = match self.mins.last() { - Some(&x) => x, - None => u64::max_value(), - }; - - if hash > self.max_hash && self.max_hash != 0 { - // This is a scaled minhash, and we don't need to add the new hash - return; - } - - if self.num == 0 && self.max_hash == 0 { - // why did you create this minhash? it will always be empty... - return; - } - - if abundance == 0 { - self.remove_hash(hash); - return; - } - - // From this point on, hash is within scaled (or no scaled specified). - - // empty mins? add it. - if self.mins.is_empty() { - self.mins.push(hash); - if let Some(ref mut abunds) = self.abunds { - abunds.push(abundance); - self.reset_md5sum(); - } - return; - } - - if hash <= self.max_hash || hash <= current_max || (self.mins.len() as u32) < self.num { - // "good" hash - within range, smaller than current entry, or - // still have space available - let pos = match self.mins.binary_search(&hash) { - Ok(p) => p, - Err(p) => p, - }; - - if pos == self.mins.len() { - // at end - must still be growing, we know the list won't - // get too long - self.mins.push(hash); - self.reset_md5sum(); - if let Some(ref mut abunds) = self.abunds { - abunds.push(abundance); - } - } else if self.mins[pos] != hash { - // didn't find hash in mins, so inserting somewhere - // in the middle; shrink list if needed. - self.mins.insert(pos, hash); - if let Some(ref mut abunds) = self.abunds { - abunds.insert(pos, abundance); - } - - // is it too big now? - if self.num != 0 && self.mins.len() > (self.num as usize) { - self.mins.pop(); - if let Some(ref mut abunds) = self.abunds { - abunds.pop(); - } - } - self.reset_md5sum(); - } else if let Some(ref mut abunds) = self.abunds { - // pos == hash: hash value already in mins, inc count by abundance - abunds[pos] += abundance; - } - } - } - - pub fn set_hash_with_abundance(&mut self, hash: u64, abundance: u64) { - let mut found = false; - if let Ok(pos) = self.mins.binary_search(&hash) { - if self.mins[pos] == hash { - found = true; - if let Some(ref mut abunds) = self.abunds { - abunds[pos] = abundance; - } - } - } - - if !found { - self.add_hash_with_abundance(hash, abundance); - } - } - - pub fn add_word(&mut self, word: &[u8]) { - let hash = _hash_murmur(word, self.seed); - self.add_hash(hash); - } - - pub fn remove_hash(&mut self, hash: u64) { - if let Ok(pos) = self.mins.binary_search(&hash) { - if self.mins[pos] == hash { - self.mins.remove(pos); - self.reset_md5sum(); - if let Some(ref mut abunds) = self.abunds { - abunds.remove(pos); - } - } - }; + scaled_for_max_hash(self.max_hash) } - pub fn remove_from(&mut self, other: &KmerMinHash) -> Result<(), Error> { - for min in &other.mins { - self.remove_hash(*min); - } - Ok(()) + pub fn iter_mins(&self) -> impl Iterator { + self.mins.iter() } - pub fn remove_many(&mut self, hashes: &[u64]) -> Result<(), Error> { - for min in hashes { + pub fn remove_from(&mut self, other: &KmerMinHash) -> Result<(), Error> { + for min in &other.mins { self.remove_hash(*min); } Ok(()) @@ -530,20 +484,6 @@ impl KmerMinHash { Ok(()) } - pub fn add_many(&mut self, hashes: &[u64]) -> Result<(), Error> { - for min in hashes { - self.add_hash(*min); - } - Ok(()) - } - - pub fn add_many_with_abund(&mut self, hashes: &[(u64, u64)]) -> Result<(), Error> { - for item in hashes { - self.add_hash_with_abundance(item.0, item.1); - } - Ok(()) - } - pub fn count_common(&self, other: &KmerMinHash, downsample: bool) -> Result { if downsample && self.max_hash != other.max_hash { let (first, second) = if self.max_hash < other.max_hash { @@ -638,55 +578,6 @@ impl KmerMinHash { } } - // compare two minhashes, with abundance; - // calculate their angular similarity. - pub fn angular_similarity(&self, other: &KmerMinHash) -> Result { - self.check_compatible(other)?; - - if self.abunds.is_none() || other.abunds.is_none() { - return Err(Error::NeedsAbundanceTracking); - } - - // TODO: check which one is smaller, swap around if needed - - let abunds = self.abunds.as_ref().unwrap(); - let other_abunds = other.abunds.as_ref().unwrap(); - - let mut prod = 0; - let mut other_iter = other.mins.iter().enumerate(); - let mut next_hash = other_iter.next(); - let a_sq: u64 = abunds.iter().map(|a| (a * a)).sum(); - let b_sq: u64 = other_abunds.iter().map(|a| (a * a)).sum(); - - for (i, hash) in self.mins.iter().enumerate() { - while let Some((j, k)) = next_hash { - match k.cmp(hash) { - Ordering::Less => next_hash = other_iter.next(), - Ordering::Equal => { - // Calling `get_unchecked` here is safe since - // both `i` and `j` are valid indices - // (`i` and `j` came from valid iterator calls) - unsafe { - prod += abunds.get_unchecked(i) * other_abunds.get_unchecked(j); - } - break; - } - Ordering::Greater => break, - } - } - } - - let norm_a = (a_sq as f64).sqrt(); - let norm_b = (b_sq as f64).sqrt(); - - if norm_a == 0. || norm_b == 0. { - return Ok(0.0); - } - let prod = f64::min(prod as f64 / (norm_a * norm_b), 1.); - let distance = 2. * prod.acos() / PI; - Ok(1. - distance) - } - pub fn similarity( &self, other: &KmerMinHash, @@ -700,36 +591,25 @@ impl KmerMinHash { (other, self) }; let downsampled_mh = second.downsample_max_hash(first.max_hash)?; + first.check_compatible(&downsampled_mh)?; first.similarity(&downsampled_mh, ignore_abundance, false) } else if ignore_abundance || self.abunds.is_none() || other.abunds.is_none() { + self.check_compatible(other)?; self.jaccard(other) } else { + self.check_compatible(other)?; self.angular_similarity(other) } } +} - pub fn dayhoff(&self) -> bool { - self.hash_function == HashFunctions::murmur64_dayhoff - } - - pub fn hp(&self) -> bool { - self.hash_function == HashFunctions::murmur64_hp - } - - pub fn mins(&self) -> Vec { - self.mins.clone() - } - - pub fn iter_mins(&self) -> impl Iterator { - self.mins.iter() - } - - pub fn abunds(&self) -> Option> { - self.abunds.clone() +impl FracMinHashOps for KmerMinHash { + fn max_hash(&self) -> u64 { + self.max_hash } // create a downsampled copy of self - pub fn downsample_max_hash(&self, max_hash: u64) -> Result { + fn downsample_max_hash(&self, max_hash: HashIntoType) -> Result { let scaled = scaled_for_max_hash(max_hash); let mut new_mh = KmerMinHash::new( @@ -747,37 +627,205 @@ impl KmerMinHash { } Ok(new_mh) } +} - pub fn to_vec_abunds(&self) -> Vec<(u64, u64)> { - if let Some(abunds) = &self.abunds { - self.mins - .iter() - .cloned() - .zip(abunds.iter().cloned()) - .collect() - } else { - self.mins - .iter() - .cloned() - .zip(std::iter::repeat(1)) - .collect() +impl MinHashOps for KmerMinHash { + fn clear(&mut self) { + self.mins.clear(); + if let Some(ref mut abunds) = self.abunds { + abunds.clear(); + } + } + + fn is_empty(&self) -> bool { + self.mins.is_empty() + } + + fn reset_md5sum(&self) { + let mut data = self.md5sum.lock().unwrap(); + if data.is_some() { + *data = None; + } + } + + fn md5sum(&self) -> String { + let mut data = self.md5sum.lock().unwrap(); + if data.is_none() { + let mut buffer = String::with_capacity(20); + + let mut md5_ctx = md5::Context::new(); + write!(&mut buffer, "{}", self.ksize()).unwrap(); + md5_ctx.consume(&buffer); + buffer.clear(); + for x in &self.mins { + write!(&mut buffer, "{}", x).unwrap(); + md5_ctx.consume(&buffer); + buffer.clear(); + } + *data = Some(format!("{:x}", md5_ctx.compute())); } + data.clone().unwrap() + } + + fn remove_hash(&mut self, hash: u64) { + if let Ok(pos) = self.mins.binary_search(&hash) { + if self.mins[pos] == hash { + self.mins.remove(pos); + self.reset_md5sum(); + if let Some(ref mut abunds) = self.abunds { + abunds.remove(pos); + } + } + }; + } + + fn mins(&self) -> Vec { + self.mins.clone() } - pub fn as_hll(&self) -> HyperLogLog { + fn as_hll(&self) -> HyperLogLog { let mut hll = HyperLogLog::with_error_rate(0.01, self.ksize()).unwrap(); for h in &self.mins { hll.add_hash(*h) } - - hll - } - - // create a downsampled copy of self - pub fn downsample_scaled(&self, scaled: u64) -> Result { - let max_hash = max_hash_for_scaled(scaled); - self.downsample_max_hash(max_hash) + + hll + } +} + +impl AbundMinHashOps for KmerMinHash { + fn track_abundance(&self) -> bool { + self.abunds.is_some() + } + + fn enable_abundance(&mut self) -> Result<(), Error> { + if !self.mins.is_empty() { + return Err(Error::NonEmptyMinHash { + message: "track_abundance=True".into(), + }); + } + + self.abunds = Some(vec![]); + + Ok(()) + } + + fn disable_abundance(&mut self) { + self.abunds = None; + } + + fn add_hash_with_abundance(&mut self, hash: u64, abundance: u64) { + let current_max = match self.mins.last() { + Some(&x) => x, + None => u64::max_value(), + }; + + if hash > self.max_hash && self.max_hash != 0 { + // This is a scaled minhash, and we don't need to add the new hash + return; + } + + if self.num == 0 && self.max_hash == 0 { + // why did you create this minhash? it will always be empty... + return; + } + + if abundance == 0 { + self.remove_hash(hash); + return; + } + + // From this point on, hash is within scaled (or no scaled specified). + + // empty mins? add it. + if self.mins.is_empty() { + self.mins.push(hash); + if let Some(ref mut abunds) = self.abunds { + abunds.push(abundance); + self.reset_md5sum(); + } + return; + } + + if hash <= self.max_hash || hash <= current_max || (self.mins.len() as u32) < self.num { + // "good" hash - within range, smaller than current entry, or + // still have space available + let pos = match self.mins.binary_search(&hash) { + Ok(p) => p, + Err(p) => p, + }; + + if pos == self.mins.len() { + // at end - must still be growing, we know the list won't + // get too long + self.mins.push(hash); + self.reset_md5sum(); + if let Some(ref mut abunds) = self.abunds { + abunds.push(abundance); + } + } else if self.mins[pos] != hash { + // didn't find hash in mins, so inserting somewhere + // in the middle; shrink list if needed. + self.mins.insert(pos, hash); + if let Some(ref mut abunds) = self.abunds { + abunds.insert(pos, abundance); + } + + // is it too big now? + if self.num != 0 && self.mins.len() > (self.num as usize) { + self.mins.pop(); + if let Some(ref mut abunds) = self.abunds { + abunds.pop(); + } + } + self.reset_md5sum(); + } else if let Some(ref mut abunds) = self.abunds { + // pos == hash: hash value already in mins, inc count by abundance + abunds[pos] += abundance; + } + } + } + + fn set_hash_with_abundance(&mut self, hash: u64, abundance: u64) { + if abundance == 0 { + self.remove_hash(hash); + return; + } + + let mut found = false; + if let Ok(pos) = self.mins.binary_search(&hash) { + if self.mins[pos] == hash { + found = true; + if let Some(ref mut abunds) = self.abunds { + abunds[pos] = abundance; + } + } + } + + if !found { + self.add_hash_with_abundance(hash, abundance); + } + } + + fn abunds(&self) -> Option> { + self.abunds.clone() + } + + fn to_vec_abunds(&self) -> Vec<(HashIntoType, u64)> { + if let Some(abunds) = &self.abunds { + self.mins + .iter() + .cloned() + .zip(abunds.iter().cloned()) + .collect() + } else { + self.mins + .iter() + .cloned() + .zip(std::iter::repeat(1)) + .collect() + } } } @@ -802,6 +850,21 @@ impl SigsTrait for KmerMinHash { self.hash_function } + fn set_hash_function(&mut self, h: HashFunctions) -> Result<(), Error> { + if self.hash_function == h { + return Ok(()); + } + + if !self.is_empty() { + return Err(Error::NonEmptyMinHash { + message: "hash_function".into(), + }); + } + + self.hash_function = h; + Ok(()) + } + fn add_hash(&mut self, hash: u64) { self.add_hash_with_abundance(hash, 1); } @@ -927,6 +990,10 @@ mod test { // A MinHash implementation for low scaled or large cardinalities #[derive(Debug, TypedBuilder)] +#[cfg_attr( + feature = "rkyv", + derive(rkyv::Serialize, rkyv::Deserialize, rkyv::Archive) +)] pub struct KmerMinHashBTree { num: u32, ksize: u32, @@ -950,6 +1017,8 @@ pub struct KmerMinHashBTree { current_max: u64, #[builder(default)] + //#[cfg_attr(feature = "rkyv", with(rkyv::with::Lock))] + #[cfg_attr(feature = "rkyv", with(rkyv::with::Skip))] md5sum: Mutex>, } @@ -1114,8 +1183,8 @@ impl KmerMinHashBTree { self.num } - pub fn is_protein(&self) -> bool { - self.hash_function == HashFunctions::murmur64_protein + pub fn iter_mins(&self) -> impl Iterator { + self.mins.iter() } pub fn max_hash(&self) -> u64 { @@ -1279,6 +1348,13 @@ impl KmerMinHashBTree { Ok(()) } + pub fn remove_from(&mut self, other: &KmerMinHashBTree) -> Result<(), Error> { + for min in &other.mins { + self.remove_hash(*min); + } + Ok(()) + } + pub fn merge(&mut self, other: &KmerMinHashBTree) -> Result<(), Error> { self.check_compatible(other)?; let union = self.mins.union(&other.mins); @@ -1316,20 +1392,6 @@ impl KmerMinHashBTree { Ok(()) } - pub fn add_many(&mut self, hashes: &[u64]) -> Result<(), Error> { - for min in hashes { - self.add_hash(*min); - } - Ok(()) - } - - pub fn add_many_with_abund(&mut self, hashes: &[(u64, u64)]) -> Result<(), Error> { - for item in hashes { - self.add_hash_with_abundance(item.0, item.1); - } - Ok(()) - } - pub fn count_common(&self, other: &KmerMinHashBTree, downsample: bool) -> Result { if downsample && self.max_hash != other.max_hash { let (first, second) = if self.max_hash < other.max_hash { @@ -1350,7 +1412,6 @@ impl KmerMinHashBTree { Ok(iter.count() as u64) } } - pub fn intersection(&self, other: &KmerMinHashBTree) -> Result<(Vec, u64), Error> { self.check_compatible(other)?; @@ -1423,39 +1484,6 @@ impl KmerMinHashBTree { } } - // compare two minhashes, with abundance; - // calculate their angular similarity. - pub fn angular_similarity(&self, other: &KmerMinHashBTree) -> Result { - self.check_compatible(other)?; - - if self.abunds.is_none() || other.abunds.is_none() { - return Err(Error::NeedsAbundanceTracking); - } - - let abunds = self.abunds.as_ref().unwrap(); - let other_abunds = other.abunds.as_ref().unwrap(); - - let mut prod = 0; - let a_sq: u64 = abunds.values().map(|a| (a * a)).sum(); - let b_sq: u64 = other_abunds.values().map(|a| (a * a)).sum(); - - for (hash, value) in abunds.iter() { - if let Some(oa) = other_abunds.get(hash) { - prod += value * oa - } - } - - let norm_a = (a_sq as f64).sqrt(); - let norm_b = (b_sq as f64).sqrt(); - - if norm_a == 0. || norm_b == 0. { - return Ok(0.0); - } - let prod = f64::min(prod as f64 / (norm_a * norm_b), 1.); - let distance = 2. * prod.acos() / PI; - Ok(1. - distance) - } - pub fn similarity( &self, other: &KmerMinHashBTree, @@ -1469,42 +1497,25 @@ impl KmerMinHashBTree { (other, self) }; let downsampled_mh = second.downsample_max_hash(first.max_hash)?; + first.check_compatible(&downsampled_mh)?; first.similarity(&downsampled_mh, ignore_abundance, false) } else if ignore_abundance || self.abunds.is_none() || other.abunds.is_none() { + self.check_compatible(other)?; self.jaccard(other) } else { + self.check_compatible(other)?; self.angular_similarity(other) } } +} - pub fn dayhoff(&self) -> bool { - self.hash_function == HashFunctions::murmur64_dayhoff - } - - pub fn hp(&self) -> bool { - self.hash_function == HashFunctions::murmur64_hp - } - - pub fn hash_function(&self) -> HashFunctions { - self.hash_function - } - - pub fn mins(&self) -> Vec { - self.mins.iter().cloned().collect() - } - - pub fn iter_mins(&self) -> impl Iterator { - self.mins.iter() - } - - pub fn abunds(&self) -> Option> { - self.abunds - .as_ref() - .map(|abunds| abunds.values().cloned().collect()) +impl FracMinHashOps for KmerMinHashBTree { + fn max_hash(&self) -> u64 { + self.max_hash } // create a downsampled copy of self - pub fn downsample_max_hash(&self, max_hash: u64) -> Result { + fn downsample_max_hash(&self, max_hash: HashIntoType) -> Result { let scaled = scaled_for_max_hash(max_hash); let mut new_mh = KmerMinHashBTree::new( @@ -1522,14 +1533,176 @@ impl KmerMinHashBTree { } Ok(new_mh) } +} - // create a downsampled copy of self - pub fn downsample_scaled(&self, scaled: u64) -> Result { - let max_hash = max_hash_for_scaled(scaled); - self.downsample_max_hash(max_hash) +impl MinHashOps for KmerMinHashBTree { + fn clear(&mut self) { + self.mins.clear(); + if let Some(ref mut abunds) = self.abunds { + abunds.clear(); + } + self.current_max = 0; + } + + fn is_empty(&self) -> bool { + self.mins.is_empty() + } + + fn reset_md5sum(&self) { + let mut data = self.md5sum.lock().unwrap(); + if data.is_some() { + *data = None; + } + } + + fn md5sum(&self) -> String { + let mut data = self.md5sum.lock().unwrap(); + if data.is_none() { + let mut buffer = String::with_capacity(20); + + let mut md5_ctx = md5::Context::new(); + write!(&mut buffer, "{}", self.ksize()).unwrap(); + md5_ctx.consume(&buffer); + buffer.clear(); + for x in &self.mins { + write!(&mut buffer, "{}", x).unwrap(); + md5_ctx.consume(&buffer); + buffer.clear(); + } + *data = Some(format!("{:x}", md5_ctx.compute())); + } + data.clone().unwrap() + } + + fn remove_hash(&mut self, hash: u64) { + if self.mins.remove(&hash) { + self.reset_md5sum(); + if let Some(ref mut abunds) = self.abunds { + abunds.remove(&hash); + } + } + if hash == self.current_max { + self.current_max = *self.mins.iter().next_back().unwrap_or(&0); + } + } + + fn mins(&self) -> Vec { + self.mins.iter().cloned().collect() + } + + fn as_hll(&self) -> HyperLogLog { + let mut hll = HyperLogLog::with_error_rate(0.01, self.ksize()).unwrap(); + + for h in &self.mins { + hll.add_hash(*h) + } + + hll + } +} + +impl AbundMinHashOps for KmerMinHashBTree { + fn track_abundance(&self) -> bool { + self.abunds.is_some() + } + + fn enable_abundance(&mut self) -> Result<(), Error> { + if !self.mins.is_empty() { + return Err(Error::NonEmptyMinHash { + message: "track_abundance=True".into(), + }); + } + + self.abunds = Some(Default::default()); + + Ok(()) + } + + fn disable_abundance(&mut self) { + self.abunds = None; + } + + fn add_hash_with_abundance(&mut self, hash: u64, abundance: u64) { + if hash > self.max_hash && self.max_hash != 0 { + // This is a scaled minhash, and we don't need to add the new hash + return; + } + + if self.num == 0 && self.max_hash == 0 { + // why did you create this minhash? it will always be empty... + return; + } + + if abundance == 0 { + self.remove_hash(hash); + return; + } + + // From this point on, hash is within scaled (or no scaled specified). + + // empty mins? add it. + if self.mins.is_empty() { + self.mins.insert(hash); + self.reset_md5sum(); + if let Some(ref mut abunds) = self.abunds { + abunds.insert(hash, abundance); + } + self.current_max = hash; + return; + } + + if hash <= self.max_hash || hash <= self.current_max || (self.mins.len() as u32) < self.num + { + // "good" hash - within range, smaller than current entry, or + // still have space available + if self.mins.insert(hash) { + self.reset_md5sum(); + if hash > self.current_max { + self.current_max = hash; + } + } + if let Some(ref mut abunds) = self.abunds { + *abunds.entry(hash).or_insert(0) += abundance; + } + + // is it too big now? + if self.num != 0 && self.mins.len() > (self.num as usize) { + let last = *self.mins.iter().next_back().unwrap(); + self.mins.remove(&last); + self.reset_md5sum(); + if let Some(ref mut abunds) = self.abunds { + abunds.remove(&last); + } + self.current_max = *self.mins.iter().next_back().unwrap(); + } + } + } + + fn set_hash_with_abundance(&mut self, hash: u64, abundance: u64) { + if abundance == 0 { + self.remove_hash(hash); + return; + } + + if self.mins.contains(&hash) { + if let Some(ref mut abunds) = self.abunds { + abunds + .entry(hash) + .and_modify(|v| *v = abundance) + .or_insert_with(|| abundance); + } + } else { + self.add_hash_with_abundance(hash, abundance); + } + } + + fn abunds(&self) -> Option> { + self.abunds + .as_ref() + .map(|abunds| abunds.values().cloned().collect()) } - pub fn to_vec_abunds(&self) -> Vec<(u64, u64)> { + fn to_vec_abunds(&self) -> Vec<(u64, u64)> { if let Some(abunds) = &self.abunds { abunds.iter().map(|(a, b)| (*a, *b)).collect() } else { @@ -1563,6 +1736,21 @@ impl SigsTrait for KmerMinHashBTree { self.hash_function } + fn set_hash_function(&mut self, h: HashFunctions) -> Result<(), Error> { + if self.hash_function == h { + return Ok(()); + } + + if !self.is_empty() { + return Err(Error::NonEmptyMinHash { + message: "hash_function".into(), + }); + } + + self.hash_function = h; + Ok(()) + } + fn add_hash(&mut self, hash: u64) { self.add_hash_with_abundance(hash, 1); } diff --git a/src/core/src/sketch/mod.rs b/src/core/src/sketch/mod.rs index 09bd51085c..3ef04e43df 100644 --- a/src/core/src/sketch/mod.rs +++ b/src/core/src/sketch/mod.rs @@ -10,6 +10,10 @@ use crate::sketch::minhash::{KmerMinHash, KmerMinHashBTree}; #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(untagged)] +#[cfg_attr( + feature = "rkyv", + derive(rkyv::Serialize, rkyv::Deserialize, rkyv::Archive) +)] pub enum Sketch { MinHash(KmerMinHash), LargeMinHash(KmerMinHashBTree), diff --git a/src/core/src/sketch/nodegraph.rs b/src/core/src/sketch/nodegraph.rs index cbca8915ba..bbfef5cd0d 100644 --- a/src/core/src/sketch/nodegraph.rs +++ b/src/core/src/sketch/nodegraph.rs @@ -7,7 +7,7 @@ use byteorder::{BigEndian, ByteOrder, LittleEndian, ReadBytesExt, WriteBytesExt} use fixedbitset::FixedBitSet; use crate::prelude::*; -use crate::sketch::minhash::KmerMinHash; +use crate::sketch::minhash::{KmerMinHash, KmerMinHashBTree}; use crate::Error; use crate::HashIntoType; @@ -58,6 +58,15 @@ impl Update for KmerMinHash { } } +impl Update for KmerMinHashBTree { + fn update(&self, other: &mut Nodegraph) -> Result<(), Error> { + for h in self.mins() { + other.count(h); + } + Ok(()) + } +} + impl Nodegraph { pub fn new(tablesizes: &[usize], ksize: usize) -> Nodegraph { let mut bs = Vec::with_capacity(tablesizes.len()); diff --git a/src/core/src/storage.rs b/src/core/src/storage.rs index f4f942d330..08a990d687 100644 --- a/src/core/src/storage.rs +++ b/src/core/src/storage.rs @@ -3,8 +3,7 @@ use std::ffi::OsStr; use std::fs::{DirBuilder, File}; use std::io::{BufReader, BufWriter, Read, Write}; use std::path::{Path, PathBuf}; -use std::rc::Rc; -use std::sync::RwLock; +use std::sync::{Arc, RwLock}; use serde::{Deserialize, Serialize}; use thiserror::Error; @@ -25,11 +24,11 @@ pub trait Storage { } #[derive(Clone)] -pub struct InnerStorage(Rc>); +pub struct InnerStorage(Arc>); impl InnerStorage { - pub fn new(inner: impl Storage + 'static) -> InnerStorage { - InnerStorage(Rc::new(RwLock::new(inner))) + pub fn new(inner: impl Storage + Send + Sync + 'static) -> InnerStorage { + InnerStorage(Arc::new(RwLock::new(inner))) } } diff --git a/src/core/tests/minhash.rs b/src/core/tests/minhash.rs index bcb3fdb4fa..50c03870e0 100644 --- a/src/core/tests/minhash.rs +++ b/src/core/tests/minhash.rs @@ -6,6 +6,7 @@ use proptest::collection::vec; use proptest::num::u64; use proptest::proptest; use sourmash::encodings::HashFunctions; +use sourmash::prelude::*; use sourmash::signature::SeqToHashes; use sourmash::signature::{Signature, SigsTrait}; use sourmash::sketch::minhash::{ diff --git a/src/core/tests/storage.rs b/src/core/tests/storage.rs index 5a60e02fcc..a27fa27b14 100644 --- a/src/core/tests/storage.rs +++ b/src/core/tests/storage.rs @@ -42,3 +42,41 @@ fn zipstorage_list_sbts() -> Result<(), Box> { Ok(()) } + +#[cfg(feature = "parallel")] +#[test] +fn zipstorage_parallel_access() -> Result<(), Box> { + use std::io::BufReader; + + use rayon::prelude::*; + use sourmash::signature::{Signature, SigsTrait}; + use sourmash::sketch::minhash::KmerMinHash; + + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("../../tests/test-data/v6.sbt.zip"); + + let zs = ZipStorage::from_file(filename.to_str().unwrap())?; + + let total_hashes: usize = [ + ".sbt.v3/f71e78178af9e45e6f1d87a0c53c465c", + ".sbt.v3/f0c834bc306651d2b9321fb21d3e8d8f", + ".sbt.v3/4e94e60265e04f0763142e20b52c0da1", + ".sbt.v3/6d6e87e1154e95b279e5e7db414bc37b", + ".sbt.v3/0107d767a345eff67ecdaed2ee5cd7ba", + ".sbt.v3/b59473c94ff2889eca5d7165936e64b3", + ".sbt.v3/60f7e23c24a8d94791cc7a8680c493f9", + ] + .par_iter() + .map(|path| { + let data = zs.load(path).unwrap(); + let sigs: Vec = serde_json::from_reader(&data[..]).expect("Loading error"); + sigs.iter() + .map(|v| v.sketches().iter().map(|mh| mh.size()).sum::()) + .sum::() + }) + .sum(); + + assert_eq!(total_hashes, 3500); + + Ok(()) +} diff --git a/src/sourmash/hll.py b/src/sourmash/hll.py index c98ded5e8b..8e593e0f51 100644 --- a/src/sourmash/hll.py +++ b/src/sourmash/hll.py @@ -43,8 +43,10 @@ def add(self, h): def update(self, other): if isinstance(other, HLL): return self._methodcall(lib.hll_merge, other._objptr) - elif isinstance(other, MinHash): + elif isinstance(other, FrozenMinHash): return self._methodcall(lib.hll_update_mh, other._objptr) + elif isinstance(other, MinHash): + return self._methodcall(lib.hll_update_mh, other.to_frozen()._objptr) else: # FIXME: we could take sets here too (or anything that can be # converted to a list of ints...) diff --git a/src/sourmash/index/__init__.py b/src/sourmash/index/__init__.py index 08068255e5..4aba54b630 100644 --- a/src/sourmash/index/__init__.py +++ b/src/sourmash/index/__init__.py @@ -34,10 +34,15 @@ CounterGather - an ancillary class returned by the 'counter_gather()' method. """ +from __future__ import annotations + import os import sourmash from abc import abstractmethod, ABC -from collections import namedtuple, Counter +from collections import Counter +from collections import defaultdict +from typing import NamedTuple, Optional, TypedDict, TYPE_CHECKING +import weakref from sourmash.search import (make_jaccard_search_query, make_containment_query, @@ -45,12 +50,79 @@ from sourmash.manifest import CollectionManifest from sourmash.logging import debug_literal from sourmash.signature import load_signatures, save_signatures +from sourmash._lowlevel import ffi, lib +from sourmash.utils import RustObject, rustcall, decode_str, encode_str +from sourmash import SourmashSignature +from sourmash.picklist import SignaturePicklist from sourmash.minhash import (flatten_and_downsample_scaled, flatten_and_downsample_num, flatten_and_intersect_scaled) -# generic return tuple for Index.search and Index.gather -IndexSearchResult = namedtuple('Result', 'score, signature, location') + +if TYPE_CHECKING: + from typing_extensions import Unpack + + +class IndexSearchResult(NamedTuple): + """generic return tuple for Index.search and Index.gather""" + score: float + signature: SourmashSignature + location: str + + +class Selection(TypedDict): + ksize: Optional[int] + moltype: Optional[str] + num: Optional[int] + scaled: Optional[int] + containment: Optional[bool] + abund: Optional[bool] + picklist: Optional[SignaturePicklist] + + +# TypedDict can't have methods (it is a dict in runtime) +def _selection_as_rust(selection: Selection): + ptr = lib.selection_new() + + for key, v in selection.items(): + if v is not None: + if key == "ksize": + rustcall(lib.selection_set_ksize, ptr, v) + + elif key == "moltype": + hash_function = None + if v.lower() == "dna": + hash_function = lib.HASH_FUNCTIONS_MURMUR64_DNA + elif v.lower() == "protein": + hash_function = lib.HASH_FUNCTIONS_MURMUR64_PROTEIN + elif v.lower() == "dayhoff": + hash_function = lib.HASH_FUNCTIONS_MURMUR64_DAYHOFF + elif v.lower() == "hp": + hash_function = lib.HASH_FUNCTIONS_MURMUR64_HP + + rustcall(lib.selection_set_moltype, ptr, hash_function) + + elif key == "num": + rustcall(lib.selection_set_num, ptr, v) + + elif key == "scaled": + rustcall(lib.selection_set_scaled, ptr, v) + + elif key == "containment": + rustcall(lib.selection_set_containment, ptr, v) + + elif key == "abund": + rustcall(lib.selection_set_abund, ptr, bool(v)) + + elif key == "picklist": + picklist_ptr = v._as_rust() + rustcall(lib.selection_set_picklist, ptr, picklist_ptr) + + else: + raise KeyError(f"Unsupported key {key} for Selection in rust") + + return ptr + class Index(ABC): # this will be removed soon; see sourmash#1894. @@ -307,8 +379,7 @@ def counter_gather(self, query, threshold_bp, **kwargs): return counter @abstractmethod - def select(self, ksize=None, moltype=None, scaled=None, num=None, - abund=None, containment=None): + def select(self, **kwargs: Unpack[Selection]): """Return Index containing only signatures that match requirements. Current arguments can be any or all of: @@ -326,9 +397,16 @@ def select(self, ksize=None, moltype=None, scaled=None, num=None, """ -def select_signature(ss, *, ksize=None, moltype=None, scaled=0, num=0, - containment=False, abund=None, picklist=None): +def select_signature(ss, **kwargs: Unpack[Selection]): "Check that the given signature matches the specified requirements." + ksize = kwargs.get('ksize') + moltype = kwargs.get('moltype') + containment = kwargs.get('containment', False) + scaled = kwargs.get('scaled', 0) + num = kwargs.get('num', 0) + abund = kwargs.get('abund') + picklist = kwargs.get('picklist') + # ksize match? if ksize and ksize != ss.minhash.ksize: return False @@ -408,7 +486,7 @@ def load(cls, location, filename=None): lidx = LinearIndex(si, filename=filename) return lidx - def select(self, **kwargs): + def select(self, **kwargs: Unpack[Selection]): """Return new LinearIndex containing only signatures that match req's. Does not raise ValueError, but may return an empty Index. @@ -479,7 +557,7 @@ def save(self, path): def load(cls, path): raise NotImplementedError - def select(self, **kwargs): + def select(self, **kwargs: Unpack[Selection]): """Return new object yielding only signatures that match req's. Does not raise ValueError, but may return an empty Index. @@ -642,7 +720,7 @@ def signatures(self): if select(ss): yield ss - def select(self, **kwargs): + def select(self, **kwargs: Unpack[Selection]): "Select signatures in zip file based on ksize/moltype/etc." # if we have a manifest, run 'select' on the manifest. @@ -1053,7 +1131,7 @@ def load_from_pathlist(cls, filename): def save(self, *args): raise NotImplementedError - def select(self, **kwargs): + def select(self, **kwargs: Unpack[Selection]): "Run 'select' on the manifest." new_manifest = self.manifest.select_to_manifest(**kwargs) return MultiIndex(new_manifest, self.parent, @@ -1162,8 +1240,135 @@ def save(self, *args): def insert(self, *args): raise NotImplementedError - def select(self, **kwargs): + def select(self, **kwargs: Unpack[Selection]): "Run 'select' on the manifest." new_manifest = self.manifest.select_to_manifest(**kwargs) return StandaloneManifestIndex(new_manifest, self._location, prefix=self.prefix) + +class RustLinearIndex(Index, RustObject): + """\ + A read-only collection of signatures in a zip file. + + Does not support `insert` or `save`. + + Concrete class; signatures dynamically loaded from disk; uses manifests. + """ + is_database = True + + __dealloc_func__ = lib.linearindex_free + + def __init__(self, storage, *, selection_dict=None, + traverse_yield_all=False, manifest=None, use_manifest=True): + + self._selection_dict = selection_dict + self._traverse_yield_all = traverse_yield_all + self._use_manifest = use_manifest + + # Taking ownership of the storage + storage_ptr = storage._take_objptr() + + manifest_ptr = ffi.NULL + # do we have a manifest already? if not, try loading. + if use_manifest: + if manifest is not None: + debug_literal('RustLinearIndex using passed-in manifest') + manifest_ptr = manifest._as_rust()._take_objptr() + + selection_ptr = ffi.NULL + + self._objptr = rustcall(lib.linearindex_new, storage_ptr, + manifest_ptr, selection_ptr, use_manifest) + + """ + if self.manifest is not None: + assert not self.selection_dict, self.selection_dict + if self.selection_dict: + assert self.manifest is None + """ + + @property + def manifest(self): + return CollectionManifest._from_rust(self._methodcall(lib.linearindex_manifest)) + + @manifest.setter + def manifest(self, value): + if value is None: + return # FIXME: can't unset manifest in a Rust Linear Index + self._methodcall(lib.linearindex_set_manifest, value._as_rust()._take_objptr()) + + def __bool__(self): + "Are there any matching signatures in this zipfile? Avoid calling len." + return self._methodcall(lib.linearindex_len) > 0 + + def __len__(self): + "calculate number of signatures." + return self._methodcall(lib.linearindex_len) + + @property + def location(self): + return decode_str(self._methodcall(lib.linearindex_location)) + + @property + def storage(self): + from ..sbt_storage import ZipStorage + + ptr = self._methodcall(lib.linearindex_storage) + return ZipStorage._from_objptr(ptr) + + def insert(self, signature): + raise NotImplementedError + + def save(self, path): + raise NotImplementedError + + @classmethod + def load(cls, location, traverse_yield_all=False, use_manifest=True): + "Class method to load a zipfile." + from ..sbt_storage import ZipStorage + + # we can only load from existing zipfiles in this method. + if not os.path.exists(location): + raise FileNotFoundError(location) + + storage = ZipStorage(location) + return cls(storage, traverse_yield_all=traverse_yield_all, + use_manifest=use_manifest) + + def _signatures_with_internal(self): + """Return an iterator of tuples (ss, internal_location). + + Note: does not limit signatures to subsets. + """ + # list all the files, without using the Storage interface; currently, + # 'Storage' does not provide a way to list all the files, so :shrug:. + for filename in self.storage._filenames(): + # should we load this file? if it ends in .sig OR we are forcing: + if filename.endswith('.sig') or \ + filename.endswith('.sig.gz') or \ + self._traverse_yield_all: + sig_data = self.storage.load(filename) + for ss in load_signatures(sig_data): + yield ss, filename + + def signatures(self): + "Load all signatures in the zip file." + attached_refs = weakref.WeakKeyDictionary() + iterator = self._methodcall(lib.linearindex_signatures) + + next_sig = rustcall(lib.signatures_iter_next, iterator) + while next_sig != ffi.NULL: + attached_refs[next_sig] = iterator + yield SourmashSignature._from_objptr(next_sig) + next_sig = rustcall(lib.signatures_iter_next, iterator) + + def select(self, **kwargs: Unpack[Selection]): + "Select signatures in zip file based on ksize/moltype/etc." + + selection = _selection_as_rust(kwargs) + + # select consumes the current index + ptr = self._take_objptr() + ptr = rustcall(lib.linearindex_select, ptr, selection) + + return RustLinearIndex._from_objptr(ptr) diff --git a/src/sourmash/manifest.py b/src/sourmash/manifest.py index bfd27eabb9..d2f78563cb 100644 --- a/src/sourmash/manifest.py +++ b/src/sourmash/manifest.py @@ -7,9 +7,13 @@ import os.path from abc import abstractmethod import itertools +from typing import TYPE_CHECKING from sourmash.picklist import SignaturePicklist +if TYPE_CHECKING: + from typing_extensions import Unpack + class BaseCollectionManifest: """ @@ -303,6 +307,7 @@ def _select(self, *, ksize=None, moltype=None, scaled=0, num=0, for row in matching_rows: yield row + #def select_to_manifest(self, **kwargs: Unpack[Selection]): def select_to_manifest(self, **kwargs): "Do a 'select' and return a new CollectionManifest object." new_rows = self._select(**kwargs) @@ -343,3 +348,34 @@ def to_picklist(self): picklist.pickset = set(self._md5_set) return picklist + + @staticmethod + def _from_rust(value): + from ._lowlevel import ffi, lib + from .utils import rustcall, decode_str + + iterator = rustcall(lib.manifest_rows, value) + + rows = [] + next_row = rustcall(lib.manifest_rows_iter_next, iterator) + while next_row != ffi.NULL: + + # TODO: extract row data from next_row + # FIXME: free mem from strings? + row = {} + row['md5'] = decode_str(next_row.md5) + row['md5short'] = row['md5'][:8] + row['ksize'] = next_row.ksize + row['moltype'] = decode_str(next_row.moltype) + row['num'] = 0 #ss.minhash.num + row['scaled'] = 0 #ss.minhash.scaled + row['n_hashes'] = 0 # len(ss.minhash) + row['with_abundance'] = next_row.with_abundance + row['name'] = decode_str(next_row.name) + row['filename'] = "" #ss.filename + row['internal_location'] = decode_str(next_row.internal_location) + rows.append(row) + + next_row = rustcall(lib.manifest_rows_iter_next, iterator) + + return CollectionManifest(rows) diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py index 360ca6165b..1691c31828 100644 --- a/src/sourmash/minhash.py +++ b/src/sourmash/minhash.py @@ -644,7 +644,7 @@ def downsample(self, *, num=None, scaled=None): # acceptable num value? make sure to set max_hash to 0. max_hash = 0 - + elif scaled is not None: # cannot downsample a num MinHash with scaled if self.num: @@ -904,7 +904,7 @@ def set_abundances(self, values, clear=True): abunds = [] for h, v in values.items(): - hashes.append(h) + hashes.append(h) if v < 0: raise ValueError("Abundance cannot be set to a negative value.") abunds.append(v) @@ -937,9 +937,8 @@ def to_mutable(self): def to_frozen(self): "Return a frozen copy of this MinHash that cannot be changed." - new_mh = self.__copy__() - new_mh.into_frozen() - return new_mh + new_mh_ptr = self._methodcall(lib.kmerminhash_to_frozen) + return FrozenMinHash._from_objptr(new_mh_ptr) def into_frozen(self): "Freeze this MinHash, preventing any changes." @@ -1069,11 +1068,8 @@ def merge(self, *args, **kwargs): def to_mutable(self): "Return a copy of this MinHash that can be changed." - mut = MinHash.__new__(MinHash) - state_tup = self.__getstate__() - - mut.__setstate__(state_tup) - return mut + new_mh_ptr = self._methodcall(lib.kmerminhash_to_mutable) + return MinHash._from_objptr(new_mh_ptr) def to_frozen(self): "Return a frozen copy of this MinHash that cannot be changed." diff --git a/src/sourmash/nodegraph.py b/src/sourmash/nodegraph.py index 8faa2eb874..7986659156 100644 --- a/src/sourmash/nodegraph.py +++ b/src/sourmash/nodegraph.py @@ -5,7 +5,7 @@ from tempfile import NamedTemporaryFile from ._lowlevel import ffi, lib -from .minhash import to_bytes, MinHash +from .minhash import to_bytes, MinHash, FrozenMinHash from .utils import RustObject, rustcall, decode_str from .exceptions import SourmashError @@ -42,8 +42,10 @@ def to_bytes(self, compression=1): def update(self, other): if isinstance(other, Nodegraph): return self._methodcall(lib.nodegraph_update, other._objptr) - elif isinstance(other, MinHash): + elif isinstance(other, FrozenMinHash): return self._methodcall(lib.nodegraph_update_mh, other._objptr) + elif isinstance(other, MinHash): + return self._methodcall(lib.nodegraph_update_mh, other.to_frozen()._objptr) else: # FIXME: we could take sets here too (or anything that can be # converted to a list of ints...) @@ -79,12 +81,15 @@ def expected_collisions(self): return self._methodcall(lib.nodegraph_expected_collisions) def matches(self, mh): - if not isinstance(mh, MinHash): + objptr = mh._objptr + if isinstance(mh, MinHash): + objptr = mh.to_frozen()._objptr + elif not isinstance(mh, FrozenMinHash): # FIXME: we could take sets here too (or anything that can be # converted to a list of ints...) - raise ValueError("mh must be a MinHash") + raise ValueError("mh must be a FrozenMinHash") - return self._methodcall(lib.nodegraph_matches, mh._objptr) + return self._methodcall(lib.nodegraph_matches, objptr) def to_khmer_nodegraph(self): import khmer diff --git a/src/sourmash/picklist.py b/src/sourmash/picklist.py index 30d5c84f90..af15df0990 100644 --- a/src/sourmash/picklist.py +++ b/src/sourmash/picklist.py @@ -252,6 +252,24 @@ def filter(self, it): if self.__contains__(ss): yield ss + def _as_rust(self): + from ._lowlevel import ffi, lib + from .utils import rustcall, decode_str + + ptr = lib.picklist_new() + + rustcall(lib.picklist_set_coltype, ptr, self.coltype.encode('utf-8'), len(self.coltype)) + rustcall(lib.picklist_set_pickfile, ptr, self.pickfile.encode('utf-8'), len(self.pickfile)) + rustcall(lib.picklist_set_column_name, ptr, self.column_name.encode('utf-8'), len(self.column_name)) + rustcall(lib.picklist_set_pickstyle, ptr, self.pickstyle.value) + + #self.preprocess_fn = preprocess[coltype] + #self.pickset = None + #self.found = set() + #self.n_queries = 0 + + return ptr + def passes_all_picklists(ss, picklists): "does the signature 'ss' pass all of the picklists?" diff --git a/src/sourmash/sbt_storage.py b/src/sourmash/sbt_storage.py index a22e782d69..42a4fceaa6 100644 --- a/src/sourmash/sbt_storage.py +++ b/src/sourmash/sbt_storage.py @@ -130,7 +130,7 @@ def subdir(self, value): self._methodcall(lib.zipstorage_set_subdir, to_bytes(value), len(value)) def _filenames(self): - if self.__inner: + if not self._objptr: return self.__inner._filenames() size = ffi.new("uintptr_t *") @@ -150,7 +150,7 @@ def save(self, path, content, *, overwrite=False, compress=False): raise NotImplementedError() def load(self, path): - if self.__inner: + if not self._objptr: return self.__inner.load(path) try: diff --git a/src/sourmash/signature.py b/src/sourmash/signature.py index 1fd34d35e6..4077d655ed 100644 --- a/src/sourmash/signature.py +++ b/src/sourmash/signature.py @@ -43,9 +43,9 @@ def __init__(self, minhash, name="", filename=""): @property def minhash(self): - return FrozenMinHash._from_objptr( + return MinHash._from_objptr( self._methodcall(lib.signature_first_mh) - ) + ).to_frozen() @minhash.setter def minhash(self, value): @@ -66,18 +66,6 @@ def __repr__(self): else: # name != md5pref: return "SourmashSignature('{}', {})".format(name, md5pref) - #def minhashes(self): - # size = ffi.new("uintptr_t *") - # mhs_ptr = self._methodcall(lib.signature_get_mhs, size) - # size = ffi.unpack(size, 1)[0] - # - # mhs = [] - # for i in range(size): - # mh = MinHash._from_objptr(mhs_ptr[i]) - # mhs.append(mh) - # - # return mhs - def md5sum(self): "Calculate md5 hash of the bottom sketch, specifically." return decode_str(self.minhash._methodcall(lib.kmerminhash_md5sum)) diff --git a/src/sourmash/utils.py b/src/sourmash/utils.py index 71afc20261..acb4b73d7a 100644 --- a/src/sourmash/utils.py +++ b/src/sourmash/utils.py @@ -29,6 +29,13 @@ def _get_objptr(self): raise RuntimeError("Object is closed") return self._objptr + def _take_objptr(self): + if not self._objptr: + raise RuntimeError("Object is closed") + ret = self._objptr + self._objptr = None + return ret + def __del__(self): if self._objptr is None or self._shared: return diff --git a/tests/test_index.py b/tests/test_index.py index af0c1da890..1067422c5f 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -1775,6 +1775,7 @@ def test_lazy_index_wraps_multi_index_location(): lazy2.signatures_with_location()): assert ss_tup == ss_lazy_tup +@pytest.mark.skip("no support for in-memory sigs yet") def test_revindex_index_search(): # confirm that RevIndex works sig2 = utils.get_test_data("2.fa.sig") @@ -1820,6 +1821,7 @@ def test_revindex_index_search(): assert sr[0][1] == ss63 +@pytest.mark.skip("no support for in-memory sigs yet") def test_revindex_gather(): # check that RevIndex.best_containment works. sig2 = utils.get_test_data("2.fa.sig") @@ -1846,6 +1848,7 @@ def test_revindex_gather(): assert match.signature == ss47 +@pytest.mark.skip("no support for in-memory sigs yet") def test_revindex_gather_ignore(): # check that RevIndex gather ignores things properly. sig2 = utils.get_test_data('2.fa.sig') diff --git a/tox.ini b/tox.ini index 41734a6a3b..ba4335a623 100644 --- a/tox.ini +++ b/tox.ini @@ -50,6 +50,11 @@ commands = pytest \ --junitxml {toxworkdir}/junit.{envname}.xml \ {posargs:doc tests} +[testenv:.pkg] +pass_env = + LIBCLANG_PATH + BINDGEN_EXTRA_CLANG_ARGS + [testenv:pypy3] deps = pip >= 19.3.1 @@ -104,7 +109,7 @@ commands = description = invoke sphinx-build to build the HTML docs basepython = python3.10 extras = doc -whitelist_externals = pandoc +allowlist_externals = pandoc pass_env = HOME change_dir = {toxinidir} #commands = sphinx-build -d "{toxworkdir}/docs_doctree" doc "{toxworkdir}/docs_out" --color -W -bhtml {posargs}