diff --git a/.ci/install_cargo.sh b/.ci/install_cargo.sh deleted file mode 100755 index 8635836ef6..0000000000 --- a/.ci/install_cargo.sh +++ /dev/null @@ -1,10 +0,0 @@ -#! /bin/sh -curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain=stable -rustup show -export PATH="$HOME/.cargo/bin:$PATH" -rustc -V -rustup target add aarch64-apple-darwin - -# update crates.io index without updating Cargo.lock -export CARGO_NET_GIT_FETCH_WITH_CLI=true -cargo update --dry-run diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml index c7b13ec599..6edeb79b1c 100644 --- a/.github/workflows/build_wheel.yml +++ b/.github/workflows/build_wheel.yml @@ -27,7 +27,7 @@ jobs: - build: macos-x86_64 os: macos-latest arch: x86_64 - macos_target: 'MACOSX_DEPLOYMENT_TARGET=10.11 CARGO_BUILD_TARGET=x86_64-apple-darwin' + macos_target: 'MACOSX_DEPLOYMENT_TARGET=11.0 CARGO_BUILD_TARGET=x86_64-apple-darwin' - build: macos-arm64 os: macos-latest arch: arm64 diff --git a/.github/workflows/dev_envs.yml b/.github/workflows/dev_envs.yml index c993e0a520..fcf1aca25b 100644 --- a/.github/workflows/dev_envs.yml +++ b/.github/workflows/dev_envs.yml @@ -57,7 +57,7 @@ jobs: - name: install dependencies shell: bash -l {0} - run: mamba install tox-conda rust git compilers pandoc + run: mamba install tox-conda rust git compilers pandoc libstdcxx-ng - name: run tests for 3.9 shell: bash -l {0} diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index fa16b58900..89c87c7551 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -63,17 +63,6 @@ jobs: toolchain: ${{ matrix.rust }} override: true - - name: Set up Python 3.8 - uses: actions/setup-python@v4 - with: - python-version: "3.8" - - - name: Install dependencies - continue-on-error: ${{ matrix.continue }} - run: | - python -m pip install --upgrade pip - python -m pip install -e . - - name: Run tests uses: actions-rs/cargo@v1 with: @@ -90,16 +79,6 @@ jobs: toolchain: stable override: true - - name: Set up Python 3.8 - uses: actions/setup-python@v4 - with: - python-version: "3.8" - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - python -m pip install -e . - - uses: actions-rs/install@v0.1 with: crate: cargo-all-features @@ -255,6 +234,12 @@ jobs: toolchain: stable override: true + - name: Check semver + uses: obi1kenobi/cargo-semver-checks-action@v2 + with: + crate-name: sourmash + version-tag-prefix: r + - name: Make sure we can publish the sourmash crate uses: actions-rs/cargo@v1 with: diff --git a/Cargo.lock b/Cargo.lock index 1b3b7a6569..f2f3b05920 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -70,6 +70,12 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b7e4c2464d97fe331d41de9d5db0def0a96f4d823b8b32a2efd503578988973" +[[package]] +name = "binary-merge" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "597bb81c80a54b6a4381b23faba8d7774b144c94cbd1d6fe3f1329bd776554ab" + [[package]] name = "bincode" version = "1.3.3" @@ -79,6 +85,26 @@ dependencies = [ "serde", ] +[[package]] +name = "bindgen" +version = "0.64.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4243e6031260db77ede97ad86c27e501d646a27ab57b59a574f725d98ab1fb4" +dependencies = [ + "bitflags 1.3.2", + "cexpr", + "clang-sys", + "lazy_static", + "lazycell", + "peeking_take_while", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn 1.0.104", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -91,18 +117,6 @@ version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42" -[[package]] -name = "bstr" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" -dependencies = [ - "lazy_static", - "memchr", - "regex-automata", - "serde", -] - [[package]] name = "buf_redux" version = "0.8.4" @@ -129,12 +143,39 @@ version = "3.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535" +[[package]] +name = "bytecheck" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a31f923c2db9513e4298b72df143e6e655a759b3d6a0966df18f81223fff54f" +dependencies = [ + "bytecheck_derive", + "ptr_meta", +] + +[[package]] +name = "bytecheck_derive" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edb17c862a905d912174daa27ae002326fff56dc8b8ada50a0a5f0976cb174f0" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.104", +] + [[package]] name = "bytecount" version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" +[[package]] +name = "bytemuck" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f5715e491b5a1598fc2bef5a606847b5dc1d48ea625bd3c02c00de8285591da" + [[package]] name = "byteorder" version = "1.4.3" @@ -179,6 +220,18 @@ name = "cc" version = "1.0.73" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" +dependencies = [ + "jobserver", +] + +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] [[package]] name = "cfg-if" @@ -228,6 +281,17 @@ dependencies = [ "half", ] +[[package]] +name = "clang-sys" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a050e2153c5be08febd6734e29298e844fdb0fa21aeddd63b4eb7baa106c69b" +dependencies = [ + "glob", + "libc", + "libloading", +] + [[package]] name = "clap" version = "4.3.0" @@ -389,13 +453,12 @@ dependencies = [ [[package]] name = "csv" -version = "1.1.6" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" +checksum = "af91f40b7355f82b0a891f50e70399475945bb0b0da4f1700ce60761c9d3e359" dependencies = [ - "bstr", "csv-core", - "itoa 0.4.8", + "itoa", "ryu", "serde", ] @@ -411,9 +474,9 @@ dependencies = [ [[package]] name = "cxx" -version = "1.0.85" +version = "1.0.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5add3fc1717409d029b20c5b6903fc0c0b02fa6741d820054f4a2efa5e5816fd" +checksum = "86d3488e7665a7a483b57e25bdd90d0aeb2bc7608c8d0346acf2ad3f1caf1d62" dependencies = [ "cc", "cxxbridge-flags", @@ -423,9 +486,9 @@ dependencies = [ [[package]] name = "cxx-build" -version = "1.0.85" +version = "1.0.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4c87959ba14bc6fbc61df77c3fcfe180fc32b93538c4f1031dd802ccb5f2ff0" +checksum = "48fcaf066a053a41a81dfb14d57d99738b767febb8b735c3016e469fac5da690" dependencies = [ "cc", "codespan-reporting", @@ -438,15 +501,15 @@ dependencies = [ [[package]] name = "cxxbridge-flags" -version = "1.0.85" +version = "1.0.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69a3e162fde4e594ed2b07d0f83c6c67b745e7f28ce58c6df5e6b6bef99dfb59" +checksum = "a2ef98b8b717a829ca5603af80e1f9e2e48013ab227b68ef37872ef84ee479bf" [[package]] name = "cxxbridge-macro" -version = "1.0.85" +version = "1.0.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e7e2adeb6a0d4a282e581096b06e1791532b7d576dcde5ccd9382acf55db8e6" +checksum = "086c685979a698443656e5cf7856c95c642295a38599f12fb1ff76fb28d19892" dependencies = [ "proc-macro2", "quote", @@ -547,12 +610,27 @@ dependencies = [ "syn 1.0.104", ] +[[package]] +name = "glob" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" + [[package]] name = "half" version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" +[[package]] +name = "hashbrown" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db0d4cf898abf0081f964436dc980e96670a0f36863e4b83aaacdb65c9d7ccc3" +dependencies = [ + "ahash", +] + [[package]] name = "heck" version = "0.4.1" @@ -574,6 +652,12 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" +[[package]] +name = "histogram" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12cb882ccb290b8646e554b157ab0b71e64e8d5bef775cd66b6531e52d302669" + [[package]] name = "iana-time-zone" version = "0.1.53" @@ -600,9 +684,18 @@ dependencies = [ [[package]] name = "indoc" -version = "1.0.7" +version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adab1eaa3408fb7f0c777a73e7465fd5656136fc93b670eb6df3c88c2c1344e3" +checksum = "bfa799dd5ed20a7e349f3b4639aa80d74549c81716d9ec4f994c9b5815598306" + +[[package]] +name = "inplace-vec-builder" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf64c2edc8226891a71f127587a2861b132d2b942310843814d5001d99a1d307" +dependencies = [ + "smallvec", +] [[package]] name = "io-lifetimes" @@ -638,15 +731,18 @@ dependencies = [ [[package]] name = "itoa" -version = "0.4.8" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" +checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35" [[package]] -name = "itoa" -version = "1.0.1" +name = "jobserver" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35" +checksum = "af25a77299a7f711a01975c35a6a424eb6862092cc2d6c72c4ed6cbc56dfc1fa" +dependencies = [ + "libc", +] [[package]] name = "js-sys" @@ -663,18 +759,61 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + [[package]] name = "libc" version = "0.2.146" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f92be4933c13fd498862a9e02a3055f8a8d9c039ce33db97306fd5a6caa7f29b" +[[package]] +name = "libloading" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" +dependencies = [ + "cfg-if", + "winapi", +] + [[package]] name = "libm" version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb" +[[package]] +name = "librocksdb-sys" +version = "0.10.0+7.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fe4d5874f5ff2bc616e55e8c6086d478fcda13faf9495768a4aa1c22042d30b" +dependencies = [ + "bindgen", + "bzip2-sys", + "cc", + "glob", + "libc", + "libz-sys", + "lz4-sys", + "zstd-sys", +] + +[[package]] +name = "libz-sys" +version = "1.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9702761c3935f8cc2f101793272e202c72b99da8f4224a19ddcf1279a6450bbf" +dependencies = [ + "cc", + "pkg-config", + "vcpkg", +] + [[package]] name = "link-cplusplus" version = "1.0.8" @@ -698,9 +837,9 @@ checksum = "09fc20d2ca12cb9f044c93e3bd6d32d523e6e2ec3db4f7b2939cd99026ecd3f0" [[package]] name = "lock_api" -version = "0.4.8" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f80bf5aacaf25cbfc8210d1cfb718f2bf3b11c4c54e5afe36c236853a8ec390" +checksum = "327fa5b6a6940e4699ec49a9beae1ea4845c6bab9314e4f84ac68742139d8c53" dependencies = [ "autocfg", "scopeguard", @@ -712,6 +851,16 @@ version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +[[package]] +name = "lz4-sys" +version = "1.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57d27b317e207b10f69f5e75494119e391a96f48861ae870d1da6edac98ca900" +dependencies = [ + "cc", + "libc", +] + [[package]] name = "lzma-sys" version = "0.1.17" @@ -772,6 +921,12 @@ dependencies = [ "autocfg", ] +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "miniz_oxide" version = "0.4.4" @@ -828,9 +983,9 @@ dependencies = [ [[package]] name = "niffler" -version = "2.4.0" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68c7ffd42bdba05fc9fbfda31283d44c5c8a88fed1a191f68795dba23cc8204b" +checksum = "470dd05a938a5ad42c2cb80ceea4255e275990ee530b86ca164e6d8a19fa407f" dependencies = [ "cfg-if", "flate2", @@ -843,6 +998,16 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2bf50223579dc7cdcfb3bfcacf7069ff68243f8c363f62ffa99cf000a6b9c451" +[[package]] +name = "nom" +version = "7.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8903e5a29a317527874d0402f867152a3d21c908bb0b933e416c65e301d4c36" +dependencies = [ + "memchr", + "minimal-lexical", +] + [[package]] name = "num-complex" version = "0.3.1" @@ -895,9 +1060,9 @@ dependencies = [ [[package]] name = "numpy" -version = "0.17.1" +version = "0.17.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6522ac2e780f532432a7c7f5dbadbfcea9ff1cf4dd858fb509ca13061a928413" +checksum = "a462c1af5ba1fddec1488c4646993a23ae7931f9e170ccba23e9c7c834277797" dependencies = [ "ahash", "libc", @@ -908,6 +1073,15 @@ dependencies = [ "pyo3", ] +[[package]] +name = "numsep" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad5c49c3e12c314efb1f43cba136031b657dcd59ee26936ab2be313c5e97da22" +dependencies = [ + "slicestring", +] + [[package]] name = "once_cell" version = "1.18.0" @@ -956,17 +1130,23 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.3" +version = "0.9.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09a279cbf25cb0757810394fbc1e359949b59e348145c643a939a525692e6929" +checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521" dependencies = [ "cfg-if", "libc", "redox_syscall 0.2.10", "smallvec", - "windows-sys 0.36.1", + "windows-sys 0.45.0", ] +[[package]] +name = "peeking_take_while" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" + [[package]] name = "piz" version = "0.4.0" @@ -1081,6 +1261,26 @@ dependencies = [ "unarray", ] +[[package]] +name = "ptr_meta" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0738ccf7ea06b608c10564b31debd4f5bc5e197fc8bfe088f68ae5ce81e7a4f1" +dependencies = [ + "ptr_meta_derive", +] + +[[package]] +name = "ptr_meta_derive" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16b845dbfca988fa33db069c0e230574d15a3088f147a87b64c7589eb662c9ac" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.104", +] + [[package]] name = "pyo3" version = "0.17.1" @@ -1244,18 +1444,79 @@ dependencies = [ "regex-syntax", ] -[[package]] -name = "regex-automata" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" - [[package]] name = "regex-syntax" version = "0.6.26" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49b3de9ec5dc0a3417da371aab17d729997c15010e7fd24ff707773a33bddb64" +[[package]] +name = "rend" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "581008d2099240d37fb08d77ad713bcaec2c4d89d50b5b21a8bb1996bbab68ab" +dependencies = [ + "bytecheck", +] + +[[package]] +name = "retain_mut" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c31b5c4033f8fdde8700e4657be2c497e7288f01515be52168c631e2e4d4086" + +[[package]] +name = "rkyv" +version = "0.7.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c30f1d45d9aa61cbc8cd1eb87705470892289bb2d01943e7803b873a57404dc3" +dependencies = [ + "bytecheck", + "hashbrown", + "ptr_meta", + "rend", + "rkyv_derive", + "seahash", +] + +[[package]] +name = "rkyv_derive" +version = "0.7.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff26ed6c7c4dfc2aa9480b86a60e3c7233543a270a680e10758a507c5a4ce476" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.104", +] + +[[package]] +name = "roaring" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef0fb5e826a8bde011ecae6a8539dd333884335c57ff0f003fbe27c25bbe8f71" +dependencies = [ + "bytemuck", + "byteorder", + "retain_mut", +] + +[[package]] +name = "rocksdb" +version = "0.20.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "015439787fce1e75d55f279078d33ff14b4af5d93d995e8838ee4631301c8a99" +dependencies = [ + "libc", + "librocksdb-sys", +] + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "rustix" version = "0.37.20" @@ -1322,6 +1583,12 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ddccb15bcce173023b3fedd9436f882a0739b8dfb45e4f6b6002bee5929f61b2" +[[package]] +name = "seahash" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b" + [[package]] name = "serde" version = "1.0.168" @@ -1348,11 +1615,29 @@ version = "1.0.104" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "076066c5f1078eac5b722a31827a8832fe108bed65dfa75e233c89f8206e976c" dependencies = [ - "itoa 1.0.1", + "itoa", "ryu", "serde", ] +[[package]] +name = "shlex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" + +[[package]] +name = "size" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fed904c7fb2856d868b92464fc8fa597fce366edea1a9cbfaa8cb5fe080bd6d" + +[[package]] +name = "slicestring" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "636b979c5672ac7c2a1120ca0a9a6074cd090dadfec42af6f8a5baea1223d180" + [[package]] name = "smallvec" version = "1.8.0" @@ -1367,7 +1652,7 @@ checksum = "9f1341053f34bb13b5e9590afb7d94b48b48d4b87467ec28e3c238693bb553de" [[package]] name = "sourmash" -version = "0.11.0" +version = "0.12.0" dependencies = [ "assert_matches", "az", @@ -1377,10 +1662,12 @@ dependencies = [ "chrono", "counter", "criterion", + "csv", "finch", "fixedbitset", "getrandom", "getset", + "histogram", "log", "md5", "memmap2", @@ -1389,6 +1676,7 @@ dependencies = [ "niffler", "nohash-hasher", "num-iter", + "numsep", "once_cell", "ouroboros", "piz", @@ -1396,8 +1684,12 @@ dependencies = [ "proptest", "rand", "rayon", + "rkyv", + "roaring", + "rocksdb", "serde", "serde_json", + "size", "tempfile", "thiserror", "twox-hash", @@ -1556,9 +1848,9 @@ checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3" [[package]] name = "unicode-width" -version = "0.1.9" +version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973" +checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" [[package]] name = "unindent" @@ -1566,12 +1858,21 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "58ee9362deb4a96cef4d437d1ad49cffc9b9e92d202b6995674e928ce684f112" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "vec-collections" -version = "0.3.6" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f2390c4dc8ae8640c57d067b1a3d40bc05c124cc6bc7394d761b53435d41b76" +checksum = "3c9965c8f2ffed1dbcd16cafe18a009642f540fa22661c6cfd6309ddb02e4982" dependencies = [ + "binary-merge", + "inplace-vec-builder", + "lazy_static", "num-traits", "serde", "smallvec", @@ -1736,15 +2037,11 @@ checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows-sys" -version = "0.36.1" +version = "0.45.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea04155a16a59f9eab786fe12a4a450e75cdb175f9e0d80da1e17db09f55b8d2" +checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" dependencies = [ - "windows_aarch64_msvc 0.36.1", - "windows_i686_gnu 0.36.1", - "windows_i686_msvc 0.36.1", - "windows_x86_64_gnu 0.36.1", - "windows_x86_64_msvc 0.36.1", + "windows-targets 0.42.2", ] [[package]] @@ -1753,7 +2050,22 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ - "windows-targets", + "windows-targets 0.48.0", +] + +[[package]] +name = "windows-targets" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" +dependencies = [ + "windows_aarch64_gnullvm 0.42.2", + "windows_aarch64_msvc 0.42.2", + "windows_i686_gnu 0.42.2", + "windows_i686_msvc 0.42.2", + "windows_x86_64_gnu 0.42.2", + "windows_x86_64_gnullvm 0.42.2", + "windows_x86_64_msvc 0.42.2", ] [[package]] @@ -1762,15 +2074,21 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5" dependencies = [ - "windows_aarch64_gnullvm", + "windows_aarch64_gnullvm 0.48.0", "windows_aarch64_msvc 0.48.0", "windows_i686_gnu 0.48.0", "windows_i686_msvc 0.48.0", "windows_x86_64_gnu 0.48.0", - "windows_x86_64_gnullvm", + "windows_x86_64_gnullvm 0.48.0", "windows_x86_64_msvc 0.48.0", ] +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" + [[package]] name = "windows_aarch64_gnullvm" version = "0.48.0" @@ -1779,9 +2097,9 @@ checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" [[package]] name = "windows_aarch64_msvc" -version = "0.36.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9bb8c3fd39ade2d67e9874ac4f3db21f0d710bee00fe7cab16949ec184eeaa47" +checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" [[package]] name = "windows_aarch64_msvc" @@ -1791,9 +2109,9 @@ checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" [[package]] name = "windows_i686_gnu" -version = "0.36.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "180e6ccf01daf4c426b846dfc66db1fc518f074baa793aa7d9b9aaeffad6a3b6" +checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" [[package]] name = "windows_i686_gnu" @@ -1803,9 +2121,9 @@ checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" [[package]] name = "windows_i686_msvc" -version = "0.36.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2e7917148b2812d1eeafaeb22a97e4813dfa60a3f8f78ebe204bcc88f12f024" +checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" [[package]] name = "windows_i686_msvc" @@ -1815,9 +2133,9 @@ checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" [[package]] name = "windows_x86_64_gnu" -version = "0.36.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4dcd171b8776c41b97521e5da127a2d86ad280114807d0b2ab1e462bc764d9e1" +checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" [[package]] name = "windows_x86_64_gnu" @@ -1825,6 +2143,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" + [[package]] name = "windows_x86_64_gnullvm" version = "0.48.0" @@ -1833,9 +2157,9 @@ checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" [[package]] name = "windows_x86_64_msvc" -version = "0.36.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c811ca4a8c853ef420abd8592ba53ddbbac90410fab6903b3e79972a631f7680" +checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" [[package]] name = "windows_x86_64_msvc" @@ -1851,3 +2175,14 @@ checksum = "c179869f34fc7c01830d3ce7ea2086bc3a07e0d35289b667d0a8bf910258926c" dependencies = [ "lzma-sys", ] + +[[package]] +name = "zstd-sys" +version = "2.0.7+zstd.1.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94509c3ba2fe55294d752b79842c530ccfab760192521df74a081a78d2b3c7f5" +dependencies = [ + "cc", + "libc", + "pkg-config", +] diff --git a/Makefile b/Makefile index 4c2ef69abb..990f79c068 100644 --- a/Makefile +++ b/Makefile @@ -28,6 +28,8 @@ include/sourmash.h: src/core/src/lib.rs \ src/core/src/ffi/hyperloglog.rs \ src/core/src/ffi/minhash.rs \ src/core/src/ffi/signature.rs \ + src/core/src/ffi/manifest.rs \ + src/core/src/ffi/picklist.rs \ src/core/src/ffi/nodegraph.rs \ src/core/src/ffi/index/mod.rs \ src/core/src/ffi/index/revindex.rs \ diff --git a/deny.toml b/deny.toml new file mode 100644 index 0000000000..19b40477fb --- /dev/null +++ b/deny.toml @@ -0,0 +1,46 @@ +all-features = true +feature-depth = 1 + +[advisories] +db-path = "~/.cargo/advisory-db" +db-urls = ["https://github.com/rustsec/advisory-db"] +vulnerability = "deny" +unmaintained = "warn" +yanked = "warn" +notice = "warn" +# A list of advisory IDs to ignore. Note that ignored advisories will still +# output a note when they are encountered. +ignore = [ + #"RUSTSEC-0000-0000", +] + +[licenses] +unlicensed = "deny" +allow = [ + "MIT", + "Apache-2.0", + "Apache-2.0 WITH LLVM-exception", + "BSD-3-Clause", + "BSD-2-Clause", + "ISC", + "Unicode-DFS-2016", +] +copyleft = "warn" +allow-osi-fsf-free = "neither" +default = "deny" +confidence-threshold = 0.8 +exceptions = [ + { allow = ["Zlib"], name = "piz", version = "*" }, +] + +[bans] +multiple-versions = "deny" +wildcards = "allow" +highlight = "all" +workspace-default-features = "allow" +external-default-features = "allow" + +[sources] +unknown-registry = "warn" +unknown-git = "warn" +allow-registry = ["https://github.com/rust-lang/crates.io-index"] diff --git a/doc/developer.md b/doc/developer.md index d3f83f7924..e28f656402 100644 --- a/doc/developer.md +++ b/doc/developer.md @@ -25,7 +25,7 @@ and the [`conda-forge`](https://conda-forge.org/) channel by default). Once `mamba` is installed, run ``` -mamba create -n sourmash_dev tox-conda rust git compilers pandoc +mamba create -n sourmash_dev tox-conda rust git compilers pandoc libstdcxx-ng ``` to create an environment called `sourmash_dev` containing the programs needed for development. diff --git a/flake.lock b/flake.lock index f9fc0a31af..4efa617b3a 100644 --- a/flake.lock +++ b/flake.lock @@ -1,32 +1,12 @@ { "nodes": { - "naersk": { - "inputs": { - "nixpkgs": [ - "nixpkgs" - ] - }, - "locked": { - "lastModified": 1688534083, - "narHash": "sha256-/bI5vsioXscQTsx+Hk9X5HfweeNZz/6kVKsbdqfwW7g=", - "owner": "nix-community", - "repo": "naersk", - "rev": "abca1fb7a6cfdd355231fc220c3d0302dbb4369a", - "type": "github" - }, - "original": { - "owner": "nix-community", - "repo": "naersk", - "type": "github" - } - }, "nixpkgs": { "locked": { - "lastModified": 1689449371, - "narHash": "sha256-sK3Oi8uEFrFPL83wKPV6w0+96NrmwqIpw9YFffMifVg=", + "lastModified": 1692372666, + "narHash": "sha256-JyoI70xpi2irk2JW5KL2w4DrkKmr6EiPztYw6+dqnho=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "29bcead8405cfe4c00085843eb372cc43837bb9d", + "rev": "da5b3c7f1029781ee3c0d971db3f259cf46d205d", "type": "github" }, "original": { @@ -38,7 +18,6 @@ }, "root": { "inputs": { - "naersk": "naersk", "nixpkgs": "nixpkgs", "rust-overlay": "rust-overlay", "utils": "utils" @@ -54,11 +33,11 @@ ] }, "locked": { - "lastModified": 1689475081, - "narHash": "sha256-lAyG+KKKjOAG1YxYnji1g1pV39WxzQQBHI3ZwoRzweM=", + "lastModified": 1692410823, + "narHash": "sha256-YM1QCenpghNqgleUmoCJUArTuMEBqScyQuhepA6JZaI=", "owner": "oxalica", "repo": "rust-overlay", - "rev": "6e28f20574595b01e14f2bbb57d62b84393fdcc1", + "rev": "598b2f04ed252eb5808b108d7a10084c0c548753", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index 21bee98f82..f448eb4266 100644 --- a/flake.nix +++ b/flake.nix @@ -11,16 +11,9 @@ flake-utils.follows = "utils"; }; }; - - naersk = { - url = "github:nix-community/naersk"; - inputs = { - nixpkgs.follows = "nixpkgs"; - }; - }; }; - outputs = { self, nixpkgs, naersk, rust-overlay, utils }: + outputs = { self, nixpkgs, rust-overlay, utils }: utils.lib.eachDefaultSystem (system: let overlays = [ (import rust-overlay) ]; @@ -36,10 +29,6 @@ cargo = rustVersion; rustc = rustVersion; }; - naersk-lib = naersk.lib."${system}".override { - cargo = rustVersion; - rustc = rustVersion; - }; python = pkgs.python311Packages; @@ -49,10 +38,13 @@ { packages = { - lib = naersk-lib.buildPackage { + lib = rustPlatform.buildRustPackage { + name = "libsourmash"; pname = "libsourmash"; - root = ./.; + src = lib.cleanSource ./.; copyLibs = true; + cargoLock.lockFile = ./Cargo.lock; + nativeBuildInputs = with rustPlatform; [ bindgenHook ]; }; sourmash = python.buildPythonPackage rec { @@ -66,7 +58,7 @@ lockFile = ./Cargo.lock; }; - nativeBuildInputs = with rustPlatform; [ cargoSetupHook maturinBuildHook ]; + nativeBuildInputs = with rustPlatform; [ cargoSetupHook maturinBuildHook bindgenHook ]; buildInputs = lib.optionals stdenv.isDarwin [ libiconv ]; propagatedBuildInputs = with python; [ cffi deprecation cachetools bitstring numpy scipy matplotlib screed ]; @@ -93,9 +85,7 @@ defaultPackage = self.packages.${system}.sourmash; devShell = mkShell { - nativeBuildInputs = [ - clang_13 - ]; + nativeBuildInputs = [ rustPlatform.bindgenHook ]; buildInputs = [ rustVersion @@ -107,7 +97,6 @@ (python311.withPackages (ps: with ps; [ virtualenv tox cffi ])) (python310.withPackages (ps: with ps; [ virtualenv ])) (python39.withPackages (ps: with ps; [ virtualenv ])) - (python38.withPackages (ps: with ps; [ virtualenv ])) rust-cbindgen maturin @@ -123,13 +112,10 @@ cargo-outdated cargo-udeps nixpkgs-fmt - - llvmPackages_13.libclang - llvmPackages_13.libcxxClang + cargo-deny ]; - BINDGEN_EXTRA_CLANG_ARGS = "-isystem ${llvmPackages_13.libclang.lib}/lib/clang/${lib.getVersion clang}/include"; - LIBCLANG_PATH = "${llvmPackages_13.libclang.lib}/lib"; + # Needed for matplotlib LD_LIBRARY_PATH = "${stdenv.cc.cc.lib}/lib64:$LD_LIBRARY_PATH"; # workaround for https://github.com/NixOS/nixpkgs/blob/48dfc9fa97d762bce28cc8372a2dd3805d14c633/doc/languages-frameworks/python.section.md#python-setuppy-bdist_wheel-cannot-create-whl diff --git a/include/sourmash.h b/include/sourmash.h index 6fa7854880..4a7c9bd235 100644 --- a/include/sourmash.h +++ b/include/sourmash.h @@ -16,6 +16,12 @@ enum HashFunctions { }; typedef uint32_t HashFunctions; +enum PickStyle { + PICK_STYLE_INCLUDE = 1, + PICK_STYLE_EXCLUDE = 2, +}; +typedef uint32_t PickStyle; + enum SourmashErrorCode { SOURMASH_ERROR_CODE_NO_ERROR = 0, SOURMASH_ERROR_CODE_PANIC = 1, @@ -42,6 +48,7 @@ enum SourmashErrorCode { SOURMASH_ERROR_CODE_PARSE_INT = 100003, SOURMASH_ERROR_CODE_SERDE_ERROR = 100004, SOURMASH_ERROR_CODE_NIFFLER_ERROR = 100005, + SOURMASH_ERROR_CODE_CSV_ERROR = 100006, }; typedef uint32_t SourmashErrorCode; @@ -51,14 +58,26 @@ typedef struct SourmashHyperLogLog SourmashHyperLogLog; typedef struct SourmashKmerMinHash SourmashKmerMinHash; +typedef struct SourmashLinearIndex SourmashLinearIndex; + +typedef struct SourmashManifest SourmashManifest; + +typedef struct SourmashManifestRowIter SourmashManifestRowIter; + typedef struct SourmashNodegraph SourmashNodegraph; +typedef struct SourmashPicklist SourmashPicklist; + typedef struct SourmashRevIndex SourmashRevIndex; typedef struct SourmashSearchResult SourmashSearchResult; +typedef struct SourmashSelection SourmashSelection; + typedef struct SourmashSignature SourmashSignature; +typedef struct SourmashSignatureIter SourmashSignatureIter; + typedef struct SourmashZipStorage SourmashZipStorage; /** @@ -79,6 +98,15 @@ typedef struct { bool owned; } SourmashStr; +typedef struct { + uint32_t ksize; + uint8_t with_abundance; + SourmashStr md5; + SourmashStr internal_location; + SourmashStr name; + SourmashStr moltype; +} SourmashManifestRow; + bool computeparams_dayhoff(const SourmashComputeParameters *ptr); bool computeparams_dna(const SourmashComputeParameters *ptr); @@ -263,8 +291,38 @@ double kmerminhash_similarity(const SourmashKmerMinHash *ptr, void kmerminhash_slice_free(uint64_t *ptr, uintptr_t insize); +SourmashKmerMinHash *kmerminhash_to_frozen(const SourmashKmerMinHash *ptr); + +SourmashKmerMinHash *kmerminhash_to_mutable(const SourmashKmerMinHash *ptr); + bool kmerminhash_track_abundance(const SourmashKmerMinHash *ptr); +void linearindex_free(SourmashLinearIndex *ptr); + +uint64_t linearindex_len(const SourmashLinearIndex *ptr); + +SourmashStr linearindex_location(const SourmashLinearIndex *ptr); + +const SourmashManifest *linearindex_manifest(const SourmashLinearIndex *ptr); + +SourmashLinearIndex *linearindex_new(SourmashZipStorage *storage_ptr, + SourmashManifest *manifest_ptr, + SourmashSelection *selection_ptr, + bool use_manifest); + +SourmashLinearIndex *linearindex_select(SourmashLinearIndex *ptr, + const SourmashSelection *selection_ptr); + +void linearindex_set_manifest(SourmashLinearIndex *ptr, SourmashManifest *manifest_ptr); + +SourmashSignatureIter *linearindex_signatures(const SourmashLinearIndex *ptr); + +const SourmashZipStorage *linearindex_storage(const SourmashLinearIndex *ptr); + +SourmashManifestRowIter *manifest_rows(const SourmashManifest *ptr); + +const SourmashManifestRow *manifest_rows_iter_next(SourmashManifestRowIter *ptr); + void nodegraph_buffer_free(uint8_t *ptr, uintptr_t insize); bool nodegraph_count(SourmashNodegraph *ptr, uint64_t h); @@ -309,6 +367,18 @@ SourmashNodegraph *nodegraph_with_tables(uintptr_t ksize, uintptr_t starting_size, uintptr_t n_tables); +void picklist_free(SourmashPicklist *ptr); + +SourmashPicklist *picklist_new(void); + +void picklist_set_coltype(SourmashPicklist *ptr, const char *coltype_ptr, uintptr_t insize); + +void picklist_set_column_name(SourmashPicklist *ptr, const char *prop_ptr, uintptr_t insize); + +void picklist_set_pickfile(SourmashPicklist *ptr, const char *prop_ptr, uintptr_t insize); + +void picklist_set_pickstyle(SourmashPicklist *ptr, PickStyle pickstyle); + void revindex_free(SourmashRevIndex *ptr); const SourmashSearchResult *const *revindex_gather(const SourmashRevIndex *ptr, @@ -354,6 +424,36 @@ double searchresult_score(const SourmashSearchResult *ptr); SourmashSignature *searchresult_signature(const SourmashSearchResult *ptr); +bool selection_abund(const SourmashSelection *ptr); + +bool selection_containment(const SourmashSelection *ptr); + +uint32_t selection_ksize(const SourmashSelection *ptr); + +HashFunctions selection_moltype(const SourmashSelection *ptr); + +SourmashSelection *selection_new(void); + +uint32_t selection_num(const SourmashSelection *ptr); + +const SourmashPicklist *selection_picklist(const SourmashSelection *ptr); + +uint32_t selection_scaled(const SourmashSelection *ptr); + +void selection_set_abund(SourmashSelection *ptr, bool new_abund); + +void selection_set_containment(SourmashSelection *ptr, bool new_containment); + +void selection_set_ksize(SourmashSelection *ptr, uint32_t new_ksize); + +void selection_set_moltype(SourmashSelection *ptr, HashFunctions new_moltype); + +void selection_set_num(SourmashSelection *ptr, uint32_t new_num); + +void selection_set_picklist(SourmashSelection *ptr, SourmashPicklist *new_picklist); + +void selection_set_scaled(SourmashSelection *ptr, uint32_t new_scaled); + void signature_add_protein(SourmashSignature *ptr, const char *sequence); void signature_add_sequence(SourmashSignature *ptr, const char *sequence, bool force); @@ -370,16 +470,12 @@ SourmashStr signature_get_filename(const SourmashSignature *ptr); SourmashStr signature_get_license(const SourmashSignature *ptr); -SourmashKmerMinHash **signature_get_mhs(const SourmashSignature *ptr, uintptr_t *size); - SourmashStr signature_get_name(const SourmashSignature *ptr); uintptr_t signature_len(const SourmashSignature *ptr); SourmashSignature *signature_new(void); -void signature_push_mh(SourmashSignature *ptr, const SourmashKmerMinHash *other); - SourmashStr signature_save_json(const SourmashSignature *ptr); void signature_set_filename(SourmashSignature *ptr, const char *name); @@ -388,6 +484,8 @@ void signature_set_mh(SourmashSignature *ptr, const SourmashKmerMinHash *other); void signature_set_name(SourmashSignature *ptr, const char *name); +const SourmashSignature *signatures_iter_next(SourmashSignatureIter *ptr); + SourmashSignature **signatures_load_buffer(const char *ptr, uintptr_t insize, bool _ignore_md5sum, diff --git a/pyproject.toml b/pyproject.toml index 48d0152679..0e5b4405b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -148,6 +148,12 @@ features = ["maturin"] locked = true module-name = "sourmash._lowlevel" +# macOS deployment target SDK version +[tool.maturin.target.x86_64-apple-darwin] +macos-deployment-target = "11.0" +[tool.maturin.target.aarch64-apple-darwin] +macos-deployment-target = "11.0" + [tool.isort] known_third_party = ["deprecation", "hypothesis", "mmh3", "numpy", "pytest", "screed", "sourmash_tst_utils"] multi_line_output = 3 @@ -158,11 +164,38 @@ known_first_party = ["sourmash"] [tool.cibuildwheel] build = "cp39-*" -skip = "*-win32 *-manylinux_i686 *-musllinux_ppc64le *-musllinux_s390x" -before-build = "source .ci/install_cargo.sh" -environment = { PATH="$HOME/.cargo/bin:$PATH" } +skip = "*-win32 *-manylinux_i686 *-musllinux_*" +before-all = [ + "curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain=stable", + "cargo update --dry-run", +] +macos.before-build = [ + "rustup target add aarch64-apple-darwin", +] build-verbosity = 3 +[tool.cibuildwheel.environment] +CARGO_REGISTRIES_CRATES_IO_PROTOCOL="sparse" +PATH="$HOME/.cargo/bin:$PATH" + +[tool.cibuildwheel.linux] +before-all = [ + "curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain=stable", + "cargo update --dry-run", + "if [ -f /etc/system-release ]; then yum -y install centos-release-scl; fi", + "if [ -f /etc/system-release ]; then yum -y install llvm-toolset-7.0; fi", +] +before-build = [ + "if [ -f /etc/system-release ]; then source scl_source enable llvm-toolset-7.0; fi", + "if [ -f /etc/system-release ]; then source scl_source enable devtoolset-10; fi", +] +[tool.cibuildwheel.linux.environment] +CARGO_REGISTRIES_CRATES_IO_PROTOCOL="sparse" +PATH="$HOME/.cargo/bin:$PATH" +LIBCLANG_PATH="/opt/rh/llvm-toolset-7.0/root/usr/lib64" +LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/rh/llvm-toolset-7.0/root/usr/lib64" +C_INCLUDE_PATH="/opt/rh/devtoolset-10/root/usr/lib/gcc/aarch64-redhat-linux/10/include:/opt/rh/devtoolset-10/root/usr/lib/gcc/x86_64-redhat-linux/10/include" + [tool.pytest.ini_options] addopts = "--doctest-glob='doc/*.md' -n4" norecursedirs = [ diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index a5c68435ab..87c7aefaca 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sourmash" -version = "0.11.0" +version = "0.12.0" authors = ["Luiz Irber "] description = "MinHash sketches for genomic data" repository = "https://github.com/sourmash-bio/sourmash" @@ -22,6 +22,8 @@ bench = false from-finch = ["finch"] parallel = ["rayon"] maturin = [] +mastiff = ["rocksdb", "rkyv"] +default = ["parallel", "mastiff"] [dependencies] az = "1.0.0" @@ -29,6 +31,7 @@ bytecount = "0.6.0" byteorder = "1.4.3" cfg-if = "1.0" counter = "0.5.7" +csv = "1.1.6" finch = { version = "0.5.0", optional = true } fixedbitset = "0.4.0" getrandom = { version = "0.2", features = ["js"] } @@ -39,7 +42,7 @@ murmurhash3 = "0.0.5" niffler = { version = "2.3.1", default-features = false, features = [ "gz" ] } nohash-hasher = "0.2.0" num-iter = "0.1.43" -once_cell = "1.18.0" # once_cell 1.14+ requires Rust 1.56+ +once_cell = "1.18.0" rayon = { version = "1.7.0", optional = true } serde = { version = "1.0.168", features = ["derive"] } serde_json = "1.0.104" @@ -47,10 +50,16 @@ primal-check = "0.3.1" thiserror = "1.0" typed-builder = "0.14.0" twox-hash = "1.6.0" -vec-collections = "0.3.4" +vec-collections = "0.4.3" piz = "0.4.0" # piz 0.5.1 requires Rust 1.63+ memmap2 = "0.7.1" ouroboros = "0.17.2" +rkyv = { version = "0.7.39", optional = true } +rocksdb = { version = "0.20.0", optional = true } +roaring = "0.10.0" +histogram = "0.6.9" +numsep = "0.1.12" +size = "0.4.0" [dev-dependencies] assert_matches = "1.3.0" @@ -60,10 +69,6 @@ proptest = { version = "1.2.0", default-features = false, features = ["std"]} rand = "0.8.2" tempfile = "3.7.1" -[[bench]] -name = "index" -harness = false - [[bench]] name = "compute" harness = false diff --git a/src/core/benches/index.rs b/src/core/benches/index.rs deleted file mode 100644 index d3d4b54118..0000000000 --- a/src/core/benches/index.rs +++ /dev/null @@ -1,83 +0,0 @@ -#[macro_use] -extern crate criterion; - -use std::path::PathBuf; - -use criterion::{Bencher, Criterion, Fun}; -use sourmash::index::bigsi::BIGSI; -use sourmash::index::linear::LinearIndex; -use sourmash::index::Index; -use sourmash::index::MHBT; -use sourmash::signature::Signature; - -fn find_small_bench(c: &mut Criterion) { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/v5.sbt.json"); - - let sbt = MHBT::from_path(filename).expect("Loading error"); - - let leaf: Signature = (*sbt.signatures().first().unwrap()).clone(); - - let mut linear = LinearIndex::builder().storage(sbt.storage()).build(); - - for l in sbt.signatures() { - linear.insert(l).unwrap(); - } - - let mut bigsi = BIGSI::new(10000, 10); - for l in sbt.signatures() { - bigsi.insert(l).unwrap(); - } - - let sbt_find = Fun::new("sbt_search", move |b: &mut Bencher, leaf: &Signature| { - b.iter(|| sbt.search(leaf, 0.1, false)) - }); - - let linear_find = Fun::new("linear_search", move |b: &mut Bencher, leaf: &Signature| { - b.iter(|| linear.search(leaf, 0.1, false)) - }); - - let bigsi_find = Fun::new("bigsi_search", move |b: &mut Bencher, leaf: &Signature| { - b.iter(|| bigsi.search(leaf, 0.1, false)) - }); - - let functions = vec![sbt_find, linear_find, bigsi_find]; - c.bench_functions("search_small", functions, leaf); -} - -fn find_subset_bench(c: &mut Criterion) { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/subset.sbt.json"); - - let sbt = MHBT::from_path(filename).expect("Loading error"); - - let leaf: Signature = (*sbt.signatures().first().unwrap()).clone(); - - let mut linear = LinearIndex::builder().storage(sbt.storage()).build(); - for l in sbt.signatures() { - linear.insert(l).unwrap(); - } - - let mut bigsi = BIGSI::new(10000, 10); - for l in sbt.signatures() { - bigsi.insert(l).unwrap(); - } - - let sbt_find = Fun::new("sbt_search", move |b: &mut Bencher, leaf: &Signature| { - b.iter(|| sbt.search(leaf, 0.1, false)) - }); - - let linear_find = Fun::new("linear_search", move |b: &mut Bencher, leaf: &Signature| { - b.iter(|| linear.search(leaf, 0.1, false)) - }); - - let bigsi_find = Fun::new("bigsi_search", move |b: &mut Bencher, leaf: &Signature| { - b.iter(|| bigsi.search(leaf, 0.1, false)) - }); - - let functions = vec![sbt_find, linear_find, bigsi_find]; - c.bench_functions("search_subset", functions, leaf); -} - -criterion_group!(benches, find_small_bench, find_subset_bench); -criterion_main!(benches); diff --git a/src/core/src/encodings.rs b/src/core/src/encodings.rs index 6010cf2f6d..443db90b50 100644 --- a/src/core/src/encodings.rs +++ b/src/core/src/encodings.rs @@ -7,6 +7,7 @@ use std::str; use nohash_hasher::BuildNoHashHasher; use once_cell::sync::Lazy; +use vec_collections::AbstractVecSet; use crate::Error; @@ -23,6 +24,10 @@ type ColorToIdx = HashMap>; #[allow(non_camel_case_types)] #[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[cfg_attr( + feature = "rkyv", + derive(rkyv::Serialize, rkyv::Deserialize, rkyv::Archive) +)] #[repr(u32)] pub enum HashFunctions { murmur64_DNA = 1, diff --git a/src/core/src/errors.rs b/src/core/src/errors.rs index cd4ddcfaf1..f6c3ce311e 100644 --- a/src/core/src/errors.rs +++ b/src/core/src/errors.rs @@ -63,6 +63,9 @@ pub enum SourmashError { #[error(transparent)] IOError(#[from] std::io::Error), + #[error(transparent)] + CsvError(#[from] csv::Error), + #[cfg(not(all(target_arch = "wasm32", target_os = "unknown")))] #[error(transparent)] Panic(#[from] crate::ffi::utils::Panic), @@ -108,6 +111,7 @@ pub enum SourmashErrorCode { ParseInt = 100_003, SerdeError = 100_004, NifflerError = 100_005, + CsvError = 100_006, } #[cfg(not(all(target_arch = "wasm32", target_os = "unknown")))] @@ -137,6 +141,7 @@ impl SourmashErrorCode { SourmashError::IOError { .. } => SourmashErrorCode::Io, SourmashError::NifflerError { .. } => SourmashErrorCode::NifflerError, SourmashError::Utf8Error { .. } => SourmashErrorCode::Utf8Error, + SourmashError::CsvError { .. } => SourmashErrorCode::CsvError, } } } diff --git a/src/core/src/ffi/hyperloglog.rs b/src/core/src/ffi/hyperloglog.rs index d9e828ab48..a5412fb6c8 100644 --- a/src/core/src/ffi/hyperloglog.rs +++ b/src/core/src/ffi/hyperloglog.rs @@ -6,7 +6,7 @@ use crate::prelude::*; use crate::signature::SigsTrait; use crate::sketch::hyperloglog::HyperLogLog; -use crate::ffi::minhash::SourmashKmerMinHash; +use crate::ffi::minhash::{MinHash, SourmashKmerMinHash}; use crate::ffi::utils::ForeignObject; pub struct SourmashHyperLogLog; @@ -108,14 +108,14 @@ unsafe fn hll_merge( } ffi_fn! { -unsafe fn hll_update_mh( - ptr: *mut SourmashHyperLogLog, - optr: *const SourmashKmerMinHash, -) { +unsafe fn hll_update_mh(ptr: *mut SourmashHyperLogLog, optr: *const SourmashKmerMinHash) { let hll = SourmashHyperLogLog::as_rust_mut(ptr); let mh = SourmashKmerMinHash::as_rust(optr); - mh.update(hll)? + match mh { + MinHash::Mutable(mh) => mh.update(hll)?, + MinHash::Frozen(mh) => mh.update(hll)?, + } } } diff --git a/src/core/src/ffi/index/mod.rs b/src/core/src/ffi/index/mod.rs index 932a97b222..516a3eafc7 100644 --- a/src/core/src/ffi/index/mod.rs +++ b/src/core/src/ffi/index/mod.rs @@ -1,7 +1,11 @@ pub mod revindex; +use crate::encodings::HashFunctions; +use crate::index::{Selection, SigStore}; + use crate::signature::Signature; +use crate::ffi::picklist::SourmashPicklist; use crate::ffi::signature::SourmashSignature; use crate::ffi::utils::{ForeignObject, SourmashStr}; @@ -35,3 +39,164 @@ pub unsafe extern "C" fn searchresult_signature( let result = SourmashSearchResult::as_rust(ptr); SourmashSignature::from_rust((result.1).clone()) } + +//================================================================ + +pub struct SourmashSelection; + +impl ForeignObject for SourmashSelection { + type RustObject = Selection; +} + +#[no_mangle] +pub unsafe extern "C" fn selection_new() -> *mut SourmashSelection { + SourmashSelection::from_rust(Selection::default()) +} + +#[no_mangle] +pub unsafe extern "C" fn selection_ksize(ptr: *const SourmashSelection) -> u32 { + let sel = SourmashSelection::as_rust(ptr); + if let Some(ksize) = sel.ksize() { + ksize + } else { + todo!("empty ksize case not supported yet") + } +} + +#[no_mangle] +pub unsafe extern "C" fn selection_set_ksize(ptr: *mut SourmashSelection, new_ksize: u32) { + let sel = SourmashSelection::as_rust_mut(ptr); + sel.set_ksize(new_ksize); +} + +#[no_mangle] +pub unsafe extern "C" fn selection_num(ptr: *const SourmashSelection) -> u32 { + let sel = SourmashSelection::as_rust(ptr); + if let Some(num) = sel.num() { + num + } else { + todo!("empty num case not supported yet") + } +} + +#[no_mangle] +pub unsafe extern "C" fn selection_set_num(ptr: *mut SourmashSelection, new_num: u32) { + let sel = SourmashSelection::as_rust_mut(ptr); + sel.set_num(new_num); +} + +#[no_mangle] +pub unsafe extern "C" fn selection_scaled(ptr: *const SourmashSelection) -> u32 { + let sel = SourmashSelection::as_rust(ptr); + if let Some(scaled) = sel.scaled() { + scaled + } else { + todo!("empty scaled case not supported yet") + } +} + +#[no_mangle] +pub unsafe extern "C" fn selection_set_scaled(ptr: *mut SourmashSelection, new_scaled: u32) { + let sel = SourmashSelection::as_rust_mut(ptr); + sel.set_scaled(new_scaled); +} + +#[no_mangle] +pub unsafe extern "C" fn selection_containment(ptr: *const SourmashSelection) -> bool { + let sel = SourmashSelection::as_rust(ptr); + if let Some(containment) = sel.containment() { + containment + } else { + todo!("empty scaled case not supported yet") + } +} + +#[no_mangle] +pub unsafe extern "C" fn selection_set_containment( + ptr: *mut SourmashSelection, + new_containment: bool, +) { + let sel = SourmashSelection::as_rust_mut(ptr); + sel.set_containment(new_containment); +} + +#[no_mangle] +pub unsafe extern "C" fn selection_abund(ptr: *const SourmashSelection) -> bool { + let sel = SourmashSelection::as_rust(ptr); + if let Some(abund) = sel.abund() { + abund + } else { + todo!("empty abund case not supported yet") + } +} + +#[no_mangle] +pub unsafe extern "C" fn selection_set_abund(ptr: *mut SourmashSelection, new_abund: bool) { + let sel = SourmashSelection::as_rust_mut(ptr); + sel.set_abund(new_abund); +} + +#[no_mangle] +pub unsafe extern "C" fn selection_moltype(ptr: *const SourmashSelection) -> HashFunctions { + let sel = SourmashSelection::as_rust(ptr); + if let Some(hash_function) = sel.moltype() { + hash_function + } else { + todo!("empty hash_function case not supported yet") + } +} + +#[no_mangle] +pub unsafe extern "C" fn selection_set_moltype( + ptr: *mut SourmashSelection, + new_moltype: HashFunctions, +) { + let sel = SourmashSelection::as_rust_mut(ptr); + sel.set_moltype(new_moltype); +} + +#[no_mangle] +pub unsafe extern "C" fn selection_picklist( + ptr: *const SourmashSelection, +) -> *const SourmashPicklist { + let sel = SourmashSelection::as_rust(ptr); + if let Some(picklist) = sel.picklist() { + SourmashPicklist::from_rust(picklist) + } else { + todo!("empty picklist case not supported yet") + } +} + +#[no_mangle] +pub unsafe extern "C" fn selection_set_picklist( + ptr: *mut SourmashSelection, + new_picklist: *mut SourmashPicklist, +) { + let sel = SourmashSelection::as_rust_mut(ptr); + let pick = SourmashPicklist::into_rust(new_picklist); + sel.set_picklist(*pick); +} + +//================================================================ +// +pub struct SignatureIterator { + iter: Box>, +} + +pub struct SourmashSignatureIter; + +impl ForeignObject for SourmashSignatureIter { + type RustObject = SignatureIterator; +} + +#[no_mangle] +pub unsafe extern "C" fn signatures_iter_next( + ptr: *mut SourmashSignatureIter, +) -> *const SourmashSignature { + let iterator = SourmashSignatureIter::as_rust_mut(ptr); + + match iterator.iter.next() { + Some(sig) => SourmashSignature::from_rust(sig.into()), + None => std::ptr::null(), + } +} diff --git a/src/core/src/ffi/index/revindex.rs b/src/core/src/ffi/index/revindex.rs index 3597121bce..abf0bc6bad 100644 --- a/src/core/src/ffi/index/revindex.rs +++ b/src/core/src/ffi/index/revindex.rs @@ -1,15 +1,22 @@ use std::path::PathBuf; use std::slice; +use std::sync::Arc; -use crate::index::revindex::RevIndex; +use crate::index::revindex::mem_revindex::{LinearRevIndex, RevIndex}; use crate::index::Index; +use crate::manifest::Manifest; use crate::signature::{Signature, SigsTrait}; -use crate::sketch::minhash::KmerMinHash; +use crate::sketch::minhash::{max_hash_for_scaled, KmerMinHash}; use crate::sketch::Sketch; +use crate::storage::Storage; -use crate::ffi::index::SourmashSearchResult; -use crate::ffi::minhash::SourmashKmerMinHash; +use crate::ffi::index::{ + SignatureIterator, SourmashSearchResult, SourmashSelection, SourmashSignatureIter, +}; +use crate::ffi::manifest::SourmashManifest; +use crate::ffi::minhash::{MinHash, SourmashKmerMinHash}; use crate::ffi::signature::SourmashSignature; +use crate::ffi::storage::SourmashZipStorage; use crate::ffi::utils::{ForeignObject, SourmashStr}; pub struct SourmashRevIndex; @@ -42,8 +49,7 @@ unsafe fn revindex_new_with_paths( let template = { assert!(!template_ptr.is_null()); - //TODO: avoid clone here - Sketch::MinHash(SourmashKmerMinHash::as_rust(template_ptr).clone()) + SourmashKmerMinHash::as_rust(template_ptr).clone().into() }; let queries_vec: Vec; @@ -52,9 +58,11 @@ unsafe fn revindex_new_with_paths( } else { queries_vec = slice::from_raw_parts(queries_ptr, inqueries) .iter() - .map(|mh_ptr| - // TODO: avoid this clone - SourmashKmerMinHash::as_rust(*mh_ptr).clone()) + .map(|mh_ptr| match SourmashKmerMinHash::as_rust(*mh_ptr) { + // TODO: avoid clone + MinHash::Mutable(mh) => mh.clone().into(), + MinHash::Frozen(mh) => mh.clone(), + }) .collect(); Some(queries_vec.as_ref()) }; @@ -90,7 +98,7 @@ unsafe fn revindex_new_with_sigs( let template = { assert!(!template_ptr.is_null()); //TODO: avoid clone here - Sketch::MinHash(SourmashKmerMinHash::as_rust(template_ptr).clone()) + SourmashKmerMinHash::as_rust(template_ptr).clone().into() }; let queries_vec: Vec; @@ -99,9 +107,13 @@ unsafe fn revindex_new_with_sigs( } else { queries_vec = slice::from_raw_parts(queries_ptr, inqueries) .iter() - .map(|mh_ptr| - // TODO: avoid this clone - SourmashKmerMinHash::as_rust(*mh_ptr).clone()) + .map(|mh_ptr| { + // TODO: avoid this clone + match SourmashKmerMinHash::as_rust(*mh_ptr) { + MinHash::Mutable(mh) => mh.clone().into(), + MinHash::Frozen(mh) => mh.clone(), + } + }) .collect(); Some(queries_vec.as_ref()) }; @@ -248,3 +260,141 @@ unsafe fn revindex_signatures( Ok(Box::into_raw(b) as *mut *mut SourmashSignature) } } + +//-------------------------------------------------- + +pub struct SourmashLinearIndex; + +impl ForeignObject for SourmashLinearIndex { + type RustObject = LinearRevIndex; +} + +ffi_fn! { +unsafe fn linearindex_new( + storage_ptr: *mut SourmashZipStorage, + manifest_ptr: *mut SourmashManifest, + selection_ptr: *mut SourmashSelection, + use_manifest: bool, +) -> Result<*mut SourmashLinearIndex> { + let storage = Arc::try_unwrap(*SourmashZipStorage::into_rust(storage_ptr)).ok().unwrap(); + + let manifest = if manifest_ptr.is_null() { + if use_manifest { + // Load manifest from zipstorage + Some(Manifest::from_reader(storage.load("SOURMASH-MANIFEST.csv")?.as_slice())?) + } else { + None + } + } else { + Some(*SourmashManifest::into_rust(manifest_ptr)) + }; + + let _selection = if !selection_ptr.is_null() { + Some(SourmashSelection::into_rust(selection_ptr)) + } else { + None + }; + // TODO: how to extract a template? Probably from selection? + let max_hash = max_hash_for_scaled(100); + let template = Sketch::MinHash( + KmerMinHash::builder() + .num(0u32) + .ksize(57) + .hash_function(crate::encodings::HashFunctions::murmur64_protein) + .max_hash(max_hash) + .build(), + ); + + /* + def __init__(self, storage, *, selection_dict=None, + traverse_yield_all=False, manifest=None, use_manifest=True): + sig_files: Manifest, + template: &Sketch, + keep_sigs: bool, + ref_sigs: Option>, + storage: Option, + */ + + let linear_index = LinearRevIndex::new(manifest, &template, false, None, Some(storage)); + + Ok(SourmashLinearIndex::from_rust(linear_index)) +} +} + +#[no_mangle] +pub unsafe extern "C" fn linearindex_free(ptr: *mut SourmashLinearIndex) { + SourmashLinearIndex::drop(ptr); +} + +#[no_mangle] +pub unsafe extern "C" fn linearindex_manifest( + ptr: *const SourmashLinearIndex, +) -> *const SourmashManifest { + let index = SourmashLinearIndex::as_rust(ptr); + SourmashManifest::from_rust(index.manifest()) +} + +ffi_fn! { +unsafe fn linearindex_set_manifest( + ptr: *mut SourmashLinearIndex, + manifest_ptr: *mut SourmashManifest, +) -> Result<()> { + let index = SourmashLinearIndex::as_rust_mut(ptr); + let manifest = SourmashManifest::into_rust(manifest_ptr); + + index.set_manifest(*manifest)?; + Ok(()) +} +} + +#[no_mangle] +pub unsafe extern "C" fn linearindex_len(ptr: *const SourmashLinearIndex) -> u64 { + let index = SourmashLinearIndex::as_rust(ptr); + index.len() as u64 +} + +#[no_mangle] +pub unsafe extern "C" fn linearindex_location(ptr: *const SourmashLinearIndex) -> SourmashStr { + let index = SourmashLinearIndex::as_rust(ptr); + match index.location() { + Some(x) => x, + None => "".into(), + } + .into() +} + +#[no_mangle] +pub unsafe extern "C" fn linearindex_storage( + ptr: *const SourmashLinearIndex, +) -> *const SourmashZipStorage { + let index = SourmashLinearIndex::as_rust(ptr); + let storage = index.storage(); + + match storage { + Some(st) => SourmashZipStorage::from_rust(st), + None => std::ptr::null::(), + } +} + +#[no_mangle] +pub unsafe extern "C" fn linearindex_signatures( + ptr: *const SourmashLinearIndex, +) -> *mut SourmashSignatureIter { + let index = SourmashLinearIndex::as_rust(ptr); + + let iter = Box::new(index.signatures_iter()); + SourmashSignatureIter::from_rust(SignatureIterator { iter }) +} + +ffi_fn! { +unsafe fn linearindex_select( + ptr: *mut SourmashLinearIndex, + selection_ptr: *const SourmashSelection, +) -> Result<*mut SourmashLinearIndex> { + let index = SourmashLinearIndex::into_rust(ptr); + let selection = SourmashSelection::as_rust(selection_ptr); + + let new_index = index.select(selection)?; + Ok(SourmashLinearIndex::from_rust(new_index)) +} +} diff --git a/src/core/src/ffi/manifest.rs b/src/core/src/ffi/manifest.rs new file mode 100644 index 0000000000..815f8d83f1 --- /dev/null +++ b/src/core/src/ffi/manifest.rs @@ -0,0 +1,73 @@ +use crate::manifest::{Manifest, Record}; + +use crate::ffi::utils::{ForeignObject, SourmashStr}; + +pub struct SourmashManifest; + +impl ForeignObject for SourmashManifest { + type RustObject = Manifest; +} + +pub struct ManifestRowIterator { + iter: Box>, +} + +pub struct SourmashManifestRowIter; + +impl ForeignObject for SourmashManifestRowIter { + type RustObject = ManifestRowIterator; +} + +#[no_mangle] +pub unsafe extern "C" fn manifest_rows_iter_next( + ptr: *mut SourmashManifestRowIter, +) -> *const SourmashManifestRow { + let iterator = SourmashManifestRowIter::as_rust_mut(ptr); + + match iterator.iter.next() { + Some(row) => SourmashManifestRow::from_rust(row.into()), + None => std::ptr::null(), + } +} + +#[no_mangle] +pub unsafe extern "C" fn manifest_rows( + ptr: *const SourmashManifest, +) -> *mut SourmashManifestRowIter { + let manifest = SourmashManifest::as_rust(ptr); + + let iter = Box::new(manifest.iter()); + SourmashManifestRowIter::from_rust(ManifestRowIterator { iter }) +} + +#[repr(C)] +pub struct SourmashManifestRow { + pub ksize: u32, + pub with_abundance: u8, + pub md5: SourmashStr, + pub internal_location: SourmashStr, + pub name: SourmashStr, + pub moltype: SourmashStr, +} + +impl ForeignObject for SourmashManifestRow { + type RustObject = SourmashManifestRow; +} + +impl From<&Record> for SourmashManifestRow { + fn from(record: &Record) -> SourmashManifestRow { + Self { + ksize: record.ksize(), + with_abundance: record.with_abundance() as u8, + md5: record.md5().into(), + name: record.name().into(), + moltype: record.moltype().to_string().into(), + internal_location: record + .internal_location() + .to_str() + .unwrap() + .to_owned() + .into(), + } + } +} diff --git a/src/core/src/ffi/minhash.rs b/src/core/src/ffi/minhash.rs index 45890b81d9..3509c705ab 100644 --- a/src/core/src/ffi/minhash.rs +++ b/src/core/src/ffi/minhash.rs @@ -6,12 +6,33 @@ use crate::encodings::{aa_to_dayhoff, aa_to_hp, translate_codon, HashFunctions}; use crate::ffi::utils::{ForeignObject, SourmashStr}; use crate::signature::SeqToHashes; use crate::signature::SigsTrait; -use crate::sketch::minhash::KmerMinHash; +use crate::sketch::hyperloglog::HyperLogLog; +use crate::sketch::minhash::{ + AbundMinHashOps, FracMinHashOps, KmerMinHash, KmerMinHashBTree, MinHashOps, +}; +use crate::sketch::Sketch; +use crate::Error; +use crate::HashIntoType; + +#[derive(Clone)] +pub enum MinHash { + Mutable(KmerMinHashBTree), + Frozen(KmerMinHash), +} pub struct SourmashKmerMinHash; +#[no_mangle] +pub unsafe extern "C" fn kmerminhash_to_mutable( + ptr: *const SourmashKmerMinHash, +) -> *mut SourmashKmerMinHash { + let mh = SourmashKmerMinHash::as_rust(ptr); + + SourmashKmerMinHash::from_rust(mh.clone().to_mutable()) +} + impl ForeignObject for SourmashKmerMinHash { - type RustObject = KmerMinHash; + type RustObject = MinHash; } #[no_mangle] @@ -23,9 +44,9 @@ pub unsafe extern "C" fn kmerminhash_new( track_abundance: bool, n: u32, ) -> *mut SourmashKmerMinHash { - let mh = KmerMinHash::new(scaled, k, hash_function, seed, track_abundance, n); + let mh = KmerMinHashBTree::new(scaled, k, hash_function, seed, track_abundance, n); - SourmashKmerMinHash::from_rust(mh) + SourmashKmerMinHash::from_rust(MinHash::Mutable(mh)) } #[no_mangle] @@ -33,6 +54,15 @@ pub unsafe extern "C" fn kmerminhash_free(ptr: *mut SourmashKmerMinHash) { SourmashKmerMinHash::drop(ptr); } +#[no_mangle] +pub unsafe extern "C" fn kmerminhash_to_frozen( + ptr: *const SourmashKmerMinHash, +) -> *mut SourmashKmerMinHash { + let mh = SourmashKmerMinHash::as_rust(ptr); + + SourmashKmerMinHash::from_rust(mh.clone().to_frozen()) +} + #[no_mangle] pub unsafe extern "C" fn kmerminhash_slice_free(ptr: *mut u64, insize: usize) { // FIXME @@ -471,6 +501,7 @@ unsafe fn kmerminhash_similarity(ptr: *const SourmashKmerMinHash, other: *const mh.similarity(other_mh, ignore_abundance, downsample) } } + ffi_fn! { unsafe fn kmerminhash_angular_similarity(ptr: *const SourmashKmerMinHash, other: *const SourmashKmerMinHash) -> Result { @@ -479,3 +510,372 @@ unsafe fn kmerminhash_angular_similarity(ptr: *const SourmashKmerMinHash, other: mh.angular_similarity(other_mh) } } + +impl MinHash { + pub fn to_mutable(self) -> MinHash { + match self { + MinHash::Mutable(mh) => MinHash::Mutable(mh), + MinHash::Frozen(mh) => MinHash::Mutable(mh.into()), + } + } + + pub fn to_frozen(self) -> MinHash { + match self { + MinHash::Mutable(mh) => MinHash::Frozen(mh.into()), + MinHash::Frozen(mh) => MinHash::Frozen(mh), + } + } + + pub fn num(&self) -> u32 { + match self { + MinHash::Mutable(mh) => mh.num(), + MinHash::Frozen(mh) => mh.num(), + } + } + + pub fn count_common(&self, other: &MinHash, downsample: bool) -> Result { + match *self { + MinHash::Mutable(ref mh) => match other { + MinHash::Mutable(ref ot) => mh.count_common(ot, downsample), + MinHash::Frozen(ref ot) => { + Into::::into(mh.clone()).count_common(ot, downsample) + } + }, + + MinHash::Frozen(ref mh) => match other { + MinHash::Frozen(ref ot) => mh.count_common(ot, downsample), + MinHash::Mutable(ref ot) => { + Into::::into(mh.clone()).count_common(ot, downsample) + } + }, + } + } + + pub fn merge(&mut self, other: &MinHash) -> Result<(), Error> { + match *self { + MinHash::Mutable(ref mut mh) => match other { + MinHash::Mutable(ref ot) => mh.merge(ot), + MinHash::Frozen(ref ot) => mh.merge(&Into::::into(ot.clone())), + }, + + MinHash::Frozen(ref mut mh) => match other { + MinHash::Frozen(ref ot) => mh.merge(ot), + MinHash::Mutable(ref ot) => mh.merge(&Into::::into(ot.clone())), + }, + } + } + + pub fn similarity( + &self, + other: &MinHash, + ignore_abundance: bool, + downsample: bool, + ) -> Result { + match *self { + MinHash::Mutable(ref mh) => match other { + MinHash::Mutable(ref ot) => mh.similarity(ot, ignore_abundance, downsample), + MinHash::Frozen(ref ot) => Into::::into(mh.clone()).similarity( + ot, + ignore_abundance, + downsample, + ), + }, + + MinHash::Frozen(ref mh) => { + match other { + MinHash::Frozen(ref ot) => mh.similarity(ot, ignore_abundance, downsample), + MinHash::Mutable(ref ot) => Into::::into(mh.clone()) + .similarity(ot, ignore_abundance, downsample), + } + } + } + } + + pub fn jaccard(&self, other: &MinHash) -> Result { + match *self { + MinHash::Mutable(ref mh) => match other { + MinHash::Mutable(ref ot) => mh.jaccard(ot), + MinHash::Frozen(ref ot) => Into::::into(mh.clone()).jaccard(ot), + }, + + MinHash::Frozen(ref mh) => match other { + MinHash::Frozen(ref ot) => mh.jaccard(ot), + MinHash::Mutable(ref ot) => Into::::into(mh.clone()).jaccard(ot), + }, + } + } + + pub fn intersection_size(&self, other: &MinHash) -> Result<(u64, u64), Error> { + match *self { + MinHash::Mutable(ref mh) => match other { + MinHash::Mutable(ref ot) => mh.intersection_size(ot), + MinHash::Frozen(ref ot) => { + Into::::into(mh.clone()).intersection_size(ot) + } + }, + + MinHash::Frozen(ref mh) => match other { + MinHash::Frozen(ref ot) => mh.intersection_size(ot), + MinHash::Mutable(ref ot) => { + Into::::into(mh.clone()).intersection_size(ot) + } + }, + } + } + + pub fn intersection(&self, other: &MinHash) -> Result<(Vec, u64), Error> { + match *self { + MinHash::Mutable(ref mh) => match other { + MinHash::Mutable(ref ot) => mh.intersection(ot), + MinHash::Frozen(ref ot) => Into::::into(mh.clone()).intersection(ot), + }, + + MinHash::Frozen(ref mh) => match other { + MinHash::Frozen(ref ot) => mh.intersection(ot), + MinHash::Mutable(ref ot) => { + Into::::into(mh.clone()).intersection(ot) + } + }, + } + } + + pub fn add_from(&mut self, other: &MinHash) -> Result<(), Error> { + match *self { + MinHash::Mutable(ref mut mh) => match other { + MinHash::Mutable(ref ot) => mh.add_from(ot), + MinHash::Frozen(ref ot) => mh.add_from(&Into::::into(ot.clone())), + }, + + MinHash::Frozen(ref mut mh) => match other { + MinHash::Frozen(ref ot) => mh.add_from(ot), + MinHash::Mutable(ref ot) => mh.add_from(&Into::::into(ot.clone())), + }, + } + } + + pub fn remove_from(&mut self, other: &MinHash) -> Result<(), Error> { + match *self { + MinHash::Mutable(ref mut mh) => match other { + MinHash::Mutable(ref ot) => mh.remove_from(ot), + MinHash::Frozen(ref ot) => { + mh.remove_from(&Into::::into(ot.clone())) + } + }, + + MinHash::Frozen(ref mut mh) => match other { + MinHash::Frozen(ref ot) => mh.remove_from(ot), + MinHash::Mutable(ref ot) => mh.remove_from(&Into::::into(ot.clone())), + }, + } + } +} + +impl From for Sketch { + fn from(mh: MinHash) -> Sketch { + match mh { + MinHash::Mutable(mh) => Sketch::LargeMinHash(mh), + MinHash::Frozen(mh) => Sketch::MinHash(mh), + } + } +} + +impl FracMinHashOps for MinHash { + fn max_hash(&self) -> HashIntoType { + match *self { + MinHash::Mutable(ref mh) => mh.max_hash(), + MinHash::Frozen(ref mh) => mh.max_hash(), + } + } + + fn downsample_max_hash(&self, max_hash: HashIntoType) -> Result { + match *self { + MinHash::Mutable(ref mh) => Ok(MinHash::Mutable(mh.downsample_max_hash(max_hash)?)), + MinHash::Frozen(ref mh) => Ok(MinHash::Frozen(mh.downsample_max_hash(max_hash)?)), + } + } +} + +impl AbundMinHashOps for MinHash { + fn track_abundance(&self) -> bool { + match *self { + MinHash::Mutable(ref mh) => mh.track_abundance(), + MinHash::Frozen(ref mh) => mh.track_abundance(), + } + } + + fn enable_abundance(&mut self) -> Result<(), Error> { + match *self { + MinHash::Mutable(ref mut mh) => mh.enable_abundance(), + MinHash::Frozen(ref mut mh) => mh.enable_abundance(), + } + } + + fn disable_abundance(&mut self) { + match *self { + MinHash::Mutable(ref mut mh) => mh.disable_abundance(), + MinHash::Frozen(ref mut mh) => mh.disable_abundance(), + } + } + + fn add_hash_with_abundance(&mut self, hash: HashIntoType, abundance: u64) { + match *self { + MinHash::Mutable(ref mut mh) => mh.add_hash_with_abundance(hash, abundance), + MinHash::Frozen(ref mut mh) => mh.add_hash_with_abundance(hash, abundance), + } + } + + fn set_hash_with_abundance(&mut self, hash: HashIntoType, abundance: u64) { + match *self { + MinHash::Mutable(ref mut mh) => mh.set_hash_with_abundance(hash, abundance), + MinHash::Frozen(ref mut mh) => mh.set_hash_with_abundance(hash, abundance), + } + } + + fn abunds(&self) -> Option> { + match *self { + MinHash::Mutable(ref mh) => mh.abunds(), + MinHash::Frozen(ref mh) => mh.abunds(), + } + } + + fn to_vec_abunds(&self) -> Vec<(HashIntoType, u64)> { + match *self { + MinHash::Mutable(ref mh) => mh.to_vec_abunds(), + MinHash::Frozen(ref mh) => mh.to_vec_abunds(), + } + } +} + +impl MinHashOps for MinHash { + fn clear(&mut self) { + match *self { + MinHash::Mutable(ref mut mh) => mh.clear(), + MinHash::Frozen(ref mut mh) => mh.clear(), + } + } + + fn is_empty(&self) -> bool { + match *self { + MinHash::Mutable(ref mh) => mh.is_empty(), + MinHash::Frozen(ref mh) => mh.is_empty(), + } + } + + fn reset_md5sum(&self) { + match *self { + MinHash::Mutable(ref mh) => mh.reset_md5sum(), + MinHash::Frozen(ref mh) => mh.reset_md5sum(), + } + } + + fn md5sum(&self) -> String { + match *self { + MinHash::Mutable(ref mh) => mh.md5sum(), + MinHash::Frozen(ref mh) => mh.md5sum(), + } + } + + fn mins(&self) -> Vec { + match *self { + MinHash::Mutable(ref mh) => mh.mins(), + MinHash::Frozen(ref mh) => mh.mins(), + } + } + + fn remove_hash(&mut self, hash: HashIntoType) { + match *self { + MinHash::Mutable(ref mut mh) => mh.remove_hash(hash), + MinHash::Frozen(ref mut mh) => mh.remove_hash(hash), + } + } + + fn as_hll(&self) -> HyperLogLog { + match *self { + MinHash::Mutable(ref mh) => mh.as_hll(), + MinHash::Frozen(ref mh) => mh.as_hll(), + } + } +} + +impl SigsTrait for MinHash { + fn size(&self) -> usize { + match *self { + MinHash::Mutable(ref mh) => mh.size(), + MinHash::Frozen(ref mh) => mh.size(), + } + } + + fn to_vec(&self) -> Vec { + match *self { + MinHash::Mutable(ref mh) => mh.to_vec(), + MinHash::Frozen(ref mh) => mh.to_vec(), + } + } + + fn ksize(&self) -> usize { + match *self { + MinHash::Mutable(ref mh) => mh.ksize(), + MinHash::Frozen(ref mh) => mh.ksize(), + } + } + + fn seed(&self) -> u64 { + match *self { + MinHash::Mutable(ref mh) => mh.seed(), + MinHash::Frozen(ref mh) => mh.seed(), + } + } + + fn hash_function(&self) -> HashFunctions { + match *self { + MinHash::Mutable(ref mh) => mh.hash_function(), + MinHash::Frozen(ref mh) => mh.hash_function(), + } + } + + fn set_hash_function(&mut self, h: HashFunctions) -> Result<(), Error> { + match *self { + MinHash::Mutable(ref mut mh) => mh.set_hash_function(h), + MinHash::Frozen(ref mut mh) => mh.set_hash_function(h), + } + } + + fn add_hash(&mut self, hash: HashIntoType) { + match *self { + MinHash::Mutable(ref mut mh) => mh.add_hash(hash), + MinHash::Frozen(ref mut mh) => mh.add_hash(hash), + } + } + + fn check_compatible(&self, other: &Self) -> Result<(), Error> { + match *self { + MinHash::Mutable(ref mh) => match other { + MinHash::Mutable(ref ot) => mh.check_compatible(ot), + MinHash::Frozen(ref ot) => { + Into::::into(mh.clone()).check_compatible(ot) + } + }, + + MinHash::Frozen(ref mh) => match other { + MinHash::Frozen(ref ot) => mh.check_compatible(ot), + MinHash::Mutable(ref ot) => { + Into::::into(mh.clone()).check_compatible(ot) + } + }, + } + } + + fn add_sequence(&mut self, seq: &[u8], force: bool) -> Result<(), Error> { + match *self { + MinHash::Mutable(ref mut mh) => mh.add_sequence(seq, force), + MinHash::Frozen(ref mut mh) => mh.add_sequence(seq, force), + } + } + + fn add_protein(&mut self, seq: &[u8]) -> Result<(), Error> { + match *self { + MinHash::Mutable(ref mut mh) => mh.add_protein(seq), + MinHash::Frozen(ref mut mh) => mh.add_protein(seq), + } + } +} diff --git a/src/core/src/ffi/mod.rs b/src/core/src/ffi/mod.rs index a67de37176..44e856001f 100644 --- a/src/core/src/ffi/mod.rs +++ b/src/core/src/ffi/mod.rs @@ -9,8 +9,10 @@ pub mod utils; pub mod cmd; pub mod hyperloglog; pub mod index; +pub mod manifest; pub mod minhash; pub mod nodegraph; +pub mod picklist; pub mod signature; pub mod storage; diff --git a/src/core/src/ffi/nodegraph.rs b/src/core/src/ffi/nodegraph.rs index 2e0753b94d..46842d6513 100644 --- a/src/core/src/ffi/nodegraph.rs +++ b/src/core/src/ffi/nodegraph.rs @@ -5,7 +5,7 @@ use std::slice; use crate::prelude::*; use crate::sketch::nodegraph::Nodegraph; -use crate::ffi::minhash::SourmashKmerMinHash; +use crate::ffi::minhash::{MinHash, SourmashKmerMinHash}; use crate::ffi::utils::ForeignObject; pub struct SourmashNodegraph; @@ -134,7 +134,11 @@ pub unsafe extern "C" fn nodegraph_matches( ) -> usize { let ng = SourmashNodegraph::as_rust(ptr); let mh = SourmashKmerMinHash::as_rust(mh_ptr); - ng.matches(mh) + + match mh { + MinHash::Mutable(mh) => ng.matches(&mh.clone().into()), + MinHash::Frozen(mh) => ng.matches(mh), + } } #[no_mangle] @@ -157,7 +161,10 @@ pub unsafe extern "C" fn nodegraph_update_mh( let ng = SourmashNodegraph::as_rust_mut(ptr); let mh = SourmashKmerMinHash::as_rust(optr); - mh.update(ng).unwrap(); + match mh { + MinHash::Mutable(mh) => mh.update(ng).unwrap(), + MinHash::Frozen(mh) => mh.update(ng).unwrap(), + } } ffi_fn! { diff --git a/src/core/src/ffi/picklist.rs b/src/core/src/ffi/picklist.rs new file mode 100644 index 0000000000..c7bea755ae --- /dev/null +++ b/src/core/src/ffi/picklist.rs @@ -0,0 +1,89 @@ +use std::os::raw::c_char; +use std::slice; + +use crate::picklist::{PickStyle, Picklist}; + +use crate::ffi::utils::ForeignObject; + +pub struct SourmashPicklist; + +impl ForeignObject for SourmashPicklist { + type RustObject = Picklist; +} + +#[no_mangle] +pub unsafe extern "C" fn picklist_new() -> *mut SourmashPicklist { + SourmashPicklist::from_rust(Picklist::default()) +} + +#[no_mangle] +pub unsafe extern "C" fn picklist_free(ptr: *mut SourmashPicklist) { + SourmashPicklist::drop(ptr); +} + +ffi_fn! { +unsafe fn picklist_set_coltype( + ptr: *mut SourmashPicklist, + coltype_ptr: *const c_char, + insize: usize, +) -> Result<()> { + let coltype = { + assert!(!coltype_ptr.is_null()); + let coltype = slice::from_raw_parts(coltype_ptr as *mut u8, insize); + std::str::from_utf8(coltype)? + }; + let pl = SourmashPicklist::as_rust_mut(ptr); + pl.set_coltype(coltype.to_string()); + + Ok(()) +} +} + +ffi_fn! { +unsafe fn picklist_set_pickfile( + ptr: *mut SourmashPicklist, + prop_ptr: *const c_char, + insize: usize, +) -> Result<()> { + let prop = { + assert!(!prop_ptr.is_null()); + let prop = slice::from_raw_parts(prop_ptr as *mut u8, insize); + std::str::from_utf8(prop)? + }; + let pl = SourmashPicklist::as_rust_mut(ptr); + pl.set_pickfile(prop.to_string()); + + Ok(()) +} +} + +ffi_fn! { +unsafe fn picklist_set_column_name( + ptr: *mut SourmashPicklist, + prop_ptr: *const c_char, + insize: usize, +) -> Result<()> { + let prop = { + assert!(!prop_ptr.is_null()); + let prop = slice::from_raw_parts(prop_ptr as *mut u8, insize); + std::str::from_utf8(prop)? + }; + let pl = SourmashPicklist::as_rust_mut(ptr); + pl.set_column_name(prop.to_string()); + + Ok(()) +} +} + +ffi_fn! { +unsafe fn picklist_set_pickstyle( + ptr: *mut SourmashPicklist, + pickstyle: PickStyle, +) -> Result<()> { + let pl = SourmashPicklist::as_rust_mut(ptr); + + pl.set_pickstyle(pickstyle); + + Ok(()) +} +} diff --git a/src/core/src/ffi/signature.rs b/src/core/src/ffi/signature.rs index 825e091f4d..610a6d2e17 100644 --- a/src/core/src/ffi/signature.rs +++ b/src/core/src/ffi/signature.rs @@ -11,7 +11,7 @@ use crate::signature::Signature; use crate::sketch::Sketch; use crate::ffi::cmd::compute::SourmashComputeParameters; -use crate::ffi::minhash::SourmashKmerMinHash; +use crate::ffi::minhash::{MinHash, SourmashKmerMinHash}; use crate::ffi::utils::{ForeignObject, SourmashStr}; pub struct SourmashSignature; @@ -117,23 +117,37 @@ unsafe fn signature_set_filename(ptr: *mut SourmashSignature, name: *const c_cha } ffi_fn! { -unsafe fn signature_push_mh(ptr: *mut SourmashSignature, other: *const SourmashKmerMinHash) -> - Result<()> { +unsafe fn signature_set_mh( + ptr: *mut SourmashSignature, + other: *const SourmashKmerMinHash, +) -> Result<()> { let sig = SourmashSignature::as_rust_mut(ptr); let mh = SourmashKmerMinHash::as_rust(other); - sig.push(Sketch::MinHash(mh.clone())); + sig.reset_sketches(); + // TODO(lirber): avoid clone here + sig.push(mh.clone().into()); Ok(()) } } ffi_fn! { -unsafe fn signature_set_mh(ptr: *mut SourmashSignature, other: *const SourmashKmerMinHash) -> - Result<()> { - let sig = SourmashSignature::as_rust_mut(ptr); - let mh = SourmashKmerMinHash::as_rust(other); - sig.reset_sketches(); - sig.push(Sketch::MinHash(mh.clone())); - Ok(()) +unsafe fn signature_first_mh(ptr: *const SourmashSignature) -> Result<*mut SourmashKmerMinHash> { + let sig = SourmashSignature::as_rust(ptr); + + match sig.signatures.get(0) { + Some(Sketch::LargeMinHash(mh)) => { + // TODO(lirber): avoid clone here + Ok(SourmashKmerMinHash::from_rust(MinHash::Mutable(mh.clone()))) + } + Some(Sketch::MinHash(mh)) => { + // TODO(lirber): avoid clone here + Ok(SourmashKmerMinHash::from_rust(MinHash::Frozen(mh.clone().into()))) + } + _ => { + // TODO: signatures is empty? + unimplemented!() + } + } } } @@ -163,24 +177,6 @@ unsafe fn signature_get_license(ptr: *const SourmashSignature) -> Result Result<*mut SourmashKmerMinHash> { - let sig = SourmashSignature::as_rust(ptr); - - match sig.signatures.get(0) { - Some(Sketch::MinHash(mh)) => { - Ok(SourmashKmerMinHash::from_rust(mh.clone())) - }, - Some(Sketch::LargeMinHash(mh_btree)) => { - Ok(SourmashKmerMinHash::from_rust(mh_btree.into())) - }, - _ => Err(SourmashError::Internal { - message: "found unsupported sketch type".to_string() - }), - } -} -} - ffi_fn! { unsafe fn signature_eq(ptr: *const SourmashSignature, other: *const SourmashSignature) -> Result { let sig = SourmashSignature::as_rust(ptr); @@ -199,25 +195,12 @@ unsafe fn signature_save_json(ptr: *const SourmashSignature) -> Result Result<*mut *mut SourmashKmerMinHash> { - let sig = SourmashSignature::as_rust(ptr); - - let output = sig.sketches(); - - // FIXME: how to fit this into the ForeignObject trait? - let ptr_sigs: Vec<*mut Signature> = output.into_iter().map(|x| { - Box::into_raw(Box::new(x)) as *mut Signature - }).collect(); - - let b = ptr_sigs.into_boxed_slice(); - *size = b.len(); - - Ok(Box::into_raw(b) as *mut *mut SourmashKmerMinHash) -} -} - -ffi_fn! { -unsafe fn signatures_save_buffer(ptr: *const *const SourmashSignature, size: usize, compression: u8, osize: *mut usize) -> Result<*const u8> { +unsafe fn signatures_save_buffer( + ptr: *const *const SourmashSignature, + size: usize, + compression: u8, + osize: *mut usize, +) -> Result<*const u8> { // FIXME: review this for ForeignObject let sigs = { @@ -225,30 +208,35 @@ unsafe fn signatures_save_buffer(ptr: *const *const SourmashSignature, size: usi slice::from_raw_parts(ptr, size) }; - let rsigs: Vec<&Signature> = sigs.iter().map(|x| SourmashSignature::as_rust(*x)).collect(); + let rsigs: Vec<&Signature> = sigs + .iter() + .map(|x| SourmashSignature::as_rust(*x)) + .collect(); let mut buffer = vec![]; { - let mut writer = if compression > 0 { - let level = match compression { - 1 => niffler::compression::Level::One, - 2 => niffler::compression::Level::Two, - 3 => niffler::compression::Level::Three, - 4 => niffler::compression::Level::Four, - 5 => niffler::compression::Level::Five, - 6 => niffler::compression::Level::Six, - 7 => niffler::compression::Level::Seven, - 8 => niffler::compression::Level::Eight, - _ => niffler::compression::Level::Nine, - }; - - niffler::get_writer(Box::new(&mut buffer), - niffler::compression::Format::Gzip, - level)? - } else { - Box::new(&mut buffer) - }; - serde_json::to_writer(&mut writer, &rsigs)?; + let mut writer = if compression > 0 { + let level = match compression { + 1 => niffler::compression::Level::One, + 2 => niffler::compression::Level::Two, + 3 => niffler::compression::Level::Three, + 4 => niffler::compression::Level::Four, + 5 => niffler::compression::Level::Five, + 6 => niffler::compression::Level::Six, + 7 => niffler::compression::Level::Seven, + 8 => niffler::compression::Level::Eight, + _ => niffler::compression::Level::Nine, + }; + + niffler::get_writer( + Box::new(&mut buffer), + niffler::compression::Format::Gzip, + level, + )? + } else { + Box::new(&mut buffer) + }; + serde_json::to_writer(&mut writer, &rsigs)?; } let b = buffer.into_boxed_slice(); diff --git a/src/core/src/ffi/storage.rs b/src/core/src/ffi/storage.rs index 86d3834201..882a8d5f20 100644 --- a/src/core/src/ffi/storage.rs +++ b/src/core/src/ffi/storage.rs @@ -1,5 +1,6 @@ use std::os::raw::c_char; use std::slice; +use std::sync::Arc; use crate::ffi::utils::{ForeignObject, SourmashStr}; use crate::prelude::*; @@ -8,7 +9,7 @@ use crate::storage::ZipStorage; pub struct SourmashZipStorage; impl ForeignObject for SourmashZipStorage { - type RustObject = ZipStorage; + type RustObject = Arc; } ffi_fn! { @@ -20,7 +21,7 @@ unsafe fn zipstorage_new(ptr: *const c_char, insize: usize) -> Result<*mut Sourm }; let zipstorage = ZipStorage::from_file(path)?; - Ok(SourmashZipStorage::from_rust(zipstorage)) + Ok(SourmashZipStorage::from_rust(Arc::new(zipstorage))) } } @@ -110,7 +111,7 @@ unsafe fn zipstorage_set_subdir( std::str::from_utf8(path)? }; - storage.set_subdir(path.to_string()); + (*Arc::get_mut(storage).unwrap()).set_subdir(path.to_string()); Ok(()) } } diff --git a/src/core/src/from.rs b/src/core/src/from.rs index dfc384236e..95c5aa5fcd 100644 --- a/src/core/src/from.rs +++ b/src/core/src/from.rs @@ -2,6 +2,7 @@ use finch::sketch_schemes::mash::MashSketcher; use finch::sketch_schemes::SketchScheme; use crate::encodings::HashFunctions; +use crate::prelude::*; use crate::sketch::minhash::KmerMinHash; /* diff --git a/src/core/src/index/bigsi.rs b/src/core/src/index/bigsi.rs deleted file mode 100644 index 0e45348fc7..0000000000 --- a/src/core/src/index/bigsi.rs +++ /dev/null @@ -1,218 +0,0 @@ -use std::collections::HashMap; -use std::path::Path; - -use fixedbitset::FixedBitSet; -use thiserror::Error; -use typed_builder::TypedBuilder; - -use crate::index::Index; -use crate::signature::{Signature, SigsTrait}; -use crate::sketch::nodegraph::Nodegraph; -use crate::sketch::Sketch; -use crate::Error; -use crate::HashIntoType; - -#[derive(Clone, TypedBuilder)] -pub struct BIGSI { - matrix: Vec, - ksize: usize, - datasets: Vec, - //#[builder(setter(skip))] - //storage: Rc, -} - -#[derive(Debug, Error)] -pub enum BIGSIError { - #[error("BIGSI doesn't support this method")] - MethodDisabled, -} - -impl BIGSI { - pub fn new(bf_size: usize, ksize: usize) -> BIGSI { - let mut matrix = Vec::with_capacity(bf_size); - for _ in 0..bf_size { - // TODO: figure initial capacity for each row - matrix.push(FixedBitSet::with_capacity(100)); - } - - BIGSI { - matrix, - ksize, - datasets: Vec::new(), - } - } -} - -impl BIGSI { - pub fn add(&mut self, dataset: Signature) { - let mut ng = Nodegraph::new(&[self.matrix.len()], self.ksize); - - // TODO: select correct minhash - if let Sketch::MinHash(mh) = &dataset.signatures[0] { - for h in mh.mins() { - ng.count(h); - } - } else { - // TODO: what if it is not a mh? - unimplemented!() - } - - self.datasets.push(dataset); - let col = self.datasets.len() - 1; - - let bs = ng.into_bitsets(); - for pos in bs[0].ones() { - let bs = &mut self.matrix[pos]; - if bs.len() == col { - bs.grow(col + col / 2); - } - bs.insert(col); - } - } - - pub fn query(&self, hash: HashIntoType) -> impl Iterator + '_ { - let pos = hash as usize % self.matrix.len(); - let bs = &self.matrix[pos]; - bs.ones() - } - - pub fn query_datasets(&self, hash: HashIntoType) -> impl Iterator + '_ { - self.query(hash).map(move |pos| self.datasets[pos].clone()) - } -} - -impl<'a> Index<'a> for BIGSI { - type Item = Signature; - //type SignatureIterator = std::slice::Iter<'a, Self::Item>; - - fn search( - &self, - sig: &Self::Item, - threshold: f64, - containment: bool, - ) -> Result, Error> { - let mut results = Vec::new(); - - //TODO: still assuming one mh in the signature! - if let Sketch::MinHash(hashes) = &sig.signatures[0] { - let mut counter: HashMap = HashMap::with_capacity(hashes.size()); - - for hash in hashes.mins() { - self.query(hash).for_each(|dataset_idx| { - let idx = counter.entry(dataset_idx).or_insert(0); - *idx += 1; - }); - } - - for (idx, count) in counter { - let match_sig = &self.datasets[idx]; - //TODO: still assuming one mh in the signature! - let match_mh = match_sig.signatures[0].size(); - - let score = if containment { - count as f64 / hashes.size() as f64 - } else { - count as f64 / (hashes.size() + match_mh - count) as f64 - }; - - if score >= threshold { - results.push(match_sig) - } - } - - Ok(results) - } else { - // TODO: what if it is not a minhash? - unimplemented!() - } - } - - fn insert(&mut self, node: Self::Item) -> Result<(), Error> { - self.add(node); - Ok(()) - } - - fn save>(&self, _path: P) -> Result<(), Error> { - unimplemented!() - } - - fn load>(_path: P) -> Result<(), Error> { - unimplemented!() - } - - fn signatures(&self) -> Vec { - unimplemented!() - } - - fn signature_refs(&self) -> Vec<&Self::Item> { - unimplemented!() - } - - /* - fn iter_signatures(&'a self) -> Self::SignatureIterator { - self.datasets.iter() - } - */ -} - -#[cfg(test)] -mod test { - use std::convert::TryInto; - use std::fs::File; - use std::io::BufReader; - use std::path::PathBuf; - - use super::BIGSI; - - use crate::index::SigStore; - use crate::index::{Index, MHBT}; - use crate::signature::Signature; - - #[test] - fn bigsi_sbt_oracle() { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/v5.sbt.json"); - - let sbt = MHBT::from_path(filename).expect("Loading error"); - - let mut bigsi = BIGSI::new(10000, 10); - let datasets = sbt.signatures(); - - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); - - let mut reader = BufReader::new(File::open(filename).unwrap()); - let sigs = Signature::load_signatures( - &mut reader, - Some(31), - Some("DNA".try_into().unwrap()), - None, - ) - .unwrap(); - let sig_data = sigs[0].clone(); - - let leaf: SigStore<_> = sig_data.into(); - - for l in datasets { - bigsi.insert(l).expect("insertion error!"); - } - - let results_sbt = sbt.search(&leaf, 0.5, false).unwrap(); - assert_eq!(results_sbt.len(), 1); - - let data = leaf.data.get().unwrap(); - let results_bigsi = bigsi.search(data, 0.5, false).unwrap(); - assert_eq!(results_bigsi.len(), 1); - - assert_eq!(results_sbt.len(), results_bigsi.len()); - - let results_sbt = sbt.search(&leaf, 0.1, false).unwrap(); - assert_eq!(results_sbt.len(), 2); - - let data = leaf.data.get().unwrap(); - let results_bigsi = bigsi.search(data, 0.1, false).unwrap(); - assert_eq!(results_bigsi.len(), 2); - - assert_eq!(results_sbt.len(), results_bigsi.len()); - } -} diff --git a/src/core/src/index/linear.rs b/src/core/src/index/linear.rs index 78b2c6f1f5..20656a62e4 100644 --- a/src/core/src/index/linear.rs +++ b/src/core/src/index/linear.rs @@ -12,12 +12,12 @@ use crate::storage::{FSStorage, InnerStorage, Storage, StorageInfo}; use crate::Error; #[derive(TypedBuilder)] -pub struct LinearIndex { +pub struct LinearIndex { #[builder(default)] storage: Option, #[builder(default)] - datasets: Vec>, + datasets: Vec, } #[derive(Serialize, Deserialize)] @@ -27,15 +27,11 @@ struct LinearInfo { leaves: Vec, } -impl<'a, L> Index<'a> for LinearIndex -where - L: Clone + Comparable + 'a, - SigStore: From, -{ - type Item = L; +impl<'a> Index<'a> for LinearIndex { + type Item = Signature; //type SignatureIterator = std::slice::Iter<'a, Self::Item>; - fn insert(&mut self, node: L) -> Result<(), Error> { + fn insert(&mut self, node: Self::Item) -> Result<(), Error> { self.datasets.push(node.into()); Ok(()) } @@ -76,11 +72,7 @@ where */ } -impl LinearIndex -where - L: ToWriter, - SigStore: ReadData, -{ +impl LinearIndex { pub fn save_file>( &mut self, path: P, @@ -115,7 +107,7 @@ where .iter_mut() .map(|l| { // Trigger data loading - let _: &L = (*l).data().unwrap(); + let _: &Signature = (*l).data().unwrap(); // set storage to new one l.storage = Some(storage.clone()); @@ -137,7 +129,7 @@ where Ok(()) } - pub fn from_path>(path: P) -> Result, Error> { + pub fn from_path>(path: P) -> Result { let file = File::open(&path)?; let mut reader = BufReader::new(file); @@ -147,11 +139,11 @@ where basepath.push(path); basepath.canonicalize()?; - let linear = LinearIndex::::from_reader(&mut reader, basepath.parent().unwrap())?; + let linear = LinearIndex::from_reader(&mut reader, basepath.parent().unwrap())?; Ok(linear) } - pub fn from_reader(rdr: R, path: P) -> Result, Error> + pub fn from_reader(rdr: R, path: P) -> Result where R: Read, P: AsRef, @@ -171,7 +163,7 @@ where .leaves .into_iter() .map(|l| { - let mut v: SigStore = l.into(); + let mut v: SigStore = l.into(); v.storage = Some(storage.clone()); v }) diff --git a/src/core/src/index/mod.rs b/src/core/src/index/mod.rs index 4e43074ebe..a902af62e5 100644 --- a/src/core/src/index/mod.rs +++ b/src/core/src/index/mod.rs @@ -3,10 +3,8 @@ //! An index organizes signatures to allow for fast similarity search. //! Some indices also support containment searches. -pub mod bigsi; pub mod linear; pub mod revindex; -pub mod sbt; pub mod search; @@ -17,27 +15,84 @@ use once_cell::sync::OnceCell; use serde::{Deserialize, Serialize}; use typed_builder::TypedBuilder; +use crate::encodings::HashFunctions; use crate::errors::ReadDataError; -use crate::index::sbt::{Node, SBT}; use crate::index::search::{search_minhashes, search_minhashes_containment}; +use crate::picklist::Picklist; use crate::prelude::*; use crate::signature::SigsTrait; -use crate::sketch::nodegraph::Nodegraph; use crate::sketch::Sketch; use crate::storage::{InnerStorage, Storage}; use crate::Error; -pub type MHBT = SBT, Signature>; +#[derive(Default)] +pub struct Selection { + ksize: Option, + abund: Option, + num: Option, + scaled: Option, + containment: Option, + moltype: Option, + picklist: Option, +} + +impl Selection { + pub fn ksize(&self) -> Option { + self.ksize + } + + pub fn set_ksize(&mut self, ksize: u32) { + self.ksize = Some(ksize); + } + + pub fn abund(&self) -> Option { + self.abund + } + + pub fn set_abund(&mut self, value: bool) { + self.abund = Some(value); + } -/* FIXME: bring back after MQF works on macOS and Windows -use cfg_if::cfg_if; -cfg_if! { - if #[cfg(not(target_arch = "wasm32"))] { - use mqf::MQF; - pub type MHMT = SBT, Signature>; + pub fn num(&self) -> Option { + self.num + } + + pub fn set_num(&mut self, num: u32) { + self.num = Some(num); + } + + pub fn scaled(&self) -> Option { + self.scaled + } + + pub fn set_scaled(&mut self, scaled: u32) { + self.scaled = Some(scaled); + } + + pub fn containment(&self) -> Option { + self.containment + } + + pub fn set_containment(&mut self, containment: bool) { + self.containment = Some(containment); + } + + pub fn moltype(&self) -> Option { + self.moltype + } + + pub fn set_moltype(&mut self, value: HashFunctions) { + self.moltype = Some(value); + } + + pub fn picklist(&self) -> Option { + self.picklist.clone() + } + + pub fn set_picklist(&mut self, value: Picklist) { + self.picklist = Some(value); } } -*/ pub trait Index<'a> { type Item: Comparable; @@ -132,7 +187,7 @@ pub struct DatasetInfo { } #[derive(TypedBuilder, Default, Clone)] -pub struct SigStore { +pub struct SigStore { #[builder(setter(into))] filename: String, @@ -145,16 +200,16 @@ pub struct SigStore { storage: Option, #[builder(setter(into), default)] - data: OnceCell, + data: OnceCell, } -impl SigStore { +impl SigStore { pub fn name(&self) -> String { self.name.clone() } } -impl std::fmt::Debug for SigStore { +impl std::fmt::Debug for SigStore { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, @@ -164,7 +219,7 @@ impl std::fmt::Debug for SigStore { } } -impl ReadData for SigStore { +impl ReadData for SigStore { fn data(&self) -> Result<&Signature, Error> { if let Some(sig) = self.data.get() { Ok(sig) @@ -188,8 +243,25 @@ impl ReadData for SigStore { } } -impl SigStore { - pub fn count_common(&self, other: &SigStore) -> u64 { +impl SigStore { + pub fn save(&self, path: &str) -> Result { + if let Some(storage) = &self.storage { + if let Some(data) = self.data.get() { + let mut buffer = Vec::new(); + data.to_writer(&mut buffer)?; + + Ok(storage.save(path, &buffer)?) + } else { + unimplemented!() + } + } else { + unimplemented!() + } + } +} + +impl SigStore { + pub fn count_common(&self, other: &SigStore) -> u64 { let ng: &Signature = self.data().unwrap(); let ong: &Signature = other.data().unwrap(); @@ -216,13 +288,13 @@ impl SigStore { } } -impl From> for Signature { - fn from(other: SigStore) -> Signature { +impl From for Signature { + fn from(other: SigStore) -> Signature { other.data.get().unwrap().to_owned() } } -impl Deref for SigStore { +impl Deref for SigStore { type Target = Signature; fn deref(&self) -> &Signature { @@ -230,8 +302,8 @@ impl Deref for SigStore { } } -impl From for SigStore { - fn from(other: Signature) -> SigStore { +impl From for SigStore { + fn from(other: Signature) -> SigStore { let name = other.name(); let filename = other.filename(); @@ -245,8 +317,8 @@ impl From for SigStore { } } -impl Comparable> for SigStore { - fn similarity(&self, other: &SigStore) -> f64 { +impl Comparable for SigStore { + fn similarity(&self, other: &SigStore) -> f64 { let ng: &Signature = self.data().unwrap(); let ong: &Signature = other.data().unwrap(); @@ -269,7 +341,7 @@ impl Comparable> for SigStore { unimplemented!() } - fn containment(&self, other: &SigStore) -> f64 { + fn containment(&self, other: &SigStore) -> f64 { let ng: &Signature = self.data().unwrap(); let ong: &Signature = other.data().unwrap(); @@ -321,8 +393,8 @@ impl Comparable for Signature { } } -impl From for SigStore { - fn from(other: DatasetInfo) -> SigStore { +impl From for SigStore { + fn from(other: DatasetInfo) -> SigStore { SigStore { filename: other.filename, name: other.name, diff --git a/src/core/src/index/revindex.rs b/src/core/src/index/revindex.rs deleted file mode 100644 index 0a1fc25d18..0000000000 --- a/src/core/src/index/revindex.rs +++ /dev/null @@ -1,699 +0,0 @@ -use std::collections::{HashMap, HashSet}; -use std::path::{Path, PathBuf}; -use std::sync::atomic::{AtomicUsize, Ordering}; - -use getset::{CopyGetters, Getters, Setters}; -use log::{debug, info}; -use nohash_hasher::BuildNoHashHasher; -use serde::{Deserialize, Serialize}; - -#[cfg(feature = "parallel")] -use rayon::prelude::*; - -use crate::encodings::{Color, Colors, Idx}; -use crate::index::Index; -use crate::signature::{Signature, SigsTrait}; -use crate::sketch::minhash::KmerMinHash; -use crate::sketch::Sketch; -use crate::Error; -use crate::HashIntoType; - -type SigCounter = counter::Counter; - -#[derive(Serialize, Deserialize)] -struct HashToColor(HashMap>); - -impl HashToColor { - fn new() -> Self { - HashToColor(HashMap::< - HashIntoType, - Color, - BuildNoHashHasher, - >::with_hasher(BuildNoHashHasher::default())) - } - - fn get(&self, hash: &HashIntoType) -> Option<&Color> { - self.0.get(hash) - } - - fn retain(&mut self, hashes: &HashSet) { - self.0.retain(|hash, _| hashes.contains(hash)) - } - - fn len(&self) -> usize { - self.0.len() - } - - fn is_empty(&self) -> bool { - self.0.is_empty() - } - - fn add_to(&mut self, colors: &mut Colors, dataset_id: usize, matched_hashes: Vec) { - let mut color = None; - - matched_hashes.into_iter().for_each(|hash| { - color = Some(colors.update(color, &[dataset_id as Idx]).unwrap()); - self.0.insert(hash, color.unwrap()); - }); - } - - fn reduce_hashes_colors( - a: (HashToColor, Colors), - b: (HashToColor, Colors), - ) -> (HashToColor, Colors) { - let ((small_hashes, small_colors), (mut large_hashes, mut large_colors)) = - if a.0.len() > b.0.len() { - (b, a) - } else { - (a, b) - }; - - small_hashes.0.into_iter().for_each(|(hash, color)| { - large_hashes - .0 - .entry(hash) - .and_modify(|entry| { - // Hash is already present. - // Update the current color by adding the indices from - // small_colors. - let ids = small_colors.indices(&color); - let new_color = large_colors.update(Some(*entry), ids).unwrap(); - *entry = new_color; - }) - .or_insert_with(|| { - // In this case, the hash was not present yet. - // we need to create the same color from small_colors - // into large_colors. - let ids = small_colors.indices(&color); - let new_color = large_colors.update(None, ids).unwrap(); - assert_eq!(new_color, color); - new_color - }); - }); - - (large_hashes, large_colors) - } -} - -// Use rkyv for serialization? -// https://davidkoloski.me/rkyv/ -#[derive(Serialize, Deserialize)] -pub struct RevIndex { - hash_to_color: HashToColor, - - sig_files: Vec, - - #[serde(skip)] - ref_sigs: Option>, - - template: Sketch, - colors: Colors, - //#[serde(skip)] - //storage: Option, -} - -impl RevIndex { - pub fn load>( - index_path: P, - queries: Option<&[KmerMinHash]>, - ) -> Result> { - let (rdr, _) = niffler::from_path(index_path)?; - let revindex = if let Some(qs) = queries { - // TODO: avoid loading full revindex if query != None - /* - struct PartialRevIndex { - hashes_to_keep: Option>, - marker: PhantomData T>, - } - - impl PartialRevIndex { - pub fn new(hashes_to_keep: HashSet) -> Self { - PartialRevIndex { - hashes_to_keep: Some(hashes_to_keep), - marker: PhantomData, - } - } - } - */ - - let mut hashes: HashSet = HashSet::new(); - for q in qs { - hashes.extend(q.iter_mins()); - } - - //let mut revindex: RevIndex = PartialRevIndex::new(hashes).deserialize(&rdr).unwrap(); - - let mut revindex: RevIndex = serde_json::from_reader(rdr)?; - revindex.hash_to_color.retain(&hashes); - revindex - } else { - // Load the full revindex - serde_json::from_reader(rdr)? - }; - - Ok(revindex) - } - - pub fn new( - search_sigs: &[PathBuf], - template: &Sketch, - threshold: usize, - queries: Option<&[KmerMinHash]>, - keep_sigs: bool, - ) -> RevIndex { - // If threshold is zero, let's merge all queries and save time later - let merged_query = queries.and_then(|qs| Self::merge_queries(qs, threshold)); - - let processed_sigs = AtomicUsize::new(0); - - #[cfg(feature = "parallel")] - let sig_iter = search_sigs.par_iter(); - - #[cfg(not(feature = "parallel"))] - let sig_iter = search_sigs.iter(); - - let filtered_sigs = sig_iter.enumerate().filter_map(|(dataset_id, filename)| { - let i = processed_sigs.fetch_add(1, Ordering::SeqCst); - if i % 1000 == 0 { - info!("Processed {} reference sigs", i); - } - - let search_sig = Signature::from_path(filename) - .unwrap_or_else(|_| panic!("Error processing {:?}", filename)) - .swap_remove(0); - - RevIndex::map_hashes_colors( - dataset_id, - &search_sig, - queries, - &merged_query, - threshold, - template, - ) - }); - - #[cfg(feature = "parallel")] - let (hash_to_color, colors) = filtered_sigs.reduce( - || (HashToColor::new(), Colors::default()), - HashToColor::reduce_hashes_colors, - ); - - #[cfg(not(feature = "parallel"))] - let (hash_to_color, colors) = filtered_sigs.fold( - (HashToColor::new(), Colors::default()), - HashToColor::reduce_hashes_colors, - ); - - // TODO: build this together with hash_to_idx? - let ref_sigs = if keep_sigs { - #[cfg(feature = "parallel")] - let sigs_iter = search_sigs.par_iter(); - - #[cfg(not(feature = "parallel"))] - let sigs_iter = search_sigs.iter(); - - Some( - sigs_iter - .map(|ref_path| { - Signature::from_path(ref_path) - .unwrap_or_else(|_| panic!("Error processing {:?}", ref_path)) - .swap_remove(0) - }) - .collect(), - ) - } else { - None - }; - - RevIndex { - hash_to_color, - sig_files: search_sigs.into(), - ref_sigs, - template: template.clone(), - colors, - // storage: Some(InnerStorage::new(MemStorage::default())), - } - } - - fn merge_queries(qs: &[KmerMinHash], threshold: usize) -> Option { - if threshold == 0 { - let mut merged = qs[0].clone(); - for query in &qs[1..] { - merged.merge(query).unwrap(); - } - Some(merged) - } else { - None - } - } - - pub fn new_with_sigs( - search_sigs: Vec, - template: &Sketch, - threshold: usize, - queries: Option<&[KmerMinHash]>, - ) -> RevIndex { - // If threshold is zero, let's merge all queries and save time later - let merged_query = queries.and_then(|qs| Self::merge_queries(qs, threshold)); - - let processed_sigs = AtomicUsize::new(0); - - #[cfg(feature = "parallel")] - let sigs_iter = search_sigs.par_iter(); - #[cfg(not(feature = "parallel"))] - let sigs_iter = search_sigs.iter(); - - let filtered_sigs = sigs_iter.enumerate().filter_map(|(dataset_id, sig)| { - let i = processed_sigs.fetch_add(1, Ordering::SeqCst); - if i % 1000 == 0 { - info!("Processed {} reference sigs", i); - } - - RevIndex::map_hashes_colors( - dataset_id, - sig, - queries, - &merged_query, - threshold, - template, - ) - }); - - #[cfg(feature = "parallel")] - let (hash_to_color, colors) = filtered_sigs.reduce( - || (HashToColor::new(), Colors::default()), - HashToColor::reduce_hashes_colors, - ); - - #[cfg(not(feature = "parallel"))] - let (hash_to_color, colors) = filtered_sigs.fold( - (HashToColor::new(), Colors::default()), - HashToColor::reduce_hashes_colors, - ); - - RevIndex { - hash_to_color, - sig_files: vec![], - ref_sigs: search_sigs.into(), - template: template.clone(), - colors, - //storage: None, - } - } - - fn map_hashes_colors( - dataset_id: usize, - search_sig: &Signature, - queries: Option<&[KmerMinHash]>, - merged_query: &Option, - threshold: usize, - template: &Sketch, - ) -> Option<(HashToColor, Colors)> { - let mut search_mh = None; - if let Some(Sketch::MinHash(mh)) = search_sig.select_sketch(template) { - search_mh = Some(mh); - } - - let search_mh = search_mh.expect("Couldn't find a compatible MinHash"); - let mut hash_to_color = HashToColor::new(); - let mut colors = Colors::default(); - - if let Some(qs) = queries { - if let Some(ref merged) = merged_query { - let (matched_hashes, intersection) = merged.intersection(search_mh).unwrap(); - if !matched_hashes.is_empty() || intersection > threshold as u64 { - hash_to_color.add_to(&mut colors, dataset_id, matched_hashes); - } - } else { - for query in qs { - let (matched_hashes, intersection) = query.intersection(search_mh).unwrap(); - if !matched_hashes.is_empty() || intersection > threshold as u64 { - hash_to_color.add_to(&mut colors, dataset_id, matched_hashes); - } - } - } - } else { - let matched = search_mh.mins(); - let size = matched.len() as u64; - if !matched.is_empty() || size > threshold as u64 { - hash_to_color.add_to(&mut colors, dataset_id, matched); - } - }; - - if hash_to_color.is_empty() { - None - } else { - Some((hash_to_color, colors)) - } - } - - pub fn search( - &self, - counter: SigCounter, - similarity: bool, - threshold: usize, - ) -> Result, Box> { - let mut matches = vec![]; - if similarity { - unimplemented!("TODO: threshold correction") - } - - for (dataset_id, size) in counter.most_common() { - if size >= threshold { - matches.push(self.sig_files[dataset_id as usize].to_str().unwrap().into()); - } else { - break; - }; - } - Ok(matches) - } - - pub fn gather( - &self, - mut counter: SigCounter, - threshold: usize, - query: &KmerMinHash, - ) -> Result, Box> { - let mut match_size = usize::max_value(); - let mut matches = vec![]; - - while match_size > threshold && !counter.is_empty() { - let (dataset_id, size) = counter.most_common()[0]; - match_size = if size >= threshold { size } else { break }; - - let p; - let match_path = if self.sig_files.is_empty() { - p = PathBuf::new(); // TODO: Fix somehow? - &p - } else { - &self.sig_files[dataset_id as usize] - }; - - let ref_match; - let match_sig = if let Some(refsigs) = &self.ref_sigs { - &refsigs[dataset_id as usize] - } else { - // TODO: remove swap_remove - ref_match = Signature::from_path(match_path)?.swap_remove(0); - &ref_match - }; - - let mut match_mh = None; - if let Some(Sketch::MinHash(mh)) = match_sig.select_sketch(&self.template) { - match_mh = Some(mh); - } - let match_mh = match_mh.expect("Couldn't find a compatible MinHash"); - - // Calculate stats - let f_orig_query = match_size as f64 / query.size() as f64; - let f_match = match_size as f64 / match_mh.size() as f64; - let filename = match_path.to_str().unwrap().into(); - let name = match_sig.name(); - let unique_intersect_bp = match_mh.scaled() as usize * match_size; - let gather_result_rank = matches.len(); - - let (intersect_orig, _) = match_mh.intersection_size(query)?; - let intersect_bp = (match_mh.scaled() * intersect_orig) as usize; - - let f_unique_to_query = intersect_orig as f64 / query.size() as f64; - let match_ = match_sig.clone(); - - // TODO: all of these - let f_unique_weighted = 0.; - let average_abund = 0; - let median_abund = 0; - let std_abund = 0; - let md5 = "".into(); - let f_match_orig = 0.; - let remaining_bp = 0; - - let result = GatherResult { - intersect_bp, - f_orig_query, - f_match, - f_unique_to_query, - f_unique_weighted, - average_abund, - median_abund, - std_abund, - filename, - name, - md5, - match_, - f_match_orig, - unique_intersect_bp, - gather_result_rank, - remaining_bp, - }; - matches.push(result); - - // Prepare counter for finding the next match by decrementing - // all hashes found in the current match in other datasets - for hash in match_mh.iter_mins() { - if let Some(color) = self.hash_to_color.get(hash) { - for dataset in self.colors.indices(color) { - counter.entry(*dataset).and_modify(|e| { - if *e > 0 { - *e -= 1 - } - }); - } - } - } - counter.remove(&dataset_id); - } - Ok(matches) - } - - pub fn counter_for_query(&self, query: &KmerMinHash) -> SigCounter { - query - .iter_mins() - .filter_map(|hash| self.hash_to_color.get(hash)) - .flat_map(|color| self.colors.indices(color)) - .cloned() - .collect() - } - - pub fn template(&self) -> Sketch { - self.template.clone() - } - - // TODO: mh should be a sketch, or even a sig... - pub(crate) fn find_signatures( - &self, - mh: &KmerMinHash, - threshold: f64, - containment: bool, - _ignore_scaled: bool, - ) -> Result, Error> { - /* - let template_mh = None; - if let Sketch::MinHash(mh) = self.template { - template_mh = Some(mh); - }; - // TODO: throw error - let template_mh = template_mh.unwrap(); - - let tmp_mh; - let mh = if template_mh.scaled() > mh.scaled() { - // TODO: proper error here - tmp_mh = mh.downsample_scaled(self.scaled)?; - &tmp_mh - } else { - mh - }; - - if self.scaled < mh.scaled() && !ignore_scaled { - return Err(LcaDBError::ScaledMismatchError { - db: self.scaled, - query: mh.scaled(), - } - .into()); - } - */ - - // TODO: proper threshold calculation - let threshold: usize = (threshold * (mh.size() as f64)) as _; - - let counter = self.counter_for_query(mh); - - debug!( - "number of matching signatures for hashes: {}", - counter.len() - ); - - let mut results = vec![]; - for (dataset_id, size) in counter.most_common() { - let match_size = if size >= threshold { size } else { break }; - - let p; - let match_path = if self.sig_files.is_empty() { - p = PathBuf::new(); // TODO: Fix somehow? - &p - } else { - &self.sig_files[dataset_id as usize] - }; - - let ref_match; - let match_sig = if let Some(refsigs) = &self.ref_sigs { - &refsigs[dataset_id as usize] - } else { - // TODO: remove swap_remove - ref_match = Signature::from_path(match_path)?.swap_remove(0); - &ref_match - }; - - let mut match_mh = None; - if let Some(Sketch::MinHash(mh)) = match_sig.select_sketch(&self.template) { - match_mh = Some(mh); - } - let match_mh = match_mh.unwrap(); - - if size >= threshold { - let score = if containment { - size as f64 / mh.size() as f64 - } else { - size as f64 / (mh.size() + match_size - size) as f64 - }; - let filename = match_path.to_str().unwrap().into(); - let mut sig = match_sig.clone(); - sig.reset_sketches(); - sig.push(Sketch::MinHash(match_mh.clone())); - results.push((score, sig, filename)); - } else { - break; - }; - } - Ok(results) - } -} - -#[derive(CopyGetters, Getters, Setters, Serialize, Deserialize, Debug)] -pub struct GatherResult { - #[getset(get_copy = "pub")] - intersect_bp: usize, - - #[getset(get_copy = "pub")] - f_orig_query: f64, - - #[getset(get_copy = "pub")] - f_match: f64, - - f_unique_to_query: f64, - f_unique_weighted: f64, - average_abund: usize, - median_abund: usize, - std_abund: usize, - - #[getset(get = "pub")] - filename: String, - - #[getset(get = "pub")] - name: String, - - md5: String, - match_: Signature, - f_match_orig: f64, - unique_intersect_bp: usize, - gather_result_rank: usize, - remaining_bp: usize, -} - -impl GatherResult { - pub fn get_match(&self) -> Signature { - self.match_.clone() - } -} - -impl<'a> Index<'a> for RevIndex { - type Item = Signature; - - fn insert(&mut self, _node: Self::Item) -> Result<(), Error> { - unimplemented!() - } - - fn save>(&self, _path: P) -> Result<(), Error> { - unimplemented!() - } - - fn load>(_path: P) -> Result<(), Error> { - unimplemented!() - } - - fn len(&self) -> usize { - if let Some(refs) = &self.ref_sigs { - refs.len() - } else { - self.sig_files.len() - } - } - - fn signatures(&self) -> Vec { - if let Some(ref sigs) = self.ref_sigs { - sigs.to_vec() - } else { - unimplemented!() - } - } - - fn signature_refs(&self) -> Vec<&Self::Item> { - unimplemented!() - } -} - -#[cfg(test)] -mod test { - use super::*; - - use crate::sketch::minhash::max_hash_for_scaled; - - #[test] - fn revindex_new() { - let max_hash = max_hash_for_scaled(10000); - let template = Sketch::MinHash( - KmerMinHash::builder() - .num(0u32) - .ksize(31) - .max_hash(max_hash) - .build(), - ); - let search_sigs = [ - "../../tests/test-data/gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig".into(), - "../../tests/test-data/gather/GCF_000007545.1_ASM754v1_genomic.fna.gz.sig".into(), - ]; - let index = RevIndex::new(&search_sigs, &template, 0, None, false); - assert_eq!(index.colors.len(), 3); - } - - #[test] - fn revindex_many() { - let max_hash = max_hash_for_scaled(10000); - let template = Sketch::MinHash( - KmerMinHash::builder() - .num(0u32) - .ksize(31) - .max_hash(max_hash) - .build(), - ); - let search_sigs = [ - "../../tests/test-data/gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig".into(), - "../../tests/test-data/gather/GCF_000007545.1_ASM754v1_genomic.fna.gz.sig".into(), - "../../tests/test-data/gather/GCF_000008105.1_ASM810v1_genomic.fna.gz.sig".into(), - ]; - - let index = RevIndex::new(&search_sigs, &template, 0, None, false); - /* - dbg!(&index.colors.colors); - 0: 86 - 1: 132 - 2: 91 - (0, 1): 53 - (0, 2): 90 - (1, 2): 26 - (0, 1, 2): 261 - union: 739 - */ - //assert_eq!(index.colors.len(), 3); - assert_eq!(index.colors.len(), 7); - } -} diff --git a/src/core/src/index/revindex/mem_revindex.rs b/src/core/src/index/revindex/mem_revindex.rs new file mode 100644 index 0000000000..7aeea27c2e --- /dev/null +++ b/src/core/src/index/revindex/mem_revindex.rs @@ -0,0 +1,1118 @@ +use std::collections::{HashMap, HashSet}; +use std::path::{Path, PathBuf}; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; + +use getset::{CopyGetters, Getters, Setters}; +use log::{debug, info}; +use nohash_hasher::BuildNoHashHasher; +use serde::{Deserialize, Serialize}; +use typed_builder::TypedBuilder; + +#[cfg(feature = "parallel")] +use rayon::prelude::*; + +use crate::encodings::{Color, Colors, Idx}; +use crate::index::{Index, Selection, SigStore}; +use crate::manifest::Manifest; +use crate::signature::{Signature, SigsTrait}; +use crate::sketch::minhash::{KmerMinHash, MinHashOps}; +use crate::sketch::Sketch; +use crate::storage::{Storage, ZipStorage}; +use crate::Error; +use crate::HashIntoType; + +type SigCounter = counter::Counter; + +#[derive(Serialize, Deserialize)] +struct HashToColor(HashMap>); + +impl HashToColor { + fn new() -> Self { + HashToColor(HashMap::< + HashIntoType, + Color, + BuildNoHashHasher, + >::with_hasher(BuildNoHashHasher::default())) + } + + fn get(&self, hash: &HashIntoType) -> Option<&Color> { + self.0.get(hash) + } + + fn retain(&mut self, hashes: &HashSet) { + self.0.retain(|hash, _| hashes.contains(hash)) + } + + fn len(&self) -> usize { + self.0.len() + } + + fn is_empty(&self) -> bool { + self.0.is_empty() + } + + fn add_to(&mut self, colors: &mut Colors, dataset_id: usize, matched_hashes: Vec) { + let mut color = None; + + matched_hashes.into_iter().for_each(|hash| { + color = Some(colors.update(color, &[dataset_id as Idx]).unwrap()); + self.0.insert(hash, color.unwrap()); + }); + } + + fn reduce_hashes_colors( + a: (HashToColor, Colors), + b: (HashToColor, Colors), + ) -> (HashToColor, Colors) { + let ((small_hashes, small_colors), (mut large_hashes, mut large_colors)) = + if a.0.len() > b.0.len() { + (b, a) + } else { + (a, b) + }; + + small_hashes.0.into_iter().for_each(|(hash, color)| { + large_hashes + .0 + .entry(hash) + .and_modify(|entry| { + // Hash is already present. + // Update the current color by adding the indices from + // small_colors. + let ids = small_colors.indices(&color); + let new_color = large_colors.update(Some(*entry), ids).unwrap(); + *entry = new_color; + }) + .or_insert_with(|| { + // In this case, the hash was not present yet. + // we need to create the same color from small_colors + // into large_colors. + let ids = small_colors.indices(&color); + let new_color = large_colors.update(None, ids).unwrap(); + assert_eq!(new_color, color); + new_color + }); + }); + + (large_hashes, large_colors) + } +} + +// Use rkyv for serialization? +// https://davidkoloski.me/rkyv/ +#[derive(Serialize, Deserialize)] +pub struct RevIndex { + linear: LinearRevIndex, + hash_to_color: HashToColor, + colors: Colors, +} + +#[derive(Serialize, Deserialize)] +pub struct LinearRevIndex { + sig_files: Manifest, + + #[serde(skip)] + ref_sigs: Option>, + + template: Sketch, + + #[serde(skip)] + storage: Option>, +} + +impl LinearRevIndex { + pub fn new( + sig_files: Option, + template: &Sketch, + keep_sigs: bool, + ref_sigs: Option>, + storage: Option, + ) -> Self { + if ref_sigs.is_none() && sig_files.is_none() { + todo!("throw error, one need to be set"); + } + + let ref_sigs = if let Some(ref_sigs) = ref_sigs { + Some(ref_sigs.into_iter().map(|m| m.into()).collect()) + } else if keep_sigs { + let search_sigs: Vec<_> = sig_files + .as_ref() + .unwrap() + .internal_locations() + .map(PathBuf::from) + .collect(); + + #[cfg(feature = "parallel")] + let sigs_iter = search_sigs.par_iter(); + + #[cfg(not(feature = "parallel"))] + let sigs_iter = search_sigs.iter(); + + Some( + sigs_iter + .map(|ref_path| { + if let Some(storage) = &storage { + let sig_data = storage + .load(ref_path.to_str().unwrap_or_else(|| { + panic!("error converting path {:?}", ref_path) + })) + .unwrap_or_else(|_| panic!("error loading {:?}", ref_path)); + Signature::from_reader(sig_data.as_slice()) + .unwrap_or_else(|_| panic!("Error processing {:?}", ref_path)) + .swap_remove(0) + .into() + } else { + Signature::from_path(&ref_path) + .unwrap_or_else(|_| panic!("Error processing {:?}", ref_path)) + .swap_remove(0) + .into() + } + }) + .collect(), + ) + } else { + None + }; + + let storage = storage.map(Arc::new); + + let sig_files = sig_files.unwrap_or_else(|| { + todo!("generate manifest for ref_sigs"); + }); + + LinearRevIndex { + sig_files, + template: template.clone(), + ref_sigs, + storage, + } + } + + fn index( + self, + threshold: usize, + merged_query: Option, + queries: Option<&[KmerMinHash]>, + ) -> RevIndex { + let processed_sigs = AtomicUsize::new(0); + + let search_sigs: Vec<_> = self + .sig_files + .internal_locations() + .map(PathBuf::from) + .collect(); + + #[cfg(feature = "parallel")] + let sig_iter = search_sigs.par_iter(); + + #[cfg(not(feature = "parallel"))] + let sig_iter = search_sigs.iter(); + + let filtered_sigs = sig_iter.enumerate().filter_map(|(dataset_id, filename)| { + let i = processed_sigs.fetch_add(1, Ordering::SeqCst); + if i % 1000 == 0 { + info!("Processed {} reference sigs", i); + } + + let search_sig = if let Some(storage) = &self.storage { + let sig_data = storage + .load( + filename + .to_str() + .unwrap_or_else(|| panic!("error converting path {:?}", filename)), + ) + .unwrap_or_else(|_| panic!("error loading {:?}", filename)); + + Signature::from_reader(sig_data.as_slice()) + } else { + Signature::from_path(&filename) + } + .unwrap_or_else(|_| panic!("Error processing {:?}", filename)) + .swap_remove(0); + + RevIndex::map_hashes_colors( + dataset_id, + &search_sig, + queries, + &merged_query, + threshold, + &self.template, + ) + }); + + #[cfg(feature = "parallel")] + let (hash_to_color, colors) = filtered_sigs.reduce( + || (HashToColor::new(), Colors::default()), + HashToColor::reduce_hashes_colors, + ); + + #[cfg(not(feature = "parallel"))] + let (hash_to_color, colors) = filtered_sigs.fold( + (HashToColor::new(), Colors::default()), + HashToColor::reduce_hashes_colors, + ); + + RevIndex { + hash_to_color, + colors, + linear: self, + } + } + + pub fn location(&self) -> Option { + if let Some(storage) = &self.storage { + storage.path() + } else { + None + } + } + + pub fn storage(&self) -> Option> { + self.storage.clone() + } + + pub fn select(mut self, selection: &Selection) -> Result { + let manifest = self.sig_files.select_to_manifest(selection)?; + self.sig_files = manifest; + + Ok(self) + /* + # if we have a manifest, run 'select' on the manifest. + manifest = self.manifest + traverse_yield_all = self.traverse_yield_all + + if manifest is not None: + manifest = manifest.select_to_manifest(**kwargs) + return ZipFileLinearIndex(self.storage, + selection_dict=None, + traverse_yield_all=traverse_yield_all, + manifest=manifest, + use_manifest=True) + else: + # no manifest? just pass along all the selection kwargs to + # the new ZipFileLinearIndex. + + assert manifest is None + if self.selection_dict: + # combine selects... + d = dict(self.selection_dict) + for k, v in kwargs.items(): + if k in d: + if d[k] is not None and d[k] != v: + raise ValueError(f"incompatible select on '{k}'") + d[k] = v + kwargs = d + + return ZipFileLinearIndex(self.storage, + selection_dict=kwargs, + traverse_yield_all=traverse_yield_all, + manifest=None, + use_manifest=False) + */ + } + + pub fn counter_for_query(&self, query: &KmerMinHash) -> SigCounter { + let processed_sigs = AtomicUsize::new(0); + + // TODO: Some(ref_sigs) case + + let search_sigs: Vec<_> = self + .sig_files + .internal_locations() + .map(PathBuf::from) + .collect(); + + #[cfg(feature = "parallel")] + let sig_iter = search_sigs.par_iter(); + + #[cfg(not(feature = "parallel"))] + let sig_iter = search_sigs.iter(); + + let counters = sig_iter.enumerate().filter_map(|(dataset_id, filename)| { + let i = processed_sigs.fetch_add(1, Ordering::SeqCst); + if i % 1000 == 0 { + info!("Processed {} reference sigs", i); + } + + let search_sig = if let Some(storage) = &self.storage { + let sig_data = storage + .load( + filename + .to_str() + .unwrap_or_else(|| panic!("error converting path {:?}", filename)), + ) + .unwrap_or_else(|_| panic!("error loading {:?}", filename)); + + Signature::from_reader(sig_data.as_slice()) + } else { + Signature::from_path(&filename) + } + .unwrap_or_else(|_| panic!("Error processing {:?}", filename)) + .swap_remove(0); + + let mut search_mh = None; + if let Some(Sketch::MinHash(mh)) = search_sig.select_sketch(&self.template) { + search_mh = Some(mh); + }; + let search_mh = search_mh.expect("Couldn't find a compatible MinHash"); + + let (large_mh, small_mh) = if query.size() > search_mh.size() { + (query, search_mh) + } else { + (search_mh, query) + }; + + let (size, _) = small_mh + .intersection_size(large_mh) + .unwrap_or_else(|_| panic!("error computing intersection for {:?}", filename)); + + if size == 0 { + None + } else { + let mut counter: SigCounter = Default::default(); + counter[&(dataset_id as u64)] += size as usize; + Some(counter) + } + }); + + let reduce_counters = |mut a: SigCounter, b: SigCounter| { + a.extend(&b); + a + }; + + #[cfg(feature = "parallel")] + let counter = counters.reduce(|| SigCounter::new(), reduce_counters); + + #[cfg(not(feature = "parallel"))] + let counter = counters.fold(SigCounter::new(), reduce_counters); + + counter + } + + pub fn search( + &self, + counter: SigCounter, + similarity: bool, + threshold: usize, + ) -> Result, Box> { + let mut matches = vec![]; + if similarity { + unimplemented!("TODO: threshold correction") + } + + for (dataset_id, size) in counter.most_common() { + if size >= threshold { + matches.push( + self.sig_files[dataset_id as usize] + .internal_location() + .to_str() + .unwrap() + .into(), + ); + } else { + break; + }; + } + Ok(matches) + } + + fn gather_round( + &self, + dataset_id: u64, + match_size: usize, + query: &KmerMinHash, + round: usize, + ) -> Result { + let match_path = if self.sig_files.is_empty() { + PathBuf::new() + } else { + self.sig_files[dataset_id as usize].internal_location() + }; + let match_sig = self.sig_for_dataset(dataset_id as usize)?; + let result = self.stats_for_match(&match_sig, query, match_size, match_path, round)?; + Ok(result) + } + + fn sig_for_dataset(&self, dataset_id: usize) -> Result { + let match_path = if self.sig_files.is_empty() { + PathBuf::new() + } else { + self.sig_files[dataset_id as usize].internal_location() + }; + + let match_sig = if let Some(refsigs) = &self.ref_sigs { + refsigs[dataset_id as usize].clone() + } else { + let mut sig = if let Some(storage) = &self.storage { + let sig_data = storage + .load( + match_path + .to_str() + .unwrap_or_else(|| panic!("error converting path {:?}", match_path)), + ) + .unwrap_or_else(|_| panic!("error loading {:?}", match_path)); + Signature::from_reader(sig_data.as_slice())? + } else { + Signature::from_path(&match_path)? + }; + // TODO: remove swap_remove + sig.swap_remove(0).into() + }; + Ok(match_sig) + } + + fn stats_for_match( + &self, + match_sig: &Signature, + query: &KmerMinHash, + match_size: usize, + match_path: PathBuf, + gather_result_rank: usize, + ) -> Result { + let mut match_mh = None; + if let Some(Sketch::MinHash(mh)) = match_sig.select_sketch(&self.template) { + match_mh = Some(mh); + } + let match_mh = match_mh.expect("Couldn't find a compatible MinHash"); + + // Calculate stats + let f_orig_query = match_size as f64 / query.size() as f64; + let f_match = match_size as f64 / match_mh.size() as f64; + let filename = match_path.to_str().unwrap().into(); + let name = match_sig.name(); + let unique_intersect_bp = match_mh.scaled() as usize * match_size; + + let (intersect_orig, _) = match_mh.intersection_size(query)?; + let intersect_bp = (match_mh.scaled() as u64 * intersect_orig) as usize; + + let f_unique_to_query = intersect_orig as f64 / query.size() as f64; + let match_ = match_sig.clone(); + + // TODO: all of these + let f_unique_weighted = 0.; + let average_abund = 0; + let median_abund = 0; + let std_abund = 0; + let md5 = "".into(); + let f_match_orig = 0.; + let remaining_bp = 0; + + Ok(GatherResult { + intersect_bp, + f_orig_query, + f_match, + f_unique_to_query, + f_unique_weighted, + average_abund, + median_abund, + std_abund, + filename, + name, + md5, + match_, + f_match_orig, + unique_intersect_bp, + gather_result_rank, + remaining_bp, + }) + } + + pub fn gather( + &self, + mut counter: SigCounter, + threshold: usize, + query: &KmerMinHash, + ) -> Result, Box> { + let mut match_size = usize::max_value(); + let mut matches = vec![]; + + while match_size > threshold && !counter.is_empty() { + let (dataset_id, size) = counter.most_common()[0]; + if threshold == 0 && size == 0 { + break; + } + + match_size = if size >= threshold { + size + } else { + break; + }; + + let result = self.gather_round(dataset_id, match_size, query, matches.len())?; + + // Prepare counter for finding the next match by decrementing + // all hashes found in the current match in other datasets + // TODO: maybe par_iter? + let mut to_remove: HashSet = Default::default(); + to_remove.insert(dataset_id); + + for (dataset, value) in counter.iter_mut() { + let dataset_sig = self.sig_for_dataset(*dataset as usize)?; + let mut match_mh = None; + if let Some(Sketch::MinHash(mh)) = dataset_sig.select_sketch(&self.template) { + match_mh = Some(mh); + } + let match_mh = match_mh.expect("Couldn't find a compatible MinHash"); + + let (intersection, _) = query.intersection_size(match_mh)?; + if intersection as usize > *value { + to_remove.insert(*dataset); + } else { + *value -= intersection as usize; + }; + } + to_remove.iter().for_each(|dataset_id| { + counter.remove(dataset_id); + }); + matches.push(result); + } + Ok(matches) + } + + pub fn manifest(&self) -> Manifest { + self.sig_files.clone() + } + + pub fn set_manifest(&mut self, new_manifest: Manifest) -> Result<(), Error> { + self.sig_files = new_manifest; + Ok(()) + } + + pub fn signatures_iter(&self) -> impl Iterator + '_ { + if let Some(_sigs) = &self.ref_sigs { + //sigs.iter().cloned() + todo!("this works, but need to match return types") + } else { + // FIXME temp solution, must find better one! + (0..self.sig_files.len()) + .map(move |dataset_id| self.sig_for_dataset(dataset_id).expect("error loading sig")) + } + } +} + +impl<'a> Index<'a> for LinearRevIndex { + type Item = SigStore; + + fn insert(&mut self, _node: Self::Item) -> Result<(), Error> { + unimplemented!() + } + + fn save>(&self, _path: P) -> Result<(), Error> { + unimplemented!() + } + + fn load>(_path: P) -> Result<(), Error> { + unimplemented!() + } + + fn len(&self) -> usize { + if let Some(refs) = &self.ref_sigs { + refs.len() + } else { + self.sig_files.len() + } + } + + fn signatures(&self) -> Vec { + if let Some(ref sigs) = self.ref_sigs { + sigs.to_vec() + } else { + unimplemented!() + } + } + + fn signature_refs(&self) -> Vec<&Self::Item> { + unimplemented!() + } +} + +impl RevIndex { + pub fn load>( + index_path: P, + queries: Option<&[KmerMinHash]>, + ) -> Result> { + let (rdr, _) = niffler::from_path(index_path)?; + let revindex = if let Some(qs) = queries { + // TODO: avoid loading full revindex if query != None + /* + struct PartialRevIndex { + hashes_to_keep: Option>, + marker: PhantomData T>, + } + + impl PartialRevIndex { + pub fn new(hashes_to_keep: HashSet) -> Self { + PartialRevIndex { + hashes_to_keep: Some(hashes_to_keep), + marker: PhantomData, + } + } + } + */ + + let mut hashes: HashSet = HashSet::new(); + for q in qs { + hashes.extend(q.iter_mins()); + } + + //let mut revindex: RevIndex = PartialRevIndex::new(hashes).deserialize(&rdr).unwrap(); + + let mut revindex: RevIndex = serde_json::from_reader(rdr)?; + revindex.hash_to_color.retain(&hashes); + revindex + } else { + // Load the full revindex + serde_json::from_reader(rdr)? + }; + + Ok(revindex) + } + + pub fn new( + search_sigs: &[PathBuf], + template: &Sketch, + threshold: usize, + queries: Option<&[KmerMinHash]>, + keep_sigs: bool, + ) -> RevIndex { + // If threshold is zero, let's merge all queries and save time later + let merged_query = queries.and_then(|qs| Self::merge_queries(qs, threshold)); + + let linear = LinearRevIndex::new(Some(search_sigs.into()), template, keep_sigs, None, None); + linear.index(threshold, merged_query, queries) + } + + pub fn from_zipstorage( + storage: ZipStorage, + template: &Sketch, + threshold: usize, + queries: Option<&[KmerMinHash]>, + keep_sigs: bool, + ) -> Result { + // If threshold is zero, let's merge all queries and save time later + let merged_query = queries.and_then(|qs| Self::merge_queries(qs, threshold)); + + // Load manifest from zipstorage + let manifest = Manifest::from_reader(storage.load("SOURMASH-MANIFEST.csv")?.as_slice())?; + let search_sigs: Vec<_> = manifest.internal_locations().map(PathBuf::from).collect(); + + let linear = LinearRevIndex::new( + Some(search_sigs.as_slice().into()), + template, + keep_sigs, + None, + Some(storage), + ); + + Ok(linear.index(threshold, merged_query, queries)) + } + + fn merge_queries(qs: &[KmerMinHash], threshold: usize) -> Option { + if threshold == 0 { + let mut merged = qs[0].clone(); + for query in &qs[1..] { + merged.merge(query).unwrap(); + } + Some(merged) + } else { + None + } + } + + pub fn new_with_sigs( + search_sigs: Vec, + template: &Sketch, + threshold: usize, + queries: Option<&[KmerMinHash]>, + ) -> RevIndex { + // If threshold is zero, let's merge all queries and save time later + let merged_query = queries.and_then(|qs| Self::merge_queries(qs, threshold)); + + let linear = LinearRevIndex::new( + Default::default(), + template, + false, + search_sigs.into(), + None, + ); + + linear.index(threshold, merged_query, queries) + } + + fn map_hashes_colors( + dataset_id: usize, + search_sig: &Signature, + queries: Option<&[KmerMinHash]>, + merged_query: &Option, + threshold: usize, + template: &Sketch, + ) -> Option<(HashToColor, Colors)> { + let mut search_mh = None; + if let Some(Sketch::MinHash(mh)) = search_sig.select_sketch(template) { + search_mh = Some(mh); + } + + let search_mh = search_mh.expect("Couldn't find a compatible MinHash"); + let mut hash_to_color = HashToColor::new(); + let mut colors = Colors::default(); + + if let Some(qs) = queries { + if let Some(ref merged) = merged_query { + let (matched_hashes, intersection) = merged.intersection(search_mh).unwrap(); + if !matched_hashes.is_empty() || intersection > threshold as u64 { + hash_to_color.add_to(&mut colors, dataset_id, matched_hashes); + } + } else { + for query in qs { + let (matched_hashes, intersection) = query.intersection(search_mh).unwrap(); + if !matched_hashes.is_empty() || intersection > threshold as u64 { + hash_to_color.add_to(&mut colors, dataset_id, matched_hashes); + } + } + } + } else { + let matched = search_mh.mins(); + let size = matched.len() as u64; + if !matched.is_empty() || size > threshold as u64 { + hash_to_color.add_to(&mut colors, dataset_id, matched); + } + }; + + if hash_to_color.is_empty() { + None + } else { + Some((hash_to_color, colors)) + } + } + + pub fn counter_for_query(&self, query: &KmerMinHash) -> SigCounter { + query + .iter_mins() + .filter_map(|hash| self.hash_to_color.get(hash)) + .flat_map(|color| self.colors.indices(color)) + .cloned() + .collect() + } + + pub fn search( + &self, + counter: SigCounter, + similarity: bool, + threshold: usize, + ) -> Result, Box> { + self.linear.search(counter, similarity, threshold) + } + + pub fn gather( + &self, + mut counter: SigCounter, + threshold: usize, + query: &KmerMinHash, + ) -> Result, Box> { + let mut match_size = usize::max_value(); + let mut matches = vec![]; + + while match_size > threshold && !counter.is_empty() { + let (dataset_id, size) = counter.most_common()[0]; + match_size = if size >= threshold { size } else { break }; + let result = self + .linear + .gather_round(dataset_id, match_size, query, matches.len())?; + if let Some(Sketch::MinHash(match_mh)) = + result.match_.select_sketch(&self.linear.template) + { + // Prepare counter for finding the next match by decrementing + // all hashes found in the current match in other datasets + for hash in match_mh.iter_mins() { + if let Some(color) = self.hash_to_color.get(hash) { + counter.subtract(self.colors.indices(color).cloned()); + } + } + counter.remove(&dataset_id); + matches.push(result); + } else { + unimplemented!() + } + } + Ok(matches) + } + + pub fn template(&self) -> Sketch { + self.linear.template.clone() + } + + // TODO: mh should be a sketch, or even a sig... + pub(crate) fn find_signatures( + &self, + mh: &KmerMinHash, + threshold: f64, + containment: bool, + _ignore_scaled: bool, + ) -> Result, Error> { + /* + let template_mh = None; + if let Sketch::MinHash(mh) = self.template { + template_mh = Some(mh); + }; + // TODO: throw error + let template_mh = template_mh.unwrap(); + + let tmp_mh; + let mh = if template_mh.scaled() > mh.scaled() { + // TODO: proper error here + tmp_mh = mh.downsample_scaled(self.scaled)?; + &tmp_mh + } else { + mh + }; + + if self.scaled < mh.scaled() && !ignore_scaled { + return Err(LcaDBError::ScaledMismatchError { + db: self.scaled, + query: mh.scaled(), + } + .into()); + } + */ + + // TODO: proper threshold calculation + let threshold: usize = (threshold * (mh.size() as f64)) as _; + + let counter = self.counter_for_query(mh); + + debug!( + "number of matching signatures for hashes: {}", + counter.len() + ); + + let mut results = vec![]; + for (dataset_id, size) in counter.most_common() { + let match_size = if size >= threshold { size } else { break }; + + let match_path = if self.linear.sig_files.is_empty() { + PathBuf::new() + } else { + self.linear.sig_files[dataset_id as usize].internal_location() + }; + + let ref_match; + let match_sig = if let Some(refsigs) = &self.linear.ref_sigs { + &refsigs[dataset_id as usize] + } else { + let mut sig = if let Some(storage) = &self.linear.storage { + let sig_data = + storage + .load(match_path.to_str().unwrap_or_else(|| { + panic!("error converting path {:?}", match_path) + })) + .unwrap_or_else(|_| panic!("error loading {:?}", match_path)); + Signature::from_reader(sig_data.as_slice())? + } else { + Signature::from_path(&match_path)? + }; + // TODO: remove swap_remove + ref_match = sig.swap_remove(0); + &ref_match + }; + + let mut match_mh = None; + if let Some(Sketch::MinHash(mh)) = match_sig.select_sketch(&self.linear.template) { + match_mh = Some(mh); + } + let match_mh = match_mh.unwrap(); + + if size >= threshold { + let score = if containment { + size as f64 / mh.size() as f64 + } else { + size as f64 / (mh.size() + match_size - size) as f64 + }; + let filename = match_path.to_str().unwrap().into(); + let mut sig = match_sig.clone(); + sig.reset_sketches(); + sig.push(Sketch::MinHash(match_mh.clone())); + results.push((score, sig, filename)); + } else { + break; + }; + } + Ok(results) + } +} + +#[derive(TypedBuilder, CopyGetters, Getters, Setters, Serialize, Deserialize, Debug, PartialEq)] +pub struct GatherResult { + #[getset(get_copy = "pub")] + intersect_bp: usize, + + #[getset(get_copy = "pub")] + f_orig_query: f64, + + #[getset(get_copy = "pub")] + f_match: f64, + + f_unique_to_query: f64, + f_unique_weighted: f64, + average_abund: usize, + median_abund: usize, + std_abund: usize, + + #[getset(get = "pub")] + filename: String, + + #[getset(get = "pub")] + name: String, + + #[getset(get = "pub")] + md5: String, + match_: Signature, + f_match_orig: f64, + unique_intersect_bp: usize, + gather_result_rank: usize, + remaining_bp: usize, +} + +impl GatherResult { + pub fn get_match(&self) -> Signature { + self.match_.clone() + } +} + +impl<'a> Index<'a> for RevIndex { + type Item = Signature; + + fn insert(&mut self, _node: Self::Item) -> Result<(), Error> { + unimplemented!() + } + + fn save>(&self, _path: P) -> Result<(), Error> { + unimplemented!() + } + + fn load>(_path: P) -> Result<(), Error> { + unimplemented!() + } + + fn len(&self) -> usize { + if let Some(refs) = &self.linear.ref_sigs { + refs.len() + } else { + self.linear.sig_files.len() + } + } + + fn signatures(&self) -> Vec { + if let Some(ref sigs) = self.linear.ref_sigs { + sigs.iter().map(|s| s.clone().into()).collect() + } else { + unimplemented!() + } + } + + fn signature_refs(&self) -> Vec<&Self::Item> { + unimplemented!() + } +} + +#[cfg(test)] +mod test { + use super::*; + + use crate::sketch::minhash::max_hash_for_scaled; + + #[test] + fn revindex_new() { + let max_hash = max_hash_for_scaled(10000); + let template = Sketch::MinHash( + KmerMinHash::builder() + .num(0u32) + .ksize(31) + .max_hash(max_hash) + .build(), + ); + let search_sigs = [ + "../../tests/test-data/gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig".into(), + "../../tests/test-data/gather/GCF_000007545.1_ASM754v1_genomic.fna.gz.sig".into(), + ]; + let index = RevIndex::new(&search_sigs, &template, 0, None, false); + assert_eq!(index.colors.len(), 3); + } + + #[test] + fn revindex_many() { + let max_hash = max_hash_for_scaled(10000); + let template = Sketch::MinHash( + KmerMinHash::builder() + .num(0u32) + .ksize(31) + .max_hash(max_hash) + .build(), + ); + let search_sigs = [ + "../../tests/test-data/gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig".into(), + "../../tests/test-data/gather/GCF_000007545.1_ASM754v1_genomic.fna.gz.sig".into(), + "../../tests/test-data/gather/GCF_000008105.1_ASM810v1_genomic.fna.gz.sig".into(), + ]; + + let index = RevIndex::new(&search_sigs, &template, 0, None, false); + /* + dbg!(&index.colors.colors); + 0: 86 + 1: 132 + 2: 91 + (0, 1): 53 + (0, 2): 90 + (1, 2): 26 + (0, 1, 2): 261 + union: 739 + */ + //assert_eq!(index.colors.len(), 3); + assert_eq!(index.colors.len(), 7); + } + + #[test] + fn revindex_from_zipstorage() { + let max_hash = max_hash_for_scaled(100); + let template = Sketch::MinHash( + KmerMinHash::builder() + .num(0u32) + .ksize(57) + .hash_function(crate::encodings::HashFunctions::murmur64_protein) + .max_hash(max_hash) + .build(), + ); + let storage = ZipStorage::from_file("../../tests/test-data/prot/protein.zip") + .expect("error loading zipfile"); + let index = RevIndex::from_zipstorage(storage, &template, 0, None, false) + .expect("error building from ziptorage"); + + assert_eq!(index.colors.len(), 3); + + let query_sig = Signature::from_path( + "../../tests/test-data/prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig", + ) + .expect("Error processing query") + .swap_remove(0); + let mut query_mh = None; + if let Some(Sketch::MinHash(mh)) = query_sig.select_sketch(&template) { + query_mh = Some(mh); + } + let query_mh = query_mh.expect("Couldn't find a compatible MinHash"); + + let counter_rev = index.counter_for_query(query_mh); + let counter_lin = index.linear.counter_for_query(query_mh); + + let results_rev = index.search(counter_rev, false, 0).unwrap(); + let results_linear = index.linear.search(counter_lin, false, 0).unwrap(); + assert_eq!(results_rev, results_linear); + + let counter_rev = index.counter_for_query(query_mh); + let counter_lin = index.linear.counter_for_query(query_mh); + + let results_rev = index.gather(counter_rev, 0, query_mh).unwrap(); + let results_linear = index.linear.gather(counter_lin, 0, query_mh).unwrap(); + assert_eq!(results_rev.len(), 1); + assert_eq!(results_rev, results_linear); + } +} diff --git a/src/core/src/index/revindex/mod.rs b/src/core/src/index/revindex/mod.rs new file mode 100644 index 0000000000..8ee612afc9 --- /dev/null +++ b/src/core/src/index/revindex/mod.rs @@ -0,0 +1,520 @@ +pub mod mem_revindex; +pub mod revindex; + +use std::collections::{BTreeSet, HashMap}; +use std::hash::{Hash, Hasher}; +use std::iter::FromIterator; +use std::path::{Path, PathBuf}; +use std::sync::Arc; + +use byteorder::{LittleEndian, WriteBytesExt}; +use rkyv::{Archive, Deserialize, Serialize}; +use roaring::RoaringTreemap; + +use crate::index::revindex::mem_revindex::GatherResult; +use crate::signature::{Signature, SigsTrait}; +use crate::sketch::minhash::{max_hash_for_scaled, FracMinHashOps, KmerMinHash, MinHashOps}; +use crate::sketch::Sketch; + +use crate::encodings::Color; + +//type DB = rocksdb::DBWithThreadMode; +type DB = rocksdb::DBWithThreadMode; + +type DatasetID = u64; +type SigCounter = counter::Counter; +type QueryColors = HashMap; +type HashToColor = HashMap; + +const HASHES: &str = "hashes"; +const SIGS: &str = "signatures"; +const COLORS: &str = "colors"; + +pub enum RevIndex { + //Color(color_revindex::ColorRevIndex), + Plain(revindex::RevIndex), +} + +impl RevIndex { + /* TODO: need the repair_cf variant, not available in rocksdb-rust yet + pub fn repair(index: &Path, colors: bool) { + if colors { + color_revindex::repair(index); + } else { + revindex::repair(index); + } + } + */ + + pub fn counter_for_query(&self, query: &KmerMinHash) -> SigCounter { + match self { + //Self::Color(db) => db.counter_for_query(query), + Self::Plain(db) => db.counter_for_query(query), + } + } + + pub fn matches_from_counter( + &self, + counter: SigCounter, + threshold: usize, + ) -> Vec<(String, usize)> { + match self { + //Self::Color(db) => todo!(), //db.matches_from_counter(counter, threshold), + Self::Plain(db) => db.matches_from_counter(counter, threshold), + } + } + + pub fn prepare_gather_counters( + &self, + query: &KmerMinHash, + ) -> (SigCounter, QueryColors, HashToColor) { + match self { + //Self::Color(_db) => todo!(), //db.prepare_gather_counters(query), + Self::Plain(db) => db.prepare_gather_counters(query), + } + } + + pub fn index( + &self, + index_sigs: Vec, + template: &Sketch, + threshold: f64, + save_paths: bool, + ) { + match self { + //Self::Color(db) => db.index(index_sigs, template, threshold, save_paths), + Self::Plain(db) => db.index(index_sigs, template, threshold, save_paths), + } + } + + pub fn update( + &self, + index_sigs: Vec, + template: &Sketch, + threshold: f64, + save_paths: bool, + ) { + match self { + //Self::Color(db) => db.update(index_sigs, template, threshold, save_paths), + Self::Plain(db) => db.update(index_sigs, template, threshold, save_paths), + } + } + + pub fn compact(&self) { + match self { + //Self::Color(db) => db.compact(), + Self::Plain(db) => db.compact(), + }; + } + + pub fn flush(&self) -> Result<(), Box> { + match self { + //Self::Color(db) => db.flush(), + Self::Plain(db) => db.flush(), + } + } + + pub fn convert(&self, output_db: RevIndex) -> Result<(), Box> { + match self { + //Self::Color(_db) => todo!(), + Self::Plain(db) => db.convert(output_db), + } + } + + pub fn check(&self, quick: bool) { + match self { + //Self::Color(db) => db.check(quick), + Self::Plain(db) => db.check(quick), + } + } + + pub fn create(index: &Path, colors: bool) -> Self { + if colors { + todo!() //color_revindex::ColorRevIndex::create(index) + } else { + revindex::RevIndex::create(index) + } + } + + pub fn open(index: &Path, read_only: bool) -> Self { + let opts = Self::db_options(); + let cfs = DB::list_cf(&opts, index).unwrap(); + + if cfs.into_iter().any(|c| c == COLORS) { + // TODO: ColorRevIndex can't be read-only for now, + // due to pending unmerged colors + todo!() //color_revindex::ColorRevIndex::open(index, false) + } else { + revindex::RevIndex::open(index, read_only) + } + } + + fn db_options() -> rocksdb::Options { + let mut opts = rocksdb::Options::default(); + opts.set_max_open_files(500); + + // Updated defaults from + // https://github.com/facebook/rocksdb/wiki/Setup-Options-and-Basic-Tuning#other-general-options + opts.set_bytes_per_sync(1048576); + let mut block_opts = rocksdb::BlockBasedOptions::default(); + block_opts.set_block_size(16 * 1024); + block_opts.set_cache_index_and_filter_blocks(true); + block_opts.set_pin_l0_filter_and_index_blocks_in_cache(true); + block_opts.set_format_version(5); + opts.set_block_based_table_factory(&block_opts); + // End of updated defaults + + opts + } + + pub fn gather( + &self, + counter: SigCounter, + query_colors: QueryColors, + hash_to_color: HashToColor, + threshold: usize, + query: &KmerMinHash, + template: &Sketch, + ) -> Result, Box> { + match self { + //Self::Color(_db) => todo!(), + Self::Plain(db) => db.gather( + counter, + query_colors, + hash_to_color, + threshold, + query, + template, + ), + } + } +} + +#[derive(Debug, PartialEq, Clone, Archive, Serialize, Deserialize)] +enum SignatureData { + Empty, + Internal(Signature), + External(String), +} + +impl Default for SignatureData { + fn default() -> Self { + SignatureData::Empty + } +} + +impl SignatureData { + fn from_slice(slice: &[u8]) -> Option { + // TODO: avoid the aligned vec allocation here + let mut vec = rkyv::AlignedVec::new(); + vec.extend_from_slice(slice); + let archived_value = unsafe { rkyv::archived_root::(vec.as_ref()) }; + let inner = archived_value.deserialize(&mut rkyv::Infallible).unwrap(); + Some(inner) + } + + fn as_bytes(&self) -> Option> { + let bytes = rkyv::to_bytes::<_, 256>(self).unwrap(); + Some(bytes.into_vec()) + + /* + let mut serializer = DefaultSerializer::default(); + let v = serializer.serialize_value(self).unwrap(); + debug_assert_eq!(v, 0); + let buf = serializer.into_serializer().into_inner(); + debug_assert!(Datasets::from_slice(&buf.to_vec()).is_some()); + Some(buf.to_vec()) + */ + } +} + +fn check_compatible_downsample(me: &KmerMinHash, other: &KmerMinHash) -> Result<(), crate::Error> { + /* + if self.num != other.num { + return Err(Error::MismatchNum { + n1: self.num, + n2: other.num, + } + .into()); + } + */ + use crate::Error; + + if me.ksize() != other.ksize() { + return Err(Error::MismatchKSizes); + } + if me.hash_function() != other.hash_function() { + // TODO: fix this error + return Err(Error::MismatchDNAProt); + } + if me.max_hash() < other.max_hash() { + return Err(Error::MismatchScaled); + } + if me.seed() != other.seed() { + return Err(Error::MismatchSeed); + } + Ok(()) +} + +pub fn prepare_query(search_sig: &Signature, template: &Sketch) -> Option { + let mut search_mh = None; + if let Some(Sketch::MinHash(mh)) = search_sig.select_sketch(template) { + search_mh = Some(mh.clone()); + } else { + // try to find one that can be downsampled + if let Sketch::MinHash(template_mh) = template { + for sketch in search_sig.sketches() { + if let Sketch::MinHash(ref_mh) = sketch { + if check_compatible_downsample(&ref_mh, template_mh).is_ok() { + let max_hash = max_hash_for_scaled(template_mh.scaled()); + let mh = ref_mh.downsample_max_hash(max_hash).unwrap(); + search_mh = Some(mh); + } + } + } + } + } + search_mh +} + +#[derive(Debug, PartialEq, Clone)] +pub enum Datasets { + Empty, + Unique(DatasetID), + Many(RoaringTreemap), +} + +impl Hash for Datasets { + fn hash(&self, state: &mut H) + where + H: Hasher, + { + match self { + Self::Empty => todo!(), + Self::Unique(v) => v.hash(state), + Self::Many(v) => todo!(), + } + } +} + +impl IntoIterator for Datasets { + type Item = DatasetID; + type IntoIter = Box>; + + fn into_iter(self) -> Self::IntoIter { + match self { + Self::Empty => Box::new(std::iter::empty()), + Self::Unique(v) => Box::new(std::iter::once(v)), + Self::Many(v) => Box::new(v.into_iter()), + } + } +} + +impl Default for Datasets { + fn default() -> Self { + Datasets::Empty + } +} + +impl Extend for Datasets { + fn extend(&mut self, iter: T) + where + T: IntoIterator, + { + if let Self::Many(v) = self { + v.extend(iter); + return; + } + + let mut it = iter.into_iter(); + while let Some(value) = it.next() { + match self { + Self::Empty => *self = Datasets::Unique(value), + Self::Unique(v) => { + if *v != value { + *self = Self::Many([*v, value].iter().copied().collect()); + } + } + Self::Many(v) => { + v.extend(it); + return; + } + } + } + } +} + +impl Datasets { + fn new(vals: &[DatasetID]) -> Self { + if vals.is_empty() { + Self::Empty + } else if vals.len() == 1 { + Self::Unique(vals[0]) + } else { + Self::Many(RoaringTreemap::from_sorted_iter(vals.iter().copied()).unwrap()) + } + } + + fn from_slice(slice: &[u8]) -> Option { + use byteorder::ReadBytesExt; + + if slice.len() == 8 { + // Unique + Some(Self::Unique( + (&slice[..]).read_u64::().unwrap(), + )) + } else if slice.len() == 1 { + // Empty + Some(Self::Empty) + } else { + // Many + Some(Self::Many( + RoaringTreemap::deserialize_from(&slice[..]).unwrap(), + )) + } + } + + fn as_bytes(&self) -> Option> { + use byteorder::WriteBytesExt; + + match self { + Self::Empty => Some(vec![42_u8]), + Self::Unique(v) => { + let mut buf = vec![0u8; 8]; + (&mut buf[..]) + .write_u64::(*v) + .expect("error writing bytes"); + Some(buf) + } + Self::Many(v) => { + let mut buf = vec![]; + v.serialize_into(&mut buf).unwrap(); + Some(buf) + } + } + } + + fn union(&mut self, other: Datasets) { + match self { + Datasets::Empty => match other { + Datasets::Empty => (), + Datasets::Unique(_) | Datasets::Many(_) => *self = other, + }, + Datasets::Unique(v) => match other { + Datasets::Empty => (), + Datasets::Unique(o) => { + if *v != o { + *self = Datasets::Many([*v, o].iter().copied().collect()) + } + } + Datasets::Many(mut o) => { + o.extend([*v].into_iter()); + *self = Datasets::Many(o); + } + }, + Datasets::Many(ref mut v) => v.extend(other.into_iter()), + } + } + + fn len(&self) -> usize { + match self { + Self::Empty => 0, + Self::Unique(_) => 1, + Self::Many(ref v) => v.len() as usize, + } + } + + fn contains(&self, value: &DatasetID) -> bool { + match self { + Self::Empty => false, + Self::Unique(v) => v == value, + Self::Many(ref v) => v.contains(*value), + } + } +} + +fn sig_save_to_db( + db: Arc, + mut search_sig: Signature, + search_mh: KmerMinHash, + size: u64, + threshold: f64, + save_paths: bool, + filename: &Path, + dataset_id: u64, +) { + // Save signature to DB + let sig = if search_mh.is_empty() || size < threshold as u64 { + SignatureData::Empty + } else if save_paths { + SignatureData::External(filename.to_str().unwrap().to_string()) + } else { + search_sig.reset_sketches(); + search_sig.push(Sketch::MinHash(search_mh)); + SignatureData::Internal(search_sig) + }; + + let sig_bytes = sig.as_bytes().unwrap(); + let cf_sigs = db.cf_handle(SIGS).unwrap(); + let mut hash_bytes = [0u8; 8]; + (&mut hash_bytes[..]) + .write_u64::(dataset_id) + .expect("error writing bytes"); + db.put_cf(&cf_sigs, &hash_bytes[..], sig_bytes.as_slice()) + .expect("error saving sig"); +} + +fn stats_for_cf(db: Arc, cf_name: &str, deep_check: bool, quick: bool) { + use byteorder::ReadBytesExt; + use histogram::Histogram; + use log::info; + use numsep::{separate, Locale}; + + let cf = db.cf_handle(cf_name).unwrap(); + + let iter = db.iterator_cf(&cf, rocksdb::IteratorMode::Start); + let mut kcount = 0; + let mut vcount = 0; + let mut vcounts = Histogram::new(); + let mut datasets: Datasets = Default::default(); + + for result in iter { + let (key, value) = result.unwrap(); + let _k = (&key[..]).read_u64::().unwrap(); + kcount += key.len(); + + //println!("Saw {} {:?}", k, Datasets::from_slice(&value)); + vcount += value.len(); + + if !quick && deep_check { + let v = Datasets::from_slice(&value).expect("Error with value"); + vcounts.increment(v.len() as u64).unwrap(); + datasets.union(v); + } + //println!("Saw {} {:?}", k, value); + } + + info!("*** {} ***", cf_name); + use size::Size; + let ksize = Size::from_bytes(kcount); + let vsize = Size::from_bytes(vcount); + if !quick && cf_name == COLORS { + info!( + "total datasets: {}", + separate(datasets.len(), Locale::English) + ); + } + info!("total keys: {}", separate(kcount / 8, Locale::English)); + + info!("k: {}", ksize.to_string()); + info!("v: {}", vsize.to_string()); + + if !quick && kcount > 0 && deep_check { + info!("max v: {}", vcounts.maximum().unwrap()); + info!("mean v: {}", vcounts.mean().unwrap()); + info!("stddev: {}", vcounts.stddev().unwrap()); + info!("median v: {}", vcounts.percentile(50.0).unwrap()); + info!("p25 v: {}", vcounts.percentile(25.0).unwrap()); + info!("p75 v: {}", vcounts.percentile(75.0).unwrap()); + } +} diff --git a/src/core/src/index/revindex/revindex.rs b/src/core/src/index/revindex/revindex.rs new file mode 100644 index 0000000000..21d371e751 --- /dev/null +++ b/src/core/src/index/revindex/revindex.rs @@ -0,0 +1,549 @@ +use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher}; +use std::path::{Path, PathBuf}; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; + +use byteorder::{LittleEndian, WriteBytesExt}; +use log::{info, trace}; +use rayon::prelude::*; +use rocksdb::{ColumnFamilyDescriptor, MergeOperands, Options}; + +use crate::index::revindex::mem_revindex::GatherResult; +use crate::signature::{Signature, SigsTrait}; +use crate::sketch::minhash::{KmerMinHash, MinHashOps}; +use crate::sketch::Sketch; + +use crate::index::revindex::prepare_query; +use crate::index::revindex::{ + self as module, sig_save_to_db, stats_for_cf, Color, DatasetID, Datasets, HashToColor, + QueryColors, SigCounter, SignatureData, DB, HASHES, SIGS, +}; + +fn compute_color(idxs: &Datasets) -> Color { + let s = BuildHasherDefault::::default(); + let mut hasher = s.build_hasher(); + /* + // TODO: remove this... + let mut sorted: Vec<_> = idxs.iter().collect(); + sorted.sort(); + */ + idxs.hash(&mut hasher); + hasher.finish() +} + +#[derive(Debug, Clone)] +pub struct RevIndex { + db: Arc, +} + +fn merge_datasets( + _: &[u8], + existing_val: Option<&[u8]>, + operands: &MergeOperands, +) -> Option> { + let mut datasets = existing_val + .and_then(Datasets::from_slice) + .unwrap_or_default(); + + for op in operands { + let new_vals = Datasets::from_slice(op).unwrap(); + datasets.union(new_vals); + } + // TODO: optimization! if nothing changed, skip as_bytes() + datasets.as_bytes() +} + +/* TODO: need the repair_cf variant, not available in rocksdb-rust yet +pub fn repair(path: &Path) { + let opts = db_options(); + + DB::repair(&opts, path).unwrap() +} +*/ + +impl RevIndex { + pub fn create(path: &Path) -> module::RevIndex { + let mut opts = module::RevIndex::db_options(); + opts.create_if_missing(true); + opts.create_missing_column_families(true); + + // prepare column family descriptors + let cfs = cf_descriptors(); + + let db = Arc::new(DB::open_cf_descriptors(&opts, path, cfs).unwrap()); + + module::RevIndex::Plain(Self { db }) + } + + pub fn open(path: &Path, read_only: bool) -> module::RevIndex { + let opts = module::RevIndex::db_options(); + + // prepare column family descriptors + let cfs = cf_descriptors(); + + let db = if read_only { + Arc::new(DB::open_cf_descriptors_read_only(&opts, path, cfs, false).unwrap()) + } else { + Arc::new(DB::open_cf_descriptors(&opts, path, cfs).unwrap()) + }; + + module::RevIndex::Plain(Self { db }) + } + + fn map_hashes_colors( + &self, + dataset_id: DatasetID, + filename: &PathBuf, + threshold: f64, + template: &Sketch, + save_paths: bool, + ) { + let search_sig = Signature::from_path(&filename) + .unwrap_or_else(|_| panic!("Error processing {:?}", filename)) + .swap_remove(0); + + let search_mh = + prepare_query(&search_sig, template).expect("Couldn't find a compatible MinHash"); + + let colors = Datasets::new(&[dataset_id]).as_bytes().unwrap(); + + let cf_hashes = self.db.cf_handle(HASHES).unwrap(); + + let matched = search_mh.mins(); + let size = matched.len() as u64; + if !matched.is_empty() || size > threshold as u64 { + // FIXME threshold is f64 + let mut hash_bytes = [0u8; 8]; + for hash in matched { + (&mut hash_bytes[..]) + .write_u64::(hash) + .expect("error writing bytes"); + self.db + .merge_cf(&cf_hashes, &hash_bytes[..], colors.as_slice()) + .expect("error merging"); + } + } + + sig_save_to_db( + self.db.clone(), + search_sig, + search_mh, + size, + threshold, + save_paths, + filename, + dataset_id, + ); + } + + pub fn counter_for_query(&self, query: &KmerMinHash) -> SigCounter { + info!("Collecting hashes"); + let cf_hashes = self.db.cf_handle(HASHES).unwrap(); + let hashes_iter = query.iter_mins().map(|hash| { + let mut v = vec![0_u8; 8]; + (&mut v[..]) + .write_u64::(*hash) + .expect("error writing bytes"); + (&cf_hashes, v) + }); + + info!("Multi get"); + self.db + .multi_get_cf(hashes_iter) + .into_iter() + .filter_map(|r| r.ok().unwrap_or(None)) + .flat_map(|raw_datasets| { + let new_vals = Datasets::from_slice(&raw_datasets).unwrap(); + new_vals.into_iter() + }) + .collect() + } + + pub fn prepare_gather_counters( + &self, + query: &KmerMinHash, + ) -> (SigCounter, QueryColors, HashToColor) { + let cf_hashes = self.db.cf_handle(HASHES).unwrap(); + let hashes_iter = query.iter_mins().map(|hash| { + let mut v = vec![0_u8; 8]; + (&mut v[..]) + .write_u64::(*hash) + .expect("error writing bytes"); + (&cf_hashes, v) + }); + + /* + build a HashToColors for query, + and a QueryColors (Color -> Datasets) mapping. + Loading Datasets from rocksdb for every hash takes too long. + */ + let mut query_colors: QueryColors = Default::default(); + let mut counter: SigCounter = Default::default(); + + info!("Building hash_to_colors and query_colors"); + let hash_to_colors = query + .iter_mins() + .zip(self.db.multi_get_cf(hashes_iter).into_iter()) + .filter_map(|(k, r)| { + let raw = r.ok().unwrap_or(None); + raw.map(|raw| { + let new_vals = Datasets::from_slice(&raw).unwrap(); + let color = compute_color(&new_vals); + query_colors + .entry(color) + .or_insert_with(|| new_vals.clone()); + counter.update(new_vals.into_iter()); + (*k, color) + }) + }) + .collect(); + + (counter, query_colors, hash_to_colors) + } + + pub fn matches_from_counter( + &self, + counter: SigCounter, + threshold: usize, + ) -> Vec<(String, usize)> { + let cf_sigs = self.db.cf_handle(SIGS).unwrap(); + + let matches_iter = counter + .most_common() + .into_iter() + .filter_map(|(dataset_id, size)| { + if size >= threshold { + let mut v = vec![0_u8; 8]; + (&mut v[..]) + .write_u64::(dataset_id) + .expect("error writing bytes"); + Some((&cf_sigs, v, size)) + } else { + None + } + }); + + let matches_sizes = matches_iter.clone().map(|(_, _, v)| v); + + info!("Multi get matches"); + self.db + .multi_get_cf(matches_iter.map(|(k, v, _)| (k, v))) + .into_iter() + .zip(matches_sizes) + .filter_map(|(r, size)| r.ok().unwrap_or(None).map(|v| (v, size))) + .filter_map( + |(sigdata, size)| match SignatureData::from_slice(&sigdata).unwrap() { + SignatureData::Empty => None, + SignatureData::External(p) => Some((p, size)), + SignatureData::Internal(sig) => Some((sig.name(), size)), + }, + ) + .collect() + } + + pub fn gather( + &self, + mut counter: SigCounter, + query_colors: QueryColors, + hash_to_color: HashToColor, + threshold: usize, + orig_query: &KmerMinHash, + template: &Sketch, + ) -> Result, Box> { + let mut match_size = usize::max_value(); + let mut matches = vec![]; + let mut key_bytes = [0u8; 8]; + //let mut query: KmerMinHashBTree = orig_query.clone().into(); + + let cf_sigs = self.db.cf_handle(SIGS).unwrap(); + + while match_size > threshold && !counter.is_empty() { + trace!("counter len: {}", counter.len()); + trace!("match size: {}", match_size); + + let (dataset_id, size) = counter.k_most_common_ordered(1)[0]; + match_size = if size >= threshold { size } else { break }; + + (&mut key_bytes[..]) + .write_u64::(dataset_id) + .expect("error writing bytes"); + + let match_sig = self + .db + .get_cf(&cf_sigs, &key_bytes[..]) + .ok() + .map( + |sigdata| match SignatureData::from_slice(&(sigdata.unwrap())).unwrap() { + SignatureData::Empty => todo!("throw error, empty sig"), + SignatureData::External(_p) => todo!("Load from external"), + SignatureData::Internal(sig) => sig, + }, + ) + .unwrap_or_else(|| panic!("Unknown dataset {}", dataset_id)); + + let match_mh = + prepare_query(&match_sig, template).expect("Couldn't find a compatible MinHash"); + + // Calculate stats + let f_orig_query = match_size as f64 / orig_query.size() as f64; + let f_match = match_size as f64 / match_mh.size() as f64; + let name = match_sig.name(); + let unique_intersect_bp = match_mh.scaled() as usize * match_size; + let gather_result_rank = matches.len(); + + let (intersect_orig, _) = match_mh.intersection_size(orig_query)?; + let intersect_bp = (match_mh.scaled() as u64 * intersect_orig) as usize; + + let f_unique_to_query = intersect_orig as f64 / orig_query.size() as f64; + let match_ = match_sig.clone(); + let md5 = match_sig.md5sum(); + + // TODO: all of these + let filename = "".into(); + let f_unique_weighted = 0.; + let average_abund = 0; + let median_abund = 0; + let std_abund = 0; + let f_match_orig = 0.; + let remaining_bp = 0; + + let result = GatherResult::builder() + .intersect_bp(intersect_bp) + .f_orig_query(f_orig_query) + .f_match(f_match) + .f_unique_to_query(f_unique_to_query) + .f_unique_weighted(f_unique_weighted) + .average_abund(average_abund) + .median_abund(median_abund) + .std_abund(std_abund) + .filename(filename) + .name(name) + .md5(md5) + .match_(match_) + .f_match_orig(f_match_orig) + .unique_intersect_bp(unique_intersect_bp) + .gather_result_rank(gather_result_rank) + .remaining_bp(remaining_bp) + .build(); + matches.push(result); + + trace!("Preparing counter for next round"); + // Prepare counter for finding the next match by decrementing + // all hashes found in the current match in other datasets + // TODO: not used at the moment, so just skip. + //query.remove_many(match_mh.to_vec().as_slice())?; + + // TODO: Use HashesToColors here instead. If not initialized, + // build it. + match_mh + .iter_mins() + .filter_map(|hash| hash_to_color.get(hash)) + .flat_map(|color| { + // TODO: remove this clone + query_colors.get(color).unwrap().clone().into_iter() + }) + .for_each(|dataset| { + // TODO: collect the flat_map into a Counter, and remove more + // than one at a time... + counter.entry(dataset).and_modify(|e| { + if *e > 0 { + *e -= 1 + } + }); + }); + + counter.remove(&dataset_id); + } + Ok(matches) + } + + pub fn index( + &self, + index_sigs: Vec, + template: &Sketch, + threshold: f64, + save_paths: bool, + ) { + let processed_sigs = AtomicUsize::new(0); + + index_sigs + .par_iter() + .enumerate() + .for_each(|(dataset_id, filename)| { + let i = processed_sigs.fetch_add(1, Ordering::SeqCst); + if i % 1000 == 0 { + info!("Processed {} reference sigs", i); + } + + self.map_hashes_colors( + dataset_id as DatasetID, + filename, + threshold, + template, + save_paths, + ); + }); + info!("Processed {} reference sigs", processed_sigs.into_inner()); + } + + pub fn update( + &self, + index_sigs: Vec, + template: &Sketch, + threshold: f64, + save_paths: bool, + ) { + use byteorder::ReadBytesExt; + + if !save_paths { + todo!("only supports with save_paths=True for now"); + } + + let cf_sigs = self.db.cf_handle(SIGS).unwrap(); + let iter = self.db.iterator_cf(&cf_sigs, rocksdb::IteratorMode::Start); + + info!("Verifying existing sigs"); + // verify data match up to this point + let mut max_dataset_id = 0; + let to_skip = iter + .map(|result| { + let (key, value) = result.unwrap(); + let current_dataset_id = (&key[..]).read_u64::().unwrap(); + + let filename = &index_sigs[current_dataset_id as usize]; + let sig_data = SignatureData::from_slice(&value).unwrap(); + match sig_data { + SignatureData::External(sig) => { + assert_eq!(sig, filename.as_os_str().to_str().unwrap().to_string()) + } + SignatureData::Empty => (), + SignatureData::Internal(_) => { + todo!("only supports with save_paths=True for now") + } + }; + max_dataset_id = max_dataset_id.max(current_dataset_id as u64); + }) + .count(); + + max_dataset_id += 1; + assert_eq!(max_dataset_id as usize, to_skip); + + // process the remainder + let processed_sigs = AtomicUsize::new(0); + + index_sigs + .par_iter() + .skip(to_skip) + .enumerate() + .for_each(|(i, filename)| { + let dataset_id = i + to_skip; + + let i = processed_sigs.fetch_add(1, Ordering::SeqCst); + if i % 1000 == 0 { + info!("Processed {} reference sigs", i); + } + + self.map_hashes_colors( + dataset_id as DatasetID, + filename, + threshold, + template, + save_paths, + ); + }); + + info!( + "Processed additional {} reference sigs", + processed_sigs.into_inner() + ); + } + + pub fn check(&self, quick: bool) { + stats_for_cf(self.db.clone(), HASHES, true, quick); + info!(""); + stats_for_cf(self.db.clone(), SIGS, false, quick); + } + + pub fn compact(&self) { + for cf_name in [HASHES, SIGS] { + let cf = self.db.cf_handle(cf_name).unwrap(); + self.db.compact_range_cf(&cf, None::<&[u8]>, None::<&[u8]>) + } + } + + pub fn flush(&self) -> Result<(), Box> { + self.db.flush_wal(true)?; + + for cf_name in [HASHES, SIGS] { + let cf = self.db.cf_handle(cf_name).unwrap(); + self.db.flush_cf(&cf)?; + } + + Ok(()) + } + + pub fn convert(&self, output_db: module::RevIndex) -> Result<(), Box> { + todo!() + /* + if let RevIndex::Color(db) = output_db { + let other_db = db.db; + + let cf_hashes = self.db.cf_handle(HASHES).unwrap(); + + info!("start converting colors"); + let mut color_bytes = [0u8; 8]; + let iter = self + .db + .iterator_cf(&cf_hashes, rocksdb::IteratorMode::Start); + for (key, value) in iter { + let datasets = Datasets::from_slice(&value).unwrap(); + let new_idx: Vec<_> = datasets.into_iter().collect(); + let new_color = Colors::update(other_db.clone(), None, new_idx.as_slice()).unwrap(); + + (&mut color_bytes[..]) + .write_u64::(new_color) + .expect("error writing bytes"); + other_db + .put_cf(&cf_hashes, &key[..], &color_bytes[..]) + .unwrap(); + } + info!("finished converting colors"); + + info!("copying sigs to output"); + let cf_sigs = self.db.cf_handle(SIGS).unwrap(); + let iter = self.db.iterator_cf(&cf_sigs, rocksdb::IteratorMode::Start); + for (key, value) in iter { + other_db.put_cf(&cf_sigs, &key[..], &value[..]).unwrap(); + } + info!("finished copying sigs to output"); + + Ok(()) + } else { + todo!() + } + */ + } +} + +fn cf_descriptors() -> Vec { + let mut cfopts = Options::default(); + cfopts.set_max_write_buffer_number(16); + cfopts.set_merge_operator_associative("datasets operator", merge_datasets); + cfopts.set_min_write_buffer_number_to_merge(10); + + // Updated default from + // https://github.com/facebook/rocksdb/wiki/Setup-Options-and-Basic-Tuning#other-general-options + cfopts.set_level_compaction_dynamic_level_bytes(true); + + let cf_hashes = ColumnFamilyDescriptor::new(HASHES, cfopts); + + let mut cfopts = Options::default(); + cfopts.set_max_write_buffer_number(16); + // Updated default + cfopts.set_level_compaction_dynamic_level_bytes(true); + //cfopts.set_merge_operator_associative("colors operator", merge_colors); + + let cf_sigs = ColumnFamilyDescriptor::new(SIGS, cfopts); + + vec![cf_hashes, cf_sigs] +} diff --git a/src/core/src/index/sbt/mhbt.rs b/src/core/src/index/sbt/mhbt.rs deleted file mode 100644 index 2d4ceb3fb8..0000000000 --- a/src/core/src/index/sbt/mhbt.rs +++ /dev/null @@ -1,361 +0,0 @@ -use std::collections::HashMap; -use std::io::Write; - -use crate::errors::ReadDataError; -use crate::index::sbt::{Factory, FromFactory, Node, SBT}; -use crate::prelude::*; -use crate::signature::SigsTrait; -use crate::sketch::nodegraph::Nodegraph; -use crate::sketch::Sketch; -use crate::storage::Storage; -use crate::Error; - -impl ToWriter for Nodegraph { - fn to_writer(&self, writer: &mut W) -> Result<(), Error> - where - W: Write, - { - self.save_to_writer(writer) - } -} - -impl FromFactory> for SBT, L> { - fn factory(&self, name: &str) -> Result, Error> { - match self.factory { - Factory::GraphFactory { args: (k, t, n) } => { - let n = Nodegraph::with_tables(t as usize, n as usize, k as usize); - - Ok(Node::builder() - .filename(name) - .name(name) - .metadata(HashMap::default()) - .storage(self.storage()) - .data(n) - .build()) - } - } - } -} - -impl Update> for Node { - fn update(&self, _other: &mut Node) -> Result<(), Error> { - unimplemented!(); - } -} - -impl Update> for Signature { - fn update(&self, parent: &mut Node) -> Result<(), Error> { - // TODO: avoid copy here - let mut parent_data = parent.data()?.clone(); - - if let Sketch::MinHash(sig) = &self.signatures[0] { - for h in sig.mins() { - parent_data.count(h); - } - - let min_n_below = parent - .metadata - .entry("min_n_below".into()) - .or_insert(u64::max_value()); - - *min_n_below = u64::min(sig.size() as u64, *min_n_below); - if *min_n_below == 0 { - *min_n_below = 1 - } - } else { - //TODO what if it is not a minhash? - unimplemented!() - } - - parent.data = parent_data.into(); - - Ok(()) - } -} - -impl Comparable> for Node { - fn similarity(&self, other: &Node) -> f64 { - let ng: &Nodegraph = self.data().unwrap(); - let ong: &Nodegraph = other.data().unwrap(); - ng.similarity(ong) - } - - fn containment(&self, other: &Node) -> f64 { - let ng: &Nodegraph = self.data().unwrap(); - let ong: &Nodegraph = other.data().unwrap(); - ng.containment(ong) - } -} - -impl Comparable for Node { - fn similarity(&self, other: &Signature) -> f64 { - let ng: &Nodegraph = self.data().unwrap(); - - // TODO: select the right signatures... - if let Sketch::MinHash(sig) = &other.signatures[0] { - if sig.size() == 0 { - return 0.0; - } - - let matches: usize = sig.mins().iter().map(|h| ng.get(*h)).sum(); - - let min_n_below = self.metadata["min_n_below"] as f64; - - // This overestimates the similarity, but better than truncating too - // soon and losing matches - matches as f64 / min_n_below - } else { - //TODO what if it is not a minhash? - unimplemented!() - } - } - - fn containment(&self, other: &Signature) -> f64 { - let ng: &Nodegraph = self.data().unwrap(); - - // TODO: select the right signatures... - if let Sketch::MinHash(sig) = &other.signatures[0] { - if sig.size() == 0 { - return 0.0; - } - - let matches: usize = sig.mins().iter().map(|h| ng.get(*h)).sum(); - - matches as f64 / sig.size() as f64 - } else { - //TODO what if it is not a minhash? - unimplemented!() - } - } -} - -impl ReadData for Node { - fn data(&self) -> Result<&Nodegraph, Error> { - if let Some(storage) = &self.storage { - Ok(self.data.get_or_init(|| { - let raw = storage.load(&self.filename).unwrap(); - Nodegraph::from_reader(&mut &raw[..]).unwrap() - })) - } else if let Some(data) = self.data.get() { - Ok(data) - } else { - Err(ReadDataError::LoadError.into()) - } - } -} - -#[cfg(test)] -mod test { - use std::convert::TryInto; - use std::fs::File; - use std::io::{BufReader, Seek, SeekFrom}; - use std::path::PathBuf; - - use assert_matches::assert_matches; - - use super::Factory; - - use crate::index::linear::LinearIndex; - use crate::index::sbt::scaffold; - use crate::index::search::{search_minhashes, search_minhashes_containment}; - use crate::index::{Index, SigStore, MHBT}; - use crate::prelude::*; - - #[test] - fn save_sbt() { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/v5.sbt.json"); - - let mut sbt = MHBT::from_path(filename).expect("Loading error"); - - let mut tmpfile = tempfile::NamedTempFile::new().unwrap(); - sbt.save_file(tmpfile.path(), None).unwrap(); - - tmpfile.seek(SeekFrom::Start(0)).unwrap(); - } - - #[test] - fn load_sbt() { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/v5.sbt.json"); - - let sbt = MHBT::from_path(filename).expect("Loading error"); - - assert_eq!(sbt.d, 2); - //assert_eq!(sbt.storage.backend, "FSStorage"); - //assert_eq!(sbt.storage.args["path"], ".sbt.v5"); - //assert_matches!(&sbt.storage, ::FSStorage(args) => { - // assert_eq!(args, &[1, 100000, 4]); - //}); - assert_matches!(&sbt.factory, Factory::GraphFactory { args } => { - assert_eq!(args, &(1, 100000.0, 4)); - }); - - println!("sbt leaves {:?} {:?}", sbt.leaves.len(), sbt.leaves); - - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); - - let mut reader = BufReader::new(File::open(filename).unwrap()); - let sigs = Signature::load_signatures( - &mut reader, - Some(31), - Some("DNA".try_into().unwrap()), - None, - ) - .unwrap(); - let leaf = sigs[0].clone(); - - let results = sbt.find(search_minhashes, &leaf, 0.5).unwrap(); - assert_eq!(results.len(), 1); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = sbt.find(search_minhashes, &leaf, 0.1).unwrap(); - assert_eq!(results.len(), 2); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let mut linear = LinearIndex::builder().storage(sbt.storage()).build(); - for l in &sbt.leaves { - linear.insert(l.1.data().unwrap().clone()).unwrap(); - } - - let datasets = linear.signatures(); - println!("linear leaves {:?} {:?}", datasets.len(), datasets); - - let results = linear.find(search_minhashes, &leaf, 0.5).unwrap(); - assert_eq!(results.len(), 1); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = linear.find(search_minhashes, &leaf, 0.1).unwrap(); - assert_eq!(results.len(), 2); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = linear - .find(search_minhashes_containment, &leaf, 0.5) - .unwrap(); - assert_eq!(results.len(), 2); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = linear - .find(search_minhashes_containment, &leaf, 0.1) - .unwrap(); - assert_eq!(results.len(), 4); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - } - - #[test] - #[ignore] - fn roundtrip_sbt() -> Result<(), Box> { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/v5.sbt.json"); - - let sbt = MHBT::from_path(filename)?; - - assert_eq!(sbt.d, 2); - //assert_eq!(sbt.storage.backend, "FSStorage"); - //assert_eq!(sbt.storage.args["path"], ".sbt.v5"); - //assert_matches!(&sbt.storage, ::FSStorage(args) => { - // assert_eq!(args, &[1, 100000, 4]); - //}); - assert_matches!(&sbt.factory, Factory::GraphFactory { args } => { - assert_eq!(args, &(1, 100000.0, 4)); - }); - - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); - - let mut reader = BufReader::new(File::open(filename)?); - let sigs = Signature::load_signatures( - &mut reader, - Some(31), - Some("DNA".try_into().unwrap()), - None, - )?; - let sig_data = sigs[0].clone(); - - let leaf: SigStore<_> = sig_data.into(); - - let results = sbt.find(search_minhashes, &leaf, 0.5)?; - assert_eq!(results.len(), 1); - //println!("results: {:?}", results); - //println!("leaf: {:?}", leaf); - - let results = sbt.find(search_minhashes, &leaf, 0.1)?; - assert_eq!(results.len(), 2); - //println!("results: {:?}", results); - //println!("leaf: {:?}", leaf); - - println!("sbt internal {:?} {:?}", sbt.nodes.len(), sbt.nodes); - println!("sbt leaves {:?} {:?}", sbt.leaves.len(), sbt.leaves); - - let mut new_sbt: MHBT = MHBT::builder().storage(None).build(); - let datasets = sbt.signatures(); - for l in datasets { - new_sbt.insert(l)?; - } - - for (i, node) in &sbt.nodes { - assert_eq!(node.data().unwrap(), new_sbt.nodes[i].data().unwrap()); - } - - assert_eq!(new_sbt.signature_refs().len(), 7); - println!("new_sbt internal {:?} {:?}", sbt.nodes.len(), sbt.nodes); - println!("new_sbt leaves {:?} {:?}", sbt.leaves.len(), sbt.leaves); - - let results = new_sbt.find(search_minhashes, &leaf, 0.5)?; - //println!("results: {:?}", results); - //println!("leaf: {:?}", leaf); - assert_eq!(results.len(), 1); - - let results = new_sbt.find(search_minhashes, &leaf, 0.1)?; - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - assert_eq!(results.len(), 2); - - let results = new_sbt.find(search_minhashes_containment, &leaf, 0.5)?; - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - assert_eq!(results.len(), 2); - - let results = new_sbt.find(search_minhashes_containment, &leaf, 0.1)?; - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - assert_eq!(results.len(), 4); - - Ok(()) - } - - #[test] - fn scaffold_sbt() { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/v5.sbt.json"); - - let sbt = MHBT::from_path(filename).expect("Loading error"); - - let new_sbt: MHBT = scaffold(sbt.leaves(), sbt.storage()); - - assert_eq!(new_sbt.signatures().len(), 7); - } - - #[test] - fn load_v4() { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/v4.sbt.json"); - - let _sbt = MHBT::from_path(filename).expect("Loading error"); - } - - #[test] - fn load_v5() { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/v5.sbt.json"); - - let _sbt = MHBT::from_path(filename).expect("Loading error"); - } -} diff --git a/src/core/src/index/sbt/mhmt.rs b/src/core/src/index/sbt/mhmt.rs deleted file mode 100644 index 5eeb8a09b3..0000000000 --- a/src/core/src/index/sbt/mhmt.rs +++ /dev/null @@ -1,227 +0,0 @@ -use std::io::{Read, Write}; - -use mqf::MQF; - -use crate::Error; -use crate::index::sbt::{FromFactory, Node, Update, SBT}; -use crate::index::storage::{ReadData, ReadDataError, ToWriter}; -use crate::index::Comparable; -use crate::signature::{Signature, SigsTrait}; -use crate::sketch::Sketch; - -impl ToWriter for MQF { - fn to_writer(&self, writer: &mut W) -> Result<(), Error> - where - W: Write, - { - // TODO: using tempfile for now, but ideally want to avoid that - let mut tmpfile = tempfile::NamedTempFile::new()?; - self.serialize(tmpfile.path()).unwrap(); // TODO: convert this to a proper error - - let mut buffer = Vec::new(); - tmpfile.read_to_end(&mut buffer)?; - writer.write_all(&buffer)?; - - Ok(()) - } -} - -impl ReadData for Node { - fn data(&self) -> Result<&MQF, Error> { - if let Some(storage) = &self.storage { - Ok(self.data.get_or_create(|| { - let raw = storage.load(&self.filename).unwrap(); - - // TODO: using tempfile for now, but ideally want to avoid that - let mut tmpfile = tempfile::NamedTempFile::new().unwrap(); - tmpfile.write_all(&raw[..]).unwrap(); - - MQF::deserialize(tmpfile.path()).unwrap() - })) - } else if let Some(data) = self.data.get() { - Ok(data) - } else { - Err(ReadDataError::LoadError.into()) - } - } -} - -impl FromFactory> for SBT, L> { - fn factory(&self, _name: &str) -> Result, Error> { - unimplemented!() - } -} - -impl Update> for Node { - fn update(&self, _other: &mut Node) -> Result<(), Error> { - unimplemented!(); - } -} - -impl Update> for Signature { - fn update(&self, _other: &mut Node) -> Result<(), Error> { - unimplemented!(); - } -} - -impl Comparable> for Node { - fn similarity(&self, other: &Node) -> f64 { - let _ng: &MQF = self.data().unwrap(); - let _ong: &MQF = other.data().unwrap(); - unimplemented!(); - //ng.similarity(&ong) - } - - fn containment(&self, other: &Node) -> f64 { - let _ng: &MQF = self.data().unwrap(); - let _ong: &MQF = other.data().unwrap(); - unimplemented!(); - //ng.containment(&ong) - } -} - -impl Comparable for Node { - fn similarity(&self, other: &Signature) -> f64 { - let ng: &MQF = self.data().unwrap(); - - // TODO: select the right signatures... - if let Sketch::MinHash(sig) = &other.signatures[0] { - if sig.size() == 0 { - return 0.0; - } - - let matches: usize = sig - .mins - .iter() - .filter(|h| dbg!(ng.count_key(**h % u64::pow(2, 26))) > 0) - //.filter(|h| dbg!(ng.count_key(**h)) > 0) - .count(); - - let min_n_below = self.metadata["min_n_below"] as f64; - - // This overestimates the similarity, but better than truncating too - // soon and losing matches - matches as f64 / min_n_below - } else { - //TODO what if it is not a minhash? - unimplemented!() - } - } - - fn containment(&self, other: &Signature) -> f64 { - let ng: &MQF = self.data().unwrap(); - - // TODO: select the right signatures... - if let Sketch::MinHash(sig) = &other.signatures[0] { - if sig.size() == 0 { - return 0.0; - } - - let matches: usize = sig - .mins - .iter() - .filter(|h| ng.count_key(**h % u64::pow(2, 26)) > 0) - //.filter(|h| ng.count_key(**h) > 0) - .count(); - - matches as f64 / sig.size() as f64 - } else { - //TODO what if it is not a minhash? - unimplemented!() - } - } -} - -/* FIXME: bring back after MQF works on macOS and Windows -#[cfg(test)] -mod test { - use std::fs::File; - use std::io::{BufReader, Seek, SeekFrom}; - use std::path::PathBuf; - use std::rc::Rc; - use tempfile; - - use assert_matches::assert_matches; - use lazy_init::Lazy; - - use super::{scaffold, Factory}; - - use crate::index::linear::LinearIndex; - use crate::index::search::{search_minhashes, search_minhashes_containment}; - use crate::index::storage::ReadData; - use crate::index::{Index, SigStore, MHBT}; - use crate::signature::Signature; - - #[cfg(not(target_arch = "wasm32"))] - #[test] - fn load_mhmt() { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("tests/test-data/v5_mhmt.sbt.json"); - - let mut sbt = crate::index::MHMT::from_path(filename).expect("Loading error"); - - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); - - let mut reader = BufReader::new(File::open(filename).unwrap()); - let sigs = Signature::load_signatures(&mut reader, 31, Some("DNA".into()), None).unwrap(); - let sig_data = sigs[0].clone(); - - let data = Lazy::new(); - data.get_or_create(|| sig_data); - - let leaf = SigStore::builder() - .data(Rc::new(data)) - .filename("") - .name("") - .metadata("") - .storage(None) - .build(); - - let results = sbt.find(search_minhashes, &leaf, 0.5).unwrap(); - //assert_eq!(results.len(), 1); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = sbt.find(search_minhashes, &leaf, 0.1).unwrap(); - assert_eq!(results.len(), 2); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let mut linear = LinearIndex::builder().storage(sbt.storage()).build(); - for l in &sbt.leaves { - linear.insert(l.1.data().unwrap().clone()).unwrap(); - } - - println!( - "linear leaves {:?} {:?}", - linear.datasets.len(), - linear.datasets - ); - - let results = linear.find(search_minhashes, &leaf, 0.5).unwrap(); - assert_eq!(results.len(), 1); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = linear.find(search_minhashes, &leaf, 0.1).unwrap(); - assert_eq!(results.len(), 2); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = linear - .find(search_minhashes_containment, &leaf, 0.5) - .unwrap(); - assert_eq!(results.len(), 2); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = linear - .find(search_minhashes_containment, &leaf, 0.1) - .unwrap(); - assert_eq!(results.len(), 4); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - } - */ -} diff --git a/src/core/src/index/sbt/mod.rs b/src/core/src/index/sbt/mod.rs deleted file mode 100644 index 5245defe1f..0000000000 --- a/src/core/src/index/sbt/mod.rs +++ /dev/null @@ -1,878 +0,0 @@ -pub mod mhbt; - -/* FIXME: bring back after boomphf changes -pub mod ukhs; -*/ - -/* FIXME: bring back after MQF works on macOS and Windows -#[cfg(not(target_arch = "wasm32"))] -pub mod mhmt; -*/ - -use std::collections::hash_map::Entry; -use std::collections::{HashMap, HashSet}; -use std::fmt::Debug; -use std::fs::File; -use std::hash::BuildHasherDefault; -use std::io::{BufReader, Read}; -use std::path::{Path, PathBuf}; - -use log::info; -use nohash_hasher::NoHashHasher; -use once_cell::sync::OnceCell; -use serde::{Deserialize, Serialize}; -use typed_builder::TypedBuilder; - -use crate::index::{Comparable, DatasetInfo, Index, SigStore}; -use crate::prelude::*; -use crate::storage::{FSStorage, InnerStorage, StorageInfo}; -use crate::Error; - -#[derive(TypedBuilder)] -pub struct SBT { - #[builder(default = 2)] - d: u32, - - #[builder(default, setter(into))] - storage: Option, - - #[builder(default = Factory::GraphFactory { args: (1, 100000.0, 4) })] - factory: Factory, - - #[builder(default = HashMap::default())] - nodes: HashMap, - - #[builder(default = HashMap::default())] - leaves: HashMap>, -} - -const fn parent(pos: u64, d: u64) -> u64 { - (pos - 1) / d -} - -const fn child(parent: u64, pos: u64, d: u64) -> u64 { - d * parent + pos + 1 -} - -impl SBT -where - L: std::clone::Clone + Default, - N: Default, -{ - #[inline(always)] - fn parent(&self, pos: u64) -> Option { - if pos == 0 { - None - } else { - Some(parent(pos, u64::from(self.d))) - } - } - - #[inline(always)] - fn child(&self, parent: u64, pos: u64) -> u64 { - child(parent, pos, u64::from(self.d)) - } - - #[inline(always)] - fn children(&self, pos: u64) -> Vec { - (0..u64::from(self.d)).map(|c| self.child(pos, c)).collect() - } - - pub fn storage(&self) -> Option { - self.storage.clone() - } - - /* - fn fill_up(&mut self) -> Result<(), Error> { - let mut visited = HashSet::new(); - let mut queue: Vec<_> = self.leaves.keys().collect(); - - while !queue.is_empty() { - let pos = queue.pop().unwrap(); - - if !visited.contains(&pos) { - visited.insert(pos); - } - } - - Ok(()) - } - */ - - // combine -} - -impl SBT, T> -where - T: ToWriter + Clone, - U: ToWriter, - Node: ReadData, - SigStore: ReadData, -{ - fn parse_v4(rdr: &mut R) -> Result - where - R: Read, - { - let sinfo: SBTInfoV4 = serde_json::from_reader(rdr)?; - Ok(SBTInfo::V4(sinfo)) - } - - fn parse_v5(rdr: &mut R) -> Result - where - R: Read, - { - let sinfo: SBTInfoV5 = serde_json::from_reader(rdr)?; - Ok(SBTInfo::V5(sinfo)) - } - - pub fn from_reader(mut rdr: R, path: P) -> Result, T>, Error> - where - R: Read, - P: AsRef, - { - // TODO: I would love to do this, but I get an untagged enum error with - // SBTInfo... - //let sinfo: SBTInfo = serde_json::from_reader(rdr)?; - - let mut s = String::new(); - rdr.read_to_string(&mut s)?; - - let sinfo = - Self::parse_v5(&mut s.as_bytes()).or_else(|_| Self::parse_v4(&mut s.as_bytes()))?; - - // TODO: support other storages - let mut st: FSStorage = match sinfo { - SBTInfo::V4(ref sbt) => (&sbt.storage.args).into(), - SBTInfo::V5(ref sbt) => (&sbt.storage.args).into(), - SBTInfo::V6(ref sbt) => (&sbt.storage.args).into(), - }; - st.set_base(path.as_ref().to_str().unwrap()); - let storage = InnerStorage::new(st); - - let d = match sinfo { - SBTInfo::V4(ref sbt) => sbt.d, - SBTInfo::V5(ref sbt) => sbt.d, - SBTInfo::V6(ref sbt) => sbt.d, - }; - - let factory = match sinfo { - SBTInfo::V4(ref sbt) => sbt.factory.clone(), - SBTInfo::V5(ref sbt) => sbt.factory.clone(), - SBTInfo::V6(ref sbt) => sbt.factory.clone(), - }; - - let (nodes, leaves) = match sinfo { - SBTInfo::V6(sbt) => { - let nodes = sbt - .nodes - .into_iter() - .map(|(n, l)| { - ( - n, - Node::builder() - .filename(l.filename) - .name(l.name) - .metadata(l.metadata) - .storage(Some(storage.clone())) - .build(), - ) - }) - .collect(); - let leaves = sbt - .signatures - .into_iter() - .map(|(n, l)| { - ( - n, - SigStore::builder() - .filename(l.filename) - .name(l.name) - .metadata(l.metadata) - .storage(Some(storage.clone())) - .build(), - ) - }) - .collect(); - (nodes, leaves) - } - SBTInfo::V5(sbt) => { - let nodes = sbt - .nodes - .into_iter() - .map(|(n, l)| { - ( - n, - Node::builder() - .filename(l.filename) - .name(l.name) - .metadata(l.metadata) - .storage(Some(storage.clone())) - .build(), - ) - }) - .collect(); - let leaves = sbt - .leaves - .into_iter() - .map(|(n, l)| { - ( - n, - SigStore::builder() - .filename(l.filename) - .name(l.name) - .metadata(l.metadata) - .storage(Some(storage.clone())) - .build(), - ) - }) - .collect(); - (nodes, leaves) - } - SBTInfo::V4(sbt) => { - let nodes = sbt - .nodes - .iter() - .filter_map(|(n, x)| match x { - NodeInfoV4::Node(l) => Some(( - *n, - Node::builder() - .filename(l.filename.clone()) - .name(l.name.clone()) - .metadata(l.metadata.clone()) - .storage(Some(storage.clone())) - .build(), - )), - NodeInfoV4::Leaf(_) => None, - }) - .collect(); - - let leaves = sbt - .nodes - .into_iter() - .filter_map(|(n, x)| match x { - NodeInfoV4::Node(_) => None, - NodeInfoV4::Leaf(l) => Some(( - n, - SigStore::builder() - .filename(l.filename) - .name(l.name) - .metadata(l.metadata) - .storage(Some(storage.clone())) - .build(), - )), - }) - .collect(); - - (nodes, leaves) - } - }; - - Ok(SBT { - d, - factory, - storage: Some(storage), - nodes, - leaves, - }) - } - - pub fn from_path>(path: P) -> Result, T>, Error> { - let file = File::open(&path)?; - let mut reader = BufReader::new(file); - - // TODO: match with available Storage while we don't - // add a function to build a Storage from a StorageInfo - let mut basepath = PathBuf::new(); - basepath.push(path); - // TODO: canonicalize doesn't work on wasm32-wasi - //basepath.canonicalize()?; - - let sbt = SBT::, T>::from_reader(&mut reader, basepath.parent().unwrap())?; - Ok(sbt) - } - - pub fn save_file>( - &mut self, - path: P, - storage: Option, - ) -> Result<(), Error> { - let ref_path = path.as_ref(); - let mut basename = ref_path.file_name().unwrap().to_str().unwrap().to_owned(); - if basename.ends_with(".sbt.json") { - basename = basename.replace(".sbt.json", ""); - } - let location = ref_path.parent().unwrap(); - - let storage = match storage { - Some(s) => s, - None => { - let subdir = format!(".sbt.{}", basename); - InnerStorage::new(FSStorage::new(location.to_str().unwrap(), &subdir)) - } - }; - - let args = storage.args(); - let storage_info = StorageInfo { - backend: "FSStorage".into(), - args, - }; - - let info: SBTInfoV5 = SBTInfoV5 { - d: self.d, - factory: self.factory.clone(), - storage: storage_info, - version: 5, - nodes: self - .nodes - .iter_mut() - .map(|(n, l)| { - // Trigger data loading - let _: &U = (*l).data().expect("Couldn't load data"); - - // set storage to new one - l.storage = Some(storage.clone()); - - let filename = (*l).save(&l.filename).unwrap(); - let new_node = NodeInfo { - filename, - name: l.name.clone(), - metadata: l.metadata.clone(), - }; - (*n, new_node) - }) - .collect(), - leaves: self - .leaves - .iter_mut() - .map(|(n, l)| { - // Trigger data loading - let _: &T = (*l).data().unwrap(); - - // set storage to new one - l.storage = Some(storage.clone()); - - // TODO: this should be l.md5sum(), not l.filename - let filename = (*l).save(&l.filename).unwrap(); - let new_node = DatasetInfo { - filename, - name: l.name.clone(), - metadata: l.metadata.clone(), - }; - (*n, new_node) - }) - .collect(), - }; - - let file = File::create(path)?; - serde_json::to_writer(file, &info)?; - - Ok(()) - } - - pub fn leaves(&self) -> Vec> { - self.leaves.values().cloned().collect() - } -} - -impl<'a, N, L> Index<'a> for SBT -where - N: Comparable + Comparable + Update + Debug + Default, - L: Comparable + Update + Clone + Debug + Default, - SBT: FromFactory, - SigStore: From + ReadData, -{ - type Item = L; - - fn find(&self, search_fn: F, sig: &L, threshold: f64) -> Result, Error> - where - F: Fn(&dyn Comparable, &Self::Item, f64) -> bool, - { - let mut matches = Vec::new(); - let mut visited = HashSet::new(); - let mut queue = vec![0u64]; - - while let Some(pos) = queue.pop() { - if !visited.contains(&pos) { - visited.insert(pos); - - if let Some(node) = self.nodes.get(&pos) { - if search_fn(&node, sig, threshold) { - for c in self.children(pos) { - queue.push(c); - } - } - } else if let Some(leaf) = self.leaves.get(&pos) { - let data = leaf.data().expect("Error reading data"); - if search_fn(data, sig, threshold) { - matches.push(data); - } - } - } - } - - Ok(matches) - } - - fn insert(&mut self, dataset: L) -> Result<(), Error> { - if self.leaves.is_empty() { - // in this case the tree is empty, - // just add the dataset to the first available leaf - self.leaves.entry(0).or_insert_with(|| dataset.into()); - return Ok(()); - } - - // we can unwrap here because the root node case - // only happens on an empty tree, and if we got - // to this point we have at least one leaf already. - // TODO: find position by similarity search - let pos = self.leaves.keys().max().unwrap() + 1; - let parent_pos = self.parent(pos).unwrap(); - let final_pos; - - if let Entry::Occupied(pnode) = self.leaves.entry(parent_pos) { - // Case 1: parent is a Leaf - // create a new internal node, add it to self.nodes[parent_pos] - - let (_, leaf) = pnode.remove_entry(); - - let mut new_node = self.factory(&format!("internal.{}", parent_pos))?; - - // for each children update the parent node - // TODO: write the update method - leaf.data.get().unwrap().update(&mut new_node)?; - dataset.update(&mut new_node)?; - - // node and parent are children of new internal node - let mut c_pos = self.children(parent_pos).into_iter().take(2); - let c1_pos = c_pos.next().unwrap(); - let c2_pos = c_pos.next().unwrap(); - - self.leaves.entry(c1_pos).or_insert(leaf); - self.leaves.entry(c2_pos).or_insert_with(|| dataset.into()); - final_pos = c2_pos; - - // add the new internal node to self.nodes[parent_pos) - // TODO check if it is really empty? - self.nodes.entry(parent_pos).or_insert(new_node); - } else { - // TODO: moved these two lines here to avoid borrow checker - // error E0502 in the Vacant case, but would love to avoid it! - let mut new_node = self.factory(&format!("internal.{}", parent_pos))?; - let c_pos = self.children(parent_pos)[0]; - - match self.nodes.entry(parent_pos) { - // Case 2: parent is a node and has an empty child spot available - // (if there isn't an empty spot, it was already covered by case 1) - Entry::Occupied(mut pnode) => { - dataset.update(pnode.get_mut())?; - self.leaves.entry(pos).or_insert_with(|| dataset.into()); - final_pos = pos; - } - - // Case 3: parent is None/empty - // this can happen with d != 2, need to create parent node - Entry::Vacant(pnode) => { - dataset.update(&mut new_node)?; - self.leaves.entry(c_pos).or_insert_with(|| dataset.into()); - final_pos = c_pos; - pnode.insert(new_node); - } - } - } - - let entry = &self.leaves[&final_pos]; - let data = entry.data.get().unwrap(); - - let mut parent_pos = parent_pos; - while let Some(ppos) = self.parent(parent_pos) { - if let Entry::Occupied(mut pnode) = self.nodes.entry(parent_pos) { - //TODO: use children for this node to update, instead of dragging - // dataset up to the root? It would be more generic, but this - // works for minhash, draff signatures and nodegraphs... - data.update(pnode.get_mut())?; - } - parent_pos = ppos; - } - - Ok(()) - } - - /* - fn batch_insert(&mut self, nodes: Vec) -> Result<(), Error> { - self = scaffold(nodes, self.storage()); - Ok(()) - } - */ - - fn save>(&self, _path: P) -> Result<(), Error> { - unimplemented!(); - } - - fn load>(_path: P) -> Result<(), Error> { - unimplemented!() - } - - fn signatures(&self) -> Vec { - self.leaves - .values() - .map(|x| x.data().unwrap().clone()) - .collect() - } - - fn signature_refs(&self) -> Vec<&Self::Item> { - self.leaves.values().map(|x| x.data().unwrap()).collect() - } - - /* - fn iter_signatures(&'a self) -> Self::SignatureIterator { - self.leaves.values() - } - */ -} - -/* -#[derive(TypedBuilder, Clone, Default, Serialize, Deserialize)] -pub struct Factory { - class: String, - args: Vec, -} -*/ - -#[derive(Debug, Clone, Serialize, Deserialize)] -#[serde(tag = "class")] -pub enum Factory { - GraphFactory { args: (u64, f64, u64) }, -} - -#[derive(TypedBuilder, Default, Clone)] -pub struct Node { - #[builder(setter(into))] - filename: String, - - #[builder(setter(into))] - name: String, - - metadata: HashMap, - - #[builder(default)] - storage: Option, - - #[builder(setter(into), default)] - data: OnceCell, -} - -impl Node -where - T: ToWriter, -{ - pub fn save(&self, path: &str) -> Result { - if let Some(storage) = &self.storage { - if let Some(data) = self.data.get() { - let mut buffer = Vec::new(); - data.to_writer(&mut buffer)?; - - Ok(storage.save(path, &buffer)?) - } else { - // TODO throw error, data was not initialized - unimplemented!() - } - } else { - unimplemented!() - } - } -} - -impl PartialEq for Node -where - T: PartialEq, - Node: ReadData, -{ - fn eq(&self, other: &Node) -> bool { - self.data().unwrap() == other.data().unwrap() - } -} - -impl SigStore -where - T: ToWriter, -{ - pub fn save(&self, path: &str) -> Result { - if let Some(storage) = &self.storage { - if let Some(data) = self.data.get() { - let mut buffer = Vec::new(); - data.to_writer(&mut buffer)?; - - Ok(storage.save(path, &buffer)?) - } else { - unimplemented!() - } - } else { - unimplemented!() - } - } -} - -impl std::fmt::Debug for Node -where - T: Debug, -{ - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "Node [name={}, filename={}, metadata: {:?}, data: {:?}]", - self.name, - self.filename, - self.metadata, - self.data.get().is_some() - ) - } -} - -#[derive(Serialize, Deserialize, Debug)] -struct NodeInfo { - filename: String, - name: String, - metadata: HashMap, -} - -#[derive(Serialize, Deserialize, Debug)] -#[serde(untagged)] -enum NodeInfoV4 { - Node(NodeInfo), - Leaf(DatasetInfo), -} - -#[derive(Serialize, Deserialize)] -struct SBTInfoV4 { - d: u32, - version: u32, - storage: StorageInfo, - factory: Factory, - nodes: HashMap, -} - -#[derive(Serialize, Deserialize)] -struct SBTInfoV5 { - d: u32, - version: u32, - storage: StorageInfo, - factory: Factory, - nodes: HashMap, - leaves: HashMap, -} - -#[derive(Serialize, Deserialize)] -struct SBTInfoV6 { - d: u32, - version: u32, - storage: StorageInfo, - factory: Factory, - nodes: HashMap, - signatures: HashMap, -} - -#[derive(Deserialize)] -#[serde(untagged)] -enum SBTInfo { - V6(SBTInfoV6), - V5(SBTInfoV5), - V4(SBTInfoV4), -} - -enum BinaryTree { - Empty, - Internal(Box>>>>), - Leaf(Box>>), -} - -struct TreeNode { - element: T, - left: BinaryTree, - right: BinaryTree, -} - -pub fn scaffold( - mut datasets: Vec>, - storage: Option, -) -> SBT, Signature> -where - N: Clone + Default, -{ - let mut leaves: HashMap> = HashMap::with_capacity(datasets.len()); - - let mut next_round = Vec::new(); - - // generate two bottom levels: - // - datasets - // - first level of internal nodes - info!("Start processing leaves"); - while let Some(next_leaf) = datasets.pop() { - let (simleaf_tree, in_common) = if datasets.is_empty() { - (BinaryTree::Empty, next_leaf.mins().into_iter().collect()) - } else { - let mut similar_leaf_pos = 0; - let mut current_max = 0; - for (pos, leaf) in datasets.iter().enumerate() { - let common = next_leaf.count_common(leaf); - if common > current_max { - current_max = common; - similar_leaf_pos = pos; - } - } - - let similar_leaf = datasets.remove(similar_leaf_pos); - - let in_common = next_leaf - .mins() - .into_iter() - .collect::>>>() - .union(&similar_leaf.mins().into_iter().collect()) - .cloned() - .collect(); - - let simleaf_tree = BinaryTree::Leaf(Box::new(TreeNode { - element: similar_leaf, - left: BinaryTree::Empty, - right: BinaryTree::Empty, - })); - (simleaf_tree, in_common) - }; - - let leaf_tree = BinaryTree::Leaf(Box::new(TreeNode { - element: next_leaf, - left: BinaryTree::Empty, - right: BinaryTree::Empty, - })); - - let tree = BinaryTree::Internal(Box::new(TreeNode { - element: in_common, - left: leaf_tree, - right: simleaf_tree, - })); - - next_round.push(tree); - - if next_round.len() % 100 == 0 { - info!("Processed {} leaves", next_round.len() * 2); - } - } - info!("Finished processing leaves"); - - // while we don't get to the root, generate intermediary levels - while next_round.len() != 1 { - next_round = BinaryTree::process_internal_level(next_round); - info!("Finished processing round {}", next_round.len()); - } - - // Convert from binary tree to nodes/leaves - let root = next_round.pop().unwrap(); - let mut visited = HashSet::new(); - let mut queue = vec![(0u64, root)]; - - while let Some((pos, cnode)) = queue.pop() { - if !visited.contains(&pos) { - visited.insert(pos); - - match cnode { - BinaryTree::Leaf(leaf) => { - leaves.insert(pos, leaf.element); - } - BinaryTree::Internal(mut node) => { - let left = std::mem::replace(&mut node.left, BinaryTree::Empty); - let right = std::mem::replace(&mut node.right, BinaryTree::Empty); - queue.push((2 * pos + 1, left)); - queue.push((2 * pos + 2, right)); - } - BinaryTree::Empty => (), - } - } - } - - SBT::builder() - .storage(storage) - .nodes(HashMap::default()) - .leaves(leaves) - .build() -} - -impl BinaryTree { - fn process_internal_level(mut current_round: Vec) -> Vec { - let mut next_round = Vec::with_capacity(current_round.len() + 1); - - while let Some(next_node) = current_round.pop() { - let similar_node = if current_round.is_empty() { - BinaryTree::Empty - } else { - let mut similar_node_pos = 0; - let mut current_max = 0; - for (pos, cmpe) in current_round.iter().enumerate() { - let common = BinaryTree::intersection_size(&next_node, cmpe); - if common > current_max { - current_max = common; - similar_node_pos = pos; - } - } - current_round.remove(similar_node_pos) - }; - - let tree = BinaryTree::new_tree(next_node, similar_node); - - next_round.push(tree); - } - next_round - } - - // Remove this when MSRV is >= 1.40 - #[allow(clippy::mem_replace_with_default)] - fn new_tree(mut left: BinaryTree, mut right: BinaryTree) -> BinaryTree { - let in_common = if let BinaryTree::Internal(ref mut el1) = left { - match right { - BinaryTree::Internal(ref mut el2) => { - let c1 = std::mem::replace( - &mut el1.element, - HashSet::>>::default(), - ); - let c2 = std::mem::replace( - &mut el2.element, - HashSet::>>::default(), - ); - c1.union(&c2).cloned().collect() - } - BinaryTree::Empty => std::mem::replace( - &mut el1.element, - HashSet::>>::default(), - ), - _ => panic!("Should not see a Leaf at this level"), - } - } else { - HashSet::>>::default() - }; - - BinaryTree::Internal(Box::new(TreeNode { - element: in_common, - left, - right, - })) - } - - fn intersection_size(n1: &BinaryTree, n2: &BinaryTree) -> usize { - if let BinaryTree::Internal(ref el1) = n1 { - if let BinaryTree::Internal(ref el2) = n2 { - return el1.element.intersection(&el2.element).count(); - } - }; - 0 - } -} - -/* -impl From> for SBT, Signature> -where - U: Default + Clone, -{ - fn from(other: LinearIndex) -> Self { - let storage = other.storage(); - scaffold(other.datasets, storage) - } -} -*/ diff --git a/src/core/src/lib.rs b/src/core/src/lib.rs index 66de82e6a0..5eebef3b0a 100644 --- a/src/core/src/lib.rs +++ b/src/core/src/lib.rs @@ -26,6 +26,8 @@ pub mod prelude; pub mod cmd; +pub mod manifest; +pub mod picklist; pub mod signature; pub mod sketch; pub mod storage; diff --git a/src/core/src/manifest.rs b/src/core/src/manifest.rs new file mode 100644 index 0000000000..ce740c638b --- /dev/null +++ b/src/core/src/manifest.rs @@ -0,0 +1,186 @@ +use std::convert::TryInto; +use std::io::Read; +use std::ops::Deref; +use std::path::PathBuf; + +use serde::de; +use serde::{Deserialize, Serialize}; + +use crate::encodings::HashFunctions; +use crate::index::Selection; +use crate::Error; + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct Record { + internal_location: String, + ksize: u32, + + #[serde(deserialize_with = "to_bool")] + with_abundance: bool, + + md5: String, + name: String, + moltype: String, + /* + md5short: String, + num: String, + scaled: String, + n_hashes: String, + filename: String, + */ +} + +fn to_bool<'de, D>(deserializer: D) -> Result +where + D: de::Deserializer<'de>, +{ + match String::deserialize(deserializer)? + .to_ascii_lowercase() + .as_ref() + { + "0" | "false" => Ok(false), + "1" | "true" => Ok(true), + other => Err(de::Error::invalid_value( + de::Unexpected::Str(other), + &"0/1 or true/false are the only supported values", + )), + } +} + +#[derive(Debug, Default, Serialize, Deserialize, Clone)] +pub struct Manifest { + records: Vec, +} + +impl Record { + pub fn internal_location(&self) -> PathBuf { + self.internal_location.clone().into() + } + + pub fn ksize(&self) -> u32 { + self.ksize + } + + pub fn with_abundance(&self) -> bool { + self.with_abundance + } + + pub fn md5(&self) -> &str { + self.md5.as_ref() + } + + pub fn name(&self) -> &str { + self.name.as_ref() + } + + pub fn moltype(&self) -> HashFunctions { + self.moltype.as_str().try_into().unwrap() + } +} + +impl Manifest { + pub fn from_reader(rdr: R) -> Result { + let mut records = vec![]; + + let mut rdr = csv::ReaderBuilder::new() + .comment(Some(b'#')) + .from_reader(rdr); + for result in rdr.deserialize() { + let record: Record = result?; + records.push(record); + } + Ok(Manifest { records }) + } + + pub fn internal_locations(&self) -> impl Iterator { + self.records.iter().map(|r| r.internal_location.as_str()) + } + + pub fn iter(&self) -> impl Iterator { + self.records.iter() + } + + pub fn select_to_manifest(&self, selection: &Selection) -> Result { + let rows = self.records.iter().filter(|row| { + let mut valid = true; + valid = if let Some(ksize) = selection.ksize() { + row.ksize == ksize + } else { + valid + }; + valid = if let Some(abund) = selection.abund() { + valid && row.with_abundance() == abund + } else { + valid + }; + valid = if let Some(moltype) = selection.moltype() { + valid && row.moltype() == moltype + } else { + valid + }; + valid + }); + + Ok(Manifest { + records: rows.cloned().collect(), + }) + + /* + matching_rows = self.rows + if ksize: + matching_rows = ( row for row in matching_rows + if row['ksize'] == ksize ) + if moltype: + matching_rows = ( row for row in matching_rows + if row['moltype'] == moltype ) + if scaled or containment: + if containment and not scaled: + raise ValueError("'containment' requires 'scaled' in Index.select'") + + matching_rows = ( row for row in matching_rows + if row['scaled'] and not row['num'] ) + if num: + matching_rows = ( row for row in matching_rows + if row['num'] and not row['scaled'] ) + + if abund: + # only need to concern ourselves if abundance is _required_ + matching_rows = ( row for row in matching_rows + if row['with_abundance'] ) + + if picklist: + matching_rows = ( row for row in matching_rows + if picklist.matches_manifest_row(row) ) + + # return only the internal filenames! + for row in matching_rows: + yield row + */ + } +} + +impl From<&[PathBuf]> for Manifest { + fn from(v: &[PathBuf]) -> Self { + Manifest { + records: v + .iter() + .map(|p| Record { + internal_location: p.to_str().unwrap().into(), + ksize: 0, // FIXME + with_abundance: false, // FIXME + md5: "".into(), // FIXME + name: "".into(), // FIXME + moltype: "".into(), // FIXME + }) + .collect(), + } + } +} + +impl Deref for Manifest { + type Target = Vec; + + fn deref(&self) -> &Self::Target { + &self.records + } +} diff --git a/src/core/src/picklist.rs b/src/core/src/picklist.rs new file mode 100644 index 0000000000..a4e8b3815e --- /dev/null +++ b/src/core/src/picklist.rs @@ -0,0 +1,35 @@ +use getset::{CopyGetters, Getters, Setters}; +use typed_builder::TypedBuilder; + +#[derive(Default, TypedBuilder, CopyGetters, Getters, Setters, Clone)] +pub struct Picklist { + #[getset(get = "pub", set = "pub")] + #[builder(default = "".into())] + coltype: String, + + #[getset(get = "pub", set = "pub")] + #[builder(default = "".into())] + pickfile: String, + + #[getset(get = "pub", set = "pub")] + #[builder(default = "".into())] + column_name: String, + + #[getset(get = "pub", set = "pub")] + #[builder] + pickstyle: PickStyle, +} + +#[derive(Clone)] +#[repr(u32)] +pub enum PickStyle { + Include = 1, + Exclude = 2, +} + +// TODO: remove with MSRV 1.62 and use derive(Default) instead +impl std::default::Default for PickStyle { + fn default() -> Self { + PickStyle::Include + } +} diff --git a/src/core/src/prelude.rs b/src/core/src/prelude.rs index ef7d4aa27b..eb265d42ee 100644 --- a/src/core/src/prelude.rs +++ b/src/core/src/prelude.rs @@ -5,6 +5,8 @@ use crate::Error; pub use crate::signature::Signature; pub use crate::storage::Storage; +pub use crate::sketch::minhash::{AbundMinHashOps, FracMinHashOps, MinHashOps}; + pub trait ToWriter { fn to_writer(&self, writer: &mut W) -> Result<(), Error> where diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index db2a85ea05..bacceb848d 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -2,6 +2,8 @@ //! //! A signature is a collection of sketches for a genomic dataset. +use core::iter::FusedIterator; + use std::fs::File; use std::io; use std::iter::Iterator; @@ -20,6 +22,8 @@ use crate::sketch::Sketch; use crate::Error; use crate::HashIntoType; +// TODO: this is the behavior expected from Sketch, but that name is already +// used. Sketchable? pub trait SigsTrait { fn size(&self) -> usize; fn to_vec(&self) -> Vec; @@ -28,6 +32,16 @@ pub trait SigsTrait { fn seed(&self) -> u64; fn hash_function(&self) -> HashFunctions; + fn set_hash_function(&mut self, h: HashFunctions) -> Result<(), Error>; + fn is_protein(&self) -> bool { + self.hash_function() == HashFunctions::murmur64_protein + } + fn dayhoff(&self) -> bool { + self.hash_function() == HashFunctions::murmur64_dayhoff + } + fn hp(&self) -> bool { + self.hash_function() == HashFunctions::murmur64_hp + } fn add_hash(&mut self, hash: HashIntoType); @@ -117,6 +131,14 @@ impl SigsTrait for Sketch { } } + fn set_hash_function(&mut self, h: HashFunctions) -> Result<(), Error> { + match *self { + Sketch::MinHash(ref mut mh) => mh.set_hash_function(h), + Sketch::LargeMinHash(ref mut mh) => mh.set_hash_function(h), + Sketch::HyperLogLog(ref mut hll) => hll.set_hash_function(h), + } + } + fn add_hash(&mut self, hash: HashIntoType) { match *self { Sketch::MinHash(ref mut mh) => mh.add_hash(hash), @@ -395,6 +417,10 @@ impl Iterator for SeqToHashes { } #[derive(Serialize, Deserialize, Debug, Clone, TypedBuilder)] +#[cfg_attr( + feature = "rkyv", + derive(rkyv::Serialize, rkyv::Deserialize, rkyv::Archive) +)] pub struct Signature { #[serde(default = "default_class")] #[builder(default = default_class())] @@ -654,6 +680,92 @@ impl Signature { Ok(()) } + + pub fn iter_mut(&mut self) -> IterMut<'_> { + let length = self.signatures.len(); + IterMut { + iter: self.signatures.iter_mut(), + length, + } + } + + pub fn iter<'a>(&'a mut self) -> Iter<'a> { + let length = self.signatures.len(); + Iter { + iter: self.signatures.iter(), + length, + } + } +} + +pub struct IterMut<'a> { + iter: std::slice::IterMut<'a, Sketch>, + length: usize, +} + +impl<'a> IntoIterator for &'a mut Signature { + type Item = &'a mut Sketch; + type IntoIter = IterMut<'a>; + + fn into_iter(self) -> IterMut<'a> { + self.iter_mut() + } +} + +impl<'a> Iterator for IterMut<'a> { + type Item = &'a mut Sketch; + + fn next(&mut self) -> Option<&'a mut Sketch> { + if self.length == 0 { + None + } else { + self.length -= 1; + self.iter.next() + } + } + + fn size_hint(&self) -> (usize, Option) { + (self.length, Some(self.length)) + } +} + +pub struct Iter<'a> { + iter: std::slice::Iter<'a, Sketch>, + length: usize, +} + +impl<'a> Iterator for Iter<'a> { + type Item = &'a Sketch; + + fn next(&mut self) -> Option<&'a Sketch> { + if self.length == 0 { + None + } else { + self.length -= 1; + self.iter.next() + } + } + + fn size_hint(&self) -> (usize, Option) { + (self.length, Some(self.length)) + } +} + +impl FusedIterator for Iter<'_> {} + +impl ExactSizeIterator for Iter<'_> { + fn len(&self) -> usize { + self.length + } +} + +impl Clone for Iter<'_> { + fn clone(&self) -> Self { + Iter { + iter: self.iter.clone(), + length: self.length, + } + } } impl ToWriter for Signature { @@ -683,6 +795,8 @@ impl Default for Signature { impl PartialEq for Signature { fn eq(&self, other: &Signature) -> bool { + use crate::sketch::minhash::{KmerMinHash, KmerMinHashBTree}; + let metadata = self.class == other.class && self.email == other.email && self.hash_function == other.hash_function @@ -691,14 +805,25 @@ impl PartialEq for Signature { // TODO: find the right signature // as long as we have a matching - if let Sketch::MinHash(mh) = &self.signatures[0] { - if let Sketch::MinHash(other_mh) = &other.signatures[0] { - return metadata && (mh == other_mh); - } - } else { - unimplemented!() + match &self.signatures[0] { + Sketch::MinHash(mh) => match &other.signatures[0] { + Sketch::MinHash(other_mh) => return metadata && (mh == other_mh), + Sketch::LargeMinHash(other_mh) => { + // TODO: avoid clone + metadata && (mh == &Into::::into(other_mh.clone())) + } + Sketch::HyperLogLog(_) => todo!(), + }, + Sketch::LargeMinHash(mh) => match &other.signatures[0] { + Sketch::LargeMinHash(other_mh) => return metadata && (mh == other_mh), + Sketch::MinHash(other_mh) => { + // TODO: avoid clone + metadata && (mh == &Into::::into(other_mh.clone())) + } + Sketch::HyperLogLog(_) => todo!(), + }, + Sketch::HyperLogLog(_) => todo!(), } - metadata } } diff --git a/src/core/src/sketch/hyperloglog/mod.rs b/src/core/src/sketch/hyperloglog/mod.rs index 409d2a2c44..85436ff52f 100644 --- a/src/core/src/sketch/hyperloglog/mod.rs +++ b/src/core/src/sketch/hyperloglog/mod.rs @@ -18,7 +18,7 @@ use serde::{Deserialize, Serialize}; use crate::encodings::HashFunctions; use crate::prelude::*; use crate::signature::SigsTrait; -use crate::sketch::KmerMinHash; +use crate::sketch::{KmerMinHash, KmerMinHashBTree}; use crate::Error; use crate::HashIntoType; @@ -26,6 +26,10 @@ pub mod estimators; use estimators::CounterType; #[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[cfg_attr( + feature = "rkyv", + derive(rkyv::Serialize, rkyv::Deserialize, rkyv::Archive) +)] pub struct HyperLogLog { registers: Vec, p: usize, @@ -183,6 +187,16 @@ impl SigsTrait for HyperLogLog { HashFunctions::murmur64_DNA } + fn set_hash_function(&mut self, h: HashFunctions) -> Result<(), Error> { + //TODO support other hash functions + if h != HashFunctions::murmur64_DNA { + return Err(Error::InvalidHashFunction { + function: h.to_string(), + }); + } + Ok(()) + } + fn add_hash(&mut self, hash: HashIntoType) { let value = hash >> self.p; let index = (hash - (value << self.p)) as usize; @@ -208,6 +222,15 @@ impl SigsTrait for HyperLogLog { } } +impl Update for KmerMinHashBTree { + fn update(&self, other: &mut HyperLogLog) -> Result<(), Error> { + for h in self.mins() { + other.add_hash(h); + } + Ok(()) + } +} + impl Update for KmerMinHash { fn update(&self, other: &mut HyperLogLog) -> Result<(), Error> { for h in self.mins() { diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index 5c5f1114f8..ea3ae911b9 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -11,13 +11,13 @@ use serde::ser::{SerializeStruct, Serializer}; use serde::{Deserialize, Serialize}; use typed_builder::TypedBuilder; -use crate::_hash_murmur; use crate::encodings::HashFunctions; use crate::signature::SigsTrait; use crate::sketch::hyperloglog::HyperLogLog; use crate::Error; +use crate::{HashIntoType, _hash_murmur}; -pub fn max_hash_for_scaled(scaled: u64) -> u64 { +pub fn max_hash_for_scaled(scaled: u64) -> HashIntoType { match scaled { 0 => 0, 1 => u64::max_value(), @@ -25,14 +25,158 @@ pub fn max_hash_for_scaled(scaled: u64) -> u64 { } } -pub fn scaled_for_max_hash(max_hash: u64) -> u64 { +pub fn scaled_for_max_hash(max_hash: HashIntoType) -> u64 { match max_hash { 0 => 0, _ => (u64::max_value() as f64 / max_hash as f64) as u64, } } +pub trait MinHashOps: SigsTrait { + fn clear(&mut self); + fn is_empty(&self) -> bool; + fn reset_md5sum(&self); + fn md5sum(&self) -> String; + + fn mins(&self) -> Vec; + + fn add_word(&mut self, word: &[u8]) { + let hash = _hash_murmur(word, self.seed()); + self.add_hash(hash); + } + + fn remove_hash(&mut self, hash: HashIntoType); + + fn remove_many(&mut self, hashes: &[HashIntoType]) -> Result<(), Error> { + for min in hashes { + self.remove_hash(*min); + } + Ok(()) + } + + fn add_many(&mut self, hashes: &[HashIntoType]) -> Result<(), Error> { + for min in hashes { + self.add_hash(*min); + } + Ok(()) + } + + /* TODO(lirber): these need to avoid KmerMinHash and be more generic + + // TODO: use iterator + fn remove_from(&mut self, other: &KmerMinHash) -> Result<(), Error>; + fn merge(&mut self, other: &KmerMinHash) -> Result<(), Error>; + fn add_from(&mut self, other: &KmerMinHash) -> Result<(), Error>; + fn count_common(&self, other: &KmerMinHash, downsample: bool) -> Result; + fn intersection(&self, other: &KmerMinHash) -> Result<(Vec, u64), Error>; + + // FIXME: intersection_size and count_common should be the same? + // (for scaled minhashes) + fn intersection_size(&self, other: &KmerMinHash) -> Result<(u64, u64), Error>; + + // calculate Jaccard similarity, ignoring abundance. + fn jaccard(&self, other: &KmerMinHash) -> Result; + + fn similarity( + &self, + other: &KmerMinHash, + ignore_abundance: bool, + downsample: bool, + ) -> Result; + */ + + fn as_hll(&self) -> HyperLogLog; +} + +pub trait AbundMinHashOps: MinHashOps { + fn track_abundance(&self) -> bool; + fn enable_abundance(&mut self) -> Result<(), Error>; + fn disable_abundance(&mut self); + fn add_hash_with_abundance(&mut self, hash: HashIntoType, abundance: u64); + fn set_hash_with_abundance(&mut self, hash: HashIntoType, abundance: u64); + fn add_many_with_abund(&mut self, hashes: &[(HashIntoType, u64)]) -> Result<(), Error> { + for item in hashes { + self.add_hash_with_abundance(item.0, item.1); + } + Ok(()) + } + + fn abunds(&self) -> Option>; + fn to_vec_abunds(&self) -> Vec<(HashIntoType, u64)>; + + // compare two minhashes, with abundance; + // calculate their angular similarity. + fn angular_similarity(&self, other: &A) -> Result { + // TODO(lirber): bring back compat check once method sig changes + //self.check_compatible(other)?; + + if !self.track_abundance() || !other.track_abundance() { + return Err(Error::NeedsAbundanceTracking); + } + + // TODO: check which one is smaller, swap around if needed + // TODO(lirber): use iters here, instead of allocating new vecs! + let abunds = self.to_vec_abunds(); + let other_abunds = other.to_vec_abunds(); + + let mut prod = 0; + let mut other_iter = other_abunds.iter(); + let mut next_hash = other_iter.next(); + let a_sq: u64 = abunds.iter().map(|(_hash, abund)| (abund * abund)).sum(); + let b_sq: u64 = other_abunds + .iter() + .map(|(_hash, abund)| (abund * abund)) + .sum(); + + for (hash, abund) in abunds { + while let Some((k, other_abund)) = next_hash { + match k.cmp(&hash) { + Ordering::Less => next_hash = other_iter.next(), + Ordering::Equal => { + prod += abund * other_abund; + break; + } + Ordering::Greater => break, + } + } + } + + let norm_a = (a_sq as f64).sqrt(); + let norm_b = (b_sq as f64).sqrt(); + + if norm_a == 0. || norm_b == 0. { + return Ok(0.0); + } + let prod = f64::min(prod as f64 / (norm_a * norm_b), 1.); + let distance = 2. * prod.acos() / PI; + Ok(1. - distance) + } +} + +pub trait FracMinHashOps: MinHashOps { + fn max_hash(&self) -> HashIntoType; + fn scaled(&self) -> u64 { + scaled_for_max_hash(self.max_hash()) + } + fn downsample_max_hash(&self, max_hash: HashIntoType) -> Result + where + Self: Sized; + + // create a downsampled copy of self + fn downsample_scaled(&self, scaled: u64) -> Result + where + Self: Sized, + { + let max_hash = max_hash_for_scaled(scaled); + self.downsample_max_hash(max_hash) + } +} + #[derive(Debug, TypedBuilder)] +#[cfg_attr( + feature = "rkyv", + derive(rkyv::Serialize, rkyv::Deserialize, rkyv::Archive) +)] pub struct KmerMinHash { num: u32, ksize: u32, @@ -53,6 +197,8 @@ pub struct KmerMinHash { abunds: Option>, #[builder(default)] + //#[cfg_attr(feature = "rkyv", with(rkyv::with::Lock))] + #[cfg_attr(feature = "rkyv", with(rkyv::with::Skip))] md5sum: Mutex>, } @@ -215,197 +361,12 @@ impl KmerMinHash { self.num } - pub fn is_protein(&self) -> bool { - self.hash_function == HashFunctions::murmur64_protein - } - - pub fn max_hash(&self) -> u64 { - self.max_hash - } - pub fn scaled(&self) -> u64 { scaled_for_max_hash(self.max_hash) } - pub fn clear(&mut self) { - self.mins.clear(); - if let Some(ref mut abunds) = self.abunds { - abunds.clear(); - } - } - - pub fn is_empty(&self) -> bool { - self.mins.is_empty() - } - - pub fn set_hash_function(&mut self, h: HashFunctions) -> Result<(), Error> { - if self.hash_function == h { - return Ok(()); - } - - if !self.is_empty() { - return Err(Error::NonEmptyMinHash { - message: "hash_function".into(), - }); - } - - self.hash_function = h; - Ok(()) - } - - pub fn track_abundance(&self) -> bool { - self.abunds.is_some() - } - - pub fn enable_abundance(&mut self) -> Result<(), Error> { - if !self.mins.is_empty() { - return Err(Error::NonEmptyMinHash { - message: "track_abundance=True".into(), - }); - } - - self.abunds = Some(vec![]); - - Ok(()) - } - - pub fn disable_abundance(&mut self) { - self.abunds = None; - } - - fn reset_md5sum(&self) { - let mut data = self.md5sum.lock().unwrap(); - if data.is_some() { - *data = None; - } - } - - pub fn md5sum(&self) -> String { - let mut data = self.md5sum.lock().unwrap(); - if data.is_none() { - let mut buffer = String::with_capacity(20); - - let mut md5_ctx = md5::Context::new(); - write!(&mut buffer, "{}", self.ksize()).unwrap(); - md5_ctx.consume(&buffer); - buffer.clear(); - for x in &self.mins { - write!(&mut buffer, "{}", x).unwrap(); - md5_ctx.consume(&buffer); - buffer.clear(); - } - *data = Some(format!("{:x}", md5_ctx.compute())); - } - data.clone().unwrap() - } - - pub fn add_hash(&mut self, hash: u64) { - self.add_hash_with_abundance(hash, 1); - } - - pub fn add_hash_with_abundance(&mut self, hash: u64, abundance: u64) { - let current_max = match self.mins.last() { - Some(&x) => x, - None => u64::max_value(), - }; - - if hash > self.max_hash && self.max_hash != 0 { - // This is a scaled minhash, and we don't need to add the new hash - return; - } - - if self.num == 0 && self.max_hash == 0 { - // why did you create this minhash? it will always be empty... - return; - } - - if abundance == 0 { - self.remove_hash(hash); - return; - } - - // From this point on, hash is within scaled (or no scaled specified). - - // empty mins? add it. - if self.mins.is_empty() { - self.mins.push(hash); - if let Some(ref mut abunds) = self.abunds { - abunds.push(abundance); - self.reset_md5sum(); - } - return; - } - - if hash <= self.max_hash || hash <= current_max || (self.mins.len() as u32) < self.num { - // "good" hash - within range, smaller than current entry, or - // still have space available - let pos = match self.mins.binary_search(&hash) { - Ok(p) => p, - Err(p) => p, - }; - - if pos == self.mins.len() { - // at end - must still be growing, we know the list won't - // get too long - self.mins.push(hash); - self.reset_md5sum(); - if let Some(ref mut abunds) = self.abunds { - abunds.push(abundance); - } - } else if self.mins[pos] != hash { - // didn't find hash in mins, so inserting somewhere - // in the middle; shrink list if needed. - self.mins.insert(pos, hash); - if let Some(ref mut abunds) = self.abunds { - abunds.insert(pos, abundance); - } - - // is it too big now? - if self.num != 0 && self.mins.len() > (self.num as usize) { - self.mins.pop(); - if let Some(ref mut abunds) = self.abunds { - abunds.pop(); - } - } - self.reset_md5sum(); - } else if let Some(ref mut abunds) = self.abunds { - // pos == hash: hash value already in mins, inc count by abundance - abunds[pos] += abundance; - } - } - } - - pub fn set_hash_with_abundance(&mut self, hash: u64, abundance: u64) { - let mut found = false; - if let Ok(pos) = self.mins.binary_search(&hash) { - if self.mins[pos] == hash { - found = true; - if let Some(ref mut abunds) = self.abunds { - abunds[pos] = abundance; - } - } - } - - if !found { - self.add_hash_with_abundance(hash, abundance); - } - } - - pub fn add_word(&mut self, word: &[u8]) { - let hash = _hash_murmur(word, self.seed); - self.add_hash(hash); - } - - pub fn remove_hash(&mut self, hash: u64) { - if let Ok(pos) = self.mins.binary_search(&hash) { - if self.mins[pos] == hash { - self.mins.remove(pos); - self.reset_md5sum(); - if let Some(ref mut abunds) = self.abunds { - abunds.remove(pos); - } - } - }; + pub fn iter_mins(&self) -> impl Iterator { + self.mins.iter() } pub fn remove_from(&mut self, other: &KmerMinHash) -> Result<(), Error> { @@ -415,13 +376,6 @@ impl KmerMinHash { Ok(()) } - pub fn remove_many(&mut self, hashes: &[u64]) -> Result<(), Error> { - for min in hashes { - self.remove_hash(*min); - } - Ok(()) - } - pub fn merge(&mut self, other: &KmerMinHash) -> Result<(), Error> { self.check_compatible(other)?; let max_size = self.mins.len() + other.mins.len(); @@ -530,20 +484,6 @@ impl KmerMinHash { Ok(()) } - pub fn add_many(&mut self, hashes: &[u64]) -> Result<(), Error> { - for min in hashes { - self.add_hash(*min); - } - Ok(()) - } - - pub fn add_many_with_abund(&mut self, hashes: &[(u64, u64)]) -> Result<(), Error> { - for item in hashes { - self.add_hash_with_abundance(item.0, item.1); - } - Ok(()) - } - pub fn count_common(&self, other: &KmerMinHash, downsample: bool) -> Result { if downsample && self.max_hash != other.max_hash { let (first, second) = if self.max_hash < other.max_hash { @@ -638,55 +578,6 @@ impl KmerMinHash { } } - // compare two minhashes, with abundance; - // calculate their angular similarity. - pub fn angular_similarity(&self, other: &KmerMinHash) -> Result { - self.check_compatible(other)?; - - if self.abunds.is_none() || other.abunds.is_none() { - return Err(Error::NeedsAbundanceTracking); - } - - // TODO: check which one is smaller, swap around if needed - - let abunds = self.abunds.as_ref().unwrap(); - let other_abunds = other.abunds.as_ref().unwrap(); - - let mut prod = 0; - let mut other_iter = other.mins.iter().enumerate(); - let mut next_hash = other_iter.next(); - let a_sq: u64 = abunds.iter().map(|a| (a * a)).sum(); - let b_sq: u64 = other_abunds.iter().map(|a| (a * a)).sum(); - - for (i, hash) in self.mins.iter().enumerate() { - while let Some((j, k)) = next_hash { - match k.cmp(hash) { - Ordering::Less => next_hash = other_iter.next(), - Ordering::Equal => { - // Calling `get_unchecked` here is safe since - // both `i` and `j` are valid indices - // (`i` and `j` came from valid iterator calls) - unsafe { - prod += abunds.get_unchecked(i) * other_abunds.get_unchecked(j); - } - break; - } - Ordering::Greater => break, - } - } - } - - let norm_a = (a_sq as f64).sqrt(); - let norm_b = (b_sq as f64).sqrt(); - - if norm_a == 0. || norm_b == 0. { - return Ok(0.0); - } - let prod = f64::min(prod as f64 / (norm_a * norm_b), 1.); - let distance = 2. * prod.acos() / PI; - Ok(1. - distance) - } - pub fn similarity( &self, other: &KmerMinHash, @@ -700,36 +591,25 @@ impl KmerMinHash { (other, self) }; let downsampled_mh = second.downsample_max_hash(first.max_hash)?; + first.check_compatible(&downsampled_mh)?; first.similarity(&downsampled_mh, ignore_abundance, false) } else if ignore_abundance || self.abunds.is_none() || other.abunds.is_none() { + self.check_compatible(other)?; self.jaccard(other) } else { + self.check_compatible(other)?; self.angular_similarity(other) } } +} - pub fn dayhoff(&self) -> bool { - self.hash_function == HashFunctions::murmur64_dayhoff - } - - pub fn hp(&self) -> bool { - self.hash_function == HashFunctions::murmur64_hp - } - - pub fn mins(&self) -> Vec { - self.mins.clone() - } - - pub fn iter_mins(&self) -> impl Iterator { - self.mins.iter() - } - - pub fn abunds(&self) -> Option> { - self.abunds.clone() +impl FracMinHashOps for KmerMinHash { + fn max_hash(&self) -> u64 { + self.max_hash } // create a downsampled copy of self - pub fn downsample_max_hash(&self, max_hash: u64) -> Result { + fn downsample_max_hash(&self, max_hash: HashIntoType) -> Result { let scaled = scaled_for_max_hash(max_hash); let mut new_mh = KmerMinHash::new( @@ -747,8 +627,192 @@ impl KmerMinHash { } Ok(new_mh) } +} + +impl MinHashOps for KmerMinHash { + fn clear(&mut self) { + self.mins.clear(); + if let Some(ref mut abunds) = self.abunds { + abunds.clear(); + } + } + + fn is_empty(&self) -> bool { + self.mins.is_empty() + } + + fn reset_md5sum(&self) { + let mut data = self.md5sum.lock().unwrap(); + if data.is_some() { + *data = None; + } + } + + fn md5sum(&self) -> String { + let mut data = self.md5sum.lock().unwrap(); + if data.is_none() { + let mut buffer = String::with_capacity(20); + + let mut md5_ctx = md5::Context::new(); + write!(&mut buffer, "{}", self.ksize()).unwrap(); + md5_ctx.consume(&buffer); + buffer.clear(); + for x in &self.mins { + write!(&mut buffer, "{}", x).unwrap(); + md5_ctx.consume(&buffer); + buffer.clear(); + } + *data = Some(format!("{:x}", md5_ctx.compute())); + } + data.clone().unwrap() + } + + fn remove_hash(&mut self, hash: u64) { + if let Ok(pos) = self.mins.binary_search(&hash) { + if self.mins[pos] == hash { + self.mins.remove(pos); + self.reset_md5sum(); + if let Some(ref mut abunds) = self.abunds { + abunds.remove(pos); + } + } + }; + } + + fn mins(&self) -> Vec { + self.mins.clone() + } + + fn as_hll(&self) -> HyperLogLog { + let mut hll = HyperLogLog::with_error_rate(0.01, self.ksize()).unwrap(); + + for h in &self.mins { + hll.add_hash(*h) + } + + hll + } +} + +impl AbundMinHashOps for KmerMinHash { + fn track_abundance(&self) -> bool { + self.abunds.is_some() + } + + fn enable_abundance(&mut self) -> Result<(), Error> { + if !self.mins.is_empty() { + return Err(Error::NonEmptyMinHash { + message: "track_abundance=True".into(), + }); + } + + self.abunds = Some(vec![]); + + Ok(()) + } + + fn disable_abundance(&mut self) { + self.abunds = None; + } + + fn add_hash_with_abundance(&mut self, hash: u64, abundance: u64) { + let current_max = match self.mins.last() { + Some(&x) => x, + None => u64::max_value(), + }; + + if hash > self.max_hash && self.max_hash != 0 { + // This is a scaled minhash, and we don't need to add the new hash + return; + } + + if self.num == 0 && self.max_hash == 0 { + // why did you create this minhash? it will always be empty... + return; + } + + if abundance == 0 { + self.remove_hash(hash); + return; + } + + // From this point on, hash is within scaled (or no scaled specified). + + // empty mins? add it. + if self.mins.is_empty() { + self.mins.push(hash); + if let Some(ref mut abunds) = self.abunds { + abunds.push(abundance); + self.reset_md5sum(); + } + return; + } + + if hash <= self.max_hash || hash <= current_max || (self.mins.len() as u32) < self.num { + // "good" hash - within range, smaller than current entry, or + // still have space available + let pos = match self.mins.binary_search(&hash) { + Ok(p) => p, + Err(p) => p, + }; + + if pos == self.mins.len() { + // at end - must still be growing, we know the list won't + // get too long + self.mins.push(hash); + self.reset_md5sum(); + if let Some(ref mut abunds) = self.abunds { + abunds.push(abundance); + } + } else if self.mins[pos] != hash { + // didn't find hash in mins, so inserting somewhere + // in the middle; shrink list if needed. + self.mins.insert(pos, hash); + if let Some(ref mut abunds) = self.abunds { + abunds.insert(pos, abundance); + } + + // is it too big now? + if self.num != 0 && self.mins.len() > (self.num as usize) { + self.mins.pop(); + if let Some(ref mut abunds) = self.abunds { + abunds.pop(); + } + } + self.reset_md5sum(); + } else if let Some(ref mut abunds) = self.abunds { + // pos == hash: hash value already in mins, inc count by abundance + abunds[pos] += abundance; + } + } + } + + fn set_hash_with_abundance(&mut self, hash: u64, abundance: u64) { + if abundance == 0 { + self.remove_hash(hash); + return; + } + + let mut found = false; + if let Ok(pos) = self.mins.binary_search(&hash) { + if self.mins[pos] == hash { + found = true; + if let Some(ref mut abunds) = self.abunds { + abunds[pos] = abundance; + } + } + } + + if !found { + self.add_hash_with_abundance(hash, abundance); + } + } + + fn abunds(&self) -> Option> { + self.abunds.clone() + } - pub fn to_vec_abunds(&self) -> Vec<(u64, u64)> { + fn to_vec_abunds(&self) -> Vec<(HashIntoType, u64)> { if let Some(abunds) = &self.abunds { self.mins .iter() @@ -763,22 +827,6 @@ impl KmerMinHash { .collect() } } - - pub fn as_hll(&self) -> HyperLogLog { - let mut hll = HyperLogLog::with_error_rate(0.01, self.ksize()).unwrap(); - - for h in &self.mins { - hll.add_hash(*h) - } - - hll - } - - // create a downsampled copy of self - pub fn downsample_scaled(&self, scaled: u64) -> Result { - let max_hash = max_hash_for_scaled(scaled); - self.downsample_max_hash(max_hash) - } } impl SigsTrait for KmerMinHash { @@ -802,6 +850,21 @@ impl SigsTrait for KmerMinHash { self.hash_function } + fn set_hash_function(&mut self, h: HashFunctions) -> Result<(), Error> { + if self.hash_function == h { + return Ok(()); + } + + if !self.is_empty() { + return Err(Error::NonEmptyMinHash { + message: "hash_function".into(), + }); + } + + self.hash_function = h; + Ok(()) + } + fn add_hash(&mut self, hash: u64) { self.add_hash_with_abundance(hash, 1); } @@ -927,6 +990,10 @@ mod test { // A MinHash implementation for low scaled or large cardinalities #[derive(Debug, TypedBuilder)] +#[cfg_attr( + feature = "rkyv", + derive(rkyv::Serialize, rkyv::Deserialize, rkyv::Archive) +)] pub struct KmerMinHashBTree { num: u32, ksize: u32, @@ -950,6 +1017,8 @@ pub struct KmerMinHashBTree { current_max: u64, #[builder(default)] + //#[cfg_attr(feature = "rkyv", with(rkyv::with::Lock))] + #[cfg_attr(feature = "rkyv", with(rkyv::with::Skip))] md5sum: Mutex>, } @@ -1114,8 +1183,8 @@ impl KmerMinHashBTree { self.num } - pub fn is_protein(&self) -> bool { - self.hash_function == HashFunctions::murmur64_protein + pub fn iter_mins(&self) -> impl Iterator { + self.mins.iter() } pub fn max_hash(&self) -> u64 { @@ -1279,6 +1348,13 @@ impl KmerMinHashBTree { Ok(()) } + pub fn remove_from(&mut self, other: &KmerMinHashBTree) -> Result<(), Error> { + for min in &other.mins { + self.remove_hash(*min); + } + Ok(()) + } + pub fn merge(&mut self, other: &KmerMinHashBTree) -> Result<(), Error> { self.check_compatible(other)?; let union = self.mins.union(&other.mins); @@ -1316,20 +1392,6 @@ impl KmerMinHashBTree { Ok(()) } - pub fn add_many(&mut self, hashes: &[u64]) -> Result<(), Error> { - for min in hashes { - self.add_hash(*min); - } - Ok(()) - } - - pub fn add_many_with_abund(&mut self, hashes: &[(u64, u64)]) -> Result<(), Error> { - for item in hashes { - self.add_hash_with_abundance(item.0, item.1); - } - Ok(()) - } - pub fn count_common(&self, other: &KmerMinHashBTree, downsample: bool) -> Result { if downsample && self.max_hash != other.max_hash { let (first, second) = if self.max_hash < other.max_hash { @@ -1350,7 +1412,6 @@ impl KmerMinHashBTree { Ok(iter.count() as u64) } } - pub fn intersection(&self, other: &KmerMinHashBTree) -> Result<(Vec, u64), Error> { self.check_compatible(other)?; @@ -1423,39 +1484,6 @@ impl KmerMinHashBTree { } } - // compare two minhashes, with abundance; - // calculate their angular similarity. - pub fn angular_similarity(&self, other: &KmerMinHashBTree) -> Result { - self.check_compatible(other)?; - - if self.abunds.is_none() || other.abunds.is_none() { - return Err(Error::NeedsAbundanceTracking); - } - - let abunds = self.abunds.as_ref().unwrap(); - let other_abunds = other.abunds.as_ref().unwrap(); - - let mut prod = 0; - let a_sq: u64 = abunds.values().map(|a| (a * a)).sum(); - let b_sq: u64 = other_abunds.values().map(|a| (a * a)).sum(); - - for (hash, value) in abunds.iter() { - if let Some(oa) = other_abunds.get(hash) { - prod += value * oa - } - } - - let norm_a = (a_sq as f64).sqrt(); - let norm_b = (b_sq as f64).sqrt(); - - if norm_a == 0. || norm_b == 0. { - return Ok(0.0); - } - let prod = f64::min(prod as f64 / (norm_a * norm_b), 1.); - let distance = 2. * prod.acos() / PI; - Ok(1. - distance) - } - pub fn similarity( &self, other: &KmerMinHashBTree, @@ -1469,42 +1497,25 @@ impl KmerMinHashBTree { (other, self) }; let downsampled_mh = second.downsample_max_hash(first.max_hash)?; + first.check_compatible(&downsampled_mh)?; first.similarity(&downsampled_mh, ignore_abundance, false) } else if ignore_abundance || self.abunds.is_none() || other.abunds.is_none() { + self.check_compatible(other)?; self.jaccard(other) } else { + self.check_compatible(other)?; self.angular_similarity(other) } } +} - pub fn dayhoff(&self) -> bool { - self.hash_function == HashFunctions::murmur64_dayhoff - } - - pub fn hp(&self) -> bool { - self.hash_function == HashFunctions::murmur64_hp - } - - pub fn hash_function(&self) -> HashFunctions { - self.hash_function - } - - pub fn mins(&self) -> Vec { - self.mins.iter().cloned().collect() - } - - pub fn iter_mins(&self) -> impl Iterator { - self.mins.iter() - } - - pub fn abunds(&self) -> Option> { - self.abunds - .as_ref() - .map(|abunds| abunds.values().cloned().collect()) +impl FracMinHashOps for KmerMinHashBTree { + fn max_hash(&self) -> u64 { + self.max_hash } // create a downsampled copy of self - pub fn downsample_max_hash(&self, max_hash: u64) -> Result { + fn downsample_max_hash(&self, max_hash: HashIntoType) -> Result { let scaled = scaled_for_max_hash(max_hash); let mut new_mh = KmerMinHashBTree::new( @@ -1522,14 +1533,176 @@ impl KmerMinHashBTree { } Ok(new_mh) } +} + +impl MinHashOps for KmerMinHashBTree { + fn clear(&mut self) { + self.mins.clear(); + if let Some(ref mut abunds) = self.abunds { + abunds.clear(); + } + self.current_max = 0; + } - // create a downsampled copy of self - pub fn downsample_scaled(&self, scaled: u64) -> Result { - let max_hash = max_hash_for_scaled(scaled); - self.downsample_max_hash(max_hash) + fn is_empty(&self) -> bool { + self.mins.is_empty() + } + + fn reset_md5sum(&self) { + let mut data = self.md5sum.lock().unwrap(); + if data.is_some() { + *data = None; + } + } + + fn md5sum(&self) -> String { + let mut data = self.md5sum.lock().unwrap(); + if data.is_none() { + let mut buffer = String::with_capacity(20); + + let mut md5_ctx = md5::Context::new(); + write!(&mut buffer, "{}", self.ksize()).unwrap(); + md5_ctx.consume(&buffer); + buffer.clear(); + for x in &self.mins { + write!(&mut buffer, "{}", x).unwrap(); + md5_ctx.consume(&buffer); + buffer.clear(); + } + *data = Some(format!("{:x}", md5_ctx.compute())); + } + data.clone().unwrap() + } + + fn remove_hash(&mut self, hash: u64) { + if self.mins.remove(&hash) { + self.reset_md5sum(); + if let Some(ref mut abunds) = self.abunds { + abunds.remove(&hash); + } + } + if hash == self.current_max { + self.current_max = *self.mins.iter().rev().next().unwrap_or(&0); + } + } + + fn mins(&self) -> Vec { + self.mins.iter().cloned().collect() + } + + fn as_hll(&self) -> HyperLogLog { + let mut hll = HyperLogLog::with_error_rate(0.01, self.ksize()).unwrap(); + + for h in &self.mins { + hll.add_hash(*h) + } + + hll + } +} + +impl AbundMinHashOps for KmerMinHashBTree { + fn track_abundance(&self) -> bool { + self.abunds.is_some() + } + + fn enable_abundance(&mut self) -> Result<(), Error> { + if !self.mins.is_empty() { + return Err(Error::NonEmptyMinHash { + message: "track_abundance=True".into(), + }); + } + + self.abunds = Some(Default::default()); + + Ok(()) + } + + fn disable_abundance(&mut self) { + self.abunds = None; + } + + fn add_hash_with_abundance(&mut self, hash: u64, abundance: u64) { + if hash > self.max_hash && self.max_hash != 0 { + // This is a scaled minhash, and we don't need to add the new hash + return; + } + + if self.num == 0 && self.max_hash == 0 { + // why did you create this minhash? it will always be empty... + return; + } + + if abundance == 0 { + self.remove_hash(hash); + return; + } + + // From this point on, hash is within scaled (or no scaled specified). + + // empty mins? add it. + if self.mins.is_empty() { + self.mins.insert(hash); + self.reset_md5sum(); + if let Some(ref mut abunds) = self.abunds { + abunds.insert(hash, abundance); + } + self.current_max = hash; + return; + } + + if hash <= self.max_hash || hash <= self.current_max || (self.mins.len() as u32) < self.num + { + // "good" hash - within range, smaller than current entry, or + // still have space available + if self.mins.insert(hash) { + self.reset_md5sum(); + if hash > self.current_max { + self.current_max = hash; + } + } + if let Some(ref mut abunds) = self.abunds { + *abunds.entry(hash).or_insert(0) += abundance; + } + + // is it too big now? + if self.num != 0 && self.mins.len() > (self.num as usize) { + let last = *self.mins.iter().rev().next().unwrap(); + self.mins.remove(&last); + self.reset_md5sum(); + if let Some(ref mut abunds) = self.abunds { + abunds.remove(&last); + } + self.current_max = *self.mins.iter().rev().next().unwrap(); + } + } + } + + fn set_hash_with_abundance(&mut self, hash: u64, abundance: u64) { + if abundance == 0 { + self.remove_hash(hash); + return; + } + + if self.mins.contains(&hash) { + if let Some(ref mut abunds) = self.abunds { + abunds + .entry(hash) + .and_modify(|v| *v = abundance) + .or_insert_with(|| abundance); + } + } else { + self.add_hash_with_abundance(hash, abundance); + } + } + + fn abunds(&self) -> Option> { + self.abunds + .as_ref() + .map(|abunds| abunds.values().cloned().collect()) } - pub fn to_vec_abunds(&self) -> Vec<(u64, u64)> { + fn to_vec_abunds(&self) -> Vec<(u64, u64)> { if let Some(abunds) = &self.abunds { abunds.iter().map(|(a, b)| (*a, *b)).collect() } else { @@ -1563,6 +1736,21 @@ impl SigsTrait for KmerMinHashBTree { self.hash_function } + fn set_hash_function(&mut self, h: HashFunctions) -> Result<(), Error> { + if self.hash_function == h { + return Ok(()); + } + + if !self.is_empty() { + return Err(Error::NonEmptyMinHash { + message: "hash_function".into(), + }); + } + + self.hash_function = h; + Ok(()) + } + fn add_hash(&mut self, hash: u64) { self.add_hash_with_abundance(hash, 1); } diff --git a/src/core/src/sketch/mod.rs b/src/core/src/sketch/mod.rs index 09bd51085c..3ef04e43df 100644 --- a/src/core/src/sketch/mod.rs +++ b/src/core/src/sketch/mod.rs @@ -10,6 +10,10 @@ use crate::sketch::minhash::{KmerMinHash, KmerMinHashBTree}; #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(untagged)] +#[cfg_attr( + feature = "rkyv", + derive(rkyv::Serialize, rkyv::Deserialize, rkyv::Archive) +)] pub enum Sketch { MinHash(KmerMinHash), LargeMinHash(KmerMinHashBTree), diff --git a/src/core/src/sketch/nodegraph.rs b/src/core/src/sketch/nodegraph.rs index cbca8915ba..bbfef5cd0d 100644 --- a/src/core/src/sketch/nodegraph.rs +++ b/src/core/src/sketch/nodegraph.rs @@ -7,7 +7,7 @@ use byteorder::{BigEndian, ByteOrder, LittleEndian, ReadBytesExt, WriteBytesExt} use fixedbitset::FixedBitSet; use crate::prelude::*; -use crate::sketch::minhash::KmerMinHash; +use crate::sketch::minhash::{KmerMinHash, KmerMinHashBTree}; use crate::Error; use crate::HashIntoType; @@ -58,6 +58,15 @@ impl Update for KmerMinHash { } } +impl Update for KmerMinHashBTree { + fn update(&self, other: &mut Nodegraph) -> Result<(), Error> { + for h in self.mins() { + other.count(h); + } + Ok(()) + } +} + impl Nodegraph { pub fn new(tablesizes: &[usize], ksize: usize) -> Nodegraph { let mut bs = Vec::with_capacity(tablesizes.len()); diff --git a/src/core/src/storage.rs b/src/core/src/storage.rs index ec82464c5b..46d7114e4f 100644 --- a/src/core/src/storage.rs +++ b/src/core/src/storage.rs @@ -3,8 +3,7 @@ use std::ffi::OsStr; use std::fs::{DirBuilder, File}; use std::io::{BufReader, BufWriter, Read, Write}; use std::path::{Path, PathBuf}; -use std::rc::Rc; -use std::sync::RwLock; +use std::sync::{Arc, RwLock}; use serde::{Deserialize, Serialize}; use thiserror::Error; @@ -25,11 +24,11 @@ pub trait Storage { } #[derive(Clone)] -pub struct InnerStorage(Rc>); +pub struct InnerStorage(Arc>); impl InnerStorage { - pub fn new(inner: impl Storage + 'static) -> InnerStorage { - InnerStorage(Rc::new(RwLock::new(inner))) + pub fn new(inner: impl Storage + Send + Sync + 'static) -> InnerStorage { + InnerStorage(Arc::new(RwLock::new(inner))) } } diff --git a/src/core/tests/minhash.rs b/src/core/tests/minhash.rs index bcb3fdb4fa..50c03870e0 100644 --- a/src/core/tests/minhash.rs +++ b/src/core/tests/minhash.rs @@ -6,6 +6,7 @@ use proptest::collection::vec; use proptest::num::u64; use proptest::proptest; use sourmash::encodings::HashFunctions; +use sourmash::prelude::*; use sourmash::signature::SeqToHashes; use sourmash::signature::{Signature, SigsTrait}; use sourmash::sketch::minhash::{ diff --git a/src/core/tests/storage.rs b/src/core/tests/storage.rs index 5a60e02fcc..a27fa27b14 100644 --- a/src/core/tests/storage.rs +++ b/src/core/tests/storage.rs @@ -42,3 +42,41 @@ fn zipstorage_list_sbts() -> Result<(), Box> { Ok(()) } + +#[cfg(feature = "parallel")] +#[test] +fn zipstorage_parallel_access() -> Result<(), Box> { + use std::io::BufReader; + + use rayon::prelude::*; + use sourmash::signature::{Signature, SigsTrait}; + use sourmash::sketch::minhash::KmerMinHash; + + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("../../tests/test-data/v6.sbt.zip"); + + let zs = ZipStorage::from_file(filename.to_str().unwrap())?; + + let total_hashes: usize = [ + ".sbt.v3/f71e78178af9e45e6f1d87a0c53c465c", + ".sbt.v3/f0c834bc306651d2b9321fb21d3e8d8f", + ".sbt.v3/4e94e60265e04f0763142e20b52c0da1", + ".sbt.v3/6d6e87e1154e95b279e5e7db414bc37b", + ".sbt.v3/0107d767a345eff67ecdaed2ee5cd7ba", + ".sbt.v3/b59473c94ff2889eca5d7165936e64b3", + ".sbt.v3/60f7e23c24a8d94791cc7a8680c493f9", + ] + .par_iter() + .map(|path| { + let data = zs.load(path).unwrap(); + let sigs: Vec = serde_json::from_reader(&data[..]).expect("Loading error"); + sigs.iter() + .map(|v| v.sketches().iter().map(|mh| mh.size()).sum::()) + .sum::() + }) + .sum(); + + assert_eq!(total_hashes, 3500); + + Ok(()) +} diff --git a/src/sourmash/hll.py b/src/sourmash/hll.py index c98ded5e8b..8e593e0f51 100644 --- a/src/sourmash/hll.py +++ b/src/sourmash/hll.py @@ -43,8 +43,10 @@ def add(self, h): def update(self, other): if isinstance(other, HLL): return self._methodcall(lib.hll_merge, other._objptr) - elif isinstance(other, MinHash): + elif isinstance(other, FrozenMinHash): return self._methodcall(lib.hll_update_mh, other._objptr) + elif isinstance(other, MinHash): + return self._methodcall(lib.hll_update_mh, other.to_frozen()._objptr) else: # FIXME: we could take sets here too (or anything that can be # converted to a list of ints...) diff --git a/src/sourmash/index/__init__.py b/src/sourmash/index/__init__.py index 08068255e5..4aba54b630 100644 --- a/src/sourmash/index/__init__.py +++ b/src/sourmash/index/__init__.py @@ -34,10 +34,15 @@ CounterGather - an ancillary class returned by the 'counter_gather()' method. """ +from __future__ import annotations + import os import sourmash from abc import abstractmethod, ABC -from collections import namedtuple, Counter +from collections import Counter +from collections import defaultdict +from typing import NamedTuple, Optional, TypedDict, TYPE_CHECKING +import weakref from sourmash.search import (make_jaccard_search_query, make_containment_query, @@ -45,12 +50,79 @@ from sourmash.manifest import CollectionManifest from sourmash.logging import debug_literal from sourmash.signature import load_signatures, save_signatures +from sourmash._lowlevel import ffi, lib +from sourmash.utils import RustObject, rustcall, decode_str, encode_str +from sourmash import SourmashSignature +from sourmash.picklist import SignaturePicklist from sourmash.minhash import (flatten_and_downsample_scaled, flatten_and_downsample_num, flatten_and_intersect_scaled) -# generic return tuple for Index.search and Index.gather -IndexSearchResult = namedtuple('Result', 'score, signature, location') + +if TYPE_CHECKING: + from typing_extensions import Unpack + + +class IndexSearchResult(NamedTuple): + """generic return tuple for Index.search and Index.gather""" + score: float + signature: SourmashSignature + location: str + + +class Selection(TypedDict): + ksize: Optional[int] + moltype: Optional[str] + num: Optional[int] + scaled: Optional[int] + containment: Optional[bool] + abund: Optional[bool] + picklist: Optional[SignaturePicklist] + + +# TypedDict can't have methods (it is a dict in runtime) +def _selection_as_rust(selection: Selection): + ptr = lib.selection_new() + + for key, v in selection.items(): + if v is not None: + if key == "ksize": + rustcall(lib.selection_set_ksize, ptr, v) + + elif key == "moltype": + hash_function = None + if v.lower() == "dna": + hash_function = lib.HASH_FUNCTIONS_MURMUR64_DNA + elif v.lower() == "protein": + hash_function = lib.HASH_FUNCTIONS_MURMUR64_PROTEIN + elif v.lower() == "dayhoff": + hash_function = lib.HASH_FUNCTIONS_MURMUR64_DAYHOFF + elif v.lower() == "hp": + hash_function = lib.HASH_FUNCTIONS_MURMUR64_HP + + rustcall(lib.selection_set_moltype, ptr, hash_function) + + elif key == "num": + rustcall(lib.selection_set_num, ptr, v) + + elif key == "scaled": + rustcall(lib.selection_set_scaled, ptr, v) + + elif key == "containment": + rustcall(lib.selection_set_containment, ptr, v) + + elif key == "abund": + rustcall(lib.selection_set_abund, ptr, bool(v)) + + elif key == "picklist": + picklist_ptr = v._as_rust() + rustcall(lib.selection_set_picklist, ptr, picklist_ptr) + + else: + raise KeyError(f"Unsupported key {key} for Selection in rust") + + return ptr + class Index(ABC): # this will be removed soon; see sourmash#1894. @@ -307,8 +379,7 @@ def counter_gather(self, query, threshold_bp, **kwargs): return counter @abstractmethod - def select(self, ksize=None, moltype=None, scaled=None, num=None, - abund=None, containment=None): + def select(self, **kwargs: Unpack[Selection]): """Return Index containing only signatures that match requirements. Current arguments can be any or all of: @@ -326,9 +397,16 @@ def select(self, ksize=None, moltype=None, scaled=None, num=None, """ -def select_signature(ss, *, ksize=None, moltype=None, scaled=0, num=0, - containment=False, abund=None, picklist=None): +def select_signature(ss, **kwargs: Unpack[Selection]): "Check that the given signature matches the specified requirements." + ksize = kwargs.get('ksize') + moltype = kwargs.get('moltype') + containment = kwargs.get('containment', False) + scaled = kwargs.get('scaled', 0) + num = kwargs.get('num', 0) + abund = kwargs.get('abund') + picklist = kwargs.get('picklist') + # ksize match? if ksize and ksize != ss.minhash.ksize: return False @@ -408,7 +486,7 @@ def load(cls, location, filename=None): lidx = LinearIndex(si, filename=filename) return lidx - def select(self, **kwargs): + def select(self, **kwargs: Unpack[Selection]): """Return new LinearIndex containing only signatures that match req's. Does not raise ValueError, but may return an empty Index. @@ -479,7 +557,7 @@ def save(self, path): def load(cls, path): raise NotImplementedError - def select(self, **kwargs): + def select(self, **kwargs: Unpack[Selection]): """Return new object yielding only signatures that match req's. Does not raise ValueError, but may return an empty Index. @@ -642,7 +720,7 @@ def signatures(self): if select(ss): yield ss - def select(self, **kwargs): + def select(self, **kwargs: Unpack[Selection]): "Select signatures in zip file based on ksize/moltype/etc." # if we have a manifest, run 'select' on the manifest. @@ -1053,7 +1131,7 @@ def load_from_pathlist(cls, filename): def save(self, *args): raise NotImplementedError - def select(self, **kwargs): + def select(self, **kwargs: Unpack[Selection]): "Run 'select' on the manifest." new_manifest = self.manifest.select_to_manifest(**kwargs) return MultiIndex(new_manifest, self.parent, @@ -1162,8 +1240,135 @@ def save(self, *args): def insert(self, *args): raise NotImplementedError - def select(self, **kwargs): + def select(self, **kwargs: Unpack[Selection]): "Run 'select' on the manifest." new_manifest = self.manifest.select_to_manifest(**kwargs) return StandaloneManifestIndex(new_manifest, self._location, prefix=self.prefix) + +class RustLinearIndex(Index, RustObject): + """\ + A read-only collection of signatures in a zip file. + + Does not support `insert` or `save`. + + Concrete class; signatures dynamically loaded from disk; uses manifests. + """ + is_database = True + + __dealloc_func__ = lib.linearindex_free + + def __init__(self, storage, *, selection_dict=None, + traverse_yield_all=False, manifest=None, use_manifest=True): + + self._selection_dict = selection_dict + self._traverse_yield_all = traverse_yield_all + self._use_manifest = use_manifest + + # Taking ownership of the storage + storage_ptr = storage._take_objptr() + + manifest_ptr = ffi.NULL + # do we have a manifest already? if not, try loading. + if use_manifest: + if manifest is not None: + debug_literal('RustLinearIndex using passed-in manifest') + manifest_ptr = manifest._as_rust()._take_objptr() + + selection_ptr = ffi.NULL + + self._objptr = rustcall(lib.linearindex_new, storage_ptr, + manifest_ptr, selection_ptr, use_manifest) + + """ + if self.manifest is not None: + assert not self.selection_dict, self.selection_dict + if self.selection_dict: + assert self.manifest is None + """ + + @property + def manifest(self): + return CollectionManifest._from_rust(self._methodcall(lib.linearindex_manifest)) + + @manifest.setter + def manifest(self, value): + if value is None: + return # FIXME: can't unset manifest in a Rust Linear Index + self._methodcall(lib.linearindex_set_manifest, value._as_rust()._take_objptr()) + + def __bool__(self): + "Are there any matching signatures in this zipfile? Avoid calling len." + return self._methodcall(lib.linearindex_len) > 0 + + def __len__(self): + "calculate number of signatures." + return self._methodcall(lib.linearindex_len) + + @property + def location(self): + return decode_str(self._methodcall(lib.linearindex_location)) + + @property + def storage(self): + from ..sbt_storage import ZipStorage + + ptr = self._methodcall(lib.linearindex_storage) + return ZipStorage._from_objptr(ptr) + + def insert(self, signature): + raise NotImplementedError + + def save(self, path): + raise NotImplementedError + + @classmethod + def load(cls, location, traverse_yield_all=False, use_manifest=True): + "Class method to load a zipfile." + from ..sbt_storage import ZipStorage + + # we can only load from existing zipfiles in this method. + if not os.path.exists(location): + raise FileNotFoundError(location) + + storage = ZipStorage(location) + return cls(storage, traverse_yield_all=traverse_yield_all, + use_manifest=use_manifest) + + def _signatures_with_internal(self): + """Return an iterator of tuples (ss, internal_location). + + Note: does not limit signatures to subsets. + """ + # list all the files, without using the Storage interface; currently, + # 'Storage' does not provide a way to list all the files, so :shrug:. + for filename in self.storage._filenames(): + # should we load this file? if it ends in .sig OR we are forcing: + if filename.endswith('.sig') or \ + filename.endswith('.sig.gz') or \ + self._traverse_yield_all: + sig_data = self.storage.load(filename) + for ss in load_signatures(sig_data): + yield ss, filename + + def signatures(self): + "Load all signatures in the zip file." + attached_refs = weakref.WeakKeyDictionary() + iterator = self._methodcall(lib.linearindex_signatures) + + next_sig = rustcall(lib.signatures_iter_next, iterator) + while next_sig != ffi.NULL: + attached_refs[next_sig] = iterator + yield SourmashSignature._from_objptr(next_sig) + next_sig = rustcall(lib.signatures_iter_next, iterator) + + def select(self, **kwargs: Unpack[Selection]): + "Select signatures in zip file based on ksize/moltype/etc." + + selection = _selection_as_rust(kwargs) + + # select consumes the current index + ptr = self._take_objptr() + ptr = rustcall(lib.linearindex_select, ptr, selection) + + return RustLinearIndex._from_objptr(ptr) diff --git a/src/sourmash/manifest.py b/src/sourmash/manifest.py index bfd27eabb9..d2f78563cb 100644 --- a/src/sourmash/manifest.py +++ b/src/sourmash/manifest.py @@ -7,9 +7,13 @@ import os.path from abc import abstractmethod import itertools +from typing import TYPE_CHECKING from sourmash.picklist import SignaturePicklist +if TYPE_CHECKING: + from typing_extensions import Unpack + class BaseCollectionManifest: """ @@ -303,6 +307,7 @@ def _select(self, *, ksize=None, moltype=None, scaled=0, num=0, for row in matching_rows: yield row + #def select_to_manifest(self, **kwargs: Unpack[Selection]): def select_to_manifest(self, **kwargs): "Do a 'select' and return a new CollectionManifest object." new_rows = self._select(**kwargs) @@ -343,3 +348,34 @@ def to_picklist(self): picklist.pickset = set(self._md5_set) return picklist + + @staticmethod + def _from_rust(value): + from ._lowlevel import ffi, lib + from .utils import rustcall, decode_str + + iterator = rustcall(lib.manifest_rows, value) + + rows = [] + next_row = rustcall(lib.manifest_rows_iter_next, iterator) + while next_row != ffi.NULL: + + # TODO: extract row data from next_row + # FIXME: free mem from strings? + row = {} + row['md5'] = decode_str(next_row.md5) + row['md5short'] = row['md5'][:8] + row['ksize'] = next_row.ksize + row['moltype'] = decode_str(next_row.moltype) + row['num'] = 0 #ss.minhash.num + row['scaled'] = 0 #ss.minhash.scaled + row['n_hashes'] = 0 # len(ss.minhash) + row['with_abundance'] = next_row.with_abundance + row['name'] = decode_str(next_row.name) + row['filename'] = "" #ss.filename + row['internal_location'] = decode_str(next_row.internal_location) + rows.append(row) + + next_row = rustcall(lib.manifest_rows_iter_next, iterator) + + return CollectionManifest(rows) diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py index 360ca6165b..1691c31828 100644 --- a/src/sourmash/minhash.py +++ b/src/sourmash/minhash.py @@ -644,7 +644,7 @@ def downsample(self, *, num=None, scaled=None): # acceptable num value? make sure to set max_hash to 0. max_hash = 0 - + elif scaled is not None: # cannot downsample a num MinHash with scaled if self.num: @@ -904,7 +904,7 @@ def set_abundances(self, values, clear=True): abunds = [] for h, v in values.items(): - hashes.append(h) + hashes.append(h) if v < 0: raise ValueError("Abundance cannot be set to a negative value.") abunds.append(v) @@ -937,9 +937,8 @@ def to_mutable(self): def to_frozen(self): "Return a frozen copy of this MinHash that cannot be changed." - new_mh = self.__copy__() - new_mh.into_frozen() - return new_mh + new_mh_ptr = self._methodcall(lib.kmerminhash_to_frozen) + return FrozenMinHash._from_objptr(new_mh_ptr) def into_frozen(self): "Freeze this MinHash, preventing any changes." @@ -1069,11 +1068,8 @@ def merge(self, *args, **kwargs): def to_mutable(self): "Return a copy of this MinHash that can be changed." - mut = MinHash.__new__(MinHash) - state_tup = self.__getstate__() - - mut.__setstate__(state_tup) - return mut + new_mh_ptr = self._methodcall(lib.kmerminhash_to_mutable) + return MinHash._from_objptr(new_mh_ptr) def to_frozen(self): "Return a frozen copy of this MinHash that cannot be changed." diff --git a/src/sourmash/nodegraph.py b/src/sourmash/nodegraph.py index 8faa2eb874..7986659156 100644 --- a/src/sourmash/nodegraph.py +++ b/src/sourmash/nodegraph.py @@ -5,7 +5,7 @@ from tempfile import NamedTemporaryFile from ._lowlevel import ffi, lib -from .minhash import to_bytes, MinHash +from .minhash import to_bytes, MinHash, FrozenMinHash from .utils import RustObject, rustcall, decode_str from .exceptions import SourmashError @@ -42,8 +42,10 @@ def to_bytes(self, compression=1): def update(self, other): if isinstance(other, Nodegraph): return self._methodcall(lib.nodegraph_update, other._objptr) - elif isinstance(other, MinHash): + elif isinstance(other, FrozenMinHash): return self._methodcall(lib.nodegraph_update_mh, other._objptr) + elif isinstance(other, MinHash): + return self._methodcall(lib.nodegraph_update_mh, other.to_frozen()._objptr) else: # FIXME: we could take sets here too (or anything that can be # converted to a list of ints...) @@ -79,12 +81,15 @@ def expected_collisions(self): return self._methodcall(lib.nodegraph_expected_collisions) def matches(self, mh): - if not isinstance(mh, MinHash): + objptr = mh._objptr + if isinstance(mh, MinHash): + objptr = mh.to_frozen()._objptr + elif not isinstance(mh, FrozenMinHash): # FIXME: we could take sets here too (or anything that can be # converted to a list of ints...) - raise ValueError("mh must be a MinHash") + raise ValueError("mh must be a FrozenMinHash") - return self._methodcall(lib.nodegraph_matches, mh._objptr) + return self._methodcall(lib.nodegraph_matches, objptr) def to_khmer_nodegraph(self): import khmer diff --git a/src/sourmash/picklist.py b/src/sourmash/picklist.py index 30d5c84f90..af15df0990 100644 --- a/src/sourmash/picklist.py +++ b/src/sourmash/picklist.py @@ -252,6 +252,24 @@ def filter(self, it): if self.__contains__(ss): yield ss + def _as_rust(self): + from ._lowlevel import ffi, lib + from .utils import rustcall, decode_str + + ptr = lib.picklist_new() + + rustcall(lib.picklist_set_coltype, ptr, self.coltype.encode('utf-8'), len(self.coltype)) + rustcall(lib.picklist_set_pickfile, ptr, self.pickfile.encode('utf-8'), len(self.pickfile)) + rustcall(lib.picklist_set_column_name, ptr, self.column_name.encode('utf-8'), len(self.column_name)) + rustcall(lib.picklist_set_pickstyle, ptr, self.pickstyle.value) + + #self.preprocess_fn = preprocess[coltype] + #self.pickset = None + #self.found = set() + #self.n_queries = 0 + + return ptr + def passes_all_picklists(ss, picklists): "does the signature 'ss' pass all of the picklists?" diff --git a/src/sourmash/sbt_storage.py b/src/sourmash/sbt_storage.py index a22e782d69..42a4fceaa6 100644 --- a/src/sourmash/sbt_storage.py +++ b/src/sourmash/sbt_storage.py @@ -130,7 +130,7 @@ def subdir(self, value): self._methodcall(lib.zipstorage_set_subdir, to_bytes(value), len(value)) def _filenames(self): - if self.__inner: + if not self._objptr: return self.__inner._filenames() size = ffi.new("uintptr_t *") @@ -150,7 +150,7 @@ def save(self, path, content, *, overwrite=False, compress=False): raise NotImplementedError() def load(self, path): - if self.__inner: + if not self._objptr: return self.__inner.load(path) try: diff --git a/src/sourmash/signature.py b/src/sourmash/signature.py index 1fd34d35e6..4077d655ed 100644 --- a/src/sourmash/signature.py +++ b/src/sourmash/signature.py @@ -43,9 +43,9 @@ def __init__(self, minhash, name="", filename=""): @property def minhash(self): - return FrozenMinHash._from_objptr( + return MinHash._from_objptr( self._methodcall(lib.signature_first_mh) - ) + ).to_frozen() @minhash.setter def minhash(self, value): @@ -66,18 +66,6 @@ def __repr__(self): else: # name != md5pref: return "SourmashSignature('{}', {})".format(name, md5pref) - #def minhashes(self): - # size = ffi.new("uintptr_t *") - # mhs_ptr = self._methodcall(lib.signature_get_mhs, size) - # size = ffi.unpack(size, 1)[0] - # - # mhs = [] - # for i in range(size): - # mh = MinHash._from_objptr(mhs_ptr[i]) - # mhs.append(mh) - # - # return mhs - def md5sum(self): "Calculate md5 hash of the bottom sketch, specifically." return decode_str(self.minhash._methodcall(lib.kmerminhash_md5sum)) diff --git a/src/sourmash/utils.py b/src/sourmash/utils.py index 71afc20261..acb4b73d7a 100644 --- a/src/sourmash/utils.py +++ b/src/sourmash/utils.py @@ -29,6 +29,13 @@ def _get_objptr(self): raise RuntimeError("Object is closed") return self._objptr + def _take_objptr(self): + if not self._objptr: + raise RuntimeError("Object is closed") + ret = self._objptr + self._objptr = None + return ret + def __del__(self): if self._objptr is None or self._shared: return diff --git a/tests/test_index.py b/tests/test_index.py index af0c1da890..1067422c5f 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -1775,6 +1775,7 @@ def test_lazy_index_wraps_multi_index_location(): lazy2.signatures_with_location()): assert ss_tup == ss_lazy_tup +@pytest.mark.skip("no support for in-memory sigs yet") def test_revindex_index_search(): # confirm that RevIndex works sig2 = utils.get_test_data("2.fa.sig") @@ -1820,6 +1821,7 @@ def test_revindex_index_search(): assert sr[0][1] == ss63 +@pytest.mark.skip("no support for in-memory sigs yet") def test_revindex_gather(): # check that RevIndex.best_containment works. sig2 = utils.get_test_data("2.fa.sig") @@ -1846,6 +1848,7 @@ def test_revindex_gather(): assert match.signature == ss47 +@pytest.mark.skip("no support for in-memory sigs yet") def test_revindex_gather_ignore(): # check that RevIndex gather ignores things properly. sig2 = utils.get_test_data('2.fa.sig') diff --git a/tox.ini b/tox.ini index 589d07b8cb..70d11ea353 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,7 @@ [tox] -envlist = +requires = + tox>=4.2 +env_list = py310, py311, py39, @@ -11,20 +13,19 @@ envlist = hypothesis, khmer, khmer_master -minversion = 3.12 -isolated_build = true +min_version = 4.2 skip_missing_interpreters = true [testenv] +description = run the tests with pytest under {basepython} package = wheel wheel_build_env = .pkg -description = run the tests with pytest under {basepython} -setenv = +set_env = PIP_DISABLE_VERSION_CHECK = 1 COVERAGE_FILE = {env:COVERAGE_FILE:{toxworkdir}/.coverage.{envname}} VIRTUALENV_NO_DOWNLOAD = 1 PIP_EXTRA_INDEX_URL = https://antocuni.github.io/pypy-wheels/manylinux2010 -passenv = +pass_env = TOXENV CURL_CA_BUNDLE http_proxy @@ -36,6 +37,8 @@ passenv = PIP_CACHE_DIR CI PYTHONDEVMODE + LIBCLANG_PATH + BINDGEN_EXTRA_CLANG_ARGS deps = pip >= 19.3.1 extras = @@ -48,6 +51,11 @@ commands = pytest \ --junitxml {toxworkdir}/junit.{envname}.xml \ {posargs:doc tests} +[testenv:.pkg] +pass_env = + LIBCLANG_PATH + BINDGEN_EXTRA_CLANG_ARGS + [testenv:pypy3] deps = pip >= 19.3.1 @@ -102,8 +110,8 @@ commands = description = invoke sphinx-build to build the HTML docs basepython = python3.10 extras = doc -whitelist_externals = pandoc -passenv = HOME +allowlist_externals = pandoc +pass_env = HOME changedir = {toxinidir} #commands = sphinx-build -d "{toxworkdir}/docs_doctree" doc "{toxworkdir}/docs_out" --color -W -bhtml {posargs} commands = sphinx-build -d "{toxworkdir}/docs_doctree" doc "{toxworkdir}/docs_out" --color -bhtml {posargs} @@ -125,7 +133,7 @@ commands = pip wheel -w {envtmpdir}/build --no-deps . [testenv:mypy] description = run mypy checker basepython = python3.8 -passenv = {[testenv]passenv} +pass_env = {[testenv]pass_env} # without PROGRAMDATA cloning using git for Windows will fail with an `error setting certificate verify locations` error PROGRAMDATA deps = mypy @@ -134,7 +142,7 @@ commands = mypy src/sourmash [testenv:fix_lint] description = format the code base to adhere to our styles, and complain about what we cannot do automatically basepython = python3.8 -passenv = {[testenv]passenv} +pass_env = {[testenv]pass_env} # without PROGRAMDATA cloning using git for Windows will fail with an `error setting certificate verify locations` error PROGRAMDATA PRE_COMMIT_HOME @@ -151,9 +159,9 @@ deps = {[testenv]deps} coverage >= 5.0.1 diff_cover skip_install = True -passenv = {[testenv]passenv} +pass_env = {[testenv]pass_env} DIFF_AGAINST -setenv = COVERAGE_FILE={toxworkdir}/.coverage +set_env = COVERAGE_FILE={toxworkdir}/.coverage commands = coverage combine coverage report -i -m coverage xml -i -o {toxworkdir}/coverage.xml