diff --git a/.ci/install_cargo.sh b/.ci/install_cargo.sh index d0696d7fdf..94ec2b9beb 100755 --- a/.ci/install_cargo.sh +++ b/.ci/install_cargo.sh @@ -2,3 +2,4 @@ curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain stable export PATH="$HOME/.cargo/bin:$PATH" rustc -V +rustup target add aarch64-apple-darwin diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml index 64b653f0c1..d15fb8a27e 100644 --- a/.github/workflows/build_wheel.yml +++ b/.github/workflows/build_wheel.yml @@ -20,6 +20,7 @@ jobs: linux-ppc64le, linux-s390x, macos-x86_64, + macos-arm64, ] include: - build: linux-x86_64 @@ -41,7 +42,11 @@ jobs: - build: macos-x86_64 os: macos-latest arch: x86_64 - macos_target: 'MACOSX_DEPLOYMENT_TARGET=10.11' + macos_target: 'MACOSX_DEPLOYMENT_TARGET=10.11 CARGO_BUILD_TARGET=x86_64-apple-darwin' + - build: macos-arm64 + os: macos-latest + arch: arm64 + macos_target: 'MACOSX_DEPLOYMENT_TARGET=11 CARGO_BUILD_TARGET=aarch64-apple-darwin' fail-fast: false steps: @@ -63,12 +68,7 @@ jobs: - name: Build wheels uses: pypa/cibuildwheel@v2.2.2 env: - CIBW_BUILD: "cp39-*" - CIBW_SKIP: "*-win32 *-manylinux_i686 *-musllinux_ppc64le *-musllinux_s390x" - CIBW_BEFORE_BUILD: 'source .ci/install_cargo.sh' - CIBW_ENVIRONMENT: 'PATH="$HOME/.cargo/bin:$PATH"' CIBW_ENVIRONMENT_MACOS: ${{ matrix.macos_target }} - CIBW_BUILD_VERBOSITY: 3 CIBW_ARCHS_LINUX: ${{ matrix.arch }} CIBW_ARCHS_MACOS: ${{ matrix.arch }} diff --git a/.github/workflows/khmer.yml b/.github/workflows/khmer.yml deleted file mode 100644 index 9385647255..0000000000 --- a/.github/workflows/khmer.yml +++ /dev/null @@ -1,30 +0,0 @@ -name: khmer compatibility tests - -on: - push: - branches: [latest] - pull_request: - schedule: - - cron: "0 0 * * *" # daily - -jobs: - test: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v1 - - - name: Set up Python 3.10 - uses: actions/setup-python@v1 - with: - python-version: "3.10" - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install tox - - - name: Run tests with latest released khmer - run: tox -e khmer,coverage - - - name: Run tests with khmer master branch - run: tox -e khmer_master,coverage diff --git a/Cargo.lock b/Cargo.lock index ef043fcddb..d634a4a0ef 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,12 +2,24 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "Inflector" +version = "0.11.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe438c63458706e03479442743baae6c88256498e6431708f6dfc520a26515d3" + [[package]] name = "adler" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "aliasable" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "250f629c0161ad8107cf89319e990051fae62832fd343083bea452d93e2205fd" + [[package]] name = "assert_matches" version = "1.5.0" @@ -635,6 +647,30 @@ version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" +[[package]] +name = "ouroboros" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f31a3b678685b150cba82b702dcdc5e155893f63610cf388d30cd988d4ca2bf" +dependencies = [ + "aliasable", + "ouroboros_macro", + "stable_deref_trait", +] + +[[package]] +name = "ouroboros_macro" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "084fd65d5dd8b3772edccb5ffd1e4b7eba43897ecd0f9401e330e8c542959408" +dependencies = [ + "Inflector", + "proc-macro-error", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "piz" version = "0.4.0" @@ -998,6 +1034,7 @@ dependencies = [ "nohash-hasher", "num-iter", "once_cell", + "ouroboros", "piz", "primal-check", "proptest", @@ -1015,6 +1052,12 @@ dependencies = [ "web-sys", ] +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + [[package]] name = "static_assertions" version = "1.1.0" diff --git a/Makefile b/Makefile index 6e168e261d..10d8c1ec04 100644 --- a/Makefile +++ b/Makefile @@ -32,6 +32,7 @@ include/sourmash.h: src/core/src/lib.rs \ src/core/src/ffi/nodegraph.rs \ src/core/src/ffi/index/mod.rs \ src/core/src/ffi/index/revindex.rs \ + src/core/src/ffi/storage.rs \ src/core/src/errors.rs cd src/core && \ RUSTC_BOOTSTRAP=1 cbindgen -c cbindgen.toml . -o ../../$@ diff --git a/asv.conf.json b/asv.conf.json index c188ad0c04..a036fd2298 100644 --- a/asv.conf.json +++ b/asv.conf.json @@ -6,7 +6,6 @@ "branches": ["latest"], "dvcs": "git", "environment_type": "virtualenv", - "pythons": ["3.10"], "env_dir": ".asv/env", "results_dir": ".asv/results", "html_dir": ".asv/html", diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py index 481cb76b43..375981299e 100644 --- a/benchmarks/benchmarks.py +++ b/benchmarks/benchmarks.py @@ -1,6 +1,10 @@ +import os import random +from pathlib import Path +from tempfile import NamedTemporaryFile +from sourmash.sbt_storage import ZipStorage from sourmash.minhash import MinHash @@ -139,3 +143,60 @@ class PeakmemMinAbundanceSuite(PeakmemMinHashSuite): def setup(self): PeakmemMinHashSuite.setup(self) self.mh = MinHash(500, 21, track_abundance=True) + +#################### + +class TimeZipStorageSuite: + + def setup(self): + import zipfile + self.zipfile = NamedTemporaryFile() + + with zipfile.ZipFile(self.zipfile, mode='w', + compression=zipfile.ZIP_STORED) as storage: + for i in range(100_000): + # just so we have lots of entries + storage.writestr(str(i), b"0") + # one big-ish entry + storage.writestr("sig1", b"9" * 1_000_000) + + def time_load_from_zipstorage(self): + with ZipStorage(self.zipfile.name) as storage: + for i in range(20): + storage.load("sig1") + + def time_load_small_from_zipstorage(self): + with ZipStorage(self.zipfile.name) as storage: + for i in range(20): + storage.load("99999") + + def teardown(self): + self.zipfile.close() + + +class PeakmemZipStorageSuite: + def setup(self): + import zipfile + self.zipfile = NamedTemporaryFile() + + with zipfile.ZipFile(self.zipfile, mode='w', + compression=zipfile.ZIP_STORED) as storage: + for i in range(100_000): + # just so we have lots of entries + storage.writestr(str(i), b"0") + # one big-ish entry + storage.writestr("sig1", b"9" * 1_000_000) + + + def peakmem_load_from_zipstorage(self): + with ZipStorage(self.zipfile.name) as storage: + for i in range(20): + storage.load("sig1") + + def peakmem_load_small_from_zipstorage(self): + with ZipStorage(self.zipfile.name) as storage: + for i in range(20): + storage.load("99999") + + def teardown(self): + self.zipfile.close() diff --git a/doc/environment.yml b/doc/environment.yml index 3864f514bd..f3840b2dce 100644 --- a/doc/environment.yml +++ b/doc/environment.yml @@ -3,4 +3,4 @@ channels: - defaults dependencies: - rust - - python =3.7 + - python =3.8 diff --git a/flake.lock b/flake.lock index f86273ccb8..8e41c96a5e 100644 --- a/flake.lock +++ b/flake.lock @@ -49,11 +49,11 @@ ] }, "locked": { - "lastModified": 1639947939, - "narHash": "sha256-pGsM8haJadVP80GFq4xhnSpNitYNQpaXk4cnA796Cso=", + "lastModified": 1648544490, + "narHash": "sha256-EoBDcccV70tfz2LAs5lK0BjC7en5mzUVlgLsd5E6DW4=", "owner": "nix-community", "repo": "naersk", - "rev": "2fc8ce9d3c025d59fee349c1f80be9785049d653", + "rev": "e30ef9a5ce9b3de8bb438f15829c50f9525ca730", "type": "github" }, "original": { @@ -64,11 +64,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1645937171, - "narHash": "sha256-n9f9GZBNMe8UMhcgmmaXNObkH01jjgp7INMrUgBgcy4=", + "lastModified": 1648219316, + "narHash": "sha256-Ctij+dOi0ZZIfX5eMhgwugfvB+WZSrvVNAyAuANOsnQ=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "22dc22f8cedc58fcb11afe1acb08e9999e78be9c", + "rev": "30d3d79b7d3607d56546dd2a6b49e156ba0ec634", "type": "github" }, "original": { @@ -150,11 +150,11 @@ ] }, "locked": { - "lastModified": 1645928338, - "narHash": "sha256-pNbkG19Nb4QTNRCIWwxv06JKKJNCUrDzgRrriEd7W1A=", + "lastModified": 1648866882, + "narHash": "sha256-yMs/RKA9pX47a03zupmQp8/RLRmKzdSDk+h5Yt7K9xU=", "owner": "oxalica", "repo": "rust-overlay", - "rev": "4f6e6588b07427cd8ddc99b664bf0fab02799804", + "rev": "7c90e17cd7c0b9e81d5b23f78b482088ac9961d1", "type": "github" }, "original": { @@ -165,11 +165,11 @@ }, "utils": { "locked": { - "lastModified": 1644229661, - "narHash": "sha256-1YdnJAsNy69bpcjuoKdOYQX0YxZBiCYZo4Twxerqv7k=", + "lastModified": 1648297722, + "narHash": "sha256-W+qlPsiZd8F3XkzXOzAoR+mpFqzm3ekQkJNa+PIh1BQ=", "owner": "numtide", "repo": "flake-utils", - "rev": "3cecb5b042f7f209c56ffd8371b2711a290ec797", + "rev": "0f8662f1319ad6abf89b3380dd2722369fc51ade", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index 9ec1cd8300..ba080af204 100644 --- a/flake.nix +++ b/flake.nix @@ -77,6 +77,20 @@ DYLD_LIBRARY_PATH = "${self.packages.${system}.lib}/lib"; NO_BUILD = "1"; }; + docker = + let + bin = self.defaultPackage.${system}; + in + pkgs.dockerTools.buildLayeredImage { + name = bin.pname; + tag = bin.version; + contents = [ bin ]; + + config = { + Cmd = [ "/bin/sourmash" ]; + WorkingDir = "/"; + }; + }; }; defaultPackage = self.packages.${system}.sourmash; diff --git a/include/sourmash.h b/include/sourmash.h index de422efcf5..1f116e681b 100644 --- a/include/sourmash.h +++ b/include/sourmash.h @@ -58,6 +58,8 @@ typedef struct SourmashSearchResult SourmashSearchResult; typedef struct SourmashSignature SourmashSignature; +typedef struct SourmashZipStorage SourmashZipStorage; + /** * Represents a string. */ @@ -456,4 +458,23 @@ SourmashStr sourmash_str_from_cstr(const char *s); char sourmash_translate_codon(const char *codon); +SourmashStr **zipstorage_filenames(const SourmashZipStorage *ptr, uintptr_t *size); + +void zipstorage_free(SourmashZipStorage *ptr); + +SourmashStr **zipstorage_list_sbts(const SourmashZipStorage *ptr, uintptr_t *size); + +const uint8_t *zipstorage_load(const SourmashZipStorage *ptr, + const char *path_ptr, + uintptr_t insize, + uintptr_t *size); + +SourmashZipStorage *zipstorage_new(const char *ptr, uintptr_t insize); + +SourmashStr zipstorage_path(const SourmashZipStorage *ptr); + +void zipstorage_set_subdir(SourmashZipStorage *ptr, const char *path_ptr, uintptr_t insize); + +SourmashStr zipstorage_subdir(const SourmashZipStorage *ptr); + #endif /* SOURMASH_H_INCLUDED */ diff --git a/pyproject.toml b/pyproject.toml index eafbab8578..abe31ec81e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,3 +19,10 @@ include_trailing_comma = true force_grid_wrap = 0 line_length = 88 known_first_party = ["sourmash"] + +[tool.cibuildwheel] +build = "cp39-*" +skip = "*-win32 *-manylinux_i686 *-musllinux_ppc64le *-musllinux_s390x" +before-build = "source .ci/install_cargo.sh" +environment = { PATH="$HOME/.cargo/bin:$PATH" } +build-verbosity = 3 diff --git a/setup.cfg b/setup.cfg index bd3aeea86d..c959dc4949 100644 --- a/setup.cfg +++ b/setup.cfg @@ -20,8 +20,9 @@ classifiers = Operating System :: POSIX :: Linux Operating System :: MacOS :: MacOS X Programming Language :: Rust - Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 + Programming Language :: Python :: 3.9 + Programming Language :: Python :: 3.10 Topic :: Scientific/Engineering :: Bio-Informatics project_urls = Documentation = https://sourmash.readthedocs.io @@ -42,7 +43,7 @@ install_requires = scipy deprecation>=2.0.6 cachetools>=4,<5 -python_requires = >=3.7 +python_requires = >=3.8 [bdist_wheel] universal = 1 diff --git a/setup.py b/setup.py index 9384aba4b4..e31e09fd1f 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ NO_BUILD = os.environ.get("NO_BUILD") == "1" -def find_dylib(name, paths): +def find_dylib_no_build(name, paths): to_find = None if sys.platform == 'darwin': to_find = f'lib{name}.dylib' @@ -25,6 +25,15 @@ def find_dylib(name, paths): raise LookupError('dylib %r not found' % name) +def find_dylib(build, target): + cargo_target = os.environ.get("CARGO_BUILD_TARGET") + if cargo_target: + in_path = "target/%s/%s" % (cargo_target, target) + else: + in_path = "target/%s" % target + return build.find_dylib("sourmash", in_path=in_path) + + def build_native(spec): cmd = ["cargo", "build", "--manifest-path", "src/core/Cargo.toml", @@ -41,7 +50,7 @@ def build_native(spec): header_filename = lambda: "include/sourmash.h" else: build = spec.add_external_build(cmd=cmd, path=".") - dylib=lambda: build.find_dylib("sourmash", in_path="target/%s" % target) + dylib = lambda: find_dylib(build, target) header_filename=lambda: build.find_header("sourmash.h", in_path="include") rtld_flags = ["NOW"] diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index 736e506c20..43ce78c0c8 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -47,6 +47,7 @@ twox-hash = "1.6.0" vec-collections = "0.3.4" piz = "0.4.0" memmap2 = "0.5.0" +ouroboros = "0.15.0" [dev-dependencies] assert_matches = "1.3.0" diff --git a/src/core/benches/compute.rs b/src/core/benches/compute.rs index 9083bd910c..3e1bee87d1 100644 --- a/src/core/benches/compute.rs +++ b/src/core/benches/compute.rs @@ -76,5 +76,75 @@ fn add_sequence(c: &mut Criterion) { }); } -criterion_group!(compute, add_sequence); +fn add_sequence_protein(c: &mut Criterion) { + let mut cp = ComputeParameters::default(); + cp.set_protein(true); + cp.set_dna(false); + cp.set_scaled(200); + cp.set_ksizes(vec![30]); + let template_sig = Signature::from_params(&cp); + + let mut data: Vec = vec![]; + let (mut f, _) = niffler::from_path("../../tests/test-data/genome-s10.fa.gz").unwrap(); + let _ = f.read_to_end(&mut data); + + let data = data.repeat(10); + + let data_upper = data.to_ascii_uppercase(); + let data_lower = data.to_ascii_lowercase(); + let data_errors: Vec = data + .iter() + .enumerate() + .map(|(i, x)| if i % 89 == 1 { b'N' } else { *x }) + .collect(); + + let mut group = c.benchmark_group("add_sequence_protein"); + group.sample_size(10); + + group.bench_function("valid", |b| { + b.iter(|| { + let fasta_data = Cursor::new(data_upper.clone()); + let mut sig = template_sig.clone(); + let mut parser = parse_fastx_reader(fasta_data).unwrap(); + while let Some(rec) = parser.next() { + sig.add_protein(&rec.unwrap().seq()).unwrap(); + } + }); + }); + + group.bench_function("lowercase", |b| { + b.iter(|| { + let fasta_data = Cursor::new(data_lower.clone()); + let mut sig = template_sig.clone(); + let mut parser = parse_fastx_reader(fasta_data).unwrap(); + while let Some(rec) = parser.next() { + sig.add_protein(&rec.unwrap().seq()).unwrap(); + } + }); + }); + + group.bench_function("invalid kmers", |b| { + b.iter(|| { + let fasta_data = Cursor::new(data_errors.clone()); + let mut sig = template_sig.clone(); + let mut parser = parse_fastx_reader(fasta_data).unwrap(); + while let Some(rec) = parser.next() { + sig.add_protein(&rec.unwrap().seq()).unwrap(); + } + }); + }); + + group.bench_function("force with valid kmers", |b| { + b.iter(|| { + let fasta_data = Cursor::new(data_upper.clone()); + let mut sig = template_sig.clone(); + let mut parser = parse_fastx_reader(fasta_data).unwrap(); + while let Some(rec) = parser.next() { + sig.add_protein(&rec.unwrap().seq()).unwrap(); + } + }); + }); +} + +criterion_group!(compute, add_sequence, add_sequence_protein); criterion_main!(compute); diff --git a/src/core/src/ffi/mod.rs b/src/core/src/ffi/mod.rs index e9f276d5e2..a67de37176 100644 --- a/src/core/src/ffi/mod.rs +++ b/src/core/src/ffi/mod.rs @@ -12,6 +12,7 @@ pub mod index; pub mod minhash; pub mod nodegraph; pub mod signature; +pub mod storage; use std::ffi::CStr; use std::os::raw::c_char; diff --git a/src/core/src/ffi/storage.rs b/src/core/src/ffi/storage.rs new file mode 100644 index 0000000000..98eca095b2 --- /dev/null +++ b/src/core/src/ffi/storage.rs @@ -0,0 +1,140 @@ +use std::os::raw::c_char; +use std::slice; + +use crate::ffi::utils::{ForeignObject, SourmashStr}; +use crate::prelude::*; +use crate::storage::ZipStorage; + +pub struct SourmashZipStorage; + +impl ForeignObject for SourmashZipStorage { + type RustObject = ZipStorage; +} + +ffi_fn! { +unsafe fn zipstorage_new(ptr: *const c_char, insize: usize) -> Result<*mut SourmashZipStorage> { + let path = { + assert!(!ptr.is_null()); + let path = slice::from_raw_parts(ptr as *mut u8, insize); + std::str::from_utf8(path)? + }; + let zipstorage = ZipStorage::from_file(path)?; + + Ok(SourmashZipStorage::from_rust(zipstorage)) +} +} + +#[no_mangle] +pub unsafe extern "C" fn zipstorage_free(ptr: *mut SourmashZipStorage) { + SourmashZipStorage::drop(ptr); +} + +ffi_fn! { +unsafe fn zipstorage_load(ptr: *const SourmashZipStorage, + path_ptr: *const c_char, + insize: usize, + size: *mut usize) -> Result<*const u8> { + + let storage = SourmashZipStorage::as_rust(ptr); + + let path = { + assert!(!path_ptr.is_null()); + let path = slice::from_raw_parts(path_ptr as *mut u8, insize); + std::str::from_utf8(path)? + }; + + let buffer = storage.load(path)?; + + let b = buffer.into_boxed_slice(); + *size = b.len(); + + Ok(Box::into_raw(b) as *const u8) +} +} + +ffi_fn! { +unsafe fn zipstorage_list_sbts( + ptr: *const SourmashZipStorage, + size: *mut usize, +) -> Result<*mut *mut SourmashStr> { + let storage = SourmashZipStorage::as_rust(ptr); + + let sbts = storage.list_sbts()?; + + // FIXME: use the ForeignObject trait, maybe define new method there... + let ptr_sigs: Vec<*mut SourmashStr> = sbts + .into_iter() + .map(|x| Box::into_raw(Box::new(SourmashStr::from_string(x))) as *mut SourmashStr) + .collect(); + + let b = ptr_sigs.into_boxed_slice(); + *size = b.len(); + + Ok(Box::into_raw(b) as *mut *mut SourmashStr) +} +} + +ffi_fn! { +unsafe fn zipstorage_filenames( + ptr: *const SourmashZipStorage, + size: *mut usize, +) -> Result<*mut *mut SourmashStr> { + let storage = SourmashZipStorage::as_rust(ptr); + + let files = storage.filenames()?; + + // FIXME: use the ForeignObject trait, maybe define new method there... + let ptr_sigs: Vec<*mut SourmashStr> = files + .into_iter() + .map(|x| Box::into_raw(Box::new(SourmashStr::from_string(x))) as *mut SourmashStr) + .collect(); + + let b = ptr_sigs.into_boxed_slice(); + *size = b.len(); + + Ok(Box::into_raw(b) as *mut *mut SourmashStr) +} +} + +ffi_fn! { +unsafe fn zipstorage_set_subdir( + ptr: *mut SourmashZipStorage, + path_ptr: *const c_char, + insize: usize, +) -> Result<()> { + let storage = SourmashZipStorage::as_rust_mut(ptr); + + let path = { + assert!(!path_ptr.is_null()); + let path = slice::from_raw_parts(path_ptr as *mut u8, insize); + std::str::from_utf8(path)? + }; + + storage.set_subdir(path.to_string()); + Ok(()) +} +} + +ffi_fn! { +unsafe fn zipstorage_path(ptr: *const SourmashZipStorage) -> Result { + let storage = SourmashZipStorage::as_rust(ptr); + + if let Some(ref path) = storage.path() { + Ok(path.clone().into()) + } else { + Ok("".into()) + } +} +} + +ffi_fn! { +unsafe fn zipstorage_subdir(ptr: *const SourmashZipStorage) -> Result { + let storage = SourmashZipStorage::as_rust(ptr); + + if let Some(ref path) = storage.subdir() { + Ok(path.clone().into()) + } else { + Ok("".into()) + } +} +} diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index 3121537130..0c6a2676bf 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -179,6 +179,7 @@ pub struct SeqToHashes { prot_configured: bool, aa_seq: Vec, + translate_iter_step: usize, } impl SeqToHashes { @@ -222,17 +223,26 @@ impl SeqToHashes { dna_last_position_check: 0, prot_configured: false, aa_seq: Vec::new(), + translate_iter_step: 0, } } } +/* +Iterator that return a kmer hash for all modes except translate. +In translate mode: + - all the frames are processed at once and converted to hashes. + - all the hashes are stored in `hashes_buffer` + - after processing all the kmers, `translate_iter_step` is incremented + per iteration to iterate over all the indeces of the `hashes_buffer`. + - the iterator will die once `translate_iter_step` == length(hashes_buffer) +More info https://github.com/sourmash-bio/sourmash/pull/1946 +*/ + impl Iterator for SeqToHashes { type Item = Result; fn next(&mut self) -> Option { - // TODO: Remove the hashes buffer - // Priority for flushing the hashes buffer - if (self.kmer_index < self.max_index) || !self.hashes_buffer.is_empty() { // Processing DNA or Translated DNA if !self.is_protein { @@ -290,18 +300,17 @@ impl Iterator for SeqToHashes { let hash = crate::_hash_murmur(std::cmp::min(kmer, krc), self.seed); self.kmer_index += 1; Some(Ok(hash)) - } else if self.hashes_buffer.is_empty() { + } else if self.hashes_buffer.is_empty() && self.translate_iter_step == 0 { // Processing protein by translating DNA - // TODO: make it a real iterator not a buffer + // TODO: Implement iterator over frames instead of hashes_buffer. - // Three frames - for i in 0..3 { + for frame_number in 0..3 { let substr: Vec = self .sequence .iter() .cloned() - .skip(i) - .take(self.sequence.len() - i) + .skip(frame_number) + .take(self.sequence.len() - frame_number) .collect(); let aa = to_aa( @@ -320,8 +329,8 @@ impl Iterator for SeqToHashes { .dna_rc .iter() .cloned() - .skip(i) - .take(self.dna_rc.len() - i) + .skip(frame_number) + .take(self.dna_rc.len() - frame_number) .collect(); let aa_rc = to_aa( &rc_substr, @@ -335,11 +344,16 @@ impl Iterator for SeqToHashes { self.hashes_buffer.push(hash); }); } - self.kmer_index = self.max_index; - Some(Ok(self.hashes_buffer.remove(0))) + Some(Ok(0)) } else { - let first_element: u64 = self.hashes_buffer.remove(0); - Some(Ok(first_element)) + if self.translate_iter_step == self.hashes_buffer.len() { + self.hashes_buffer.clear(); + self.kmer_index = self.max_index; + return Some(Ok(0)); + } + let curr_idx = self.translate_iter_step; + self.translate_iter_step += 1; + Some(Ok(self.hashes_buffer[curr_idx])) } } else { // Processing protein diff --git a/src/core/src/storage.rs b/src/core/src/storage.rs index 598f47c7b6..a2269bd6e7 100644 --- a/src/core/src/storage.rs +++ b/src/core/src/storage.rs @@ -1,6 +1,8 @@ +use std::collections::BTreeMap; +use std::ffi::OsStr; use std::fs::{DirBuilder, File}; use std::io::{BufReader, BufWriter, Read, Write}; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use std::sync::{Arc, Mutex}; use serde::{Deserialize, Serialize}; @@ -46,6 +48,12 @@ impl Storage for InnerStorage { pub enum StorageError { #[error("Path can't be empty")] EmptyPathError, + + #[error("Path not found: {0}")] + PathNotFoundError(String), + + #[error("Error reading data from {0}")] + DataReadError(String), } #[derive(Serialize, Deserialize)] @@ -155,48 +163,80 @@ impl Storage for FSStorage { } } -pub struct ZipStorage<'a> { +#[ouroboros::self_referencing] +pub struct ZipStorage { mapping: Option, - archive: Option>, - //metadata: piz::read::DirectoryContents<'a>, -} -fn load_from_archive<'a>(archive: &'a piz::ZipArchive<'a>, path: &str) -> Result, Error> { - use piz::read::FileTree; + #[borrows(mapping)] + #[covariant] + archive: piz::ZipArchive<'this>, - // FIXME error - let tree = piz::read::as_tree(archive.entries()).map_err(|_| StorageError::EmptyPathError)?; - // FIXME error - let entry = tree - .lookup(path) - .map_err(|_| StorageError::EmptyPathError)?; + subdir: Option, + path: Option, - // FIXME error - let mut reader = BufReader::new( - archive - .read(entry) - .map_err(|_| StorageError::EmptyPathError)?, - ); - let mut contents = Vec::new(); - reader.read_to_end(&mut contents)?; + #[borrows(archive)] + #[covariant] + metadata: Metadata<'this>, +} - Ok(contents) +pub type Metadata<'a> = BTreeMap<&'a OsStr, &'a piz::read::FileMetadata<'a>>; + +fn lookup<'a, P: AsRef>( + metadata: &'a Metadata, + path: P, +) -> Result<&'a piz::read::FileMetadata<'a>, Error> { + let path = path.as_ref(); + metadata + .get(&path.as_os_str()) + .ok_or_else(|| StorageError::PathNotFoundError(path.to_str().unwrap().into()).into()) + .map(|entry| *entry) +} + +fn find_subdirs<'a>(archive: &'a piz::ZipArchive<'a>) -> Result, Error> { + let subdirs: Vec<_> = archive + .entries() + .iter() + .filter(|entry| entry.is_dir()) + .collect(); + if subdirs.len() == 1 { + Ok(Some( + subdirs[0] + .path + .to_str() + .expect("Error converting path") + .into(), + )) + } else { + Ok(None) + } } -impl<'a> Storage for ZipStorage<'a> { +impl Storage for ZipStorage { fn save(&self, _path: &str, _content: &[u8]) -> Result { unimplemented!(); } fn load(&self, path: &str) -> Result, Error> { - if let Some(archive) = &self.archive { - load_from_archive(archive, path) - } else { - //FIXME - let archive = piz::ZipArchive::new((&self.mapping.as_ref()).unwrap()) - .map_err(|_| StorageError::EmptyPathError)?; - load_from_archive(&archive, path) - } + let metadata = self.borrow_metadata(); + + let entry = lookup(metadata, path).or_else(|_| { + if let Some(subdir) = self.borrow_subdir() { + lookup(metadata, subdir.to_owned() + path) + .map_err(|_| StorageError::PathNotFoundError(path.into())) + } else { + Err(StorageError::PathNotFoundError(path.into())) + } + })?; + + let mut reader = BufReader::new( + self.borrow_archive() + .read(entry) + .map_err(|_| StorageError::DataReadError(path.into()))?, + ); + let mut contents = Vec::new(); + reader.read_to_end(&mut contents)?; + + Ok(contents) } fn args(&self) -> StorageArgs { @@ -204,40 +244,68 @@ impl<'a> Storage for ZipStorage<'a> { } } -impl<'a> ZipStorage<'a> { - pub fn new(location: &str) -> Result { +impl ZipStorage { + pub fn from_file(location: &str) -> Result { let zip_file = File::open(location)?; let mapping = unsafe { memmap2::Mmap::map(&zip_file)? }; - //FIXME - //let archive = piz::ZipArchive::new(&mapping).map_err(|_| StorageError::EmptyPathError)?; + let mut storage = ZipStorageBuilder { + mapping: Some(mapping), + archive_builder: |mapping: &Option| { + piz::ZipArchive::new(mapping.as_ref().unwrap()).unwrap() + }, + metadata_builder: |archive: &piz::ZipArchive| { + archive + .entries() + .iter() + .map(|entry| (entry.path.as_os_str(), entry)) + .collect() + }, + subdir: None, + path: Some(location.to_owned()), + } + .build(); - //FIXME - // let tree = - // piz::read::as_tree(archive.entries()).map_err(|_| StorageError::EmptyPathError)?; + let subdir = find_subdirs(storage.borrow_archive())?; + storage.with_mut(|fields| *fields.subdir = subdir); - Ok(Self { - mapping: Some(mapping), - archive: None, - //metadata: tree, - }) - } - - pub fn from_slice(mapping: &'a [u8]) -> Result { - //FIXME - let archive = piz::ZipArchive::new(mapping).map_err(|_| StorageError::EmptyPathError)?; - - //FIXME - //let entries: Vec<_> = archive.entries().iter().map(|x| x.to_owned()).collect(); - //let tree = - // piz::read::as_tree(entries.as_slice()).map_err(|_| StorageError::EmptyPathError)?; - - Ok(Self { - archive: Some(archive), - mapping: None, - /* metadata: archive - .as_tree() - .map_err(|_| StorageError::EmptyPathError)?, */ - }) + Ok(storage) + } + + pub fn path(&self) -> Option { + self.borrow_path().clone() + } + + pub fn subdir(&self) -> Option { + self.borrow_subdir().clone() + } + + pub fn set_subdir(&mut self, path: String) { + self.with_mut(|fields| *fields.subdir = Some(path)) + } + + pub fn list_sbts(&self) -> Result, Error> { + Ok(self + .borrow_archive() + .entries() + .iter() + .filter_map(|entry| { + let path = entry.path.to_str().expect("Error converting path"); + if path.ends_with(".sbt.json") { + Some(path.into()) + } else { + None + } + }) + .collect()) + } + + pub fn filenames(&self) -> Result, Error> { + Ok(self + .borrow_archive() + .entries() + .iter() + .map(|entry| entry.path.to_str().expect("Error converting path").into()) + .collect()) } } diff --git a/src/core/tests/storage.rs b/src/core/tests/storage.rs index 202ddd3fe3..5a60e02fcc 100644 --- a/src/core/tests/storage.rs +++ b/src/core/tests/storage.rs @@ -1,4 +1,3 @@ -use std::fs::File; use std::path::PathBuf; use sourmash::storage::{Storage, ZipStorage}; @@ -8,7 +7,7 @@ fn zipstorage_load_file() -> Result<(), Box> { let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); filename.push("../../tests/test-data/v6.sbt.zip"); - let zs = ZipStorage::new(filename.to_str().unwrap())?; + let zs = ZipStorage::from_file(filename.to_str().unwrap())?; let data = zs.load("v6.sbt.json")?; @@ -19,19 +18,27 @@ fn zipstorage_load_file() -> Result<(), Box> { } #[test] -fn zipstorage_load_slice() -> Result<(), Box> { +fn zipstorage_load_manifest() -> Result<(), Box> { let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/v6.sbt.zip"); + filename.push("../../tests/test-data/prot/protein.sbt.zip"); - let zip_file = File::open(filename)?; - let mapping = unsafe { memmap2::Mmap::map(&zip_file)? }; + let zs = ZipStorage::from_file(filename.to_str().unwrap())?; - let zs = ZipStorage::from_slice(&mapping)?; + let _data = zs.load("protein.manifest.csv").expect("error loading file"); - let data = zs.load("v6.sbt.json")?; + Ok(()) +} - let description: serde_json::Value = serde_json::from_slice(&data[..])?; - assert_eq!(description["version"], 6); +#[test] +fn zipstorage_list_sbts() -> Result<(), Box> { + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("../../tests/test-data/v6.sbt.zip"); + + let zs = ZipStorage::from_file(filename.to_str().unwrap())?; + + let sbts = zs.list_sbts()?; + + assert_eq!(sbts.len(), 1); Ok(()) } diff --git a/src/sourmash/cli/utils.py b/src/sourmash/cli/utils.py index 2d3941918c..2063fd09de 100644 --- a/src/sourmash/cli/utils.py +++ b/src/sourmash/cli/utils.py @@ -17,20 +17,20 @@ def add_moltype_args(parser): parser.add_argument( '--dayhoff', dest='dayhoff', action='store_true', - help='build Dayhoff-encoded amino acid signatures' + help='choose Dayhoff-encoded amino acid signatures' ) parser.add_argument( '--no-dayhoff', dest='dayhoff', action='store_false', - help='do not build Dayhoff-encoded amino acid signatures') + help='do not choose Dayhoff-encoded amino acid signatures') parser.set_defaults(dayhoff=False) parser.add_argument( '--hp', '--hydrophobic-polar', dest='hp', action='store_true', - help='build hydrophobic-polar-encoded amino acid signatures' + help='choose hydrophobic-polar-encoded amino acid signatures' ) parser.add_argument( '--no-hp', '--no-hydrophobic-polar', dest='hp', action='store_false', - help='do not build hydrophobic-polar-encoded amino acid signatures') + help='do not choose hydrophobic-polar-encoded amino acid signatures') parser.set_defaults(hp=False) parser.add_argument( diff --git a/src/sourmash/command_compute.py b/src/sourmash/command_compute.py index be0d87db00..b028664b53 100644 --- a/src/sourmash/command_compute.py +++ b/src/sourmash/command_compute.py @@ -137,10 +137,15 @@ def __init__(self, args): def __call__(self): args = self.args - params = ComputeParameters(args.ksizes, args.seed, args.protein, - args.dayhoff, args.hp, args.dna, - args.num_hashes, - args.track_abundance, args.scaled) + params = ComputeParameters(ksizes=args.ksizes, + seed=args.seed, + protein=args.protein, + dayhoff=args.dayhoff, + hp=args.hp, + dna=args.dna, + num_hashes=args.num_hashes, + track_abundance=args.track_abundance, + scaled=args.scaled) sig = SourmashSignature.from_params(params) return [sig] @@ -326,7 +331,17 @@ def save_sigs_to_location(siglist, save_sig): class ComputeParameters(RustObject): __dealloc_func__ = lib.computeparams_free - def __init__(self, ksizes, seed, protein, dayhoff, hp, dna, num_hashes, track_abundance, scaled): + def __init__(self, + *, + ksizes=(21, 31, 51), + seed=42, + protein=False, + dayhoff=False, + hp=False, + dna=True, + num_hashes=500, + track_abundance=False, + scaled=0): self._objptr = lib.computeparams_new() self.seed = seed @@ -359,8 +374,15 @@ def from_manifest_row(cls, row): else: ksize = row['ksize'] * 3 - p = cls([ksize], DEFAULT_MMHASH_SEED, is_protein, is_dayhoff, is_hp, is_dna, - row['num'], row['with_abundance'], row['scaled']) + p = cls(ksizes=[ksize], + seed=DEFAULT_MMHASH_SEED, + protein=is_protein, + dayhoff=is_dayhoff, + hp=is_hp, + dna=is_dna, + num_hashes=row['num'], + track_abundance=row['with_abundance'], + scaled=row['scaled']) return p @@ -405,7 +427,7 @@ def to_param_str(self): return ",".join(pi) def __repr__(self): - return f"ComputeParameters({self.ksizes}, {self.seed}, {self.protein}, {self.dayhoff}, {self.hp}, {self.dna}, {self.num_hashes}, {self.track_abundance}, {self.scaled})" + return f"ComputeParameters(ksizes={self.ksizes}, seed={self.seed}, protein={self.protein}, dayhoff={self.dayhoff}, hp={self.hp}, dna={self.dna}, num_hashes={self.num_hashes}, track_abundance={self.track_abundance}, scaled={self.scaled})" def __eq__(self, other): return (self.ksizes == other.ksizes and diff --git a/src/sourmash/command_sketch.py b/src/sourmash/command_sketch.py index dd02dcb8e2..db8aa5a87f 100644 --- a/src/sourmash/command_sketch.py +++ b/src/sourmash/command_sketch.py @@ -146,16 +146,17 @@ def get_compute_params(self, *, split_ksizes=False): if self.mult_ksize_by_3 and not def_dna: ksizes = [ k*3 for k in ksizes ] - make_param = lambda ksizes: ComputeParameters(ksizes, - params_d.get('seed', def_seed), - def_protein, - def_dayhoff, - def_hp, - def_dna, - params_d.get('num', def_num), - params_d.get('track_abundance', - def_abund), - params_d.get('scaled', def_scaled)) + make_param = lambda ksizes: ComputeParameters( + ksizes=ksizes, + seed=params_d.get('seed', def_seed), + protein=def_protein, + dayhoff=def_dayhoff, + hp=def_hp, + dna=def_dna, + num_hashes=params_d.get('num', def_num), + track_abundance=params_d.get('track_abundance', + def_abund), + scaled=params_d.get('scaled', def_scaled)) if split_ksizes: for ksize in ksizes: diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 5a4747615f..08ae99c771 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -501,12 +501,16 @@ def search(args): # do the actual search if query.minhash.track_abundance: - results = search_databases_with_abund_query(query, databases, - threshold=args.threshold, - do_containment=args.containment, - do_max_containment=args.max_containment, - best_only=args.best_only, - unload_data=True) + try: + results = search_databases_with_abund_query(query, databases, + threshold=args.threshold, + do_containment=args.containment, + do_max_containment=args.max_containment, + best_only=args.best_only, + unload_data=True) + except TypeError as exc: + error(f"ERROR: {str(exc)}") + sys.exit(-1) else: results = search_databases_with_flat_query(query, databases, threshold=args.threshold, diff --git a/src/sourmash/distance_utils.py b/src/sourmash/distance_utils.py deleted file mode 100644 index a80952d343..0000000000 --- a/src/sourmash/distance_utils.py +++ /dev/null @@ -1,286 +0,0 @@ -""" -Utility functions for jaccard/containment --> distance estimates -From https://github.com/KoslickiLab/mutation-rate-ci-calculator -https://doi.org/10.1101/2022.01.11.475870 -""" -from scipy.optimize import brentq -from scipy.stats import norm as scipy_norm -from scipy.special import hyp2f1 -from numpy import sqrt -from math import log, exp - -from .logging import notify - - -#FROM mrcc.kmer_mutation_formulas_thm5 -def r1_to_q(k,r1): - r1 = float(r1) - q = 1-(1-r1)**k - return float(q) - - -def var_n_mutated(L,k,r1,q=None): - # there are computational issues in the variance formula that we solve here - # by the use of higher-precision arithmetic; the problem occurs when r is - # very small; for example, with L=10,k=2,r1=1e-6 standard precision - # gives varN<0 which is nonsense; by using the mpf type, we get the correct - # answer which is about 0.000038. - if (r1 == 0): return 0.0 - r1 = float(r1) - if (q == None): # we assume that if q is provided, it is correct for r1 - q = r1_to_q(k,r1) - varN = L*(1-q)*(q*(2*k+(2/r1)-1)-2*k) \ - + k*(k-1)*(1-q)**2 \ - + (2*(1-q)/(r1**2))*((1+(k-1)*(1-q))*r1-q) - if (varN<0.0): # this seems to happen only with super tiny test data - raise ValueError('Error: varN <0.0!') - return float(varN) - - -def exp_n_mutated(L,k,r1): - q = r1_to_q(k,r1) - return L*q - - -# FROM mrcc.third_moment_calculator -def exp_n_mutated_squared(L, k, p): - return var_n_mutated(L, k, p) + exp_n_mutated(L, k, p) ** 2 - - -def third_moment_nmut_exact(L,k,p): - t1 = (L * (-2 + 3*L) * p**2 + 3 * (1 - p)**(2*k) * (2 + (-1 + k - L) * p * (2 + k * p - L * p)) - (1 - p)**k * (6 + p * (-6 + L * (-6 + p + 6 * L * p))))/(p**2) - t2 = (-2 + 2 * k - L) * (-1 + 2 * k - L) * (2 * k - L) * (-1 + (1 - p)**k)**3 - t3 = (1/(p**3))*(-6 * (-1 + k)**2 * (k - L) * p**3 + 6 * (1 - p)**(3 * k) * (2 + (-2 + 2 * k - L) * p) + (1 - p)**(2 * k) * (-12 + 6 * (2 * k + L) * p + 6 * (4 * k**2 + 2 * (1 + L) - 3 * k * (2 + L)) * p**2 - (-1 + k) * k * (-2 + 4 * k - 3 * L) * p**3) + 6 * (-1 + k) * (1 - p)**k * p * (-2 + p * (2 - k + 2 * L + (k * (-2 + 3 * k - 3 * L) + L) * p))) - t4 = 6 * (-1 + (1 - p)**k) * ((k + k**2 - 2 * k * L + (-1 + L) * L) * (-1 + 2 * (1 - p)**k) * hyp2f1(1, 2 + k - L, k - L, 1) + (k + k**2 - 2 * k * L + (-1 + L) * L) * (1 - p)**k * (-1 + p) * hyp2f1(1, 2 + k - L, k - L, 1 - p) - (-2 * k + 4 * k**2 + L - 4 * k * L + L**2) * ((-1 + 2 * (1 - p)**k) * hyp2f1(1, 1 + 2 * k - L, -1 + 2 * k - L, 1)- (-1 + p)**(2 * k) * hyp2f1(1, 1 + 2 * k - L, -1 + 2 * k - L, 1 - p))) - return t1+t2+t3+t4 - - -def exp_n_mutated_cubed(L, k, p): - return third_moment_nmut_exact(L, k, p) - - -#from mrcc.p_from_scaled_containment -def probit(p): - return scipy_norm.ppf(p) - - -def sequence_len_to_n_kmers(sequence_len_bp, ksize): - n_kmers = sequence_len_bp - (ksize-1) - return n_kmers - - -def distance_to_identity(dist,d_low=None,d_high=None): - """ - ANI = 1-distance - """ - if not 0 <= dist <= 1: - raise ValueError(f"Error: distance value {dist} is not between 0 and 1!") - ident = 1-dist - result = ident - id_low,id_high=None,None - if d_low is not None and d_high is not None: - if (0<=d_low<=1) and (0<=d_high<=1): - id_high = 1-d_low - id_low = 1-d_high - result = [ident, id_low, id_high] - return result - - -def get_expected_log_probability(n_unique_kmers, ksize, mutation_rate, scaled_fraction): - '''helper function - Note that scaled here needs to be between 0 and 1 - (e.g. scaled 1000 --> scaled_fraction 0.001) - ''' - exp_nmut = exp_n_mutated(n_unique_kmers, ksize, mutation_rate) - try: - return (n_unique_kmers - exp_nmut) * log(1.0 - scaled_fraction) - except: - return float('-inf') - - -def get_exp_probability_nothing_common(mutation_rate, ksize, scaled, n_unique_kmers=None, sequence_len_bp=None): - """ - Given parameters, calculate the expected probability that nothing will be common - between a fracminhash sketch of a original sequence and a fracminhash sketch of a mutated - sequence. If this is above a threshold, we should suspect that the two sketches may have - nothing in common. The threshold needs to be set with proper insights. - - Arguments: n_unique_kmers, ksize, mutation_rate, scaled - Returns: float - expected likelihood that nothing is common between sketches - """ - # NTP note: do we have any checks for ksize >=1 in sourmash_args? The rest should be taken care of. - # assert 0.0 <= mutation_rate <= 1.0 and ksize >= 1 and scaled >= 1 - if sequence_len_bp and not n_unique_kmers: - n_unique_kmers = sequence_len_to_n_kmers(sequence_len_bp, ksize) - f_scaled = 1.0 / float(scaled) - if mutation_rate == 1.0: - return 1.0 - return exp( get_expected_log_probability(n_unique_kmers, ksize, mutation_rate, f_scaled) ) - - -def containment_to_distance(containment, ksize, scaled, n_unique_kmers=None, sequence_len_bp=None, confidence=0.95, return_identity=False, return_ci = False, prob_threshold = 10.0**(-3)): - """ - Containment --> distance CI (one step) - """ - sol1,sol2,point_estimate=None,None,None - if sequence_len_bp and not n_unique_kmers: - n_unique_kmers = sequence_len_to_n_kmers(sequence_len_bp, ksize) - if containment <= 0.0001: # changed from 0.0, to mirror jaccard_to_distance_CI_one_step - sol2 = sol1 = point_estimate = 1.0 - elif containment >= 0.9999: # changed from 1.0, to mirror jaccard_to_distance_CI_one_step - sol1 = sol2 = point_estimate = 0.0 - else: - point_estimate = 1.0-containment**(1.0/ksize) - if return_ci: - try: - alpha = 1 - confidence - z_alpha = probit(1-alpha/2) - f_scaled = 1.0/scaled # these use scaled as a fraction between 0 and 1 - - bias_factor = 1 - (1 - f_scaled) ** n_unique_kmers - - term_1 = (1.0-f_scaled) / (f_scaled * n_unique_kmers**3 * bias_factor**2) - term_2 = lambda pest: n_unique_kmers * exp_n_mutated(n_unique_kmers, ksize, pest) - exp_n_mutated_squared(n_unique_kmers, ksize, pest) - term_3 = lambda pest: var_n_mutated(n_unique_kmers, ksize, pest) / (n_unique_kmers**2) - - var_direct = lambda pest: term_1 * term_2(pest) + term_3(pest) - - f1 = lambda pest: (1-pest)**ksize + z_alpha * sqrt(var_direct(pest)) - containment - f2 = lambda pest: (1-pest)**ksize - z_alpha * sqrt(var_direct(pest)) - containment - - sol1 = brentq(f1, 0.0000001, 0.9999999) - sol2 = brentq(f2, 0.0000001, 0.9999999) - - except ValueError as exc: - # afaict, this only happens with extremely small test data - notify("WARNING: Cannot estimate ANI from containment. Do your sketches contain enough hashes?") - notify(str(exc)) - return None, None, None - - # Do this here, so that we don't need to reconvert distance <--> identity later. - prob_nothing_in_common = get_exp_probability_nothing_common(point_estimate, ksize, scaled, n_unique_kmers=n_unique_kmers) - if prob_nothing_in_common >= prob_threshold: - # keep count, suggest mod decrease scaled, maybe decrease ksize - notify('WARNING: These sketches may have no hashes in common based on chance alone.') - - if return_identity: - point_estimate,sol2,sol1 = distance_to_identity(point_estimate,sol2,sol1) - - if return_ci: - return point_estimate, sol2, sol1, prob_nothing_in_common - - return point_estimate, prob_nothing_in_common - - -#from from mrcc.p_from_scaled_jaccard -def variance_scaled_jaccard(L, p, k, s): - exp_n_mut = exp_n_mutated(L, k, p) - exp_n_mut_squared = exp_n_mutated_squared(L, k, p) - exp_n_mut_cubed = exp_n_mutated_cubed(L, k, p) - bias_factor = 1 - (1 - s) ** ( int(L + exp_n_mut) ) - - factor1 = (1-s)/(s * bias_factor**2) - factor2 = (2 * L * exp_n_mut - 2 * exp_n_mut_squared) / (L ** 3 + 3*L*exp_n_mut_squared + 3*L*L*exp_n_mut + exp_n_mut_cubed) - term1 = factor1 * factor2 - term2 = (L**2 - 2 * L * exp_n_mut + exp_n_mut_squared) / (L**2 + 2 * L * exp_n_mut + exp_n_mut_squared) - term3 = ((L - exp_n_mut) / (L + exp_n_mut))**2 - - return term1 + term2 - term3 - - -def jaccard_to_distance_orig(jaccard, ksize, scaled, n_unique_kmers=None, sequence_len_bp=None, confidence=0.95, return_identity=False, return_ci=False): - """ - Scaled Jaccard to distance estimate and CI (one step) - """ - if sequence_len_bp and not n_unique_kmers: - n_unique_kmers = sequence_len_to_n_kmers(sequence_len_bp, ksize) - if jaccard <= 0.0001: - sol2 = sol1 = point_estimate = 1.0 - elif jaccard >= 0.9999: - sol1 = sol2 = point_estimate = 0.0 - else: - point_estimate = 1.0 - (2.0 - 2.0/(jaccard + 1))**(1.0/ksize) - if return_ci: - try: - alpha = 1 - confidence - z_alpha = probit(1-alpha/2) - f_scaled = 1.0/scaled # these use scaled as a fraction between 0 and 1 - - var_direct = lambda pest: variance_scaled_jaccard(n_unique_kmers, pest, ksize, f_scaled) - - f1 = lambda pest: 2.0/(2- (1-pest)**ksize ) - 1 + z_alpha * sqrt(var_direct(pest)) - jaccard - f2 = lambda pest: 2.0/(2- (1-pest)**ksize ) - 1 - z_alpha * sqrt(var_direct(pest)) - jaccard - - sol1 = brentq(f1, 0.0000001, 0.9999999) - sol2 = brentq(f2, 0.0000001, 0.9999999) - - except ValueError as exc: - # afaict, this only happens with extremely small test data - notify("WARNING: Cannot estimate ANI from jaccard. Do your sketches contain enough hashes?") - notify(str(exc)) - return None, None, None - - # Do this here, so that we don't need to reconvert distance <--> identity later. - prob_nothing_in_common = get_exp_probability_nothing_common(point_estimate, ksize, scaled, n_unique_kmers=n_unique_kmers) - if prob_nothing_in_common >= prob_threshold: - notify('WARNING: These sketches may have no hashes in common based on chance alone.') - - if return_identity: - point_estimate,sol2,sol1 = distance_to_identity(point_estimate,sol2,sol1) - - if return_ci: - return point_estimate, sol2, sol1, prob_nothing_in_common - - return point_estimate, prob_nothing_in_common - - -def jaccard_to_distance(jaccard, ksize, scaled, n_unique_kmers=None, sequence_len_bp=None, return_identity=False, prob_threshold = 10.0**(-3)): - """ - Given parameters, calculate point estimate for mutation rate from jaccard index. - First checks if parameters are valid (checks are not exhaustive). Then uses formulas - derived mathematically to compute the point estimate. The formula uses approximations, - therefore a tiny error is associated with it. A lower bound of that error is also returned. - A high error indicates that the point estimate cannot be trusted. Threshold of the error - is open to interpretation, but suggested that > 10^-4 should be handled with caution. - - Note that the error is NOT a mutation rate, and therefore cannot be considered in - something like mut.rate +/- error. - - Arguments: jaccard, ksize, scaled, n_unique_kmers - # Returns: tuple (point_estimate_of_mutation_rate, lower_bound_of_error) - - # Returns: point_estimate_of_mutation_rate - """ - # NTP question: does this equation consider variance of jaccard due to scaled? - # Or was that only used for estimating the CI around the pt estimate? - error_lower_bound = None - if sequence_len_bp and not n_unique_kmers: - n_unique_kmers = sequence_len_to_n_kmers(sequence_len_bp, ksize) - if jaccard <= 0.0001: - point_estimate = 1.0 - elif jaccard >= 0.9999: - point_estimate = 0.0 - else: - point_estimate = 1.0 - ( 2.0 * jaccard / float(1+jaccard) ) ** (1.0/float(ksize)) - - exp_n_mut = exp_n_mutated(n_unique_kmers, ksize, point_estimate) - var_n_mut = var_n_mutated(n_unique_kmers, ksize, point_estimate) - error_lower_bound = 1.0 * n_unique_kmers * var_n_mut / (n_unique_kmers + exp_n_mut)**3 - - if error_lower_bound is not None and error_lower_bound > 10.0**(-4.0): # does this need to be modifiable? - #print(f"err: ({error_lower_bound})") - notify(f"WARNING: Error on Jaccard distance point estimate is too high ({error_lower_bound}).") - # @NTP: ruturn ANI estimate anyway, NA later instead? Probably want this value for `sig overlap, at least?` - notify(f"Returning 'NA' for this comparison and continuing.") - point_estimate = "NA" - - # get probability nothing in common - prob_nothing_in_common = get_exp_probability_nothing_common(point_estimate, ksize, scaled, n_unique_kmers=n_unique_kmers) - if prob_nothing_in_common >= prob_threshold: - notify('WARNING: These sketches may have no hashes in common based on chance alone.') - - if return_identity: - point_estimate = distance_to_identity(point_estimate) - - return point_estimate, prob_nothing_in_common, error_lower_bound - diff --git a/src/sourmash/index/__init__.py b/src/sourmash/index/__init__.py index 60f1940c36..5fa521db66 100644 --- a/src/sourmash/index/__init__.py +++ b/src/sourmash/index/__init__.py @@ -554,7 +554,7 @@ def _load_manifest(self): "Load a manifest if one exists" try: manifest_data = self.storage.load('SOURMASH-MANIFEST.csv') - except KeyError: + except (KeyError, FileNotFoundError): self.manifest = None else: debug_literal(f'found manifest on load for {self.storage.path}') @@ -616,18 +616,16 @@ def _signatures_with_internal(self): Note: does not limit signatures to subsets. """ - zf = self.storage.zipfile - # list all the files, without using the Storage interface; currently, # 'Storage' does not provide a way to list all the files, so :shrug:. - for zipinfo in zf.infolist(): + for filename in self.storage._filenames(): # should we load this file? if it ends in .sig OR we are forcing: - if zipinfo.filename.endswith('.sig') or \ - zipinfo.filename.endswith('.sig.gz') or \ + if filename.endswith('.sig') or \ + filename.endswith('.sig.gz') or \ self.traverse_yield_all: - fp = zf.open(zipinfo) - for ss in load_signatures(fp): - yield ss, zipinfo.filename + sig_data = self.storage.load(filename) + for ss in load_signatures(sig_data): + yield ss, filename def signatures(self): "Load all signatures in the zip file." @@ -652,10 +650,10 @@ def signatures(self): # if no manifest here, break Storage class encapsulation # and go for all the files. (This is necessary to support # ad-hoc zipfiles that have no manifests.) - for zipinfo in storage.zipfile.infolist(): + for filename in storage._filenames(): # should we load this file? if it ends in .sig OR force: - if zipinfo.filename.endswith('.sig') or \ - zipinfo.filename.endswith('.sig.gz') or \ + if filename.endswith('.sig') or \ + filename.endswith('.sig.gz') or \ self.traverse_yield_all: if selection_dict: select = lambda x: select_signature(x, @@ -663,7 +661,7 @@ def signatures(self): else: select = lambda x: True - data = self.storage.load(zipinfo.filename) + data = self.storage.load(filename) for ss in load_signatures(data): if select(ss): yield ss diff --git a/src/sourmash/lca/lca_db.py b/src/sourmash/lca/lca_db.py index fb9119def4..8f88d0c11f 100644 --- a/src/sourmash/lca/lca_db.py +++ b/src/sourmash/lca/lca_db.py @@ -510,10 +510,13 @@ def find(self, search_fn, query, **kwargs): score = search_fn.score_fn(query_size, shared_size, subj_size, total_size) - # note to self: even with JaccardSearchBestOnly, this will + # CTB note to self: even with JaccardSearchBestOnly, this will # still iterate over & score all signatures. We should come # up with a protocol by which the JaccardSearch object can # signal that it is done, or something. + # For example, see test_lca_jaccard_ordering, where + # for containment we could be done early, but for Jaccard we + # cannot. if search_fn.passes(score): if search_fn.collect(score, subj): if passes_all_picklists(subj, self.picklists): diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py index 5e5a14c4cf..d2323ae8ab 100644 --- a/src/sourmash/minhash.py +++ b/src/sourmash/minhash.py @@ -647,7 +647,7 @@ def jaccard(self, other, downsample=False): raise TypeError(err) return self._methodcall(lib.kmerminhash_similarity, other._get_objptr(), True, downsample) - def jaccard_ani(self, other, downsample=False, jaccard=None, confidence=0.95): + def jaccard_ani(self, other, *, downsample=False, jaccard=None, prob_threshold=1e-3, err_threshold=1e-4): "Calculate Jaccard --> ANI of two MinHash objects." self_mh = self other_mh = other @@ -659,11 +659,12 @@ def jaccard_ani(self, other, downsample=False, jaccard=None, confidence=0.95): if jaccard is None: jaccard = self_mh.similarity(other_mh, ignore_abundance=True) avg_scaled_kmers = round((len(self_mh) + len(other_mh))/2) - avg_n_kmers = avg_scaled_kmers * scaled # would be better if hll estimate - j_ani,ani_low,ani_high = jaccard_to_distance(jaccard, self_mh.ksize, scaled, - n_unique_kmers=avg_n_kmers, confidence=confidence, - return_identity=True) - return j_ani, ani_low, ani_high + avg_n_kmers = avg_scaled_kmers * scaled # would be better if hll estimate - see #1798 + j_aniresult = jaccard_to_distance(jaccard, self_mh.ksize, scaled, + n_unique_kmers=avg_n_kmers, + prob_threshold = prob_threshold, + err_threshold = err_threshold) + return j_aniresult def similarity(self, other, ignore_abundance=False, downsample=False): """Calculate similarity of two sketches. @@ -702,7 +703,7 @@ def contained_by(self, other, downsample=False): return self.count_common(other, downsample) / len(self) - def containment_ani(self, other, downsample=False, containment=None, confidence=0.95): + def containment_ani(self, other, *, downsample=False, containment=None, confidence=0.95, estimate_ci = False): "Estimate ANI from containment with the other MinHash." self_mh = self other_mh = other @@ -713,11 +714,13 @@ def containment_ani(self, other, downsample=False, containment=None, confidence= other_mh = other.downsample(scaled=scaled) if containment is None: containment = self_mh.contained_by(other_mh) - n_kmers = len(self_mh) * scaled # would be better if hll estimate - c_ani,ani_low,ani_high = containment_to_distance(containment, self_mh.ksize, self_mh.scaled, + n_kmers = len(self_mh) * scaled # would be better if hll estimate - see #1798 + + c_aniresult = containment_to_distance(containment, self_mh.ksize, self_mh.scaled, n_unique_kmers=n_kmers, confidence=confidence, - return_identity=True) - return c_ani, ani_low, ani_high + estimate_ci = estimate_ci) + return c_aniresult + def max_containment(self, other, downsample=False): """ @@ -731,7 +734,7 @@ def max_containment(self, other, downsample=False): return self.count_common(other, downsample) / min_denom - def max_containment_ani(self, other, downsample=False, max_containment=None, confidence=0.95): + def max_containment_ani(self, other, *, downsample=False, max_containment=None, confidence=0.95, estimate_ci=False): "Estimate ANI from containment with the other MinHash." self_mh = self other_mh = other @@ -743,11 +746,12 @@ def max_containment_ani(self, other, downsample=False, max_containment=None, con if max_containment is None: max_containment = self_mh.max_containment(other_mh) min_n_kmers = min(len(self_mh), len(other_mh)) - n_kmers = min_n_kmers * scaled - c_ani,ani_low,ani_high = containment_to_distance(max_containment, self_mh.ksize, scaled, - n_unique_kmers=n_kmers,confidence=confidence, - return_identity=True) - return c_ani, ani_low, ani_high + n_kmers = min_n_kmers * scaled # would be better if hll estimate - see #1798 + + c_aniresult = containment_to_distance(max_containment, self_mh.ksize, scaled, + n_unique_kmers=n_kmers,confidence=confidence, + estimate_ci = estimate_ci) + return c_aniresult def __add__(self, other): if not isinstance(other, MinHash): diff --git a/src/sourmash/picklist.py b/src/sourmash/picklist.py index fc306782b6..f48377e4cb 100644 --- a/src/sourmash/picklist.py +++ b/src/sourmash/picklist.py @@ -1,5 +1,6 @@ "Picklist code for extracting subsets of signatures." import csv +import os from enum import Enum # set up preprocessing functions for column stuff @@ -143,18 +144,23 @@ def load(self, pickfile, column_name): "load pickset, return num empty vals, and set of duplicate vals." pickset = self.init() + if not os.path.exists(pickfile) or not os.path.isfile(pickfile): + raise ValueError(f"pickfile '{pickfile}' must exist and be a regular file") + n_empty_val = 0 dup_vals = set() with open(pickfile, newline='') as csvfile: x = csvfile.readline() # skip leading comment line in case there's a manifest header - if x[0] == '#': + if not x or x[0] == '#': pass else: csvfile.seek(0) r = csv.DictReader(csvfile) + if not r.fieldnames: + raise ValueError(f"empty or improperly formatted pickfile '{pickfile}'") if column_name not in r.fieldnames: raise ValueError(f"column '{column_name}' not in pickfile '{pickfile}'") diff --git a/src/sourmash/sbt.py b/src/sourmash/sbt.py index d974c23c91..bb001fd940 100644 --- a/src/sourmash/sbt.py +++ b/src/sourmash/sbt.py @@ -625,7 +625,7 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False): kind = "Zip" if not path.endswith('.sbt.zip'): path += '.sbt.zip' - storage = ZipStorage(path) + storage = ZipStorage(path, mode="w") backend = "FSStorage" assert path[-8:] == '.sbt.zip' @@ -1440,6 +1440,8 @@ def convert_cmd(name, backend): backend = options.pop(0) backend = backend.lower().strip("'") + kwargs = {} + if options: print(options) options = options[0].split(')') @@ -1454,6 +1456,7 @@ def convert_cmd(name, backend): backend = RedisStorage elif backend.lower() in ('zip', 'zipstorage'): backend = ZipStorage + kwargs['mode'] = 'w' elif backend.lower() in ('fs', 'fsstorage'): backend = FSStorage if options: @@ -1469,6 +1472,6 @@ def convert_cmd(name, backend): else: error('backend not recognized: {}'.format(backend)) - with backend(*options) as storage: + with backend(*options, **kwargs) as storage: sbt = SBT.load(name, leaf_loader=SigLeaf.load) sbt.save(name, storage=storage) diff --git a/src/sourmash/sbt_storage.py b/src/sourmash/sbt_storage.py index 695832fa18..398e11c877 100644 --- a/src/sourmash/sbt_storage.py +++ b/src/sourmash/sbt_storage.py @@ -9,6 +9,11 @@ from abc import ABC from pathlib import Path +from ._lowlevel import ffi, lib +from .utils import RustObject, rustcall, decode_str +from .minhash import to_bytes + + class Storage(ABC): @abc.abstractmethod @@ -89,7 +94,110 @@ def load(self, path): return path.read_bytes() -class ZipStorage(Storage): +class ZipStorage(RustObject, Storage): + + __dealloc_func__ = lib.zipstorage_free + + def __init__(self, path, *, mode="r"): + if mode == "w": + self.__inner = _RwZipStorage(path) + else: + self.__inner = None + path = os.path.abspath(path) + self._objptr = rustcall(lib.zipstorage_new, to_bytes(path), len(path)) + + @staticmethod + def can_open(location): + return zipfile.is_zipfile(location) + + @property + def path(self): + if self.__inner: + return self.__inner.path + return decode_str(self._methodcall(lib.zipstorage_path)) + + @property + def subdir(self): + if self.__inner: + return self.__inner.subdir + return decode_str(self._methodcall(lib.zipstorage_subdir)) + + @subdir.setter + def subdir(self, value): + if self.__inner: + self.__inner.subdir = value + else: + self._methodcall(lib.zipstorage_set_subdir, to_bytes(value), len(value)) + + def _filenames(self): + if self.__inner: + return self.__inner._filenames() + + size = ffi.new("uintptr_t *") + paths_ptr = self._methodcall(lib.zipstorage_filenames, size) + size = size[0] + + paths = [] + for i in range(size): + path = decode_str(paths_ptr[i][0]) + paths.append(path) + + return paths + + def save(self, path, content, *, overwrite=False, compress=False): + if self.__inner: + return self.__inner.save(path, content, overwrite=overwrite, compress=compress) + raise NotImplementedError() + + def load(self, path): + if self.__inner: + return self.__inner.load(path) + + try: + size = ffi.new("uintptr_t *") + rawbuf = self._methodcall(lib.zipstorage_load, to_bytes(path), len(path), size) + size = size[0] + + rawbuf = ffi.gc(rawbuf, lambda o: lib.nodegraph_buffer_free(o, size), size) + buf = ffi.buffer(rawbuf, size) + + # TODO: maybe avoid the [:] here, it triggers a copy... + return buf[:] + except ValueError: + raise FileNotFoundError(path) + + def list_sbts(self): + if self.__inner: + return self.__inner.list_sbts() + + size = ffi.new("uintptr_t *") + paths_ptr = self._methodcall(lib.zipstorage_list_sbts, size) + size = size[0] + + paths = [] + for i in range(size): + path = decode_str(paths_ptr[i][0]) + paths.append(path) + + return paths + + def init_args(self): + return {'path': self.path} + + def flush(self): + if self.__inner: + self.__inner.flush() + + def close(self): + if self.__inner: + self.__inner.close() + + @staticmethod + def can_open(location): + return zipfile.is_zipfile(location) + + +class _RwZipStorage(Storage): def __init__(self, path): self.path = os.path.abspath(path) @@ -119,6 +227,9 @@ def __init__(self, path): if len(subdirs) == 1: self.subdir = subdirs[0] + def _filenames(self): + return [info.filename for info in self.zipfile.infolist()] + def _content_matches(self, zf, path, content): info = zf.getinfo(path) entry_content = zf.read(info) @@ -209,9 +320,6 @@ def load(self, path): else: raise FileNotFoundError(path) - def init_args(self): - return {'path': self.path} - def close(self): # TODO: this is not ideal; checking for zipfile.fp is looking at # internal implementation details from CPython... @@ -285,10 +393,6 @@ def flush(self, *, keep_closed=False): self.bufferzip.close() self.bufferzip = None - @staticmethod - def can_open(location): - return zipfile.is_zipfile(location) - def list_sbts(self): return [f for f in self.zipfile.namelist() if f.endswith(".sbt.json")] diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index dc001013e3..60cba7f43f 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -126,15 +126,15 @@ def load_picklist(args): if args.picklist: try: picklist = SignaturePicklist.from_picklist_args(args.picklist) + + notify(f"picking column '{picklist.column_name}' of type '{picklist.coltype}' from '{picklist.pickfile}'") + + n_empty_val, dup_vals = picklist.load(picklist.pickfile, picklist.column_name) except ValueError as exc: error("ERROR: could not load picklist.") error(str(exc)) sys.exit(-1) - notify(f"picking column '{picklist.column_name}' of type '{picklist.coltype}' from '{picklist.pickfile}'") - - n_empty_val, dup_vals = picklist.load(picklist.pickfile, picklist.column_name) - notify(f"loaded {len(picklist.pickset)} distinct values into picklist.") if n_empty_val: notify(f"WARNING: {n_empty_val} empty values in column '{picklist.column_name}' in picklist file") @@ -966,7 +966,7 @@ def open(self): if os.path.exists(self.location): do_create = False - storage = ZipStorage(self.location) + storage = ZipStorage(self.location, mode="w") if not storage.subdir: storage.subdir = 'signatures' diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index b09d05076b..cd93349f7e 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -1627,6 +1627,45 @@ def test_sig_extract_7_no_ksize(c): assert len(siglist) == 3 +def test_sig_extract_8_empty_picklist_fail(runtmp): + # what happens with an empty picklist? + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + # make empty picklist + picklist_csv = runtmp.output('pick.csv') + with open(picklist_csv, 'w', newline='') as csvfp: + pass + + picklist_arg = f"{picklist_csv}:md5full:md5" + + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + + err = runtmp.last_result.err + print(err) + + assert "empty or improperly formatted pickfile" in err + + +def test_sig_extract_8_nofile_picklist_fail(runtmp): + # what happens when picklist file does not exist? + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + # picklist file does not exist + picklist_csv = runtmp.output('pick.csv') + picklist_arg = f"{picklist_csv}:md5full:md5" + + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + + err = runtmp.last_result.err + print(err) + + assert "must exist and be a regular file" in err + + def test_sig_extract_8_picklist_md5(runtmp): # extract 47 from 47, using a picklist w/full md5 sig47 = utils.get_test_data('47.fa.sig') diff --git a/tests/test_distance_utils.py b/tests/test_distance_utils.py index c2ed4613c1..ebc4dd56d8 100644 --- a/tests/test_distance_utils.py +++ b/tests/test_distance_utils.py @@ -1,37 +1,70 @@ """ Tests for distance utils. """ - import pytest -from sourmash.distance_utils import containment_to_distance, jaccard_to_distance, distance_to_identity, sequence_len_to_n_kmers, jaccard_to_distance_point_estimate +from sourmash.distance_utils import (containment_to_distance, get_exp_probability_nothing_common, + handle_seqlen_nkmers, jaccard_to_distance, + ANIResult, ciANIResult, jaccardANIResult, var_n_mutated) + +def test_aniresult(): + res = ANIResult(0.4, 0.1) + assert res.dist == 0.4 + assert res.ani == 0.6 + assert res.p_nothing_in_common == 0.1 + assert res.p_exceeds_threshold ==True + # check that they're equivalent + res2 = ANIResult(0.4, 0.1) + assert res == res2 + res3 = ANIResult(0.5, 0) + assert res != res3 + assert res3.p_exceeds_threshold ==False + +def test_aniresult_bad_distance(): + """ + Fail if distance is not between 0 and 1. + """ + with pytest.raises(Exception) as exc: + ANIResult(1.1, 0.1) + print("\n", str(exc.value)) + assert "distance value 1.1000 is not between 0 and 1!" in str(exc.value) + with pytest.raises(Exception) as exc: + ANIResult(-0.1, 0.1) + print("\n", str(exc.value)) + assert "distance value -0.1000 is not between 0 and 1!" in str(exc.value) -def test_distance_to_identity(): - ident,id_low,id_high = distance_to_identity(0.5,0.4,0.6) - assert ident == 0.5 - assert id_low == 0.4 - assert id_high ==0.6 +def test_jaccard_aniresult(): + res = jaccardANIResult(0.4, 0.1, jaccard_error=0.03) + assert res.dist == 0.4 + assert res.ani == 0.6 + assert res.p_nothing_in_common == 0.1 + assert res.jaccard_error == 0.03 + assert res.p_exceeds_threshold ==True + assert res.je_exceeds_threshold ==True + res2 = jaccardANIResult(0.4, 0.1, jaccard_error=0.03, je_threshold=0.1) + assert res2.je_exceeds_threshold ==False -def test_distance_to_identity_just_point_estimate(): - ident = distance_to_identity(0.4) - assert ident == 0.6 +def test_jaccard_aniresult_nojaccarderror(): + #jaccard error is None + with pytest.raises(Exception) as exc: + jaccardANIResult(0.4, 0.1, None) + print("\n", str(exc.value)) + assert "Error: jaccard_error cannot be None." in str(exc.value) -def test_distance_to_identity_just_point_estimate_extra_input(): - """ - Ignore 2nd input, unless put both dist_low and dist_high - """ - ident = distance_to_identity(0.4,0.5) - assert ident == 0.6 - -def test_distance_to_identity_fail(): - with pytest.raises(Exception) as exc: - ident,id_low,id_high = distance_to_identity(1.1,0.4,0.6) - assert "distance value 1.1 is not between 0 and 1!" in str(exc.value) - with pytest.raises(Exception) as exc: - ident,id_low,id_high = distance_to_identity(-0.1,0.4,0.6) - assert "distance value -0.1 is not between 0 and 1!" in str(exc.value) +def test_ci_aniresult(): + res = ciANIResult(0.4, 0.1, dist_low=0.3,dist_high=0.5) + print(res) + assert res.dist == 0.4 + assert res.ani == 0.6 + assert res.p_nothing_in_common == 0.1 + assert res.ani_low == 0.5 + assert res.ani_high == 0.7 + res2 = ciANIResult(0.4, 0.1, dist_low=0.3,dist_high=0.5) + assert res == res2 + res3 = ciANIResult(0.4, 0.2, dist_low=0.3, dist_high=0.5) + assert res != res3 def test_containment_to_distance_zero(): @@ -39,20 +72,23 @@ def test_containment_to_distance_zero(): scaled = 1 nkmers = 10000 ksize=21 - dist,low,high = containment_to_distance(contain,ksize,scaled, n_unique_kmers=nkmers) - print("\nDIST:", dist) - print("CI:", low, " - ", high) + res = containment_to_distance(contain,ksize,scaled, n_unique_kmers=nkmers, estimate_ci=True) + print(res) # check results - exp_dist, exp_low,exp_high = 1.0,1.0,1.0 - assert dist == exp_dist - assert low == exp_low - assert high == exp_high - # return identity instead - exp_id, exp_idlow,exp_idhigh = 0.0,0.0,0.0 - ident,low,high = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers,return_identity=True) - assert ident == exp_id - assert low == exp_idlow - assert high == exp_idhigh + exp_dist,exp_low,exp_high,pnc = 1.0,None,None,1.0 + exp_id, exp_idlow,exp_idhigh,pnc = 0.0,None,None,1.0 + assert res.dist == exp_dist + assert res.dist_low == exp_low + assert res.dist_high == exp_high + assert res.p_nothing_in_common == pnc + assert res.ani == exp_id + assert res.ani_low == exp_idlow + assert res.ani_high == exp_idhigh + # check without returning ci + res2 = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers) + print(res2) + exp_res = ciANIResult(dist=1.0, p_nothing_in_common=1.0, p_threshold=0.001) + assert res2 == exp_res def test_containment_to_distance_one(): @@ -60,20 +96,25 @@ def test_containment_to_distance_one(): scaled = 1 nkmers = 10000 ksize=21 - dist,low,high = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers) - print("\nDIST:", dist) - print("CI:", low, " - ", high) - # check results - exp_dist, exp_low,exp_high = 0.0,0.0,0.0 - assert dist == exp_dist - assert low == exp_low - assert high == exp_high - # return identity instead - exp_id, exp_idlow,exp_idhigh = 1.0,1.0,1.0 - ident,low,high = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers,return_identity=True) - assert ident == exp_id - assert low == exp_idlow - assert high == exp_idhigh + res = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers,estimate_ci=True) + print(res) + exp_dist, exp_low,exp_high,pnc = 0.0,None,None,0.0 + exp_id, exp_idlow,exp_idhigh,pnc = 1.0,None,None,0.0 + assert res.dist == exp_dist + assert res.dist_low == exp_low + assert res.dist_high == exp_high + assert res.p_nothing_in_common == pnc + assert res.ani == exp_id + assert res.ani_low == exp_idlow + assert res.ani_high == exp_idhigh + + # check without returning ci + res = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers) + assert res.dist == exp_dist + assert res.ani == exp_id + assert res.p_nothing_in_common == pnc + assert res.ani_low == None + assert res.ani_high == None def test_containment_to_distance_scaled1(): @@ -81,70 +122,64 @@ def test_containment_to_distance_scaled1(): scaled = 1 nkmers = 10000 ksize=21 - dist,low,high = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers) - print("\nDIST:", dist) - print("CI:", low, " - ", high) - print(f"{dist},{low},{high}") + res = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers,estimate_ci=True) + print(res) # check results - exp_dist, exp_low,exp_high = 0.032468221476108394,0.028709912966405623,0.03647860197289783 - assert dist == exp_dist - assert low == exp_low - assert high == exp_high - # return identity instead - exp_id, exp_idlow,exp_idhigh = 0.9675317785238916,0.9635213980271021,0.9712900870335944 - ident,low,high = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers,return_identity=True) - print(f"{ident},{low},{high}") - assert ident == exp_id - assert low == exp_idlow - assert high == exp_idhigh - - -def test_containment_to_distance_2(): + assert res.dist == 0.032468221476108394 + assert res.ani == 0.9675317785238916 + assert res.dist_low == 0.028709912966405623 + assert res.ani_high == 0.9712900870335944 + assert res.dist_high == 0.03647860197289783 + assert res.ani_low == 0.9635213980271021 + assert res.p_nothing_in_common == 0.0 + # without returning ci + res2 = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers) + assert (res2.dist,res2.ani,res2.p_nothing_in_common) == (0.032468221476108394, 0.9675317785238916, 0.0) + assert (res2.dist,res2.ani,res2.p_nothing_in_common) == (res.dist, res.ani, res.p_nothing_in_common) + + +def test_containment_to_distance_scaled100(): contain = 0.1 scaled = 100 nkmers = 10000 ksize=31 - dist,low,high = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers) - print("\nDIST:", dist) - print("CI:", low, " - ", high) - print(f"{dist},{low},{high}") + res = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers,estimate_ci=True) + print(res) # check results - exp_dist, exp_low,exp_high = 0.07158545548052564,0.05320779238601372,0.09055547672455365 - assert dist == exp_dist - assert low == exp_low - assert high == exp_high + assert res.dist == 0.07158545548052564 + assert res.dist_low == 0.05320779238601372 + assert res.dist_high == 0.09055547672455365 + assert res.p_nothing_in_common == 4.3171247410658655e-05 + assert res.p_exceeds_threshold == False -def test_containment_to_distance_scaled100(): +def test_containment_to_distance_scaled100_2(): contain = 0.5 scaled = 100 nkmers = 10000 ksize=21 - dist,low,high = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers) - print("\nDIST:", dist) - print("CI:", low, " - ", high) - print(f"{dist},{low},{high}") + res= containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers,estimate_ci=True) + print(res) # check results - exp_dist, exp_low,exp_high = 0.032468221476108394,0.023712063916639017,0.04309960543965866 - assert dist == exp_dist - assert low == exp_low - assert high == exp_high + assert res.dist == 0.032468221476108394 + assert res.dist_low == 0.023712063916639017 + assert res.dist_high == 0.04309960543965866 + assert res.p_exceeds_threshold == False def test_containment_to_distance_k10(): - jaccard = 0.5 + contain = 0.5 scaled = 100 nkmers = 10000 ksize=10 - dist,low,high = containment_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) - print("\nDIST:", dist) - print("CI:", low, " - ", high) - print(f"{dist},{low},{high}") + res = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers,estimate_ci=True) + print(res) # check results - exp_dist, exp_low,exp_high = 0.06696700846319259,0.04982777541057476,0.08745108232411622 - assert dist == exp_dist - assert low == exp_low - assert high == exp_high + assert res.dist == 0.06696700846319259 + assert res.dist_low == 0.04982777541057476 + assert res.dist_high == 0.08745108232411622 + assert res.p_exceeds_threshold == False + def test_containment_to_distance_confidence(): contain = 0.1 @@ -152,26 +187,21 @@ def test_containment_to_distance_confidence(): nkmers = 10000 ksize=31 confidence=0.99 - dist,low,high = containment_to_distance(contain,ksize,scaled,confidence=confidence,n_unique_kmers=nkmers) - print("\nDIST:", dist) - print("CI:", low, " - ", high) - print(f"{dist},{low},{high}") + res = containment_to_distance(contain,ksize,scaled,confidence=confidence,n_unique_kmers=nkmers, estimate_ci=True) + print(res) # check results - exp_dist, exp_low,exp_high = 0.07158545548052564,0.04802880300938562,0.09619930040790341 - assert dist == exp_dist - assert low == exp_low - assert high == exp_high - + assert res.dist == 0.07158545548052564 + assert res.dist_low == 0.04802880300938562 + assert res.dist_high == 0.09619930040790341 + assert res.p_exceeds_threshold == False confidence=0.90 - dist,low,high = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers,confidence=confidence) - print("\nDIST:", dist) - print("CI:", low, " - ", high) - print(f"{dist},{low},{high}") + res2 = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers,confidence=confidence, estimate_ci=True) + print(res2) # check results - exp_dist, exp_low,exp_high = 0.07158545548052564,0.05599435479247415,0.08758718871990222 - assert dist == exp_dist - assert low == exp_low - assert high == exp_high + assert res2.dist == res.dist + assert res2.dist_low == 0.05599435479247415 + assert res2.dist_high == 0.08758718871990222 + assert res.p_exceeds_threshold == False def test_nkmers_to_bp_containment(): @@ -179,17 +209,18 @@ def test_nkmers_to_bp_containment(): scaled = 100 bp_len = 10030 ksize=31 - nkmers = sequence_len_to_n_kmers(bp_len,ksize) - print("nkmers_from_bp:", bp_len) + nkmers = handle_seqlen_nkmers(ksize, sequence_len_bp= bp_len) + print("nkmers_from_bp:", nkmers) confidence=0.99 - kmer_dist = containment_to_distance(containment,ksize,scaled,confidence=confidence,n_unique_kmers=nkmers) - bp_dist = containment_to_distance(containment,ksize,scaled,confidence=confidence,sequence_len_bp=bp_len) - print(f"\nkDIST:", kmer_dist) - print(f"\nbpDIST:",bp_dist) + kmer_res = containment_to_distance(containment,ksize,scaled,confidence=confidence,n_unique_kmers=nkmers,estimate_ci=True) + bp_res = containment_to_distance(containment,ksize,scaled,confidence=confidence,sequence_len_bp=bp_len,estimate_ci=True) + print(f"\nkDIST: {kmer_res}") + print(f"\nbpDIST:,{bp_res}") # check results - assert kmer_dist == (0.07158545548052564, 0.04802880300938562, 0.09619930040790341) - assert bp_dist == (0.07158545548052564, 0.04802880300938562, 0.09619930040790341) - assert kmer_dist==bp_dist + assert kmer_res==bp_res + assert kmer_res.dist == 0.07158545548052564 + assert kmer_res.dist_low == 0.04802880300938562 + assert kmer_res.dist_high == 0.09619930040790341 def test_jaccard_to_distance_zero(): @@ -197,22 +228,13 @@ def test_jaccard_to_distance_zero(): scaled = 1 nkmers = 10000 ksize=21 - dist,low,high = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) - print("\nDIST:", dist) - print("CI:", low, " - ", high) - print(f"{dist},{low},{high}") + res= jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) + print(res) # check results - exp_dist, exp_low,exp_high = 1.0,1.0,1.0 - assert dist == exp_dist - assert low == exp_low - assert high == exp_high - # return identity instead - exp_id, exp_idlow,exp_idhigh = 0.0,0.0,0.0 - ident,low,high = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers,return_identity=True) - print(f"{ident},{low},{high}") - assert ident == exp_id - assert low == exp_idlow - assert high == exp_idhigh + assert res.dist == 1.0 + assert res.ani == 0.0 + assert res.p_nothing_in_common == 1.0 + assert res.jaccard_error == 0.0 def test_jaccard_to_distance_one(): @@ -220,52 +242,36 @@ def test_jaccard_to_distance_one(): scaled = 1 nkmers = 10000 ksize=21 - dist,low,high = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) - print("\nDIST:", dist) - print("CI:", low, " - ", high) - print(f"{dist},{low},{high}") - # check results - exp_dist, exp_low,exp_high = 0.0,0.0,0.0 - assert dist == exp_dist - assert low == exp_low - assert high == exp_high - # return identity instead - exp_id, exp_idlow,exp_idhigh = 1.0,1.0,1.0 - ident,low,high = jaccard_to_distance(jaccard,ksize,scaled,return_identity=True,n_unique_kmers=nkmers) - print(f"{ident},{low},{high}") - assert ident == exp_id - assert low == exp_idlow - assert high == exp_idhigh - - -def test_jaccard_to_distance_scaled1(): - jaccard = 0.5 - scaled = 1 - nkmers = 10000 - ksize=21 - dist,low,high = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) - print("\nDIST:", dist) - print("CI:", low, " - ", high) - print(f"({dist},{low},{high})") + res= jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) + print(res) # check results - assert (dist,low,high) == (0.019122659390482077,0.017549816764593173,0.020857520093330525) - # return identity instead - ident,low,high = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers,return_identity=True) - print(f"({ident},{low},{high})") - assert (ident,low, high) == (0.9808773406095179,0.9791424799066695,0.9824501832354068) + assert res.dist == 0.0 + assert res.ani == 1.0 + assert res.p_nothing_in_common == 0.0 + assert res.jaccard_error == 0.0 -def test_jaccard_to_distance_scaled100(): +def test_jaccard_to_distance_scaled(): + # scaled value doesn't impact point estimate or jaccard error, just p_nothing_in_common jaccard = 0.5 - scaled = 100 + scaled = 1 nkmers = 10000 ksize=21 - dist,low,high = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) - print("\nDIST:", dist) - print("CI:", low, " - ", high) - print(f"{dist},{low},{high}") + res = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) + print(res) # check results - (dist,low,high) = (0.019122659390482077,0.014156518319161603,0.02509547110001303) + assert res.dist == 0.019122659390482077 + assert res.ani == 0.9808773406095179 + assert res.p_exceeds_threshold == False + assert res.jaccard_error == 0.00018351337045518042 + assert res.je_exceeds_threshold ==True + scaled = 100 + res2 = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) + print(res2) + assert res2.dist == res.dist + assert res2.jaccard_error == res.jaccard_error + assert res2.p_nothing_in_common != res.p_nothing_in_common + assert res2.p_exceeds_threshold ==False def test_jaccard_to_distance_k31(): @@ -273,12 +279,15 @@ def test_jaccard_to_distance_k31(): scaled = 100 nkmers = 10000 ksize=31 - dist,low,high = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) - print("\nDIST:", dist) - print("CI:", low, " - ", high) - print(f"({dist},{low},{high})") + res = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) + print(res) # check results - assert (dist, low, high) == (0.012994354410710174,0.009559840649526755,0.017172145712325677) + assert res.ani == 0.9870056455892898 + assert res.p_exceeds_threshold == False + assert res.je_exceeds_threshold ==True + res2 = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers, err_threshold=0.1) + assert res2.ani == res.ani + assert res2.je_exceeds_threshold == False def test_jaccard_to_distance_k31_2(): @@ -286,85 +295,101 @@ def test_jaccard_to_distance_k31_2(): scaled = 100 nkmers = 10000 ksize=31 - dist,low,high = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) - print("\nDIST:", dist) - print("CI:", low, " - ", high) - print(f"({dist},{low},{high})") + res = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) + print(res) # check results - assert (dist, low, high) == (0.0535071608231702,0.04073963222782166,0.0667374639111566) + assert res.ani == 0.9464928391768298 + assert res.p_exceeds_threshold == False + assert res.je_exceeds_threshold == False -def test_jaccard_to_distance_confidence(): +def test_nkmers_to_bp_jaccard(): jaccard = 0.1 scaled = 100 - nkmers = 10000 + bp_len = 10030 ksize=31 - confidence=0.99 - dist,low,high = jaccard_to_distance(jaccard,ksize,scaled,confidence=confidence,n_unique_kmers=nkmers) - print("\nDIST:", dist) - print("CI:", low, " - ", high) - print(f"({dist},{low},{high})") + nkmers = handle_seqlen_nkmers(ksize, sequence_len_bp= bp_len) + print("nkmers_from_bp:", nkmers) + kmer_res = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) + bp_res = jaccard_to_distance(jaccard,ksize,scaled,sequence_len_bp=bp_len) + print(f"\nkmer_res: {kmer_res}") + print(f"\nbp_res: {bp_res}") # check results - assert (dist,low,high) == (0.0535071608231702,0.03702518586582811,0.07080999238232542) + assert kmer_res == bp_res + assert kmer_res.dist == 0.0535071608231702 + assert kmer_res.p_exceeds_threshold == False + assert kmer_res.je_exceeds_threshold == False - confidence=0.90 - dist,low,high = jaccard_to_distance(jaccard,ksize,scaled,confidence=confidence,n_unique_kmers=nkmers) - print("\nDIST:", dist) - print("CI:", low, " - ", high) - print(f"({dist},{low},{high})") - # check results - assert(dist,low,high) == (0.0535071608231702,0.04270836172610141,0.06462806500239282) +def test_exp_prob_nothing_common(): + dist = 0.25 + ksize = 31 + scaled = 10 + bp_len = 1000030 + nkmers = handle_seqlen_nkmers(ksize, sequence_len_bp= bp_len) + print("nkmers_from_bp:", nkmers) -def test_nkmers_to_bp(): - jaccard = containment = 0.1 - scaled = 100 - bp_len = 10030 - ksize=31 - nkmers = sequence_len_to_n_kmers(bp_len,ksize) - print("nkmers_from_bp:", bp_len) - confidence=0.99 - kmer_dist = jaccard_to_distance(jaccard,ksize,scaled,confidence=confidence,n_unique_kmers=nkmers) - bp_dist = jaccard_to_distance(jaccard,ksize,scaled,confidence=confidence,sequence_len_bp=bp_len) - print(f"\nk_jDIST:", kmer_dist) - print(f"\nbp_jDIST:",bp_dist) - # check results - assert kmer_dist == (0.0535071608231702, 0.03702518586582811, 0.07080999238232542) - assert bp_dist == (0.0535071608231702, 0.03702518586582811, 0.07080999238232542) - assert kmer_dist==bp_dist - - kmer_cdist = containment_to_distance(containment,ksize,scaled,confidence=confidence,n_unique_kmers=nkmers) - bp_cdist = containment_to_distance(containment,ksize,scaled,confidence=confidence,sequence_len_bp=bp_len) - print(f"\nk_cDIST:", kmer_cdist) - print(f"\nbp_cDIST:",bp_cdist) - # check results - assert kmer_cdist == (0.07158545548052564, 0.04802880300938562, 0.09619930040790341) - assert bp_cdist == (0.07158545548052564, 0.04802880300938562, 0.09619930040790341) - assert kmer_cdist==bp_cdist + nkmers_pnc = get_exp_probability_nothing_common(dist,ksize,scaled,n_unique_kmers=nkmers) + print(f"prob nothing in common: {nkmers_pnc}") + bp_pnc = get_exp_probability_nothing_common(dist,ksize,scaled,sequence_len_bp=bp_len) + assert nkmers_pnc == bp_pnc == 7.437016945722123e-07 -def test_jaccard_to_distance_point_estimate(): - jaccard = 0.9 - ksize = 21 - scaled = 10 - n_unique_kmers = 100000 - # jaccard_to_distance_point_estimate usage - print(n_unique_kmers) - mut_rate, err = jaccard_to_distance_point_estimate(jaccard, ksize, scaled, n_unique_kmers) - print('Point estimate is: ' + str(mut_rate)) - print('Error is: ' + str(err)) -# if err > 10.0**(-4.0): -# print('Cannot trust this point estimate!') - -# # get_exp_probability_nothing_common usage -# ksize = 21 -# scaled = 1000 -# n_unique_kmers = 10000000 -# mutation_rate = 0.3 -# threshold = 10.0**(-3) - -# exp_probability_no_common = get_exp_probability_nothing_common(n_unique_kmers, -# ksize, mutation_rate, scaled) -# print(exp_probability_no_common) -# if (exp_probability_no_common >= threshold): -# print('There could be cases where nothing common between sketches may happen!') \ No newline at end of file +def test_containment_to_distance_tinytestdata_var0(): + """ + tiny test data to trigger the following: + WARNING: Cannot estimate ANI confidence intervals from containment. Do your sketches contain enough hashes? + Error: varN <0.0! + """ + contain = 0.9 + scaled = 1 + nkmers = 4 + ksize=31 + res = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers, estimate_ci=True) + print(res) + # check results + assert res.dist == 0.003392957179023992 + assert res.dist_low == None + assert res.dist_high == None + assert res.ani_low == None + assert res.ani_high == None + assert res.p_exceeds_threshold == False + + +def test_var_n_mutated(): + # check 0 + r = 0 + ksize = 31 + nkmers = 200 + var_n_mut = var_n_mutated(nkmers,ksize,r) + print(f"var_n_mutated: {var_n_mut}") + assert var_n_mut == 0 + # check var 0.0 valuerror + r = 10 + ksize = 31 + nkmers = 200 + with pytest.raises(ValueError) as exc: + var_n_mut = var_n_mutated(nkmers,ksize,r) + assert "Error: varN <0.0!" in str(exc) + # check successful + r = 0.4 + ksize = 31 + nkmers = 200000 + var_n_mut = var_n_mutated(nkmers,ksize,r) + print(f"var_n_mutated: {var_n_mut}") + assert var_n_mut == 0.10611425440741508 + + +def test_handle_seqlen_nkmers(): + bp_len = 10030 + ksize=31 + # convert seqlen to nkmers + nkmers = handle_seqlen_nkmers(ksize, sequence_len_bp= bp_len) + assert nkmers == 10000 + # if nkmers is provided, just use that + nkmers = handle_seqlen_nkmers(ksize, sequence_len_bp= bp_len, n_unique_kmers= bp_len) + assert nkmers == 10030 + # if neither seqlen or nkmers provided, complain + with pytest.raises(ValueError) as exc: + nkmers = handle_seqlen_nkmers(ksize) + assert("Error: distance estimation requires input of either 'sequence_len_bp' or 'n_unique_kmers'") in str(exc) diff --git a/tests/test_index.py b/tests/test_index.py index 35e31e0714..c0ee430ea2 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -941,8 +941,7 @@ def test_zipfile_API_signatures_traverse_yield_all(use_manifest): assert len(zipidx) == 8 # confirm that there are 12 files in there total, incl build.sh and dirs - zf = zipidx.storage.zipfile - allfiles = [ zi.filename for zi in zf.infolist() ] + allfiles = zipidx.storage._filenames() print(allfiles) assert len(allfiles) == 13 diff --git a/tests/test_lca.py b/tests/test_lca.py index 03a4ff6650..990de48864 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -2670,3 +2670,47 @@ def test_lca_index_with_picklist_exclude(runtmp): assert len(siglist) == 9 for ss in siglist: assert 'Thermotoga' not in ss.name + + +def test_lca_jaccard_ordering(): + # this tests a tricky situation where for three sketches A, B, C, + # |A intersect B| is greater than |A intersect C| + # _but_ + # |A jaccard B| is less than |A intersect B| + a = sourmash.MinHash(ksize=31, n=0, scaled=2) + b = a.copy_and_clear() + c = a.copy_and_clear() + + a.add_many([1, 2, 3, 4]) + b.add_many([1, 2, 3] + list(range(10, 30))) + c.add_many([1, 5]) + + def _intersect(x, y): + return x.intersection_and_union_size(y)[0] + + print('a intersect b:', _intersect(a, b)) + print('a intersect c:', _intersect(a, c)) + print('a jaccard b:', a.jaccard(b)) + print('a jaccard c:', a.jaccard(c)) + assert _intersect(a, b) > _intersect(a, c) + assert a.jaccard(b) < a.jaccard(c) + + # thresholds to use: + assert a.jaccard(b) < 0.15 + assert a.jaccard(c) > 0.15 + + # now - make signatures, try out :) + ss_a = sourmash.SourmashSignature(a, name='A') + ss_b = sourmash.SourmashSignature(b, name='B') + ss_c = sourmash.SourmashSignature(c, name='C') + + db = sourmash.lca.LCA_Database(ksize=31, scaled=2) + db.insert(ss_a) + db.insert(ss_b) + db.insert(ss_c) + + sr = db.search(ss_a, threshold=0.15) + print(sr) + assert len(sr) == 2 + assert sr[0].signature == ss_a + assert sr[1].signature == ss_c diff --git a/tests/test_minhash.py b/tests/test_minhash.py index 2bac12751a..4d9124c501 100644 --- a/tests/test_minhash.py +++ b/tests/test_minhash.py @@ -2655,62 +2655,39 @@ def test_containment(track_abundance): def test_containment_ANI(): f1 = utils.get_test_data('2.fa.sig') f2 = utils.get_test_data('2+63.fa.sig') - f3 = utils.get_test_data('47+63.fa.sig') mh1 = sourmash.load_one_signature(f1, ksize=31).minhash mh2 = sourmash.load_one_signature(f2, ksize=31).minhash - mh3 = sourmash.load_one_signature(f3, ksize=31).minhash - - print("\nmh1 contained by mh2", mh1.containment_ani(mh2)) - print("mh2 contained by mh1",mh2.containment_ani(mh1)) - print("mh1 max containment", mh1.max_containment_ani(mh2)) - print("mh2 max containment", mh2.max_containment_ani(mh1)) - - print("\nmh2 contained by mh3", mh2.containment_ani(mh3)) - print("mh3 contained by mh2",mh3.containment_ani(mh2)) - - print("\nmh2 contained by mh3, CI 90%", mh2.containment_ani(mh3, confidence=0.9)) - print("mh3 contained by mh2, CI 99%",mh3.containment_ani(mh2, confidence=0.99)) - assert mh1.containment_ani(mh2) == (1.0, 1.0, 1.0) - assert mh2.containment_ani(mh1) == (0.9658183324254062, 0.9648452889933389, 0.966777042966207) - assert mh1.max_containment_ani(mh2) == (1.0, 1.0, 1.0) - assert mh2.max_containment_ani(mh1) == (1.0, 1.0, 1.0) + m1_cont_m2 = mh1.containment_ani(mh2, estimate_ci =True) + m2_cont_m1 = mh2.containment_ani(mh1, estimate_ci =True) + print("\nmh1 contained by mh2", m1_cont_m2) + print("mh2 contained by mh1", m2_cont_m1) - # containment 1 is special case. check max containment for non 0/1 values: - assert mh2.containment_ani(mh3) == (0.9866751346467802, 0.9861576758035308, 0.9871770716451368) - assert mh3.containment_ani(mh2) == (0.9868883523107224, 0.986374049720872, 0.9873870188726516) - assert mh2.max_containment_ani(mh3) == (0.9868883523107224, 0.986374049720872, 0.9873870188726516) - assert mh3.max_containment_ani(mh2) == (0.9868883523107224, 0.986374049720872, 0.9873870188726516) - assert mh2.max_containment_ani(mh3)[0] == max(mh2.containment_ani(mh3)[0], mh3.containment_ani(mh2)[0]) - - # check confidence - assert mh2.containment_ani(mh3, confidence=0.9) == (0.9866751346467802, 0.986241913154108, 0.9870974232542815) - assert mh3.containment_ani(mh2, confidence=0.99) == (0.9868883523107224, 0.9862092287269876, 0.987540474733798) + assert (m1_cont_m2.ani, m1_cont_m2.ani_low, m1_cont_m2.ani_high, m1_cont_m2.p_nothing_in_common) == (1.0, None, None, 0.0) + assert (round(m2_cont_m1.ani,3), round(m2_cont_m1.ani_low,3), round(m2_cont_m1.ani_high,3)) == (0.966, 0.965, 0.967) + m1_mc_m2 = mh1.max_containment_ani(mh2, estimate_ci =True) + m2_mc_m1 = mh2.max_containment_ani(mh1, estimate_ci =True) + print("mh1 max containment", m1_mc_m2) + print("mh2 max containment", m2_mc_m1) + assert m1_mc_m2 == m2_mc_m1 + assert (m1_mc_m2.ani, m1_mc_m2.ani_low, m1_mc_m2.ani_high) == (1.0,None,None) + def test_containment_ANI_precalc_containment(): f1 = utils.get_test_data('2.fa.sig') f2 = utils.get_test_data('2+63.fa.sig') - f3 = utils.get_test_data('47+63.fa.sig') mh1 = sourmash.load_one_signature(f1, ksize=31).minhash mh2 = sourmash.load_one_signature(f2, ksize=31).minhash - mh3 = sourmash.load_one_signature(f3, ksize=31).minhash # precalc containments and assert same results s1c = mh1.contained_by(mh2) s2c = mh2.contained_by(mh1) - s3c = mh2.contained_by(mh3) - s4c = mh3.contained_by(mh2) mc = max(s1c, s2c) - assert mh1.containment_ani(mh2, containment=s1c) == (1.0, 1.0, 1.0) - assert mh2.containment_ani(mh1, containment=s2c) == (0.9658183324254062, 0.9648452889933389, 0.966777042966207) - assert mh1.max_containment_ani(mh2, max_containment=mc) == (1.0, 1.0, 1.0) - assert mh2.max_containment_ani(mh1, max_containment=mc) == (1.0, 1.0, 1.0) - assert mh2.containment_ani(mh3, containment=s3c) == (0.9866751346467802, 0.9861576758035308, 0.9871770716451368) - assert mh3.containment_ani(mh2, containment=s4c) == (0.9868883523107224, 0.986374049720872, 0.9873870188726516) - assert mh3.containment_ani(mh2, containment=s4c, confidence=0.99) == (0.9868883523107224, 0.9862092287269876, 0.987540474733798) - assert mh2.max_containment_ani(mh3, max_containment=s4c) == (0.9868883523107224, 0.986374049720872, 0.9873870188726516) - assert mh3.max_containment_ani(mh2, max_containment=s4c) == (0.9868883523107224, 0.986374049720872, 0.9873870188726516) + assert mh1.containment_ani(mh2, estimate_ci=True) == mh1.containment_ani(mh2, containment=s1c, estimate_ci=True) + assert mh2.containment_ani(mh1) == mh2.containment_ani(mh1, containment=s2c) + assert mh1.max_containment_ani(mh2) == mh1.max_containment_ani(mh2, max_containment=mc) + assert mh1.max_containment_ani(mh2) == mh2.max_containment_ani(mh1, max_containment=mc) def test_containment_ANI_downsample(): @@ -2752,12 +2729,12 @@ def test_jaccard_ANI(): mh2 = sourmash.load_one_signature(f2).minhash print("\nJACCARD_ANI", mh1.jaccard_ani(mh2)) - print("\nJACCARD_ANI 90% CI", mh1.jaccard_ani(mh2, confidence=0.9)) - print("\nJACCARD_ANI 99% CI", mh1.jaccard_ani(mh2, confidence=0.99)) - assert mh1.jaccard_ani(mh2) == (0.9783711630110239, 0.9776381521132318, 0.9790929734698974) - assert mh1.jaccard_ani(mh2, confidence=0.9) == (0.9783711630110239, 0.9777567290812516, 0.9789777082973189) - assert mh1.jaccard_ani(mh2, confidence=0.99) == (0.9783711630110239, 0.9774056164150094, 0.9793173653983231) + m1_jani_m2 = mh1.jaccard_ani(mh2) + m2_jani_m1 = mh2.jaccard_ani(mh1) + + assert m1_jani_m2 == m2_jani_m1 + assert (m1_jani_m2.ani, m1_jani_m2.p_nothing_in_common, m1_jani_m2.jaccard_error) == (0.9783711630110239, 0.0, 3.891666770716877e-07) def test_jaccard_ANI_precalc_jaccard(): @@ -2769,8 +2746,9 @@ def test_jaccard_ANI_precalc_jaccard(): jaccard = mh1.jaccard(mh2) print("\nJACCARD_ANI", mh1.jaccard_ani(mh2,jaccard=jaccard)) - assert mh1.jaccard_ani(mh2, jaccard=jaccard) == (0.9783711630110239, 0.9776381521132318, 0.9790929734698974) - assert mh1.jaccard_ani(mh2, jaccard=jaccard, confidence=0.9) == (0.9783711630110239, 0.9777567290812516, 0.9789777082973189) + assert mh1.jaccard_ani(mh2) == mh1.jaccard_ani(mh2, jaccard=jaccard) == mh2.jaccard_ani(mh1, jaccard=jaccard) + wrong_jaccard = jaccard - 0.1 + assert mh1.jaccard_ani(mh2) != mh1.jaccard_ani(mh2, jaccard=wrong_jaccard) def test_jaccard_ANI_downsample(): @@ -2793,3 +2771,21 @@ def test_jaccard_ANI_downsample(): assert mh1.scaled == mh2.scaled ds_j_manual = mh1.jaccard_ani(mh2) assert ds_s1c == ds_s2c == ds_j_manual + +def test_containment_ani_ci_tiny_testdata(): + """ + tiny test data to trigger the following: + WARNING: Cannot estimate ANI confidence intervals from containment. Do your sketches contain enough hashes? + Error: varN <0.0! + """ + mh1 = MinHash(0, 21, scaled=1, track_abundance=False) + mh2 = MinHash(0, 21, scaled=1, track_abundance=False) + + mh1.add_many((1, 3, 4)) + mh2.add_many((1, 2, 3, 4)) + + m2_cani_m1 = mh2.containment_ani(mh1, estimate_ci=True) + print(m2_cani_m1) + assert m2_cani_m1.ani == 0.986394259982259 + assert m2_cani_m1.ani_low == None + assert m2_cani_m1.ani_high == None diff --git a/tests/test_sbt.py b/tests/test_sbt.py index cb5b043c91..f31aa8c77c 100644 --- a/tests/test_sbt.py +++ b/tests/test_sbt.py @@ -367,7 +367,7 @@ def test_sbt_zipstorage(tmpdir): old_result = {str(s.signature) for s in tree.find(search_obj, to_search.data)} print(*old_result, sep='\n') - with ZipStorage(str(tmpdir.join("tree.sbt.zip"))) as storage: + with ZipStorage(str(tmpdir.join("tree.sbt.zip")), mode="w") as storage: tree.save(str(tmpdir.join("tree.sbt.json")), storage=storage) with ZipStorage(str(tmpdir.join("tree.sbt.zip"))) as storage: @@ -921,7 +921,7 @@ def test_gather_single_return(c): sig47 = load_one_signature(sig47file, ksize=31) sig63 = load_one_signature(sig63file, ksize=31) - # construct LCA Database + # construct SBT Database factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=2) @@ -937,6 +937,51 @@ def test_gather_single_return(c): assert results[0][0] == 1.0 +def test_sbt_jaccard_ordering(runtmp): + # this tests a tricky situation where for three sketches A, B, C, + # |A intersect B| is greater than |A intersect C| + # _but_ + # |A jaccard B| is less than |A intersect B| + a = sourmash.MinHash(ksize=31, n=0, scaled=2) + b = a.copy_and_clear() + c = a.copy_and_clear() + + a.add_many([1, 2, 3, 4]) + b.add_many([1, 2, 3] + list(range(10, 30))) + c.add_many([1, 5]) + + def _intersect(x, y): + return x.intersection_and_union_size(y)[0] + + print('a intersect b:', _intersect(a, b)) + print('a intersect c:', _intersect(a, c)) + print('a jaccard b:', a.jaccard(b)) + print('a jaccard c:', a.jaccard(c)) + assert _intersect(a, b) > _intersect(a, c) + assert a.jaccard(b) < a.jaccard(c) + + # thresholds to use: + assert a.jaccard(b) < 0.15 + assert a.jaccard(c) > 0.15 + + # now - make signatures, try out :) + ss_a = sourmash.SourmashSignature(a, name='A') + ss_b = sourmash.SourmashSignature(b, name='B') + ss_c = sourmash.SourmashSignature(c, name='C') + + factory = GraphFactory(31, 1e5, 4) + db = SBT(factory, d=2) + db.insert(ss_a) + db.insert(ss_b) + db.insert(ss_c) + + sr = db.search(ss_a, threshold=0.15) + print(sr) + assert len(sr) == 2 + assert sr[0].signature == ss_a + assert sr[1].signature == ss_c + + def test_sbt_protein_command_index(runtmp): c = runtmp diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 38ba835cb4..955efc7301 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -982,6 +982,17 @@ def test_search_ignore_abundance(runtmp): assert out1 != out2 +def test_search_abund_subj_flat(runtmp): + # test Index.search_abund requires an abund subj + sig47 = utils.get_test_data('track_abund/47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + with pytest.raises(SourmashCommandFailed) as exc: + runtmp.sourmash('search', sig47, sig63) + + assert "'search_abund' requires subject signatures with abundance information" in str(exc.value) + + def test_search_abund_csv(runtmp): # test search with abundance signatures, look at CSV output testdata1 = utils.get_test_data('short.fa') diff --git a/tests/test_sourmash_sketch.py b/tests/test_sourmash_sketch.py index 9f7912ade9..ba4aeb949a 100644 --- a/tests/test_sourmash_sketch.py +++ b/tests/test_sourmash_sketch.py @@ -511,7 +511,8 @@ def test_manifest_row_to_compute_parameters_4(): def test_bad_compute_parameters(): - p = ComputeParameters([31], 42, 0, 0, 0, 0, 0, True, 1000) + p = ComputeParameters(ksizes=[31], seed=42, dna=0, protein=0, dayhoff=0, + hp=0, num_hashes=0, track_abundance=True, scaled=1000) with pytest.raises(AssertionError): p.moltype diff --git a/tox.ini b/tox.ini index ec4576ea10..af5250fbee 100644 --- a/tox.ini +++ b/tox.ini @@ -95,7 +95,7 @@ deps = changedir = {toxinidir} commands = asv machine --yes - asv continuous latest HEAD + asv continuous latest HEAD {posargs} [testenv:docs] description = invoke sphinx-build to build the HTML docs