From c0ca449051cbd5a2792fbaa3c908d17bffaf5e4a Mon Sep 17 00:00:00 2001 From: Tessa Pierce Ward Date: Fri, 2 Feb 2024 15:29:56 -0800 Subject: [PATCH 01/14] MRG: update readme with maintainers & sourmash comparison info (#2965) Closes #2911. Since the world of k-mer software is pretty fast-evolving, I thought it might be better to just discuss some of our special functionality here rather than try to compare against a litany of programs. This PR also updates README.md and `pyproject.toml` with @bluegenes as maintainer! --------- Co-authored-by: Luiz Irber Co-authored-by: C. Titus Brown --- README.md | 8 +++++++- pyproject.toml | 3 ++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 085c07309e..f12d6a65ce 100644 --- a/README.md +++ b/README.md @@ -36,12 +36,18 @@ to upgrade! ---- +sourmash is a k-mer analysis multitool, and we aim to provide stable, robust programmatic and command-line APIs for a variety of sequence comparisons. Some of our special sauce includes: +- `FracMinHash` sketching, which enables accurate comparisons (including ANI) between data sets of different sizes +- `sourmash gather`, a combinatorial k-mer approach for more accurate metagenomic profiling + +Please see the [sourmash publications](https://sourmash.readthedocs.io/en/latest/publications.html#sourmash-fundamentals) for details. + The name is a riff off of [Mash](https://github.com/marbl/Mash), combined with @ctb's love of whiskey. ([Sour mash](https://en.wikipedia.org/wiki/Sour_mash) is used in making whiskey.) -Primary authors: [C. Titus Brown](mailto:titus@idyll.org) ([@ctb](http://github.com/ctb)) and [Luiz C. Irber, Jr](mailto:sourmash@luizirber.org) ([@luizirber](http://github.com/luizirber)). +Maintainers: [C. Titus Brown](mailto:titus@idyll.org) ([@ctb](http://github.com/ctb)), [Luiz C. Irber, Jr](mailto:luiz@sourmash.bio) ([@luizirber](http://github.com/luizirber)), and [N. Tessa Pierce-Ward](mailto:tessa@sourmash.bio) ([@bluegenes](http://github.com/bluegenes)). sourmash was initially developed by the [Lab for Data-Intensive Biology](http://ivory.idyll.org/lab/) at the diff --git a/pyproject.toml b/pyproject.toml index 038c7c3637..d1de447e72 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,8 +50,9 @@ authors = [ ] maintainers = [ - { name="Luiz Irber", email="luiz@sourmash.bio", orcid="0000-0003-4371-9659" }, + { name="Luiz C. Irber, Jr", email="luiz@sourmash.bio", orcid="0000-0003-4371-9659" }, { name="C. Titus Brown", email="titus@idyll.org", orcid="0000-0001-6001-2677" }, + { name="N. Tessa Pierce-Ward", email="ntpierce@ucdavis.edu", orcid="0000-0002-2942-5331" }, ] classifiers = [ From 1692eb729f6c84fc559518b8a8bf6c7dc0c23c1e Mon Sep 17 00:00:00 2001 From: Tessa Pierce Ward Date: Fri, 2 Feb 2024 16:10:20 -0800 Subject: [PATCH 02/14] MRG: update JOSS paper per pyopensci review (#2964) fixes https://github.com/sourmash-bio/sourmash/issues/2912 - adds DOIs to references where they were missing - updates publication information for Rahman Hera (ANI) paper - adds taxonomic and functional profiling references - set line length to 80 per markdown specifications Note, citations are >450 if we include all papers (364 with just original JOSS + F100). --------- Co-authored-by: C. Titus Brown --- paper.bib | 39 +++++++++++++++++++++++++++------ paper.md | 64 +++++++++++++++++++++++++++++++++---------------------- 2 files changed, 71 insertions(+), 32 deletions(-) diff --git a/paper.bib b/paper.bib index 03f1ffd46d..c04d791b43 100644 --- a/paper.bib +++ b/paper.bib @@ -34,7 +34,10 @@ @article{Pierce:2019 title = {Large-scale sequence comparisons with sourmash}, journal = {F1000Research} } + @article{gather, + doi = {10.1101/2022.01.11.475838}, + url = {https://doi.org/10.1101/2022.01.11.475838}, title={Lightweight compositional analysis of metagenomes with FracMinHash and minimum metagenome covers}, author={Irber, Luiz Carlos and Brooks, Phillip T and Reiter, Taylor E and Pierce-Ward, N Tessa and Hera, Mahmudur Rahman and Koslicki, David and Brown, C Titus}, journal={bioRxiv}, @@ -43,6 +46,8 @@ @article{gather } @article{branchwater, + doi = {10.1101/2022.11.02.514947}, + url={https://doi.org/10.1101/2022.11.02.514947}, title={Sourmash Branchwater Enables Lightweight Petabyte-Scale Sequence Search}, author={Irber, Luiz Carlos and Pierce-Ward, N Tessa and Brown, C Titus}, journal={bioRxiv}, @@ -51,6 +56,8 @@ @article{branchwater } @article{koslicki2019improving, + doi={10.1016/j.amc.2019.02.018}, + url={https://doi.org/10.1016/j.amc.2019.02.018}, title={Improving minhash via the containment index with applications to metagenomic analysis}, author={Koslicki, David and Zabeti, Hooman}, journal={Applied Mathematics and Computation}, @@ -60,10 +67,30 @@ @article{koslicki2019improving publisher={Elsevier} } -@article{hera2022debiasing, - title={Debiasing FracMinHash and deriving confidence intervals for mutation rates across a wide range of evolutionary distances}, - author={Hera, Mahmudur Rahman and Pierce-Ward, N Tessa and Koslicki, David}, - journal={bioRxiv}, - year={2022}, - publisher={Cold Spring Harbor Laboratory} +@article{hera2023deriving, + doi={10.1101/gr.277651.123}, + url={https://doi.org/10.1101/gr.277651.123}, + title={Deriving confidence intervals for mutation rates across a wide range of evolutionary distances using FracMinHash}, + author={Rahman Hera, Mahmudur and Pierce-Ward, N Tessa and Koslicki, David}, + journal={Genome Research}, + pages={gr--277651}, + year={2023}, + publisher={Cold Spring Harbor Lab} +} + +@article{liu2023fast, + doi={10.1101/2023.11.06.565843}, + url={https://doi.org/10.1101/2023.11.06.565843}, + title={Fast, lightweight, and accurate metagenomic functional profiling using FracMinHash sketches}, + author={Liu, S and Wei, W and Ma, C and Koslicki, D and others}, + year={2023} +} + +@article{portik2022evaluation, + doi={10.1186/s12859-022-05103-0}, + url={https://doi.org/10.1186/s12859-022-05103-0}, + title={Evaluation of taxonomic profiling methods for long-read shotgun metagenomic sequencing datasets}, + author={Portik, Daniel M and Brown, C Titus and Pierce-Ward, N Tessa}, + journal={Bioinformatics}, + year={2022} } diff --git a/paper.md b/paper.md index 2a04b4575a..84fe6fb825 100644 --- a/paper.md +++ b/paper.md @@ -1,5 +1,6 @@ --- -title: 'sourmash: a tool to quickly search, compare, and analyze genomic and metagenomic data sets' +title: 'sourmash: a tool to quickly search, compare, and analyze genomic +and metagenomic data sets' tags: - FracMinHash - MinHash @@ -114,38 +115,49 @@ affiliations: - name: No affiliation index: 9 -date: 27 Mar 2023 +date: 31 Jan 2024 bibliography: paper.bib --- # Summary -sourmash is a command line tool and Python library for sketching -collections of DNA, RNA, and amino acid k-mers for biological sequence -search, comparison, and analysis [@Pierce:2019]. sourmash's FracMinHash sketching supports fast and accurate sequence comparisons between datasets of different sizes [@gather], including petabase-scale database search [@branchwater]. From release 4.x, sourmash is built on top of Rust and provides an experimental Rust interface. +sourmash is a command line tool and Python library for sketching collections +of DNA, RNA, and amino acid k-mers for biological sequence search, comparison, +and analysis [@Pierce:2019]. sourmash's FracMinHash sketching supports fast and +accurate sequence comparisons between datasets of different sizes [@gather], +including taxonomic profiling [@portik2022evaluation], functional profiling +[@liu2023fast], and petabase-scale sequence search [@branchwater]. From +release 4.x, sourmash is built on top of Rust and provides an experimental +Rust interface. -FracMinHash sketching is a lossy compression approach that represents -data sets using a "fractional" sketch containing $1/S$ of the original -k-mers. Like other sequence sketching techniques (e.g. MinHash, [@Ondov:2015]), FracMinHash provides a lightweight way to store representations of large DNA or RNA sequence collections for comparison and search. Sketches can be used to identify samples, find similar samples, identify data sets with shared sequences, and build phylogenetic trees. FracMinHash sketching supports estimation of overlap, bidirectional containment, and Jaccard similarity between data sets and is accurate even for data sets of very different sizes. +FracMinHash sketching is a lossy compression approach that represents data +sets using a "fractional" sketch containing $1/S$ of the original k-mers. Like +other sequence sketching techniques (e.g. MinHash, [@Ondov:2015]), FracMinHash +provides a lightweight way to store representations of large DNA or RNA +sequence collections for comparison and search. Sketches can be used to +identify samples, find similar samples, identify data sets with shared +sequences, and build phylogenetic trees. FracMinHash sketching supports +estimation of overlap, bidirectional containment, and Jaccard similarity +between data sets and is accurate even for data sets of very different sizes. Since sourmash v1 was released in 2016 [@Brown:2016], sourmash has expanded to support new database types and many more command line functions. In particular, sourmash now has robust support for both Jaccard similarity -and containment calculations, which enables analysis and comparison of data sets -of different sizes, including large metagenomic samples. As of v4.4, +and Containment calculations, which enables analysis and comparison of data +sets of different sizes, including large metagenomic samples. As of v4.4, sourmash can convert these to estimated Average Nucleotide Identity (ANI) -values, which can provide improved biological context to sketch comparisons [@hera2022debiasing]. +values, which can provide improved biological context to sketch comparisons +[@hera2022debiasing]. # Statement of Need -Large collections of genomes, transcriptomes, and raw sequencing data -sets are readily available in biology, and the field needs lightweight -computational methods for searching and summarizing the content of -both public and private collections. sourmash provides a flexible set -of programmatic functionality for this purpose, together with a robust -and well-tested command-line interface. It has been used in well over 200 -publications (based on citations of @Brown:2016 and @Pierce:2019) and it continues -to expand in functionality. +Large collections of genomes, transcriptomes, and raw sequencing data sets are +readily available in biology, and the field needs lightweight computational +methods for searching and summarizing the content of both public and private +collections. sourmash provides a flexible set of programmatic functionality +for this purpose, together with a robust and well-tested command-line +interface. It has been used in over 350 publications (based on citations of +@Brown:2016 and @Pierce:2019) and it continues to expand in functionality. # Acknowledgements @@ -153,12 +165,12 @@ This work is funded in part by the Gordon and Betty Moore Foundation’s Data-Driven Discovery Initiative [GBMF4551 to CTB]. Notice: This manuscript has been authored by BNBI under Contract -No. HSHQDC-15-C-00064 with the DHS. The US Government retains -and the publisher, by accepting the article for publication, acknowledges -that the USG retains a non-exclusive, paid-up, irrevocable, world-wide -license to publish or reproduce the published form of this manuscript, -or allow others to do so, for USG purposes. Views and conclusions -contained herein are those of the authors and should not be interpreted -to represent policies, expressed or implied, of the DHS. +No. HSHQDC-15-C-00064 with the DHS. The US Government retains and the +publisher, by accepting the article for publication, acknowledges that the USG +retains a non-exclusive, paid-up, irrevocable, world-wide license to publish +or reproduce the published form of this manuscript, or allow others to do +so, for USG purposes. Views and conclusions contained herein are those of +the authors and should not be interpreted to represent policies, expressed +or implied, of the DHS. # References From 7b1830289a936b50120c5504d994e780e2b46c88 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sun, 4 Feb 2024 19:47:23 -0800 Subject: [PATCH 03/14] Clean up and refactor `KmerMinHash::merge` in core (#2973) Use `Option::iter` to simplify code in `KmerMinhash::merge`, as well as rewritting some checks so they are less redundant. This wasn't quite the change I was going for, but I also noticed this lowered Python tests runtime from 58s to 49s in my dev machine, mostly because `similarity` is 17% faster, so might as well merge it. --- src/core/src/ffi/utils.rs | 2 +- src/core/src/sketch/minhash.rs | 141 +++++++++++++++------------------ src/core/src/storage.rs | 2 +- 3 files changed, 65 insertions(+), 80 deletions(-) diff --git a/src/core/src/ffi/utils.rs b/src/core/src/ffi/utils.rs index 01f2221690..f075adac38 100644 --- a/src/core/src/ffi/utils.rs +++ b/src/core/src/ffi/utils.rs @@ -15,7 +15,7 @@ use crate::errors::SourmashErrorCode; use crate::Error; thread_local! { - pub static LAST_ERROR: RefCell> = RefCell::new(None); + pub static LAST_ERROR: RefCell> = const { RefCell::new(None) }; } #[allow(clippy::wrong_self_convention)] diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index 36f11a589e..85690a271f 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -431,99 +431,84 @@ impl KmerMinHash { pub fn merge(&mut self, other: &KmerMinHash) -> Result<(), Error> { self.check_compatible(other)?; let max_size = self.mins.len() + other.mins.len(); - let mut merged: Vec = Vec::with_capacity(max_size); - let mut merged_abunds: Vec = Vec::with_capacity(max_size); + let mut merged: Vec = Vec::with_capacity(max_size); + let mut merged_abunds: Option> = if self.abunds.is_some() && other.abunds.is_some() { - let mut self_iter = self.mins.iter(); - let mut other_iter = other.mins.iter(); - - let mut self_abunds_iter = self.abunds.as_mut().map(|a| a.iter()); - let mut other_abunds_iter = other.abunds.as_ref().map(|a| a.iter()); - - let mut self_value = self_iter.next(); - let mut other_value = other_iter.next(); - while self_value.is_some() { - let value = self_value.unwrap(); - match other_value { - None => { - merged.push(*value); - merged.extend(self_iter); - if let Some(sai) = self_abunds_iter { - merged_abunds.extend(sai); - } - break; + Some(Vec::with_capacity(max_size)) + } else { + None + }; + + let mut self_iter = self.mins.iter(); + let mut other_iter = other.mins.iter(); + + let mut self_abunds_iter = self.abunds.iter().flatten(); + let mut other_abunds_iter = other.abunds.iter().flatten(); + + let mut self_value = self_iter.next(); + let mut other_value = other_iter.next(); + while self_value.is_some() { + let value = self_value.unwrap(); + match other_value { + None => { + merged.push(*value); + merged.extend(self_iter); + if let Some(v) = merged_abunds.as_mut() { + v.extend(self_abunds_iter) } - Some(x) if x < value => { - merged.push(*x); - other_value = other_iter.next(); - - if let Some(ref mut oai) = other_abunds_iter { - if let Some(v) = oai.next() { - merged_abunds.push(*v) - } + break; + } + Some(x) if x < value => { + merged.push(*x); + other_value = other_iter.next(); + if let Some(v) = other_abunds_iter.next() { + if let Some(n) = merged_abunds.as_mut() { + n.push(*v) } } - Some(x) if x == value => { - merged.push(*x); - other_value = other_iter.next(); - self_value = self_iter.next(); - - if let Some(ref mut oai) = other_abunds_iter { - if let Some(v) = oai.next() { - if let Some(ref mut sai) = self_abunds_iter { - if let Some(s) = sai.next() { - merged_abunds.push(*v + *s) - } - } - } + } + Some(x) if x == value => { + merged.push(*x); + other_value = other_iter.next(); + self_value = self_iter.next(); + + if let (Some(v), Some(s)) = (other_abunds_iter.next(), self_abunds_iter.next()) + { + if let Some(n) = merged_abunds.as_mut() { + n.push(*v + *s) } } - Some(x) if x > value => { - merged.push(*value); - self_value = self_iter.next(); - - if let Some(ref mut sai) = self_abunds_iter { - if let Some(v) = sai.next() { - merged_abunds.push(*v) - } + } + Some(x) if x > value => { + merged.push(*value); + self_value = self_iter.next(); + + if let Some(v) = self_abunds_iter.next() { + if let Some(n) = merged_abunds.as_mut() { + n.push(*v) } } - Some(_) => {} } - } - if let Some(value) = other_value { - merged.push(*value); - } - merged.extend(other_iter); - if let Some(oai) = other_abunds_iter { - merged_abunds.extend(oai); + Some(_) => {} } } + if let Some(value) = other_value { + merged.push(*value); + } + merged.extend(other_iter); + if let Some(n) = merged_abunds.as_mut() { + n.extend(other_abunds_iter) + } - if merged.len() < (self.num as usize) || (self.num as usize) == 0 { - self.mins = merged; - self.abunds = if merged_abunds.is_empty() { - if self.abunds.is_some() { - Some(vec![]) - } else { - None - } - } else { - Some(merged_abunds) - }; - } else { - self.mins = merged.into_iter().take(self.num as usize).collect(); - self.abunds = if merged_abunds.is_empty() { - if self.abunds.is_some() { - Some(vec![]) - } else { - None - } - } else { - Some(merged_abunds.into_iter().take(self.num as usize).collect()) + if merged.len() > (self.num as usize) && (self.num as usize) != 0 { + merged.truncate(self.num as usize); + if let Some(v) = merged_abunds.as_mut() { + v.truncate(self.num as usize) } } + self.mins = merged; + self.abunds = merged_abunds; self.reset_md5sum(); Ok(()) diff --git a/src/core/src/storage.rs b/src/core/src/storage.rs index ad017e65a7..17cbb7701c 100644 --- a/src/core/src/storage.rs +++ b/src/core/src/storage.rs @@ -285,7 +285,7 @@ fn lookup<'a, P: AsRef>( metadata .get(&path.as_os_str()) .ok_or_else(|| StorageError::PathNotFoundError(path.to_string()).into()) - .map(|entry| *entry) + .copied() } fn find_subdirs<'a>(archive: &'a piz::ZipArchive<'a>) -> Result> { From 3af9a04cdb841d33a584401b66b8c04c3f963d68 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 4 Feb 2024 20:25:41 -0800 Subject: [PATCH 04/14] MRG: update tutorial to remove bioconda & use sourmash-minimal (#2972) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tested this series on farm in an empty account 🎉 --- doc/tutorial-install.md | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/doc/tutorial-install.md b/doc/tutorial-install.md index 36b0215254..a3bd258890 100644 --- a/doc/tutorial-install.md +++ b/doc/tutorial-install.md @@ -22,23 +22,12 @@ source ~/.bash_profile ``` otherwise, follow the instructions [here](https://github.com/conda-forge/miniforge#install). -## Add bioconda - -sourmash is installed from the [bioconda software channel](https://bioconda.github.io/), so you'll need to add that to your config: - -``` -conda config --add channels defaults -conda config --add channels bioconda -conda config --add channels conda-forge -conda config --set channel_priority strict -``` - ## Install sourmash To install sourmash, create a new environment named `smash` and install sourmash: ``` -mamba create -y -n smash sourmash +mamba create -y -n smash sourmash-minimal ``` and then activate: From 5e6fdb9c5a527c866b371a230dfe49c7d1811011 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 4 Feb 2024 21:07:21 -0800 Subject: [PATCH 05/14] MRG: update the CLI docs and help for `search --containment` and `prefetch` (#2971) Adds useful information about the order of containment searches: * `search --containment A B` reports A contained in B; * `prefetch A B` reports B contained in A; Fixes https://github.com/sourmash-bio/sourmash/issues/2968. --- doc/command-line.md | 12 +++++++++++- src/sourmash/cli/search.py | 3 +++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/doc/command-line.md b/doc/command-line.md index ee5421a6be..4697797e1b 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -325,6 +325,13 @@ Match information can be saved to a CSV file with `-o/--output`; with `-o`, all matches above the threshold will be saved, not just those printed to stdout (which are limited to `-n/--num-results`). +The `--containment` flag calculates the containment of the query in +database matches; this is an asymmetric order-dependent measure, +unlike Jaccard. Here, `search --containment Q A B C D` will report the +containment of `Q` in each of `A`, `B`, `C`, and `D`. This is opposite +to the order used by `prefetch`, where the composite sketch (e.g. metagenomes) +is the query, and the matches are contained items (e.g. genomes). + As of sourmash 4.2.0, `search` supports `--picklist`, to [select a subset of signatures to search, based on a CSV file](#using-picklists-to-subset-large-collections-of-signatures). This can be used to search only a small subset of a large collection, or to @@ -477,7 +484,10 @@ The `prefetch` subcommand searches a collection of scaled signatures for matches in a large database, using containment. It is similar to `search --containment`, while taking a `--threshold-bp` argument like `gather` does for thresholding matches (instead of using Jaccard -similarity or containment). +similarity or containment). Note that `prefetch` uses the composite +sketch (e.g. a metagenome) as the query, and finds all matching +subjects (e.g. genomes) from the database - the arguments are in the +opposite order from `search --containment`. `sourmash prefetch` is intended to select a subset of a large database for further processing. As such, it can search very large collections diff --git a/src/sourmash/cli/search.py b/src/sourmash/cli/search.py index fc37367d2e..2c11873963 100644 --- a/src/sourmash/cli/search.py +++ b/src/sourmash/cli/search.py @@ -35,6 +35,9 @@ [1] https://en.wikipedia.org/wiki/Jaccard_index +When `--containment` is provided, the containment of the query in each +of the search signatures or databases is reported. + --- """ From b26541505a9ba183f4f88298024e1bf195fdb21a Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Mon, 5 Feb 2024 05:32:40 -0800 Subject: [PATCH 06/14] MRG: fix upload wheel CI (#2974) Update paths for upload of wheels on release, based on https://github.com/sourmash-bio/sourmash/pull/2887. --- .github/workflows/build_wheel.yml | 3 ++- .github/workflows/build_wheel_all_archs.yml | 6 ++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml index e4834ff44d..faf0ccd8af 100644 --- a/.github/workflows/build_wheel.yml +++ b/.github/workflows/build_wheel.yml @@ -1,4 +1,5 @@ name: cibuildwheel +permissions: write-all on: push: @@ -104,5 +105,5 @@ jobs: - name: Release uses: fnkr/github-action-ghr@v1 env: - GHR_PATH: ${{steps.fetch_artifacts.outputs.download-path}}/artifact + GHR_PATH: ${{steps.fetch_artifacts.outputs.download-path}} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build_wheel_all_archs.yml b/.github/workflows/build_wheel_all_archs.yml index 6f88c4c822..faf0ad881e 100644 --- a/.github/workflows/build_wheel_all_archs.yml +++ b/.github/workflows/build_wheel_all_archs.yml @@ -1,4 +1,5 @@ name: maturin wheels +permissions: write-all on: pull_request: # use for testing modifications to this action @@ -9,9 +10,6 @@ on: schedule: - cron: "0 0 * * *" # daily -permissions: - contents: read - jobs: linux: @@ -85,5 +83,5 @@ jobs: - name: Release uses: fnkr/github-action-ghr@v1 env: - GHR_PATH: ${{steps.fetch_artifacts.outputs.download-path}}/artifact + GHR_PATH: ${{steps.fetch_artifacts.outputs.download-path}} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From fee62922d8857ce93f1d4e90fd7240629d606997 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Mon, 5 Feb 2024 08:26:15 -0800 Subject: [PATCH 07/14] Pre-commit updates (#2427) Ref #2421 Fix #2908 Updates older pre-commit hooks (initial exp: https://github.com/sourmash-bio/sourmash/pull/680), mostly based (again) on [tox configs](https://github.com/tox-dev/tox/blob/main/.pre-commit-config.yaml) Use ruff instead of pyupgrade/isort/black/flake8 This PR has a couple of commits: the first ones update configs, the last one runs `tox -e fix_lint` to apply pre-commit. Mostly looking into updating the first commits without breaking tests on the last commit. --- .pre-commit-config.yaml | 95 +- benchmarks/benchmarks.py | 65 +- doc/conf.py | 191 +- pyproject.toml | 29 +- src/sourmash/__init__.py | 55 +- src/sourmash/__main__.py | 9 +- src/sourmash/cli/__init__.py | 75 +- src/sourmash/cli/categorize.py | 28 +- src/sourmash/cli/compare.py | 93 +- src/sourmash/cli/compute.py | 107 +- src/sourmash/cli/gather.py | 131 +- src/sourmash/cli/import_csv.py | 12 +- src/sourmash/cli/index.py | 64 +- src/sourmash/cli/info.py | 23 +- src/sourmash/cli/lca/__init__.py | 19 +- src/sourmash/cli/lca/classify.py | 51 +- src/sourmash/cli/lca/compare_csv.py | 34 +- src/sourmash/cli/lca/index.py | 80 +- src/sourmash/cli/lca/rankinfo.py | 20 +- src/sourmash/cli/lca/summarize.py | 59 +- src/sourmash/cli/migrate.py | 6 +- src/sourmash/cli/multigather.py | 58 +- src/sourmash/cli/plot.py | 76 +- src/sourmash/cli/prefetch.py | 76 +- src/sourmash/cli/sbt_combine.py | 15 +- src/sourmash/cli/scripts/__init__.py | 24 +- src/sourmash/cli/search.py | 99 +- src/sourmash/cli/sig/__init__.py | 22 +- src/sourmash/cli/sig/cat.py | 41 +- src/sourmash/cli/sig/check.py | 57 +- src/sourmash/cli/sig/collect.py | 55 +- src/sourmash/cli/sig/describe.py | 31 +- src/sourmash/cli/sig/downsample.py | 36 +- src/sourmash/cli/sig/export.py | 19 +- src/sourmash/cli/sig/extract.py | 43 +- src/sourmash/cli/sig/fileinfo.py | 24 +- src/sourmash/cli/sig/filter.py | 44 +- src/sourmash/cli/sig/flatten.py | 38 +- src/sourmash/cli/sig/grep.py | 65 +- src/sourmash/cli/sig/inflate.py | 23 +- src/sourmash/cli/sig/ingest.py | 22 +- src/sourmash/cli/sig/intersect.py | 33 +- src/sourmash/cli/sig/kmers.py | 50 +- src/sourmash/cli/sig/manifest.py | 36 +- src/sourmash/cli/sig/merge.py | 35 +- src/sourmash/cli/sig/overlap.py | 12 +- src/sourmash/cli/sig/rename.py | 38 +- src/sourmash/cli/sig/split.py | 32 +- src/sourmash/cli/sig/subtract.py | 32 +- src/sourmash/cli/sketch/__init__.py | 19 +- src/sourmash/cli/sketch/dna.py | 79 +- src/sourmash/cli/sketch/fromfile.py | 63 +- src/sourmash/cli/sketch/protein.py | 83 +- src/sourmash/cli/sketch/translate.py | 87 +- src/sourmash/cli/storage/__init__.py | 19 +- src/sourmash/cli/storage/convert.py | 13 +- src/sourmash/cli/tax/__init__.py | 22 +- src/sourmash/cli/tax/annotate.py | 71 +- src/sourmash/cli/tax/genome.py | 105 +- src/sourmash/cli/tax/grep.py | 76 +- src/sourmash/cli/tax/metagenome.py | 113 +- src/sourmash/cli/tax/prepare.py | 55 +- src/sourmash/cli/tax/summarize.py | 47 +- src/sourmash/cli/utils.py | 190 +- src/sourmash/cli/watch.py | 29 +- src/sourmash/command_compute.py | 257 +- src/sourmash/command_sketch.py | 309 +- src/sourmash/commands.py | 756 ++- src/sourmash/compare.py | 119 +- src/sourmash/distance_utils.py | 114 +- src/sourmash/exceptions.py | 11 +- src/sourmash/fig.py | 43 +- src/sourmash/hll.py | 2 +- src/sourmash/index/__init__.py | 245 +- src/sourmash/index/revindex.py | 154 +- src/sourmash/index/sqlite_index.py | 416 +- src/sourmash/lca/__init__.py | 13 +- src/sourmash/lca/__main__.py | 9 +- src/sourmash/lca/command_classify.py | 55 +- src/sourmash/lca/command_compare_csv.py | 54 +- src/sourmash/lca/command_index.py | 237 +- src/sourmash/lca/command_rankinfo.py | 7 +- src/sourmash/lca/command_summarize.py | 79 +- src/sourmash/lca/lca_db.py | 220 +- src/sourmash/lca/lca_utils.py | 90 +- src/sourmash/logging.py | 54 +- src/sourmash/manifest.py | 142 +- src/sourmash/minhash.py | 442 +- src/sourmash/nodegraph.py | 46 +- src/sourmash/np_utils.py | 6 +- src/sourmash/picklist.py | 108 +- src/sourmash/plugins.py | 80 +- src/sourmash/save_load.py | 105 +- src/sourmash/sbt.py | 510 +- src/sourmash/sbt_storage.py | 76 +- src/sourmash/sbtmh.py | 24 +- src/sourmash/search.py | 416 +- src/sourmash/sig/__init__.py | 2 +- src/sourmash/sig/__main__.py | 693 ++- src/sourmash/sig/grep.py | 31 +- src/sourmash/signature.py | 124 +- src/sourmash/sketchcomparison.py | 106 +- src/sourmash/sourmash_args.py | 259 +- src/sourmash/sqlite_utils.py | 23 +- src/sourmash/tax/__main__.py | 387 +- src/sourmash/tax/tax_utils.py | 1167 ++-- src/sourmash/utils.py | 2 +- tests/conftest.py | 36 +- tests/sourmash_tst_utils.py | 84 +- tests/test__minhash_hypothesis.py | 16 +- tests/test_api.py | 35 +- tests/test_bugs.py | 9 +- tests/test_cmd_signature.py | 3280 ++++++----- tests/test_cmd_signature_collect.py | 410 +- tests/test_cmd_signature_fileinfo.py | 210 +- tests/test_cmd_signature_grep.py | 268 +- tests/test_compare.py | 150 +- tests/test_deprecated.py | 7 +- tests/test_distance_utils.py | 270 +- tests/test_hll.py | 36 +- tests/test_index.py | 746 +-- tests/test_index_protocol.py | 323 +- tests/test_jaccard.py | 44 +- tests/test_lca.py | 2520 +++++---- tests/test_lca_db_protocol.py | 56 +- tests/test_lca_functions.py | 356 +- tests/test_manifest.py | 57 +- tests/test_manifest_protocol.py | 119 +- tests/test_minhash.py | 731 ++- tests/test_nodegraph.py | 22 +- tests/test_np_utils.py | 1 - tests/test_picklist.py | 34 +- tests/test_plugin_framework.py | 185 +- tests/test_prefetch.py | 799 ++- tests/test_sbt.py | 572 +- tests/test_search.py | 413 +- tests/test_signature.py | 246 +- tests/test_sketchcomparison.py | 506 +- tests/test_sourmash.py | 6435 +++++++++++++--------- tests/test_sourmash_args.py | 298 +- tests/test_sourmash_compute.py | 874 +-- tests/test_sourmash_sketch.py | 1497 +++-- tests/test_sqlite_index.py | 380 +- tests/test_tax.py | 4702 +++++++++++----- tests/test_tax_utils.py | 3945 +++++++++---- tests/test_test_framework.py | 2 +- tox.ini | 259 +- utils/cardinality_estimate_confidence.py | 86 +- utils/check-tree.py | 10 +- utils/compute-dna-mh-another-way.py | 19 +- utils/compute-input-prot-another-way.py | 108 +- utils/compute-prot-mh-another-way.py | 108 +- 152 files changed, 26467 insertions(+), 16243 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bca7329143..50ab4e2c26 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,71 +1,26 @@ -default_language_version: - python: python3 repos: -- repo: https://github.com/pre-commit/pre-commit-hooks - rev: v3.2.0 - hooks: - - id: check-ast -# - id: check-builtin-literals - - id: check-docstring-first - - id: check-merge-conflict - - id: check-yaml - - id: check-toml - - id: debug-statements -# - id: end-of-file-fixer -# exclude: 'tests/test-data' -# - id: trailing-whitespace -# exclude: 'tests/test-data' -#- repo: https://github.com/asottile/pyupgrade -# rev: v2.7.2 -# hooks: -# - id: pyupgrade -#- repo: https://github.com/pre-commit/mirrors-isort -# rev: v5.4.2 -# hooks: -# - id: isort -# additional_dependencies: [toml] - -# format using black -# when the full codebase is black, use it directly; -# while it isn't, let's use darker to format new/changed code -- repo: https://github.com/akaihola/darker - rev: 1.7.1 - hooks: - - id: darker -#- repo: https://github.com/psf/black -# rev: 20.8b1 -# hooks: -# - id: black -# args: -# - --safe -# language_version: python3.8 -#- repo: https://github.com/asottile/blacken-docs -# rev: v1.8.0 -# hooks: -# - id: blacken-docs -# additional_dependencies: -# - black==19.10b0 -# language_version: python3.8 - -#- repo: https://github.com/asottile/add-trailing-comma -# rev: v2.0.1 -# hooks: -# - id: add-trailing-comma -#- repo: https://github.com/pre-commit/pygrep-hooks -# rev: v1.6.0 -# hooks: -# - id: rst-backticks -#- repo: https://github.com/asottile/setup-cfg-fmt -# rev: v1.11.0 -# hooks: -# - id: setup-cfg-fmt -# args: -# - --min-py3-version -# - '3.7' -#- repo: https://gitlab.com/pycqa/flake8 -# rev: 3.8.3 -# hooks: -# - id: flake8 -# additional_dependencies: -# - flake8-bugbear == 20.1.2 -# language_version: python3.8 + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: check-ast + - id: check-builtin-literals + - id: check-docstring-first + - id: check-merge-conflict + - id: check-yaml + - id: check-toml + - id: debug-statements + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.2.0 + hooks: + - id: ruff-format + - id: ruff + args: ["--fix", "--unsafe-fixes", "--exit-non-zero-on-fix"] + - repo: https://github.com/tox-dev/tox-ini-fmt + rev: "0.5.2" + hooks: + - id: tox-ini-fmt + args: ["-p", "fix_lint"] + - repo: meta + hooks: + - id: check-hooks-apply + - id: check-useless-excludes diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py index b2b3d7180b..d517bf7b2f 100644 --- a/benchmarks/benchmarks.py +++ b/benchmarks/benchmarks.py @@ -4,30 +4,31 @@ from sourmash.sbt_storage import ZipStorage from sourmash.minhash import MinHash -RANDOM_SEQ_SIZE=3000 -RANDOM_SEQ_NUMBER=300 +RANDOM_SEQ_SIZE = 3000 +RANDOM_SEQ_NUMBER = 300 -MINHASH_NUM=500 -MINHASH_K=21 +MINHASH_NUM = 500 +MINHASH_K = 21 -GET_MINS_RANGE=500 -ADD_HASH_RANGE=10_000 -ADD_MANY_RANGE=1000 -SIMILARITY_TIMES=500 -COUNT_COMMON_TIMES=500 -MERGE_TIMES=500 -COPY_TIMES=500 -CONCAT_TIMES=500 -SET_ABUNDANCES_RANGE=500 -ZIP_STORAGE_WRITE=100_000 -ZIP_STORAGE_LOAD=20 +GET_MINS_RANGE = 500 +ADD_HASH_RANGE = 10_000 +ADD_MANY_RANGE = 1000 +SIMILARITY_TIMES = 500 +COUNT_COMMON_TIMES = 500 +MERGE_TIMES = 500 +COPY_TIMES = 500 +CONCAT_TIMES = 500 +SET_ABUNDANCES_RANGE = 500 +ZIP_STORAGE_WRITE = 100_000 +ZIP_STORAGE_LOAD = 20 def load_sequences(): sequences = [] for i in range(10): - random_seq = random.sample("A,C,G,T".split(",") * RANDOM_SEQ_SIZE, - RANDOM_SEQ_NUMBER) + random_seq = random.sample( + "A,C,G,T".split(",") * RANDOM_SEQ_SIZE, RANDOM_SEQ_NUMBER + ) sequences.append("".join(random_seq)) return sequences @@ -35,12 +36,12 @@ def load_sequences(): class TimeMinHashSuite: def setup(self): self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=False) - self.protein_mh = MinHash(MINHASH_NUM, MINHASH_K, is_protein=True, - track_abundance=False) + self.protein_mh = MinHash( + MINHASH_NUM, MINHASH_K, is_protein=True, track_abundance=False + ) self.sequences = load_sequences() - self.populated_mh = MinHash(MINHASH_NUM, MINHASH_K, - track_abundance=False) + self.populated_mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=False) for seq in self.sequences: self.populated_mh.add_sequence(seq) @@ -103,8 +104,9 @@ def time_concat(self): class PeakmemMinHashSuite: def setup(self): self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True) - self.protein_mh = MinHash(MINHASH_NUM, MINHASH_K, - is_protein=True, track_abundance=True) + self.protein_mh = MinHash( + MINHASH_NUM, MINHASH_K, is_protein=True, track_abundance=True + ) self.sequences = load_sequences() def peakmem_add_sequence(self): @@ -158,21 +160,25 @@ def time_set_abundances_noclear(self): for i in range(SET_ABUNDANCES_RANGE): mh.set_abundances(mins, clear=False) + class PeakmemMinAbundanceSuite(PeakmemMinHashSuite): def setup(self): PeakmemMinHashSuite.setup(self) self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True) + #################### -class TimeZipStorageSuite: +class TimeZipStorageSuite: def setup(self): import zipfile + self.zipfile = NamedTemporaryFile() - with zipfile.ZipFile(self.zipfile, mode='w', - compression=zipfile.ZIP_STORED) as storage: + with zipfile.ZipFile( + self.zipfile, mode="w", compression=zipfile.ZIP_STORED + ) as storage: for i in range(ZIP_STORAGE_WRITE): # just so we have lots of entries storage.writestr(str(i), b"0") @@ -196,17 +202,18 @@ def teardown(self): class PeakmemZipStorageSuite: def setup(self): import zipfile + self.zipfile = NamedTemporaryFile() - with zipfile.ZipFile(self.zipfile, mode='w', - compression=zipfile.ZIP_STORED) as storage: + with zipfile.ZipFile( + self.zipfile, mode="w", compression=zipfile.ZIP_STORED + ) as storage: for i in range(ZIP_STORAGE_WRITE): # just so we have lots of entries storage.writestr(str(i), b"0") # one big-ish entry storage.writestr("sig1", b"9" * 1_000_000) - def peakmem_load_from_zipstorage(self): with ZipStorage(self.zipfile.name) as storage: for i in range(ZIP_STORAGE_LOAD): diff --git a/doc/conf.py b/doc/conf.py index fdd819b93a..43623fcfc5 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # # sourmash documentation build configuration file, created by # sphinx-quickstart on Sat Jun 4 16:35:43 2016. @@ -17,57 +16,59 @@ import os import sourmash -print('sourmash at:', sourmash) + +print("sourmash at:", sourmash) # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -sys.path.insert(0, os.path.abspath('..')) +sys.path.insert(0, os.path.abspath("..")) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' +# needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.doctest', - 'sphinx.ext.coverage', - 'sphinx.ext.viewcode', -# 'sphinx.ext.napoleon', - 'nbsphinx', - 'IPython.sphinxext.ipython_console_highlighting', - 'myst_parser' + "sphinx.ext.autodoc", + "sphinx.ext.doctest", + "sphinx.ext.coverage", + "sphinx.ext.viewcode", + # 'sphinx.ext.napoleon', + "nbsphinx", + "IPython.sphinxext.ipython_console_highlighting", + "myst_parser", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # source_suffix = ['.rst', '.md'] -source_suffix = ['.rst', '.md'] +source_suffix = [".rst", ".md"] # The encoding of source files. -#source_encoding = 'utf-8-sig' +# source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = 'sourmash' -copyright = '2016-2023, C. Titus Brown, Luiz Irber, and N. Tessa Pierce-Ward' -author = 'C. Titus Brown, Luiz Irber, and N. Tessa Pierce-Ward' +project = "sourmash" +copyright = "2016-2023, C. Titus Brown, Luiz Irber, and N. Tessa Pierce-Ward" +author = "C. Titus Brown, Luiz Irber, and N. Tessa Pierce-Ward" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. from importlib.metadata import version -release = version('sourmash') -version = '.'.join(release.split('.')[:2]) + +release = version("sourmash") +version = ".".join(release.split(".")[:2]) # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -78,208 +79,208 @@ # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -#today = '' +# today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # The reST default role (used for this markup: `text`) to use for all # documents. -#default_role = None +# default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +# add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +# add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -#show_authors = False +# show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] +# modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False +# keep_warnings = False # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False # CTB: suppress warnings about circularity in ToC. # see https://github.com/sphinx-doc/sphinx/issues/7410. -suppress_warnings = ['toc.circular'] +suppress_warnings = ["toc.circular"] # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'alabaster' +html_theme = "alabaster" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. html_theme_options = { - 'logo': 'logo.png', - 'logo_name': True, - 'description': 'Quickly search, compare, and analyze genomic and metagenomic data sets', - 'sidebar_collapse': False, + "logo": "logo.png", + "logo_name": True, + "description": "Quickly search, compare, and analyze genomic and metagenomic data sets", + "sidebar_collapse": False, } # Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] +# html_theme_path = [] # The name for this set of Sphinx documents. # " v documentation" by default. -#html_title = 'sourmash v1.0' +# html_title = 'sourmash v1.0' # A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None +# html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (relative to this directory) to use as a favicon of # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -#html_favicon = None +# html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. -#html_extra_path = [] +# html_extra_path = [] # If not None, a 'Last updated on:' timestamp is inserted at every page # bottom, using the given strftime format. # The empty string is equivalent to '%b %d, %Y'. -#html_last_updated_fmt = None +# html_last_updated_fmt = None # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -html_sidebars = {'*': ['about.html', 'navigation.html', 'relations.html', - 'sourcelink.html', 'searchbox.html']} +html_sidebars = { + "*": [ + "about.html", + "navigation.html", + "relations.html", + "sourcelink.html", + "searchbox.html", + ] +} # Additional templates that should be rendered to pages, maps page names to # template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_domain_indices = True +# html_domain_indices = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True +# html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True +# html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None +# html_file_suffix = None # Language to be used for generating the HTML full-text search index. # Sphinx supports the following languages: # 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' # 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr', 'zh' -#html_search_language = 'en' +# html_search_language = 'en' # A dictionary with options for the search language support, empty by default. # 'ja' uses this config value. # 'zh' user can custom change `jieba` dictionary path. -#html_search_options = {'type': 'default'} +# html_search_options = {'type': 'default'} # The name of a javascript file (relative to the configuration directory) that # implements a search results scorer. If empty, the default will be used. -#html_search_scorer = 'scorer.js' +# html_search_scorer = 'scorer.js' # Output file base name for HTML help builder. -htmlhelp_basename = 'sourmashdoc' +htmlhelp_basename = "sourmashdoc" # -- Options for LaTeX output --------------------------------------------- latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', - -# Latex figure (float) alignment -#'figure_align': 'htbp', + # The paper size ('letterpaper' or 'a4paper'). + #'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + #'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + #'preamble': '', + # Latex figure (float) alignment + #'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'sourmash.tex', 'sourmash Documentation', - 'C. Titus Brown', 'manual'), + (master_doc, "sourmash.tex", "sourmash Documentation", "C. Titus Brown", "manual"), ] # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -#latex_use_parts = False +# latex_use_parts = False # If true, show page references after internal links. -#latex_show_pagerefs = False +# latex_show_pagerefs = False # If true, show URL addresses after external links. -#latex_show_urls = False +# latex_show_urls = False # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_domain_indices = True +# latex_domain_indices = True # -- Options for manual page output --------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'sourmash', 'sourmash Documentation', - [author], 1) -] +man_pages = [(master_doc, "sourmash", "sourmash Documentation", [author], 1)] # If true, show URL addresses after external links. -#man_show_urls = False +# man_show_urls = False # -- Options for Texinfo output ------------------------------------------- @@ -288,22 +289,28 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'sourmash', 'sourmash Documentation', - author, 'sourmash', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "sourmash", + "sourmash Documentation", + author, + "sourmash", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. -#texinfo_appendices = [] +# texinfo_appendices = [] # If false, no module index is generated. -#texinfo_domain_indices = True +# texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' +# texinfo_show_urls = 'footnote' # If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False +# texinfo_no_detailmenu = False autodoc_mock_imports = ["sourmash.minhash"] myst_heading_anchors = 3 diff --git a/pyproject.toml b/pyproject.toml index d1de447e72..3f2331b97c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -100,9 +100,11 @@ license = { text = "BSD 3-Clause License" } [project.optional-dependencies] test = [ "pytest>=6.2.4,<8.1.0", - "pytest-cov>=2.12,<5.0", - "pytest-xdist", + "pytest-cov>=4,<5.0", + "pytest-xdist>=3.1", "pyyaml>=6,<7", + "diff-cover>=7.3", + "covdefaults>=2.2.2", "recommonmark", "hypothesis", "build", @@ -155,6 +157,12 @@ macos-deployment-target = "10.14" [tool.maturin.target.aarch64-apple-darwin] macos-deployment-target = "11.0" +[tool.ruff.lint] +extend-select = [ + "UP", # pyupgrade +] +ignore = ["F401", "E712", "E402", "F821", "E722", "E741", "F811", "F403", "F822"] + [tool.isort] known_third_party = ["deprecation", "hypothesis", "mmh3", "numpy", "pytest", "screed", "sourmash_tst_utils"] multi_line_output = 3 @@ -212,3 +220,20 @@ testpaths = [ "tests", "doc", ] + +[tool.coverage] +html.show_contexts = true +html.skip_covered = false +paths.source = [ + "src", + ".tox*/*/lib/python*/site-packages", + ".tox*/pypy*/site-packages", + ".tox*\\*\\Lib\\site-packages", + "*/src", + "*\\src", + "*/tests", + "*\tests", +] +report.fail_under = 88 +run.parallel = true +run.plugins = ["covdefaults"] diff --git a/src/sourmash/__init__.py b/src/sourmash/__init__.py index 33170edcd8..53ee6e4803 100644 --- a/src/sourmash/__init__.py +++ b/src/sourmash/__init__.py @@ -18,17 +18,19 @@ class MinHash - hash sketch class from deprecation import deprecated from importlib.metadata import version -__all__ = ['MinHash', 'SourmashSignature', - 'load_one_signature', - 'SourmashSignature', - 'load_file_as_index', - 'load_file_as_signatures', - 'save_signatures', - 'create_sbt_index', - 'load_signatures', # deprecated - remove in 5.0 - 'load_sbt_index', # deprecated - remove in 5.0 - 'search_sbt_index', # deprecated - remove in 5.0 - ] +__all__ = [ + "MinHash", + "SourmashSignature", + "load_one_signature", + "SourmashSignature", + "load_file_as_index", + "load_file_as_signatures", + "save_signatures", + "create_sbt_index", + "load_signatures", # deprecated - remove in 5.0 + "load_sbt_index", # deprecated - remove in 5.0 + "search_sbt_index", # deprecated - remove in 5.0 +] from ._lowlevel import ffi, lib @@ -48,9 +50,13 @@ class MinHash - hash sketch class save_signatures, ) -@deprecated(deprecated_in="3.5.1", removed_in="5.0", - current_version=VERSION, - details='Use load_file_as_signatures instead.') + +@deprecated( + deprecated_in="3.5.1", + removed_in="5.0", + current_version=VERSION, + details="Use load_file_as_signatures instead.", +) def load_signatures(*args, **kwargs): """Load a JSON string with signatures into classes. @@ -65,12 +71,17 @@ def load_signatures(*args, **kwargs): """ return load_signatures_private(*args, **kwargs) + from .sbtmh import load_sbt_index as load_sbt_index_private from .sbtmh import search_sbt_index as search_sbt_index_private -@deprecated(deprecated_in="3.5.1", removed_in="5.0", - current_version=VERSION, - details='Use load_file_as_index instead.') + +@deprecated( + deprecated_in="3.5.1", + removed_in="5.0", + current_version=VERSION, + details="Use load_file_as_index instead.", +) def load_sbt_index(*args, **kwargs): """Load and return an SBT index. @@ -80,9 +91,12 @@ def load_sbt_index(*args, **kwargs): return load_sbt_index_private(*args, **kwargs) -@deprecated(deprecated_in="3.5.1", removed_in="5.0", - current_version=VERSION, - details='Use the new Index API instead.') +@deprecated( + deprecated_in="3.5.1", + removed_in="5.0", + current_version=VERSION, + details="Use the new Index API instead.", +) def search_sbt_index(*args, **kwargs): """\ Search an SBT index `tree` with signature `query` for matches above @@ -98,6 +112,7 @@ def search_sbt_index(*args, **kwargs): """ return search_sbt_index_private(*args, **kwargs) + from .sbtmh import create_sbt_index from . import lca from . import tax diff --git a/src/sourmash/__main__.py b/src/sourmash/__main__.py index 74fdf270c0..a8c70878fa 100644 --- a/src/sourmash/__main__.py +++ b/src/sourmash/__main__.py @@ -7,18 +7,19 @@ def main(arglist=None): import sourmash + args = sourmash.cli.parse_args(arglist) - if hasattr(args, 'subcmd'): + if hasattr(args, "subcmd"): mod = getattr(sourmash.cli, args.cmd) submod = getattr(mod, args.subcmd) - mainmethod = getattr(submod, 'main') + mainmethod = getattr(submod, "main") else: mod = getattr(sourmash.cli, args.cmd) - mainmethod = getattr(mod, 'main') + mainmethod = getattr(mod, "main") retval = mainmethod(args) raise SystemExit(retval) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/sourmash/cli/__init__.py b/src/sourmash/cli/__init__.py index 575bbdb0f5..a02487f4fd 100644 --- a/src/sourmash/cli/__init__.py +++ b/src/sourmash/cli/__init__.py @@ -45,7 +45,7 @@ class SourmashParser(ArgumentParser): _citation_printed = False def __init__(self, citation=True, **kwargs): - super(SourmashParser, self).__init__(**kwargs) + super().__init__(**kwargs) self.citation = citation @classmethod @@ -53,6 +53,7 @@ def print_citation(cls): if cls._citation_printed: return from sourmash.logging import notify + notify(f"\n== This is sourmash version {sourmash.VERSION}. ==") notify("== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==\n") cls._citation_printed = True @@ -70,53 +71,56 @@ def _subparser_from_name(self, name): def print_help(self): self.print_citation() - super(SourmashParser, self).print_help() - + super().print_help() def parse_args(self, args=None, namespace=None): - if (args is None and len(sys.argv) == 1) or (args is not None and len(args) == 0): + if (args is None and len(sys.argv) == 1) or ( + args is not None and len(args) == 0 + ): self.print_help() raise SystemExit(1) - args = super(SourmashParser, self).parse_args(args=args, namespace=namespace) - if ('quiet' not in args or not args.quiet) and self.citation: + args = super().parse_args(args=args, namespace=namespace) + if ("quiet" not in args or not args.quiet) and self.citation: self.print_citation() - if 'subcmd' in args and args.subcmd is None: + if "subcmd" in args and args.subcmd is None: self._subparser_from_name(args.cmd).print_help() raise SystemExit(1) # BEGIN: dirty hacks to simultaneously support new and previous interface - if hasattr(args, 'subcmd') and args.subcmd == 'import': - args.subcmd = 'ingest' + if hasattr(args, "subcmd") and args.subcmd == "import": + args.subcmd = "ingest" # END: dirty hacks to simultaneously support new and previous interface return args def get_parser(): module_descs = { - 'tax': 'Integrate taxonomy information based on "gather" results', - 'lca': 'Taxonomic operations', - 'sketch': 'Create signatures', - 'sig': 'Manipulate signature files', - 'storage': 'Operations on storage', - 'scripts': "Plug-ins", + "tax": 'Integrate taxonomy information based on "gather" results', + "lca": "Taxonomic operations", + "sketch": "Create signatures", + "sig": "Manipulate signature files", + "storage": "Operations on storage", + "scripts": "Plug-ins", } alias = { "sig": "signature", "ext": "scripts", } - expert = set(['categorize', 'import_csv', 'migrate', 'multigather', 'sbt_combine', 'watch']) + expert = set( + ["categorize", "import_csv", "migrate", "multigather", "sbt_combine", "watch"] + ) clidir = os.path.dirname(__file__) basic_ops = utils.command_list(clidir) # provide a list of the basic operations - not expert, not submodules. user_ops = [op for op in basic_ops if op not in expert and op not in module_descs] - usage = ' Basic operations\n' + usage = " Basic operations\n" for op in user_ops: docstring = getattr(sys.modules[__name__], op).__doc__ - helpstring = 'sourmash {op:s} --help'.format(op=op) - usage += ' {hs:25s} {ds:s}\n'.format(hs=helpstring, ds=docstring) + helpstring = f"sourmash {op:s} --help" + usage += f" {helpstring:25s} {docstring:s}\n" # next, all the subcommand ones - dive into subdirectories. cmd_group_dirs = next(os.walk(clidir))[1] cmd_group_dirs = filter(utils.opfilter, cmd_group_dirs) @@ -124,18 +128,33 @@ def get_parser(): cmd_group_usage = [cmd for cmd in cmd_group_dirs if cmd not in alias.values()] for dirpath in cmd_group_usage: - usage += '\n ' + module_descs[dirpath] + '\n' - usage += ' sourmash {gd:s} --help\n'.format(gd=dirpath) + usage += "\n " + module_descs[dirpath] + "\n" + usage += f" sourmash {dirpath:s} --help\n" if dirpath in alias: - usage += ' sourmash {gd:s} --help\n'.format(gd=alias[dirpath]) + usage += f" sourmash {alias[dirpath]:s} --help\n" - desc = 'Create, compare, and manipulate k-mer sketches of biological sequences.\n\nUsage instructions:\n' + usage - parser = SourmashParser(prog='sourmash', description=desc, formatter_class=RawDescriptionHelpFormatter, usage=SUPPRESS) - parser._optionals.title = 'Options' - parser.add_argument('-v', '--version', action='version', version='sourmash '+ sourmash.VERSION) - parser.add_argument('-q', '--quiet', action='store_true', help='don\'t print citation information') + desc = ( + "Create, compare, and manipulate k-mer sketches of biological sequences.\n\nUsage instructions:\n" + + usage + ) + parser = SourmashParser( + prog="sourmash", + description=desc, + formatter_class=RawDescriptionHelpFormatter, + usage=SUPPRESS, + ) + parser._optionals.title = "Options" + parser.add_argument( + "-v", "--version", action="version", version="sourmash " + sourmash.VERSION + ) + parser.add_argument( + "-q", "--quiet", action="store_true", help="don't print citation information" + ) sub = parser.add_subparsers( - title='Instructions', dest='cmd', metavar='cmd', help=SUPPRESS, + title="Instructions", + dest="cmd", + metavar="cmd", + help=SUPPRESS, ) for op in basic_ops + cmd_group_dirs: getattr(sys.modules[__name__], op).subparser(sub) diff --git a/src/sourmash/cli/categorize.py b/src/sourmash/cli/categorize.py index e3c41ec773..0c8002e224 100644 --- a/src/sourmash/cli/categorize.py +++ b/src/sourmash/cli/categorize.py @@ -4,32 +4,36 @@ def subparser(subparsers): - subparser = subparsers.add_parser('categorize') - subparser.add_argument('database', help='location of signature collection/database to load') + subparser = subparsers.add_parser("categorize") subparser.add_argument( - 'queries', nargs='+', - help='locations of signatures to categorize' + "database", help="location of signature collection/database to load" ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "queries", nargs="+", help="locations of signatures to categorize" + ) + subparser.add_argument( + "-q", "--quiet", action="store_true", help="suppress non-error output" ) add_ksize_arg(subparser) subparser.add_argument( - '--threshold', default=0.08, type=float, - help='minimum threshold for reporting matches; default=0.08' + "--threshold", + default=0.08, + type=float, + help="minimum threshold for reporting matches; default=0.08", ) subparser.add_argument( - '--ignore-abundance', action='store_true', - help='do NOT use k-mer abundances if present' + "--ignore-abundance", + action="store_true", + help="do NOT use k-mer abundances if present", ) add_moltype_args(subparser) # TODO: help messages in these - subparser.add_argument('--csv', help='output summary CSV to this file') - subparser.add_argument('--load-csv', default=None) + subparser.add_argument("--csv", help="output summary CSV to this file") + subparser.add_argument("--load-csv", default=None) def main(args): import sourmash + return sourmash.commands.categorize(args) diff --git a/src/sourmash/cli/compare.py b/src/sourmash/cli/compare.py index 54864d6c93..74da5bd837 100644 --- a/src/sourmash/cli/compare.py +++ b/src/sourmash/cli/compare.py @@ -1,6 +1,6 @@ """create a similarity matrix comparing many samples""" -usage=""" +usage = """ The `compare` subcommand compares one or more signatures (created with `sketch`) using estimated Jaccard index [1] or (if signatures are @@ -27,69 +27,91 @@ --- """ -from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, - add_picklist_args, add_pattern_args, - add_scaled_arg) +from sourmash.cli.utils import ( + add_ksize_arg, + add_moltype_args, + add_picklist_args, + add_pattern_args, + add_scaled_arg, +) def subparser(subparsers): - subparser = subparsers.add_parser('compare', description=__doc__, usage=usage) + subparser = subparsers.add_parser("compare", description=__doc__, usage=usage) subparser.add_argument( - 'signatures', nargs='*', help='list of signatures to compare', - default=[] + "signatures", nargs="*", help="list of signatures to compare", default=[] ) subparser.add_argument( - '-q', '--quiet', action='store_true', help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-o', '--output', metavar='F', - help='file to which output will be written; default is terminal ' - '(standard output)' + "-o", + "--output", + metavar="F", + help="file to which output will be written; default is terminal " + "(standard output)", ) subparser.add_argument( - '--ignore-abundance', action='store_true', - help='do NOT use k-mer abundances even if present' + "--ignore-abundance", + action="store_true", + help="do NOT use k-mer abundances even if present", ) subparser.add_argument( - '--containment', action='store_true', - help='calculate containment instead of similarity' + "--containment", + action="store_true", + help="calculate containment instead of similarity", ) subparser.add_argument( - '--max-containment', action='store_true', - help='calculate max containment instead of similarity' + "--max-containment", + action="store_true", + help="calculate max containment instead of similarity", ) subparser.add_argument( - '--avg-containment', '--average-containment', action='store_true', - help='calculate average containment instead of similarity' + "--avg-containment", + "--average-containment", + action="store_true", + help="calculate average containment instead of similarity", ) subparser.add_argument( - '--estimate-ani', '--ANI', '--ani', action='store_true', - help='return ANI estimated from jaccard, containment, average containment, or max containment; see https://doi.org/10.1101/2022.01.11.475870' + "--estimate-ani", + "--ANI", + "--ani", + action="store_true", + help="return ANI estimated from jaccard, containment, average containment, or max containment; see https://doi.org/10.1101/2022.01.11.475870", ) subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) subparser.add_argument( - '-f', '--force', action='store_true', - help='continue past errors in file loading' + "-f", + "--force", + action="store_true", + help="continue past errors in file loading", ) subparser.add_argument( - '--csv', metavar='F', - help='write matrix to specified file in CSV format (with column ' - 'headers)' + "--csv", + metavar="F", + help="write matrix to specified file in CSV format (with column " "headers)", ) subparser.add_argument( - '-p', '--processes', metavar='N', type=int, default=None, - help='Number of processes to use to calculate similarity') + "-p", + "--processes", + metavar="N", + type=int, + default=None, + help="Number of processes to use to calculate similarity", + ) subparser.add_argument( - '--distance-matrix', action='store_true', - help='output a distance matrix, instead of a similarity matrix' + "--distance-matrix", + action="store_true", + help="output a distance matrix, instead of a similarity matrix", ) subparser.add_argument( - '--similarity-matrix', action='store_false', - dest='distance_matrix', - help='output a similarity matrix; this is the default', + "--similarity-matrix", + action="store_false", + dest="distance_matrix", + help="output a similarity matrix; this is the default", ) add_ksize_arg(subparser) @@ -101,4 +123,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.commands.compare(args) diff --git a/src/sourmash/cli/compute.py b/src/sourmash/cli/compute.py index 7b3b48d20d..cfdb48f42a 100644 --- a/src/sourmash/cli/compute.py +++ b/src/sourmash/cli/compute.py @@ -1,6 +1,6 @@ """compute sequence signatures for inputs""" -usage=""" +usage = """ ** WARNING: the sourmash compute command is DEPRECATED as of 4.0 and ** will be removed in 5.0. Please see the 'sourmash sketch' command instead. @@ -35,8 +35,8 @@ def ksize_parser(ksizes): # get list of k-mer sizes for which to compute sketches - if ',' in ksizes: - ksizes = ksizes.split(',') + if "," in ksizes: + ksizes = ksizes.split(",") ksizes = list(map(int, ksizes)) else: ksizes = [int(ksizes)] @@ -45,81 +45,98 @@ def ksize_parser(ksizes): def subparser(subparsers): - subparser = subparsers.add_parser('compute', description=__doc__, usage=usage) + subparser = subparsers.add_parser("compute", description=__doc__, usage=usage) - sketch_args = subparser.add_argument_group('Sketching options') + sketch_args = subparser.add_argument_group("Sketching options") sketch_args.add_argument( - '-k', '--ksizes', default='21,31,51', + "-k", + "--ksizes", + default="21,31,51", type=ksize_parser, - help='comma-separated list of k-mer sizes; default=%(default)s' + help="comma-separated list of k-mer sizes; default=%(default)s", ) sketch_args.add_argument( - '--track-abundance', action='store_true', - help='track k-mer abundances in the generated signature' + "--track-abundance", + action="store_true", + help="track k-mer abundances in the generated signature", ) sketch_args.add_argument( - '--scaled', type=float, default=0, - help='choose number of hashes as 1 in FRACTION of input k-mers' + "--scaled", + type=float, + default=0, + help="choose number of hashes as 1 in FRACTION of input k-mers", ) add_construct_moltype_args(sketch_args) sketch_args.add_argument( - '--input-is-protein', action='store_true', - help='Consume protein sequences - no translation needed.' + "--input-is-protein", + action="store_true", + help="Consume protein sequences - no translation needed.", ) sketch_args.add_argument( - '--seed', type=int, default=get_minhash_default_seed(), - help='seed used by MurmurHash; default=%(default)i' + "--seed", + type=int, + default=get_minhash_default_seed(), + help="seed used by MurmurHash; default=%(default)i", ) - file_args = subparser.add_argument_group('File handling options') + file_args = subparser.add_argument_group("File handling options") file_args.add_argument( - '-f', '--force', action='store_true', - help='recompute signatures even if the file exists' + "-f", + "--force", + action="store_true", + help="recompute signatures even if the file exists", ) file_args.add_argument( - '-o', '--output', - help='output computed signatures to this file' + "-o", "--output", help="output computed signatures to this file" ) file_args.add_argument( - '--output-dir', '--outdir', - help='output computed signatures to this directory', + "--output-dir", + "--outdir", + help="output computed signatures to this directory", ) file_args.add_argument( - '--singleton', action='store_true', - help='compute a signature for each sequence record individually' + "--singleton", + action="store_true", + help="compute a signature for each sequence record individually", ) file_args.add_argument( - '--merge', '--name', type=str, default='', metavar="FILE", - help='merge all input files into one signature file with the ' - 'specified name' + "--merge", + "--name", + type=str, + default="", + metavar="FILE", + help="merge all input files into one signature file with the " "specified name", ) file_args.add_argument( - '--name-from-first', action='store_true', - help='name the signature generated from each file after the first ' - 'record in the file' + "--name-from-first", + action="store_true", + help="name the signature generated from each file after the first " + "record in the file", ) file_args.add_argument( - '--randomize', action='store_true', - help='shuffle the list of input filenames randomly' + "--randomize", + action="store_true", + help="shuffle the list of input filenames randomly", ) subparser.add_argument( - '-q', '--quiet', action='store_true', help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '--check-sequence', action='store_true', - help='complain if input sequence is invalid' + "--check-sequence", + action="store_true", + help="complain if input sequence is invalid", ) subparser.add_argument( - '--license', default='CC0', type=str, - help='signature license. Currently only CC0 is supported.' + "--license", + default="CC0", + type=str, + help="signature license. Currently only CC0 is supported.", ) - subparser.add_argument( - 'filenames', nargs='+', help='file(s) of sequences' - ) - subparser._positionals.title = 'Required arguments' - subparser._optionals.title = 'Miscellaneous options' + subparser.add_argument("filenames", nargs="+", help="file(s) of sequences") + subparser._positionals.title = "Required arguments" + subparser._optionals.title = "Miscellaneous options" add_num_arg(sketch_args, 500) @@ -127,8 +144,10 @@ def main(args): from sourmash.command_compute import compute from sourmash.logging import notify - notify("""\ + notify( + """\ ** WARNING: the sourmash compute command is DEPRECATED as of 4.0 and ** will be removed in 5.0. Please see the 'sourmash sketch' command instead. -""") +""" + ) return compute(args) diff --git a/src/sourmash/cli/gather.py b/src/sourmash/cli/gather.py index 0b0115efd2..88860a50cd 100644 --- a/src/sourmash/cli/gather.py +++ b/src/sourmash/cli/gather.py @@ -1,6 +1,6 @@ """search a metagenome signature against dbs""" -usage=""" +usage = """ The `gather` subcommand selects the best reference genomes to use for a metagenome analysis, by finding the smallest set of non-overlapping @@ -62,103 +62,133 @@ --- """ -from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, - add_picklist_args, add_scaled_arg, - add_pattern_args) +from sourmash.cli.utils import ( + add_ksize_arg, + add_moltype_args, + add_picklist_args, + add_scaled_arg, + add_pattern_args, +) def subparser(subparsers): - subparser = subparsers.add_parser('gather', description=__doc__, usage=usage) - subparser.add_argument('query', help='query signature') + subparser = subparsers.add_parser("gather", description=__doc__, usage=usage) + subparser.add_argument("query", help="query signature") subparser.add_argument( - 'databases', nargs='+', - help='signatures/SBTs to search', + "databases", + nargs="+", + help="signatures/SBTs to search", ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) + subparser.add_argument("-d", "--debug", action="store_true") subparser.add_argument( - '-d', '--debug', action='store_true' + "-n", + "--num-results", + default=None, + type=int, + metavar="N", + help="number of results to report (default: terminate at --threshold-bp)", ) subparser.add_argument( - '-n', '--num-results', default=None, type=int, metavar='N', - help='number of results to report (default: terminate at --threshold-bp)' + "-o", + "--output", + metavar="FILE", + help="output CSV containing matches to this file", ) subparser.add_argument( - '-o', '--output', metavar='FILE', - help='output CSV containing matches to this file' + "--save-matches", + metavar="FILE", + help="save gather matched signatures from the database to the " + "specified file", ) subparser.add_argument( - '--save-matches', metavar='FILE', - help='save gather matched signatures from the database to the ' - 'specified file' + "--save-prefetch", + metavar="FILE", + help="save all prefetch-matched signatures from the databases to the " + "specified file or directory", ) subparser.add_argument( - '--save-prefetch', metavar='FILE', - help='save all prefetch-matched signatures from the databases to the ' - 'specified file or directory' + "--save-prefetch-csv", + metavar="FILE", + help="save a csv with information from all prefetch-matched signatures " + "to the specified file", ) subparser.add_argument( - '--save-prefetch-csv', metavar='FILE', - help='save a csv with information from all prefetch-matched signatures ' - 'to the specified file' + "--threshold-bp", + metavar="REAL", + type=float, + default=5e4, + help="reporting threshold (in bp) for estimated overlap with remaining query (default=50kb)", ) subparser.add_argument( - '--threshold-bp', metavar='REAL', type=float, default=5e4, - help='reporting threshold (in bp) for estimated overlap with remaining query (default=50kb)' + "--output-unassigned", + metavar="FILE", + help="output unassigned portions of the query as a signature to the " + "specified file", ) subparser.add_argument( - '--output-unassigned', metavar='FILE', - help='output unassigned portions of the query as a signature to the ' - 'specified file' + "--ignore-abundance", + action="store_true", + help="do NOT use k-mer abundances if present", ) subparser.add_argument( - '--ignore-abundance', action='store_true', - help='do NOT use k-mer abundances if present' + "--md5", default=None, help="select the signature with this md5 as query" ) subparser.add_argument( - '--md5', default=None, - help='select the signature with this md5 as query' - ) - subparser.add_argument( - '--cache-size', default=0, type=int, metavar='N', - help='number of internal SBT nodes to cache in memory (default: 0, cache all nodes)' + "--cache-size", + default=0, + type=int, + metavar="N", + help="number of internal SBT nodes to cache in memory (default: 0, cache all nodes)", ) # advanced parameters subparser.add_argument( - '--linear', dest="linear", action='store_true', + "--linear", + dest="linear", + action="store_true", help="force a low-memory but maybe slower database search", ) subparser.add_argument( - '--no-linear', dest="linear", action='store_false', + "--no-linear", + dest="linear", + action="store_false", ) subparser.add_argument( - '--no-prefetch', dest="prefetch", action='store_false', + "--no-prefetch", + dest="prefetch", + action="store_false", help="do not use prefetch before gather; see documentation", ) subparser.add_argument( - '--prefetch', dest="prefetch", action='store_true', + "--prefetch", + dest="prefetch", + action="store_true", help="use prefetch before gather; see documentation", ) subparser.add_argument( - '--estimate-ani-ci', action='store_true', - help='also output confidence intervals for ANI estimates' + "--estimate-ani-ci", + action="store_true", + help="also output confidence intervals for ANI estimates", ) subparser.add_argument( - '--fail-on-empty-database', action='store_true', - help='stop at databases that contain no compatible signatures' + "--fail-on-empty-database", + action="store_true", + help="stop at databases that contain no compatible signatures", ) subparser.add_argument( - '--no-fail-on-empty-database', action='store_false', - dest='fail_on_empty_database', - help='continue past databases that contain no compatible signatures' + "--no-fail-on-empty-database", + action="store_false", + dest="fail_on_empty_database", + help="continue past databases that contain no compatible signatures", ) subparser.set_defaults(fail_on_empty_database=True) subparser.add_argument( - '--create-empty-results', action='store_true', - help='create an empty results file even if no matches.' + "--create-empty-results", + action="store_true", + help="create an empty results file even if no matches.", ) add_ksize_arg(subparser) @@ -170,4 +200,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.commands.gather(args) diff --git a/src/sourmash/cli/import_csv.py b/src/sourmash/cli/import_csv.py index 77fcbd14f8..6e0964678a 100644 --- a/src/sourmash/cli/import_csv.py +++ b/src/sourmash/cli/import_csv.py @@ -4,17 +4,19 @@ def subparser(subparsers): - subparser = subparsers.add_parser('import_csv') - subparser.add_argument('mash_csvfile', help='CSV file with mash sketches') + subparser = subparsers.add_parser("import_csv") + subparser.add_argument("mash_csvfile", help="CSV file with mash sketches") subparser.add_argument( - '-o', '--output', - help='save signature generated from data to this file (default stdout)' + "-o", + "--output", + help="save signature generated from data to this file (default stdout)", ) def main(args): import sourmash + notify("** WARNING: 'import_csv' is deprecated as of sourmash 4.0, and will") notify("** be removed in sourmash 5.0; use 'sourmash sig import --csv' instead.") - notify('') + notify("") return sourmash.commands.import_csv(args) diff --git a/src/sourmash/cli/index.py b/src/sourmash/cli/index.py index dcd8572ca0..4fb0fc7ab8 100644 --- a/src/sourmash/cli/index.py +++ b/src/sourmash/cli/index.py @@ -1,6 +1,6 @@ """index signatures for rapid search""" -usage=""" +usage = """ sourmash index -k 31 dbname *.sig @@ -25,46 +25,63 @@ --- """ -from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, - add_picklist_args, add_scaled_arg) +from sourmash.cli.utils import ( + add_ksize_arg, + add_moltype_args, + add_picklist_args, + add_scaled_arg, +) def subparser(subparsers): - subparser = subparsers.add_parser('index', description=__doc__, - usage=usage) - subparser.add_argument('sbt_name', help='name to save index into; .sbt.zip or .sbt.json file') + subparser = subparsers.add_parser("index", description=__doc__, usage=usage) subparser.add_argument( - 'signatures', nargs='*', - help='signatures to load into SBT' + "sbt_name", help="name to save index into; .sbt.zip or .sbt.json file" ) + subparser.add_argument("signatures", nargs="*", help="signatures to load into SBT") subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-d', '--n_children', metavar='D', type=int, default=2, - help='number of children for internal nodes; default=2' + "-d", + "--n_children", + metavar="D", + type=int, + default=2, + help="number of children for internal nodes; default=2", ) subparser.add_argument( - '--append', action='store_true', default=False, - help='add signatures to an existing SBT' + "--append", + action="store_true", + default=False, + help="add signatures to an existing SBT", ) subparser.add_argument( - '-x', '--bf-size', metavar='S', type=float, default=1e5, - help='Bloom filter size used for internal nodes' + "-x", + "--bf-size", + metavar="S", + type=float, + default=1e5, + help="Bloom filter size used for internal nodes", ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try loading *all* files in provided subdirectories, not just .sig files"' + "-f", + "--force", + action="store_true", + help='try loading *all* files in provided subdirectories, not just .sig files"', ) subparser.add_argument( - '-s', '--sparseness', metavar='FLOAT', type=float, default=.0, - help='What percentage of internal nodes will not be saved; ranges ' - 'from 0.0 (save all nodes) to 1.0 (no nodes saved)' + "-s", + "--sparseness", + metavar="FLOAT", + type=float, + default=0.0, + help="What percentage of internal nodes will not be saved; ranges " + "from 0.0 (save all nodes) to 1.0 (no nodes saved)", ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -74,4 +91,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.commands.index(args) diff --git a/src/sourmash/cli/info.py b/src/sourmash/cli/info.py index b607112b7c..5d79790389 100644 --- a/src/sourmash/cli/info.py +++ b/src/sourmash/cli/info.py @@ -6,26 +6,29 @@ from sourmash.logging import notify from sourmash.plugins import list_all_plugins + def subparser(subparsers): - subparser = subparsers.add_parser('info') + subparser = subparsers.add_parser("info") subparser.add_argument( - '-v', '--verbose', action='store_true', - help='report versions of khmer and screed' + "-v", + "--verbose", + action="store_true", + help="report versions of khmer and screed", ) def info(verbose=False): "Report sourmash version + version of installed dependencies." - notify(f'sourmash version {sourmash.VERSION}') - notify(f'- loaded from path: {os.path.dirname(__file__)}') - notify('') + notify(f"sourmash version {sourmash.VERSION}") + notify(f"- loaded from path: {os.path.dirname(__file__)}") + notify("") if verbose: - notify('khmer version: None (internal Nodegraph)') - notify('') + notify("khmer version: None (internal Nodegraph)") + notify("") - notify(f'screed version {screed.__version__}') - notify(f'- loaded from path: {os.path.dirname(screed.__file__)}') + notify(f"screed version {screed.__version__}") + notify(f"- loaded from path: {os.path.dirname(screed.__file__)}") list_all_plugins() diff --git a/src/sourmash/cli/lca/__init__.py b/src/sourmash/cli/lca/__init__.py index a403876d02..6fbb73619c 100644 --- a/src/sourmash/cli/lca/__init__.py +++ b/src/sourmash/cli/lca/__init__.py @@ -16,19 +16,24 @@ def subparser(subparsers): - subparser = subparsers.add_parser('lca', formatter_class=RawDescriptionHelpFormatter, usage=SUPPRESS) - desc = 'Operations\n' + subparser = subparsers.add_parser( + "lca", formatter_class=RawDescriptionHelpFormatter, usage=SUPPRESS + ) + desc = "Operations\n" clidir = os.path.dirname(__file__) ops = command_list(clidir) for subcmd in ops: docstring = getattr(sys.modules[__name__], subcmd).__doc__ - helpstring = 'sourmash lca {op:s} --help'.format(op=subcmd) - desc += ' {hs:33s} {ds:s}\n'.format(hs=helpstring, ds=docstring) + helpstring = f"sourmash lca {subcmd:s} --help" + desc += f" {helpstring:33s} {docstring:s}\n" s = subparser.add_subparsers( - title='Taxonomic utilities', dest='subcmd', metavar='subcmd', help=SUPPRESS, - description=desc + title="Taxonomic utilities", + dest="subcmd", + metavar="subcmd", + help=SUPPRESS, + description=desc, ) for subcmd in ops: getattr(sys.modules[__name__], subcmd).subparser(s) subparser._action_groups.reverse() - subparser._optionals.title = 'Options' + subparser._optionals.title = "Options" diff --git a/src/sourmash/cli/lca/classify.py b/src/sourmash/cli/lca/classify.py index 7efe112bd8..55c6134f07 100644 --- a/src/sourmash/cli/lca/classify.py +++ b/src/sourmash/cli/lca/classify.py @@ -2,34 +2,49 @@ def subparser(subparsers): - subparser = subparsers.add_parser('classify') - subparser.add_argument('--db', nargs='+', action='append', - help='databases to use to classify') - subparser.add_argument('--query', nargs='*', default=[], action='append', - help='query signatures to classify') - subparser.add_argument('--query-from-file', - help='file containing list of signature files to query') - subparser.add_argument('--threshold', metavar='T', type=int, default=5, - help="minimum number of hashes needed for a taxonomic classification (default: 5)") + subparser = subparsers.add_parser("classify") subparser.add_argument( - '--majority', action='store_true', - help='use majority vote classification instead of lca' + "--db", nargs="+", action="append", help="databases to use to classify" ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "--query", + nargs="*", + default=[], + action="append", + help="query signatures to classify", ) subparser.add_argument( - '-d', '--debug', action='store_true', - help='output debugging output' + "--query-from-file", help="file containing list of signature files to query" ) subparser.add_argument( - '-o', '--output', metavar='FILE', default='-', - help='output CSV to the specified file; by default output to stdout' + "--threshold", + metavar="T", + type=int, + default=5, + help="minimum number of hashes needed for a taxonomic classification (default: 5)", ) - subparser.add_argument('--scaled', type=float) + subparser.add_argument( + "--majority", + action="store_true", + help="use majority vote classification instead of lca", + ) + subparser.add_argument( + "-q", "--quiet", action="store_true", help="suppress non-error output" + ) + subparser.add_argument( + "-d", "--debug", action="store_true", help="output debugging output" + ) + subparser.add_argument( + "-o", + "--output", + metavar="FILE", + default="-", + help="output CSV to the specified file; by default output to stdout", + ) + subparser.add_argument("--scaled", type=float) def main(args): import sourmash + return sourmash.lca.command_classify.classify(args) diff --git a/src/sourmash/cli/lca/compare_csv.py b/src/sourmash/cli/lca/compare_csv.py index 1f62fe4aa0..6732940325 100644 --- a/src/sourmash/cli/lca/compare_csv.py +++ b/src/sourmash/cli/lca/compare_csv.py @@ -1,35 +1,41 @@ """compare spreadsheets""" + def subparser(subparsers): # Dirty hack to simultaneously support new and previous interface # If desired, this function can be removed with a major version bump. - for cmd in ('compare', 'compare_csv'): + for cmd in ("compare", "compare_csv"): subparser = subparsers.add_parser(cmd) - subparser.add_argument('csv1', help='taxonomy spreadsheet output by classify') - subparser.add_argument('csv2', help='custom taxonomy spreadsheet') + subparser.add_argument("csv1", help="taxonomy spreadsheet output by classify") + subparser.add_argument("csv2", help="custom taxonomy spreadsheet") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-d', '--debug', action='store_true', - help='output debugging output' + "-d", "--debug", action="store_true", help="output debugging output" ) subparser.add_argument( - '-C', '--start-column', metavar='C', default=2, type=int, - help='column at which taxonomic assignments start; default=2' + "-C", + "--start-column", + metavar="C", + default=2, + type=int, + help="column at which taxonomic assignments start; default=2", ) subparser.add_argument( - '--tabs', action='store_true', - help='input spreadsheet is tab-delimited; default is commas' + "--tabs", + action="store_true", + help="input spreadsheet is tab-delimited; default is commas", ) subparser.add_argument( - '--no-headers', action='store_true', - help='no headers present in taxonomy spreadsheet' + "--no-headers", + action="store_true", + help="no headers present in taxonomy spreadsheet", ) - subparser.add_argument('-f', '--force', action='store_true') + subparser.add_argument("-f", "--force", action="store_true") def main(args): import sourmash + return sourmash.lca.command_compare_csv.compare_csv(args) diff --git a/src/sourmash/cli/lca/index.py b/src/sourmash/cli/lca/index.py index 3e1e456273..afc0702e9f 100644 --- a/src/sourmash/cli/lca/index.py +++ b/src/sourmash/cli/lca/index.py @@ -1,69 +1,74 @@ """create LCA database""" -from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, - add_picklist_args) +from sourmash.cli.utils import add_ksize_arg, add_moltype_args, add_picklist_args def subparser(subparsers): - subparser = subparsers.add_parser('index') - subparser.add_argument('csv', help='taxonomy spreadsheet') - subparser.add_argument('lca_db_out', help='output database name') + subparser = subparsers.add_parser("index") + subparser.add_argument("csv", help="taxonomy spreadsheet") + subparser.add_argument("lca_db_out", help="output database name") subparser.add_argument( - 'signatures', nargs='*', - help='signatures or directory of signatures to index (optional if provided via --from-file)' + "signatures", + nargs="*", + help="signatures or directory of signatures to index (optional if provided via --from-file)", ) subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) + subparser.add_argument("--scaled", metavar="S", default=10000, type=float) subparser.add_argument( - '--scaled', metavar='S', default=10000, type=float + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-d", "--debug", action="store_true", help="output debugging output" ) subparser.add_argument( - '-d', '--debug', action='store_true', - help='output debugging output' + "-C", + "--start-column", + metavar="C", + default=2, + type=int, + help="column at which taxonomic assignments start; default=2", ) subparser.add_argument( - '-C', '--start-column', metavar='C', default=2, type=int, - help='column at which taxonomic assignments start; default=2' + "--tabs", + action="store_true", + help="input spreadsheet is tab-delimited; default is commas", ) subparser.add_argument( - '--tabs', action='store_true', - help='input spreadsheet is tab-delimited; default is commas' + "--no-headers", + action="store_true", + help="no headers present in taxonomy spreadsheet", ) subparser.add_argument( - '--no-headers', action='store_true', - help='no headers present in taxonomy spreadsheet' + "--split-identifiers", + action="store_true", + help="split names in signatures on whitespace", ) subparser.add_argument( - '--split-identifiers', action='store_true', - help='split names in signatures on whitespace' + "--keep-identifier-versions", + action="store_true", + help="do not remove accession versions", ) + subparser.add_argument("-f", "--force", action="store_true") + subparser.add_argument("--report", help="output a report on anomalies, if any") subparser.add_argument( - '--keep-identifier-versions', action='store_true', - help='do not remove accession versions' + "--require-taxonomy", + action="store_true", + help="ignore signatures with no taxonomy entry", ) - subparser.add_argument('-f', '--force', action='store_true') subparser.add_argument( - '--report', help='output a report on anomalies, if any' + "--fail-on-missing-taxonomy", + action="store_true", + help="fail quickly if taxonomy is not available for an identifier", ) subparser.add_argument( - '--require-taxonomy', action='store_true', - help='ignore signatures with no taxonomy entry' - ) - subparser.add_argument( - '--fail-on-missing-taxonomy', action='store_true', - help='fail quickly if taxonomy is not available for an identifier', - ) - subparser.add_argument( - '-F', '--database-format', + "-F", + "--database-format", help="format of output database; default is 'json')", - default='json', - choices=['json', 'sql'], + default="json", + choices=["json", "sql"], ) add_ksize_arg(subparser, default=31) @@ -73,4 +78,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.lca.command_index.index(args) diff --git a/src/sourmash/cli/lca/rankinfo.py b/src/sourmash/cli/lca/rankinfo.py index 6108dcdf4f..5d89612942 100644 --- a/src/sourmash/cli/lca/rankinfo.py +++ b/src/sourmash/cli/lca/rankinfo.py @@ -1,23 +1,25 @@ """database rank info""" + def subparser(subparsers): - subparser = subparsers.add_parser('rankinfo') - subparser.add_argument('db', nargs='+') + subparser = subparsers.add_parser("rankinfo") + subparser.add_argument("db", nargs="+") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-d', '--debug', action='store_true', - help='output debugging output' + "-d", "--debug", action="store_true", help="output debugging output" ) - subparser.add_argument('--scaled', metavar='FLOAT', type=float) + subparser.add_argument("--scaled", metavar="FLOAT", type=float) subparser.add_argument( - '--minimum-num', type=int, default=0, - help='Minimum number of different lineages a k-mer must be in to be counted' + "--minimum-num", + type=int, + default=0, + help="Minimum number of different lineages a k-mer must be in to be counted", ) def main(args): import sourmash + return sourmash.lca.command_rankinfo.rankinfo_main(args) diff --git a/src/sourmash/cli/lca/summarize.py b/src/sourmash/cli/lca/summarize.py index a3a8809e73..d9411a7f5b 100644 --- a/src/sourmash/cli/lca/summarize.py +++ b/src/sourmash/cli/lca/summarize.py @@ -2,35 +2,52 @@ def subparser(subparsers): - subparser = subparsers.add_parser('summarize') - subparser.add_argument('--db', nargs='+', action='append', - help='one or more LCA databases to use') - subparser.add_argument('--query', nargs='*', default=[], action='append', - help='one or more signature files to use as queries') - subparser.add_argument('--query-from-file', - help='file containing list of signature files to query') - subparser.add_argument('--threshold', metavar='T', type=int, default=5, - help='minimum number of hashes to require for a match') - subparser.add_argument( - '-o', '--output', metavar='FILE', - help='file to which CSV output will be written' - ) - subparser.add_argument('--scaled', metavar='FLOAT', type=float, - help='scaled value to downsample to') + subparser = subparsers.add_parser("summarize") + subparser.add_argument( + "--db", nargs="+", action="append", help="one or more LCA databases to use" + ) + subparser.add_argument( + "--query", + nargs="*", + default=[], + action="append", + help="one or more signature files to use as queries", + ) + subparser.add_argument( + "--query-from-file", help="file containing list of signature files to query" + ) + subparser.add_argument( + "--threshold", + metavar="T", + type=int, + default=5, + help="minimum number of hashes to require for a match", + ) + subparser.add_argument( + "-o", + "--output", + metavar="FILE", + help="file to which CSV output will be written", + ) + subparser.add_argument( + "--scaled", metavar="FLOAT", type=float, help="scaled value to downsample to" + ) - subparser.add_argument('--ignore-abundance', action='store_true', - help='ignore hash abundances in query signatures do not weight results') + subparser.add_argument( + "--ignore-abundance", + action="store_true", + help="ignore hash abundances in query signatures do not weight results", + ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-d', '--debug', action='store_true', - help='output debugging output' + "-d", "--debug", action="store_true", help="output debugging output" ) def main(args): import sourmash + return sourmash.lca.command_summarize.summarize_main(args) diff --git a/src/sourmash/cli/migrate.py b/src/sourmash/cli/migrate.py index fc5ebe1560..b4140c5afe 100644 --- a/src/sourmash/cli/migrate.py +++ b/src/sourmash/cli/migrate.py @@ -1,10 +1,12 @@ "'sourmash migrate' - migrate an SBT database to the latest version." + def subparser(subparsers): - subparser = subparsers.add_parser('migrate') - subparser.add_argument('sbt_name', help='name to save SBT into') + subparser = subparsers.add_parser("migrate") + subparser.add_argument("sbt_name", help="name to save SBT into") def main(args): import sourmash + return sourmash.commands.migrate(args) diff --git a/src/sourmash/cli/multigather.py b/src/sourmash/cli/multigather.py index cf20a32cd2..15f7f1fc71 100644 --- a/src/sourmash/cli/multigather.py +++ b/src/sourmash/cli/multigather.py @@ -1,6 +1,6 @@ "'sourmash multigather' - gather many signatures against multiple databases." -usage=""" +usage = """ The `multigather` subcommand runs 'gather' for multiple query sequences against the same collection of sequences. The main use for multigather @@ -40,52 +40,57 @@ def subparser(subparsers): - subparser = subparsers.add_parser('multigather') + subparser = subparsers.add_parser("multigather") subparser.add_argument( - '--query', nargs='*', default=[], action='append', - help='query signature' + "--query", nargs="*", default=[], action="append", help="query signature" ) subparser.add_argument( - '--query-from-file', - help='file containing list of signature files to query' + "--query-from-file", help="file containing list of signature files to query" ) subparser.add_argument( - '--db', nargs='+', action='append', - help='signatures/SBTs to search', + "--db", + nargs="+", + action="append", + help="signatures/SBTs to search", ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) + subparser.add_argument("-d", "--debug", action="store_true") subparser.add_argument( - '-d', '--debug', action='store_true' + "--threshold-bp", + metavar="REAL", + type=float, + default=5e4, + help="threshold (in bp) for reporting results (default=50,000)", ) subparser.add_argument( - '--threshold-bp', metavar='REAL', type=float, default=5e4, - help='threshold (in bp) for reporting results (default=50,000)' + "--ignore-abundance", + action="store_true", + help="do NOT use k-mer abundances if present", ) subparser.add_argument( - '--ignore-abundance', action='store_true', - help='do NOT use k-mer abundances if present' + "--estimate-ani-ci", + action="store_true", + help="also output confidence intervals for ANI estimates", ) subparser.add_argument( - '--estimate-ani-ci', action='store_true', - help='also output confidence intervals for ANI estimates' + "--fail-on-empty-database", + action="store_true", + help="stop at databases that contain no compatible signatures", ) subparser.add_argument( - '--fail-on-empty-database', action='store_true', - help='stop at databases that contain no compatible signatures' - ) - subparser.add_argument( - '--no-fail-on-empty-database', action='store_false', - dest='fail_on_empty_database', - help='continue past databases that contain no compatible signatures' + "--no-fail-on-empty-database", + action="store_false", + dest="fail_on_empty_database", + help="continue past databases that contain no compatible signatures", ) subparser.set_defaults(fail_on_empty_database=True) subparser.add_argument( - '--output-dir', '--outdir', - help='output CSV results to this directory', + "--output-dir", + "--outdir", + help="output CSV results to this directory", ) add_ksize_arg(subparser) @@ -95,4 +100,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.commands.multigather(args) diff --git a/src/sourmash/cli/plot.py b/src/sourmash/cli/plot.py index a548683c39..718a5c8528 100644 --- a/src/sourmash/cli/plot.py +++ b/src/sourmash/cli/plot.py @@ -1,64 +1,80 @@ """plot distance matrix made by 'compare'""" + def subparser(subparsers): - subparser = subparsers.add_parser('plot') - subparser.add_argument( - 'distances', help='output from "sourmash compare"' - ) + subparser = subparsers.add_parser("plot") + subparser.add_argument("distances", help='output from "sourmash compare"') subparser.add_argument( - '--pdf', action='store_true', - help='output PDF; default is PNG' + "--pdf", action="store_true", help="output PDF; default is PNG" ) subparser.add_argument( - '--labels', action='store_true', default=None, - help='show sample labels on dendrogram/matrix' + "--labels", + action="store_true", + default=None, + help="show sample labels on dendrogram/matrix", ) subparser.add_argument( - '--no-labels', action='store_false', dest='labels', - help='do not show sample labels' + "--no-labels", + action="store_false", + dest="labels", + help="do not show sample labels", ) subparser.add_argument( - '--labeltext', - help='filename containing list of labels (overrides signature names); implies --labels' + "--labeltext", + help="filename containing list of labels (overrides signature names); implies --labels", ) subparser.add_argument( - '--indices', action='store_true', default=None, - help='show sample indices but not labels; overridden by --labels' + "--indices", + action="store_true", + default=None, + help="show sample indices but not labels; overridden by --labels", ) subparser.add_argument( - '--no-indices', action='store_false', dest='indices', - help='do not show sample indices' + "--no-indices", + action="store_false", + dest="indices", + help="do not show sample indices", ) subparser.add_argument( - '--vmin', default=0.0, type=float, - help='lower limit of heatmap scale; default=%(default)f' + "--vmin", + default=0.0, + type=float, + help="lower limit of heatmap scale; default=%(default)f", ) subparser.add_argument( - '--vmax', default=1.0, type=float, - help='upper limit of heatmap scale; default=%(default)f' + "--vmax", + default=1.0, + type=float, + help="upper limit of heatmap scale; default=%(default)f", ) subparser.add_argument( - '--subsample', type=int, metavar='N', - help='randomly downsample to this many samples, max' + "--subsample", + type=int, + metavar="N", + help="randomly downsample to this many samples, max", ) subparser.add_argument( - '--subsample-seed', type=int, default=1, metavar='S', - help='random seed for --subsample; default=1' + "--subsample-seed", + type=int, + default=1, + metavar="S", + help="random seed for --subsample; default=1", ) subparser.add_argument( - '-f', '--force', action='store_true', - help='forcibly plot non-distance matrices' + "-f", "--force", action="store_true", help="forcibly plot non-distance matrices" ) subparser.add_argument( - '--output-dir', metavar='DIR', help='directory for output plots' + "--output-dir", metavar="DIR", help="directory for output plots" ) subparser.add_argument( - '--csv', metavar='F', - help='write clustered matrix and labels out in CSV format (with column' - ' headers) to this file' + "--csv", + metavar="F", + help="write clustered matrix and labels out in CSV format (with column" + " headers) to this file", ) def main(args): import sourmash + return sourmash.commands.plot(args) diff --git a/src/sourmash/cli/prefetch.py b/src/sourmash/cli/prefetch.py index 3727960292..55ee063d0b 100644 --- a/src/sourmash/cli/prefetch.py +++ b/src/sourmash/cli/prefetch.py @@ -1,66 +1,77 @@ """search a signature against dbs, find all overlaps""" -from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, - add_picklist_args, add_scaled_arg, - add_pattern_args) +from sourmash.cli.utils import ( + add_ksize_arg, + add_moltype_args, + add_picklist_args, + add_scaled_arg, + add_pattern_args, +) def subparser(subparsers): - subparser = subparsers.add_parser('prefetch') - subparser.add_argument('query', help='query signature') - subparser.add_argument("databases", + subparser = subparsers.add_parser("prefetch") + subparser.add_argument("query", help="query signature") + subparser.add_argument( + "databases", nargs="*", help="one or more databases to search", ) subparser.add_argument( "--db-from-file", default=None, - help="list of paths containing signatures to search" - ) - subparser.add_argument( - "--linear", action='store_true', - help="force linear traversal of indexes to minimize loading time and memory use" + help="list of paths containing signatures to search", ) subparser.add_argument( - '--no-linear', dest="linear", action='store_false', + "--linear", + action="store_true", + help="force linear traversal of indexes to minimize loading time and memory use", ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "--no-linear", + dest="linear", + action="store_false", ) subparser.add_argument( - '-d', '--debug', action='store_true' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) + subparser.add_argument("-d", "--debug", action="store_true") subparser.add_argument( - '-o', '--output', metavar='FILE', - help='output CSV containing matches to this file' + "-o", + "--output", + metavar="FILE", + help="output CSV containing matches to this file", ) subparser.add_argument( - '--save-matches', metavar='FILE', - help='save all matching signatures from the databases to the ' - 'specified file or directory' + "--save-matches", + metavar="FILE", + help="save all matching signatures from the databases to the " + "specified file or directory", ) subparser.add_argument( - '--threshold-bp', metavar='REAL', type=float, default=5e4, - help='reporting threshold (in bp) for estimated overlap with remaining query hashes (default=50kb)' + "--threshold-bp", + metavar="REAL", + type=float, + default=5e4, + help="reporting threshold (in bp) for estimated overlap with remaining query hashes (default=50kb)", ) subparser.add_argument( - '--save-unmatched-hashes', metavar='FILE', - help='output unmatched query hashes as a signature to the ' - 'specified file' + "--save-unmatched-hashes", + metavar="FILE", + help="output unmatched query hashes as a signature to the " "specified file", ) subparser.add_argument( - '--save-matching-hashes', metavar='FILE', - help='output matching query hashes as a signature to the ' - 'specified file' + "--save-matching-hashes", + metavar="FILE", + help="output matching query hashes as a signature to the " "specified file", ) subparser.add_argument( - '--md5', default=None, - help='select the signature with this md5 as query' + "--md5", default=None, help="select the signature with this md5 as query" ) subparser.add_argument( - '--estimate-ani-ci', action='store_true', - help='also output confidence intervals for ANI estimates' + "--estimate-ani-ci", + action="store_true", + help="also output confidence intervals for ANI estimates", ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -71,4 +82,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.commands.prefetch(args) diff --git a/src/sourmash/cli/sbt_combine.py b/src/sourmash/cli/sbt_combine.py index 1b5ce0febf..20c09fe57a 100644 --- a/src/sourmash/cli/sbt_combine.py +++ b/src/sourmash/cli/sbt_combine.py @@ -1,17 +1,14 @@ """combine multiple Sequence Bloom Trees""" + def subparser(subparsers): - subparser = subparsers.add_parser('sbt_combine') - subparser.add_argument('sbt_name', help='name to save SBT into') - subparser.add_argument( - 'sbts', nargs='+', - help='SBTs to combine to form a new SBT' - ) - subparser.add_argument( - '-x', '--bf-size', metavar='S', type=float, default=1e5 - ) + subparser = subparsers.add_parser("sbt_combine") + subparser.add_argument("sbt_name", help="name to save SBT into") + subparser.add_argument("sbts", nargs="+", help="SBTs to combine to form a new SBT") + subparser.add_argument("-x", "--bf-size", metavar="S", type=float, default=1e5) def main(args): import sourmash + return sourmash.commands.sbt_combine(args) diff --git a/src/sourmash/cli/scripts/__init__.py b/src/sourmash/cli/scripts/__init__.py index 7062ff6c71..9655f05c98 100644 --- a/src/sourmash/cli/scripts/__init__.py +++ b/src/sourmash/cli/scripts/__init__.py @@ -21,16 +21,20 @@ # by sourmash.plugins.add_cli_scripts. _extension_dict = {} + def __getattr__(name): if name in _extension_dict: return _extension_dict[name] raise AttributeError(name) + def subparser(subparsers): - subparser = subparsers.add_parser('scripts', - usage=argparse.SUPPRESS, - formatter_class=argparse.RawDescriptionHelpFormatter, - aliases=['ext']) + subparser = subparsers.add_parser( + "scripts", + usage=argparse.SUPPRESS, + formatter_class=argparse.RawDescriptionHelpFormatter, + aliases=["ext"], + ) # get individual help strings: descrs = list(sourmash.plugins.get_cli_scripts_descriptions()) @@ -39,10 +43,12 @@ def subparser(subparsers): else: description = "(No script plugins detected!)" - s = subparser.add_subparsers(title="available plugin/extension commands", - dest='subcmd', - metavar='subcmd', - help=argparse.SUPPRESS, - description=description) + s = subparser.add_subparsers( + title="available plugin/extension commands", + dest="subcmd", + metavar="subcmd", + help=argparse.SUPPRESS, + description=description, + ) _extension_dict.update(sourmash.plugins.add_cli_scripts(s)) diff --git a/src/sourmash/cli/search.py b/src/sourmash/cli/search.py index 2c11873963..46bf46723b 100644 --- a/src/sourmash/cli/search.py +++ b/src/sourmash/cli/search.py @@ -1,6 +1,6 @@ """search a signature against other signatures""" -usage=""" +usage = """ The `search` subcommand searches a collection of signatures or SBTs for matches to the query signature. It can search for matches with @@ -41,77 +41,95 @@ --- """ -from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, - add_picklist_args, add_scaled_arg, - add_pattern_args) +from sourmash.cli.utils import ( + add_ksize_arg, + add_moltype_args, + add_picklist_args, + add_scaled_arg, + add_pattern_args, +) def subparser(subparsers): - subparser = subparsers.add_parser('search', description=__doc__, usage=usage) + subparser = subparsers.add_parser("search", description=__doc__, usage=usage) + subparser.add_argument("query", help="query signature") subparser.add_argument( - 'query', help='query signature' + "databases", + nargs="+", + help="signatures/SBTs to search", ) subparser.add_argument( - 'databases', nargs='+', - help='signatures/SBTs to search', + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-d", "--debug", action="store_true", help="output debug information" ) subparser.add_argument( - '-d', '--debug', action='store_true', - help='output debug information' + "-t", + "--threshold", + metavar="T", + default=0.08, + type=float, + help="minimum threshold for reporting matches; default=0.08", ) subparser.add_argument( - '-t', '--threshold', metavar='T', default=0.08, type=float, - help='minimum threshold for reporting matches; default=0.08' + "--save-matches", + metavar="FILE", + help="output matching signatures to the specified file", ) subparser.add_argument( - '--save-matches', metavar='FILE', - help='output matching signatures to the specified file' + "--best-only", + action="store_true", + help="report only the best match (with greater speed)", ) subparser.add_argument( - '--best-only', action='store_true', - help='report only the best match (with greater speed)' + "-n", + "--num-results", + default=3, + type=int, + metavar="N", + help="number of results to display to user; 0 to report all", ) subparser.add_argument( - '-n', '--num-results', default=3, type=int, metavar='N', - help='number of results to display to user; 0 to report all' + "--containment", + action="store_true", + help="score based on containment rather than similarity", ) subparser.add_argument( - '--containment', action='store_true', - help='score based on containment rather than similarity' + "--max-containment", + action="store_true", + help="score based on max containment rather than similarity", ) subparser.add_argument( - '--max-containment', action='store_true', - help='score based on max containment rather than similarity' + "--estimate-ani-ci", + action="store_true", + help="for containment searches, also output confidence intervals for ANI estimates", ) subparser.add_argument( - '--estimate-ani-ci', action='store_true', - help='for containment searches, also output confidence intervals for ANI estimates' + "--ignore-abundance", + action="store_true", + help="do NOT use k-mer abundances if present; note: has no effect if " + "--containment or --max-containment is specified", ) subparser.add_argument( - '--ignore-abundance', action='store_true', - help='do NOT use k-mer abundances if present; note: has no effect if ' - '--containment or --max-containment is specified' + "-o", + "--output", + metavar="FILE", + help="output CSV containing matches to this file", ) subparser.add_argument( - '-o', '--output', metavar='FILE', - help='output CSV containing matches to this file' + "--md5", default=None, help="select the signature with this md5 as query" ) subparser.add_argument( - '--md5', default=None, - help='select the signature with this md5 as query' + "--fail-on-empty-database", + action="store_true", + help="stop at databases that contain no compatible signatures", ) subparser.add_argument( - '--fail-on-empty-database', action='store_true', - help='stop at databases that contain no compatible signatures' - ) - subparser.add_argument( - '--no-fail-on-empty-database', action='store_false', - dest='fail_on_empty_database', - help='continue past databases that contain no compatible signatures' + "--no-fail-on-empty-database", + action="store_false", + dest="fail_on_empty_database", + help="continue past databases that contain no compatible signatures", ) subparser.set_defaults(fail_on_empty_database=True) @@ -124,4 +142,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.commands.search(args) diff --git a/src/sourmash/cli/sig/__init__.py b/src/sourmash/cli/sig/__init__.py index f256a7473d..2ea27abf1d 100644 --- a/src/sourmash/cli/sig/__init__.py +++ b/src/sourmash/cli/sig/__init__.py @@ -33,19 +33,27 @@ def subparser(subparsers): - subparser = subparsers.add_parser('sig', formatter_class=RawDescriptionHelpFormatter, usage=SUPPRESS, aliases=['signature']) - desc = 'Operations\n' + subparser = subparsers.add_parser( + "sig", + formatter_class=RawDescriptionHelpFormatter, + usage=SUPPRESS, + aliases=["signature"], + ) + desc = "Operations\n" clidir = os.path.dirname(__file__) ops = command_list(clidir) for subcmd in ops: docstring = getattr(sys.modules[__name__], subcmd).__doc__ - helpstring = 'sourmash sig {op:s} --help'.format(op=subcmd) - desc += ' {hs:33s} {ds:s}\n'.format(hs=helpstring, ds=docstring) + helpstring = f"sourmash sig {subcmd:s} --help" + desc += f" {helpstring:33s} {docstring:s}\n" s = subparser.add_subparsers( - title='Manipulate signature files', dest='subcmd', metavar='subcmd', help=SUPPRESS, - description=desc + title="Manipulate signature files", + dest="subcmd", + metavar="subcmd", + help=SUPPRESS, + description=desc, ) for subcmd in ops: getattr(sys.modules[__name__], subcmd).subparser(s) subparser._action_groups.reverse() - subparser._optionals.title = 'Options' + subparser._optionals.title = "Options" diff --git a/src/sourmash/cli/sig/cat.py b/src/sourmash/cli/sig/cat.py index ed85932f5f..b84905f254 100644 --- a/src/sourmash/cli/sig/cat.py +++ b/src/sourmash/cli/sig/cat.py @@ -1,6 +1,6 @@ """concatenate signature files""" -usage=""" +usage = """ ### `sourmash signature cat` - concatenate multiple signatures together @@ -15,37 +15,43 @@ """ -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, - add_picklist_args, add_pattern_args) +from sourmash.cli.utils import ( + add_moltype_args, + add_ksize_arg, + add_picklist_args, + add_pattern_args, +) def subparser(subparsers): # working on this - subparser = subparsers.add_parser('cat', description=__doc__, usage=usage) - subparser.add_argument('signatures', nargs='*') + subparser = subparsers.add_parser("cat", description=__doc__, usage=usage) + subparser.add_argument("signatures", nargs="*") subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-d', '--debug', action='store_true', - help='provide debugging output' + "-d", "--debug", action="store_true", help="provide debugging output" ) subparser.add_argument( - '-o', '--output', metavar='FILE', default='-', - help='output signature to this file (default stdout)' + "-o", + "--output", + metavar="FILE", + default="-", + help="output signature to this file (default stdout)", ) subparser.add_argument( - '-u', '--unique', action='store_true', - help='keep only distinct signatures, removing duplicates (based on md5sum)' + "-u", + "--unique", + action="store_true", + help="keep only distinct signatures, removing duplicates (based on md5sum)", ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures' + "-f", "--force", action="store_true", help="try to load all files as signatures" ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -55,4 +61,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.cat(args) diff --git a/src/sourmash/cli/sig/check.py b/src/sourmash/cli/sig/check.py index b9dd353501..a4c940eecb 100644 --- a/src/sourmash/cli/sig/check.py +++ b/src/sourmash/cli/sig/check.py @@ -1,6 +1,6 @@ """check signature collections against a picklist""" -usage=""" +usage = """ sourmash sig check --picklist ... -o miss.csv -m manifest.csv @@ -15,51 +15,57 @@ """ -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, - add_picklist_args, add_pattern_args) +from sourmash.cli.utils import ( + add_moltype_args, + add_ksize_arg, + add_picklist_args, + add_pattern_args, +) def subparser(subparsers): - subparser = subparsers.add_parser('check', usage=usage) - subparser.add_argument('signatures', nargs='*') + subparser = subparsers.add_parser("check", usage=usage) + subparser.add_argument("signatures", nargs="*") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-d', '--debug', action='store_true', - help='provide debugging output' + "-d", "--debug", action="store_true", help="provide debugging output" ) subparser.add_argument( - '-o', '--output-missing', metavar='FILE', - help='output picklist with remaining unmatched entries to this file', + "-o", + "--output-missing", + metavar="FILE", + help="output picklist with remaining unmatched entries to this file", ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures' + "-f", "--force", action="store_true", help="try to load all files as signatures" ) subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) subparser.add_argument( - '-m', '--save-manifest-matching', - help='save a manifest of the matching entries to this file.' + "-m", + "--save-manifest-matching", + help="save a manifest of the matching entries to this file.", ) subparser.add_argument( - '--fail-if-missing', action='store_true', - help='exit with an error code (-1) if there are any missing picklist values.' + "--fail-if-missing", + action="store_true", + help="exit with an error code (-1) if there are any missing picklist values.", ) subparser.add_argument( - '--no-require-manifest', - help='do not require a manifest; generate dynamically if needed', - action='store_true' + "--no-require-manifest", + help="do not require a manifest; generate dynamically if needed", + action="store_true", ) subparser.add_argument( - '-F', '--manifest-format', + "-F", + "--manifest-format", help="format of manifest output file; default is 'csv')", - default='csv', - choices=['csv', 'sql'], + default="csv", + choices=["csv", "sql"], ) add_ksize_arg(subparser) @@ -70,4 +76,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.check(args) diff --git a/src/sourmash/cli/sig/collect.py b/src/sourmash/cli/sig/collect.py index 397b0bf34e..1e5d8ded2f 100644 --- a/src/sourmash/cli/sig/collect.py +++ b/src/sourmash/cli/sig/collect.py @@ -1,6 +1,6 @@ """collect manifest information across many files""" -usage=""" +usage = """ sourmash sig collect -o all.sqlmf @@ -13,45 +13,49 @@ """ -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, - add_picklist_args, add_pattern_args) +from sourmash.cli.utils import ( + add_moltype_args, + add_ksize_arg, + add_picklist_args, + add_pattern_args, +) def subparser(subparsers): - subparser = subparsers.add_parser('collect', usage=usage) - subparser.add_argument('locations', nargs='*', - help='locations of input signatures') - subparser.add_argument('-o', '--output', help='manifest output file', - required=True) + subparser = subparsers.add_parser("collect", usage=usage) + subparser.add_argument("locations", nargs="*", help="locations of input signatures") + subparser.add_argument("-o", "--output", help="manifest output file", required=True) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-d', '--debug', action='store_true', - help='provide debugging output' + "-d", "--debug", action="store_true", help="provide debugging output" ) subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) subparser.add_argument( - '--no-require-manifest', - help='do not require a manifest; generate dynamically if needed', - action='store_true' + "--no-require-manifest", + help="do not require a manifest; generate dynamically if needed", + action="store_true", ) subparser.add_argument( - '-F', '--manifest-format', + "-F", + "--manifest-format", help="format of manifest output file; default is 'csv')", - default='sql', - choices=['csv', 'sql'], + default="sql", + choices=["csv", "sql"], ) - subparser.add_argument('--merge-previous', action='store_true', - help='merge new manifests into existing') - subparser.add_argument('--abspath', - help="convert all locations to absolute paths", - action='store_true') + subparser.add_argument( + "--merge-previous", + action="store_true", + help="merge new manifests into existing", + ) + subparser.add_argument( + "--abspath", help="convert all locations to absolute paths", action="store_true" + ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -59,4 +63,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.collect(args) diff --git a/src/sourmash/cli/sig/describe.py b/src/sourmash/cli/sig/describe.py index c59ea1fede..a7984e89d3 100644 --- a/src/sourmash/cli/sig/describe.py +++ b/src/sourmash/cli/sig/describe.py @@ -1,6 +1,6 @@ """show details of signature""" -usage=""" +usage = """ ### `sourmash signature describe` - display detailed information about signatures @@ -22,32 +22,32 @@ """ -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, - add_picklist_args, add_pattern_args) +from sourmash.cli.utils import ( + add_moltype_args, + add_ksize_arg, + add_picklist_args, + add_pattern_args, +) def subparser(subparsers): - subparser = subparsers.add_parser('describe', description=__doc__, usage=usage) - subparser.add_argument('signatures', nargs='*') + subparser = subparsers.add_parser("describe", description=__doc__, usage=usage) + subparser.add_argument("signatures", nargs="*") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-d', '--debug', action='store_true', - help='provide debugging output' + "-d", "--debug", action="store_true", help="provide debugging output" ) subparser.add_argument( - '--csv', metavar='FILE', - help='output information to a CSV file' + "--csv", metavar="FILE", help="output information to a CSV file" ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures' + "-f", "--force", action="store_true", help="try to load all files as signatures" ) subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -57,4 +57,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.describe(args) diff --git a/src/sourmash/cli/sig/downsample.py b/src/sourmash/cli/sig/downsample.py index a06b7d2eb5..7a39221d29 100644 --- a/src/sourmash/cli/sig/downsample.py +++ b/src/sourmash/cli/sig/downsample.py @@ -1,6 +1,6 @@ """downsample one or more signatures""" -usage=""" +usage = """ ### `sourmash signature downsample` - decrease the size of a signature @@ -26,33 +26,36 @@ """ -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, - add_picklist_args, add_num_arg) +from sourmash.cli.utils import ( + add_moltype_args, + add_ksize_arg, + add_picklist_args, + add_num_arg, +) def subparser(subparsers): - subparser = subparsers.add_parser('downsample', description=__doc__, usage=usage) - subparser.add_argument('signatures', nargs="*") + subparser = subparsers.add_parser("downsample", description=__doc__, usage=usage) + subparser.add_argument("signatures", nargs="*") subparser.add_argument( - '--scaled', type=int, default=0, - help='scaled value to downsample to' + "--scaled", type=int, default=0, help="scaled value to downsample to" ) subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-o', '--output', metavar='FILE', - help='output signature to this file (default stdout)', - default='-', + "-o", + "--output", + metavar="FILE", + help="output signature to this file (default stdout)", + default="-", ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures' + "-f", "--force", action="store_true", help="try to load all files as signatures" ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -62,4 +65,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.downsample(args) diff --git a/src/sourmash/cli/sig/export.py b/src/sourmash/cli/sig/export.py index 0299dba5d1..b6a4142d39 100644 --- a/src/sourmash/cli/sig/export.py +++ b/src/sourmash/cli/sig/export.py @@ -1,6 +1,6 @@ """export a signature, e.g. to mash""" -usage=""" +usage = """ ### `sourmash signature export` - export signatures to mash. @@ -17,19 +17,19 @@ def subparser(subparsers): - subparser = subparsers.add_parser('export', description=__doc__, usage=usage) - subparser.add_argument('filename') + subparser = subparsers.add_parser("export", description=__doc__, usage=usage) + subparser.add_argument("filename") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-o', '--output', metavar='FILE', - help='output signature to this file (default stdout)' + "-o", + "--output", + metavar="FILE", + help="output signature to this file (default stdout)", ) subparser.add_argument( - '--md5', default=None, - help='select the signature with this md5 as query' + "--md5", default=None, help="select the signature with this md5 as query" ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -37,4 +37,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.export(args) diff --git a/src/sourmash/cli/sig/extract.py b/src/sourmash/cli/sig/extract.py index a482526290..d3c483bb5e 100644 --- a/src/sourmash/cli/sig/extract.py +++ b/src/sourmash/cli/sig/extract.py @@ -1,6 +1,6 @@ """extract one or more signatures""" -usage=""" +usage = """ ### `sourmash signature extract` - extract signatures from a collection @@ -37,37 +37,43 @@ """ -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, - add_picklist_args, add_pattern_args) +from sourmash.cli.utils import ( + add_moltype_args, + add_ksize_arg, + add_picklist_args, + add_pattern_args, +) def subparser(subparsers): - subparser = subparsers.add_parser('extract', description=__doc__, usage=usage) - subparser.add_argument('signatures', nargs='*') + subparser = subparsers.add_parser("extract", description=__doc__, usage=usage) + subparser.add_argument("signatures", nargs="*") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-o', '--output', metavar='FILE', - help='output signature to this file (default stdout)', - default='-', + "-o", + "--output", + metavar="FILE", + help="output signature to this file (default stdout)", + default="-", ) subparser.add_argument( - '--md5', default=None, - help='select signatures whose md5 contains this substring' + "--md5", + default=None, + help="select signatures whose md5 contains this substring", ) subparser.add_argument( - '--name', default=None, - help='select signatures whose name contains this substring' + "--name", + default=None, + help="select signatures whose name contains this substring", ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures' + "-f", "--force", action="store_true", help="try to load all files as signatures" ) subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -77,4 +83,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.extract(args) diff --git a/src/sourmash/cli/sig/fileinfo.py b/src/sourmash/cli/sig/fileinfo.py index 0b5e71df71..52a894fafb 100644 --- a/src/sourmash/cli/sig/fileinfo.py +++ b/src/sourmash/cli/sig/fileinfo.py @@ -1,6 +1,6 @@ """provide summary information on the given file""" -usage=""" +usage = """ sourmash sig fileinfo @@ -14,33 +14,27 @@ """ - def subparser(subparsers): - subparser = subparsers.add_parser('fileinfo', aliases=['summarize'], - usage=usage) - subparser.add_argument('path') + subparser = subparsers.add_parser("fileinfo", aliases=["summarize"], usage=usage) + subparser.add_argument("path") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-d', '--debug', action='store_true', - help='output debug information' + "-d", "--debug", action="store_true", help="output debug information" ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures' + "-f", "--force", action="store_true", help="try to load all files as signatures" ) subparser.add_argument( - '--rebuild-manifest', help='forcibly rebuild the manifest', - action='store_true' + "--rebuild-manifest", help="forcibly rebuild the manifest", action="store_true" ) subparser.add_argument( - '--json-out', help='output information in JSON format only', - action='store_true' + "--json-out", help="output information in JSON format only", action="store_true" ) def main(args): import sourmash + return sourmash.sig.__main__.fileinfo(args) diff --git a/src/sourmash/cli/sig/filter.py b/src/sourmash/cli/sig/filter.py index 4f5f020d83..3cfaa2c7a2 100644 --- a/src/sourmash/cli/sig/filter.py +++ b/src/sourmash/cli/sig/filter.py @@ -1,6 +1,6 @@ """filter k-mers on abundance""" -usage=""" +usage = """ ### `sourmash signature filter` - remove hashes based on abundance @@ -25,32 +25,43 @@ def subparser(subparsers): - subparser = subparsers.add_parser('filter', description=__doc__, usage=usage) - subparser.add_argument('signatures', nargs='+') + subparser = subparsers.add_parser("filter", description=__doc__, usage=usage) + subparser.add_argument("signatures", nargs="+") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-o', '--output', metavar='FILE', - help='output signature to this file (default stdout)', - default='-' + "-o", + "--output", + metavar="FILE", + help="output signature to this file (default stdout)", + default="-", ) subparser.add_argument( - '--md5', type=str, default=None, - help='select signatures whose md5 contains this substring' + "--md5", + type=str, + default=None, + help="select signatures whose md5 contains this substring", ) subparser.add_argument( - '--name', type=str, default=None, - help='select signatures whose name contains this substring' + "--name", + type=str, + default=None, + help="select signatures whose name contains this substring", ) subparser.add_argument( - '-m', '--min-abundance', type=int, default=1, - help='keep hashes >= this minimum abundance' + "-m", + "--min-abundance", + type=int, + default=1, + help="keep hashes >= this minimum abundance", ) subparser.add_argument( - '-M', '--max-abundance', type=int, default=None, - help='keep hashes <= this maximum abundance' + "-M", + "--max-abundance", + type=int, + default=None, + help="keep hashes <= this maximum abundance", ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -58,4 +69,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.filter(args) diff --git a/src/sourmash/cli/sig/flatten.py b/src/sourmash/cli/sig/flatten.py index ca87b171c1..fa75f3434c 100644 --- a/src/sourmash/cli/sig/flatten.py +++ b/src/sourmash/cli/sig/flatten.py @@ -1,6 +1,6 @@ """remove abundances""" -usage=""" +usage = """ ### `sourmash signature flatten` - remove abundance information from signatures @@ -18,37 +18,38 @@ """ -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, - add_picklist_args) +from sourmash.cli.utils import add_moltype_args, add_ksize_arg, add_picklist_args def subparser(subparsers): - subparser = subparsers.add_parser('flatten', description=__doc__, usage=usage) - subparser.add_argument('signatures', nargs='*') + subparser = subparsers.add_parser("flatten", description=__doc__, usage=usage) + subparser.add_argument("signatures", nargs="*") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-o', '--output', metavar='FILE', - help='output signature to this file (default stdout)', - default='-', + "-o", + "--output", + metavar="FILE", + help="output signature to this file (default stdout)", + default="-", ) subparser.add_argument( - '--md5', default=None, - help='select signatures whose md5 contains this substring' + "--md5", + default=None, + help="select signatures whose md5 contains this substring", ) subparser.add_argument( - '--name', default=None, - help='select signatures whose name contains this substring' + "--name", + default=None, + help="select signatures whose name contains this substring", ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures' + "-f", "--force", action="store_true", help="try to load all files as signatures" ) subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -57,4 +58,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.flatten(args) diff --git a/src/sourmash/cli/sig/grep.py b/src/sourmash/cli/sig/grep.py index 03d93299da..bf1c5ccf4a 100644 --- a/src/sourmash/cli/sig/grep.py +++ b/src/sourmash/cli/sig/grep.py @@ -1,6 +1,6 @@ """extract one or more signatures by substr/regex match""" -usage=""" +usage = """ sourmash sig grep [... ] This will search for the provided pattern in the files or databases, @@ -26,63 +26,67 @@ """ -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, - add_picklist_args) +from sourmash.cli.utils import add_moltype_args, add_ksize_arg, add_picklist_args def subparser(subparsers): - subparser = subparsers.add_parser('grep', usage=usage) - subparser.add_argument('pattern', help='search pattern (string/regex)') - subparser.add_argument('signatures', nargs='*') + subparser = subparsers.add_parser("grep", usage=usage) + subparser.add_argument("pattern", help="search pattern (string/regex)") + subparser.add_argument("signatures", nargs="*") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-d', '--debug', action='store_true', - help='output debug information' + "-d", "--debug", action="store_true", help="output debug information" ) subparser.add_argument( - '-o', '--output', metavar='FILE', - help='output matching signatures to this file (default stdout)', - default='-', + "-o", + "--output", + metavar="FILE", + help="output matching signatures to this file (default stdout)", + default="-", ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures, independent of filename' + "-f", + "--force", + action="store_true", + help="try to load all files as signatures, independent of filename", ) subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) subparser.add_argument( - '-v', '--invert-match', + "-v", + "--invert-match", help="select non-matching signatures", - action="store_true" + action="store_true", ) subparser.add_argument( - '-i', '--ignore-case', + "-i", + "--ignore-case", help="ignore case distinctions (search lower and upper case both)", - action="store_true" + action="store_true", ) subparser.add_argument( - '--no-require-manifest', - help='do not require a manifest; generate dynamically if needed', - action='store_true' + "--no-require-manifest", + help="do not require a manifest; generate dynamically if needed", + action="store_true", ) subparser.add_argument( - '--csv', - help='save CSV file containing signature data in manifest format' + "--csv", help="save CSV file containing signature data in manifest format" ) subparser.add_argument( - '--silent', '--no-signatures-output', + "--silent", + "--no-signatures-output", help="do not output signatures", - action='store_true', + action="store_true", ) subparser.add_argument( - '-c', '--count', + "-c", + "--count", help="only output a count of discovered signatures; implies --silent", - action='store_true' + action="store_true", ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -91,4 +95,5 @@ def subparser(subparsers): def main(args): import sourmash.sig.grep + return sourmash.sig.grep.main(args) diff --git a/src/sourmash/cli/sig/inflate.py b/src/sourmash/cli/sig/inflate.py index c5a247727a..50b86e6dcf 100644 --- a/src/sourmash/cli/sig/inflate.py +++ b/src/sourmash/cli/sig/inflate.py @@ -1,24 +1,24 @@ """borrow abundances from one signature => one or more other signatures""" -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, - add_picklist_args) +from sourmash.cli.utils import add_moltype_args, add_ksize_arg, add_picklist_args def subparser(subparsers): - subparser = subparsers.add_parser('inflate') - subparser.add_argument('signature_from') - subparser.add_argument('other_sigs', nargs='+') + subparser = subparsers.add_parser("inflate") + subparser.add_argument("signature_from") + subparser.add_argument("other_sigs", nargs="+") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-o', '--output', metavar='FILE', default='-', - help='output signature to this file (default stdout)' + "-o", + "--output", + metavar="FILE", + default="-", + help="output signature to this file (default stdout)", ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures' + "-f", "--force", action="store_true", help="try to load all files as signatures" ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -27,4 +27,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.inflate(args) diff --git a/src/sourmash/cli/sig/ingest.py b/src/sourmash/cli/sig/ingest.py index 9c7d9e0547..99e84f7a63 100644 --- a/src/sourmash/cli/sig/ingest.py +++ b/src/sourmash/cli/sig/ingest.py @@ -1,6 +1,6 @@ """ingest/import a mash or other signature""" -usage=""" +usage = """ sourmash sig ingest --csv [ ] -o @@ -16,21 +16,25 @@ def subparser(subparsers): # Dirty hack to simultaneously support new and previous interface # If desired, this function can be removed with a major version bump. - for cmd in ('ingest', 'import'): + for cmd in ("ingest", "import"): subparser = subparsers.add_parser(cmd, usage=usage) - subparser.add_argument('--csv', action='store_true', - help='import in Mash CSV format') - subparser.add_argument('filenames', nargs='+') subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "--csv", action="store_true", help="import in Mash CSV format" ) + subparser.add_argument("filenames", nargs="+") subparser.add_argument( - '-o', '--output', metavar='FILE', default='-', - help='output signature to this file (default stdout)' + "-q", "--quiet", action="store_true", help="suppress non-error output" + ) + subparser.add_argument( + "-o", + "--output", + metavar="FILE", + default="-", + help="output signature to this file (default stdout)", ) def main(args): import sourmash + return sourmash.sig.__main__.ingest(args) diff --git a/src/sourmash/cli/sig/intersect.py b/src/sourmash/cli/sig/intersect.py index 4a5ea4db23..521e83f10f 100644 --- a/src/sourmash/cli/sig/intersect.py +++ b/src/sourmash/cli/sig/intersect.py @@ -1,6 +1,6 @@ """intersect two or more signatures""" -usage=""" +usage = """ ### `sourmash signature intersect` - intersect two (or more) signatures @@ -22,32 +22,34 @@ """ -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, - add_picklist_args) +from sourmash.cli.utils import add_moltype_args, add_ksize_arg, add_picklist_args def subparser(subparsers): - subparser = subparsers.add_parser('intersect', description=__doc__, usage=usage) - subparser.add_argument('signatures', nargs='*') + subparser = subparsers.add_parser("intersect", description=__doc__, usage=usage) + subparser.add_argument("signatures", nargs="*") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-o', '--output', metavar='FILE', default='-', - help='output signature to this file (default stdout)' + "-o", + "--output", + metavar="FILE", + default="-", + help="output signature to this file (default stdout)", ) subparser.add_argument( - '-A', '--abundances-from', metavar='FILE', - help='intersect with & take abundances from this signature' + "-A", + "--abundances-from", + metavar="FILE", + help="intersect with & take abundances from this signature", ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures' + "-f", "--force", action="store_true", help="try to load all files as signatures" ) subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -56,4 +58,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.intersect(args) diff --git a/src/sourmash/cli/sig/kmers.py b/src/sourmash/cli/sig/kmers.py index 08863f33c9..98d7ee9d8d 100644 --- a/src/sourmash/cli/sig/kmers.py +++ b/src/sourmash/cli/sig/kmers.py @@ -1,6 +1,6 @@ """show k-mers/sequences matching the signature hashes""" -usage=""" +usage = """ ### `sourmash signature kmers` - extract k-mers and/or sequences that match to signatures @@ -48,44 +48,52 @@ """ -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, - add_picklist_args) +from sourmash.cli.utils import add_moltype_args, add_ksize_arg, add_picklist_args def subparser(subparsers): - subparser = subparsers.add_parser('kmers', description=__doc__, usage=usage) - subparser.add_argument('--signatures', nargs='*', default=[]) + subparser = subparsers.add_parser("kmers", description=__doc__, usage=usage) + subparser.add_argument("--signatures", nargs="*", default=[]) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures' + "-f", "--force", action="store_true", help="try to load all files as signatures" ) subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) add_ksize_arg(subparser) add_moltype_args(subparser) add_picklist_args(subparser) - subparser.add_argument('--sequences', nargs='+', required=True, - help="FASTA/FASTQ/bz2/gz files with sequences") + subparser.add_argument( + "--sequences", + nargs="+", + required=True, + help="FASTA/FASTQ/bz2/gz files with sequences", + ) - subparser.add_argument('--save-kmers', - help="save k-mers and hash values to a CSV file") - subparser.add_argument('--save-sequences', - help="save sequences with matching hashes to a FASTA file") - subparser.add_argument('--translate', action="store_true", - help="translate DNA k-mers into amino acids (for protein, dayhoff, and hp sketches)") subparser.add_argument( - '--check-sequence', action='store_true', - help='complain if input sequence is invalid (NOTE: only checks DNA)' + "--save-kmers", help="save k-mers and hash values to a CSV file" + ) + subparser.add_argument( + "--save-sequences", help="save sequences with matching hashes to a FASTA file" + ) + subparser.add_argument( + "--translate", + action="store_true", + help="translate DNA k-mers into amino acids (for protein, dayhoff, and hp sketches)", + ) + subparser.add_argument( + "--check-sequence", + action="store_true", + help="complain if input sequence is invalid (NOTE: only checks DNA)", ) def main(args): import sourmash + return sourmash.sig.__main__.kmers(args) diff --git a/src/sourmash/cli/sig/manifest.py b/src/sourmash/cli/sig/manifest.py index e066dbda67..72f00500c4 100644 --- a/src/sourmash/cli/sig/manifest.py +++ b/src/sourmash/cli/sig/manifest.py @@ -1,6 +1,6 @@ """create a manifest for a collection of signatures""" -usage=""" +usage = """ sourmash sig manifest -o manifest.csv @@ -17,36 +17,40 @@ def subparser(subparsers): - subparser = subparsers.add_parser('manifest', usage=usage) - subparser.add_argument('location') + subparser = subparsers.add_parser("manifest", usage=usage) + subparser.add_argument("location") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-d', '--debug', action='store_true', - help='output debug information' + "-d", "--debug", action="store_true", help="output debug information" ) subparser.add_argument( - '-o', '--output', '--csv', metavar='FILE', - help='output information to a CSV file', + "-o", + "--output", + "--csv", + metavar="FILE", + help="output information to a CSV file", required=True, ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures' + "-f", "--force", action="store_true", help="try to load all files as signatures" ) subparser.add_argument( - '--no-rebuild-manifest', help='use existing manifest if available', - action='store_true' + "--no-rebuild-manifest", + help="use existing manifest if available", + action="store_true", ) subparser.add_argument( - '-F', '--manifest-format', + "-F", + "--manifest-format", help="format of manifest output file; default is 'csv')", - default='csv', - choices=['csv', 'sql'], + default="csv", + choices=["csv", "sql"], ) + def main(args): import sourmash + return sourmash.sig.__main__.manifest(args) diff --git a/src/sourmash/cli/sig/merge.py b/src/sourmash/cli/sig/merge.py index 6de8b77d16..026749a5f0 100644 --- a/src/sourmash/cli/sig/merge.py +++ b/src/sourmash/cli/sig/merge.py @@ -1,6 +1,6 @@ """merge one or more signatures""" -usage=""" +usage = """ ### `sourmash signature merge` - merge two or more signatures into one @@ -24,36 +24,32 @@ """ -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, - add_picklist_args) +from sourmash.cli.utils import add_moltype_args, add_ksize_arg, add_picklist_args def subparser(subparsers): - subparser = subparsers.add_parser('merge', description=__doc__, usage=usage) - subparser.add_argument('signatures', nargs='*') + subparser = subparsers.add_parser("merge", description=__doc__, usage=usage) + subparser.add_argument("signatures", nargs="*") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-o', '--output', metavar='FILE', default='-', - help='output signature to this file (default stdout)' + "-o", + "--output", + metavar="FILE", + default="-", + help="output signature to this file (default stdout)", ) subparser.add_argument( - '--flatten', action='store_true', - help='remove abundances from all signatures' + "--flatten", action="store_true", help="remove abundances from all signatures" ) + subparser.add_argument("--name", help="rename merged signature") subparser.add_argument( - '--name', - help='rename merged signature' + "-f", "--force", action="store_true", help="try to load all files as signatures" ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures' - ) - subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -62,4 +58,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.merge(args) diff --git a/src/sourmash/cli/sig/overlap.py b/src/sourmash/cli/sig/overlap.py index 373336253c..c268e62a85 100644 --- a/src/sourmash/cli/sig/overlap.py +++ b/src/sourmash/cli/sig/overlap.py @@ -1,6 +1,6 @@ """see detailed comparison of signatures""" -usage=""" +usage = """ ### `sourmash signature overlap` - detailed comparison of two signatures' overlap @@ -28,12 +28,11 @@ def subparser(subparsers): - subparser = subparsers.add_parser('overlap', description=__doc__, usage=usage) - subparser.add_argument('signature1') - subparser.add_argument('signature2') + subparser = subparsers.add_parser("overlap", description=__doc__, usage=usage) + subparser.add_argument("signature1") + subparser.add_argument("signature2") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -41,4 +40,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.overlap(args) diff --git a/src/sourmash/cli/sig/rename.py b/src/sourmash/cli/sig/rename.py index 2b360fa8d3..4ed25612fc 100644 --- a/src/sourmash/cli/sig/rename.py +++ b/src/sourmash/cli/sig/rename.py @@ -1,6 +1,6 @@ """rename signature""" -usage=""" +usage = """ ### `sourmash signature rename` - rename a signature @@ -17,34 +17,37 @@ """ -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, - add_picklist_args, add_pattern_args) +from sourmash.cli.utils import ( + add_moltype_args, + add_ksize_arg, + add_picklist_args, + add_pattern_args, +) def subparser(subparsers): - subparser = subparsers.add_parser('rename', description=__doc__, usage=usage) - subparser.add_argument('signatures', nargs='*') - subparser.add_argument('name') + subparser = subparsers.add_parser("rename", description=__doc__, usage=usage) + subparser.add_argument("signatures", nargs="*") + subparser.add_argument("name") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-d', '--debug', action='store_true', - help='print debugging output' + "-d", "--debug", action="store_true", help="print debugging output" ) subparser.add_argument( - '-o', '--output', metavar='FILE', - help='output renamed signature to this file (default stdout)', - default='-' + "-o", + "--output", + metavar="FILE", + help="output renamed signature to this file (default stdout)", + default="-", ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures' + "-f", "--force", action="store_true", help="try to load all files as signatures" ) subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -54,4 +57,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.rename(args) diff --git a/src/sourmash/cli/sig/split.py b/src/sourmash/cli/sig/split.py index e4587b3e0f..bf98fc71fe 100644 --- a/src/sourmash/cli/sig/split.py +++ b/src/sourmash/cli/sig/split.py @@ -1,6 +1,6 @@ """split signature files""" -usage=""" +usage = """ ### `sourmash signature split` - split signatures into individual files @@ -36,32 +36,33 @@ """ -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, - add_picklist_args) +from sourmash.cli.utils import add_moltype_args, add_ksize_arg, add_picklist_args def subparser(subparsers): - subparser = subparsers.add_parser('split', description=__doc__, usage=usage) - subparser.add_argument('signatures', nargs='*') + subparser = subparsers.add_parser("split", description=__doc__, usage=usage) + subparser.add_argument("signatures", nargs="*") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '--output-dir', '--outdir', - help='output signatures to this directory', + "--output-dir", + "--outdir", + help="output signatures to this directory", ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures' + "-f", "--force", action="store_true", help="try to load all files as signatures" ) subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) subparser.add_argument( - '-E', '--extension', type=str, default='.sig', - help="write files with this extension ('.sig' by default)" + "-E", + "--extension", + type=str, + default=".sig", + help="write files with this extension ('.sig' by default)", ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -70,4 +71,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.split(args) diff --git a/src/sourmash/cli/sig/subtract.py b/src/sourmash/cli/sig/subtract.py index 118d91fe41..69a349ace3 100644 --- a/src/sourmash/cli/sig/subtract.py +++ b/src/sourmash/cli/sig/subtract.py @@ -1,6 +1,6 @@ """subtract one or more signatures""" -usage=""" +usage = """ ### `sourmash signature subtract` - subtract other signatures from a signature @@ -22,28 +22,33 @@ """ -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg) +from sourmash.cli.utils import add_moltype_args, add_ksize_arg def subparser(subparsers): - subparser = subparsers.add_parser('subtract', description=__doc__, usage=usage) - subparser.add_argument('signature_from') - subparser.add_argument('subtraction_sigs', nargs='+') + subparser = subparsers.add_parser("subtract", description=__doc__, usage=usage) + subparser.add_argument("signature_from") + subparser.add_argument("subtraction_sigs", nargs="+") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-o', '--output', metavar='FILE', default='-', - help='output signature to this file (default stdout)' + "-o", + "--output", + metavar="FILE", + default="-", + help="output signature to this file (default stdout)", ) subparser.add_argument( - '--flatten', action='store_true', - help='remove abundance from signatures before subtracting' + "--flatten", + action="store_true", + help="remove abundance from signatures before subtracting", ) subparser.add_argument( - '-A', '--abundances-from', metavar='FILE', - help='intersect with & take abundances from this signature' + "-A", + "--abundances-from", + metavar="FILE", + help="intersect with & take abundances from this signature", ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -51,4 +56,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.subtract(args) diff --git a/src/sourmash/cli/sketch/__init__.py b/src/sourmash/cli/sketch/__init__.py index 22abf26ed1..999ce1d3b9 100644 --- a/src/sourmash/cli/sketch/__init__.py +++ b/src/sourmash/cli/sketch/__init__.py @@ -18,19 +18,24 @@ def subparser(subparsers): - subparser = subparsers.add_parser('sketch', formatter_class=RawDescriptionHelpFormatter, usage=SUPPRESS) - desc = 'Operations\n' + subparser = subparsers.add_parser( + "sketch", formatter_class=RawDescriptionHelpFormatter, usage=SUPPRESS + ) + desc = "Operations\n" clidir = os.path.dirname(__file__) ops = command_list(clidir) for subcmd in ops: docstring = getattr(sys.modules[__name__], subcmd).__doc__ - helpstring = 'sourmash sketch {op:s} --help'.format(op=subcmd) - desc += ' {hs:33s} {ds:s}\n'.format(hs=helpstring, ds=docstring) + helpstring = f"sourmash sketch {subcmd:s} --help" + desc += f" {helpstring:33s} {docstring:s}\n" s = subparser.add_subparsers( - title='Create signatures', dest='subcmd', metavar='subcmd', help=SUPPRESS, - description=desc + title="Create signatures", + dest="subcmd", + metavar="subcmd", + help=SUPPRESS, + description=desc, ) for subcmd in ops: getattr(sys.modules[__name__], subcmd).subparser(s) subparser._action_groups.reverse() - subparser._optionals.title = 'Options' + subparser._optionals.title = "Options" diff --git a/src/sourmash/cli/sketch/dna.py b/src/sourmash/cli/sketch/dna.py index 1d82f9df65..19f6de7509 100644 --- a/src/sourmash/cli/sketch/dna.py +++ b/src/sourmash/cli/sketch/dna.py @@ -1,6 +1,6 @@ """create DNA signatures""" -usage=""" +usage = """ sourmash sketch dna data/*.fna.gz @@ -25,66 +25,79 @@ from sourmash.logging import notify, print_results, error from sourmash import command_sketch -assert command_sketch.DEFAULTS['dna'] == 'k=31,scaled=1000,noabund' + +assert command_sketch.DEFAULTS["dna"] == "k=31,scaled=1000,noabund" def subparser(subparsers): - subparser = subparsers.add_parser('dna', - aliases=['rna', 'nucleotide', 'nt'], - usage=usage) - subparser.add_argument( - '--license', default='CC0', type=str, - help='signature license. Currently only CC0 is supported.' + subparser = subparsers.add_parser( + "dna", aliases=["rna", "nucleotide", "nt"], usage=usage ) subparser.add_argument( - '--check-sequence', action='store_true', - help='complain if input sequence is invalid DNA' + "--license", + default="CC0", + type=str, + help="signature license. Currently only CC0 is supported.", ) subparser.add_argument( - '-p', '--param-string', default=[], - help='signature parameters to use.', action='append', + "--check-sequence", + action="store_true", + help="complain if input sequence is invalid DNA", ) - subparser.add_argument( - 'filenames', nargs='*', help='file(s) of sequences' + "-p", + "--param-string", + default=[], + help="signature parameters to use.", + action="append", ) - file_args = subparser.add_argument_group('File handling options') + + subparser.add_argument("filenames", nargs="*", help="file(s) of sequences") + file_args = subparser.add_argument_group("File handling options") file_args.add_argument( - '-f', '--force', action='store_true', - help='recompute signatures even if the file exists' + "-f", + "--force", + action="store_true", + help="recompute signatures even if the file exists", ) subparser.add_argument( - '--from-file', - help='a text file containing a list of sequence files to load' + "--from-file", help="a text file containing a list of sequence files to load" ) file_args.add_argument( - '-o', '--output', - help='output computed signatures to this file' + "-o", "--output", help="output computed signatures to this file" ) file_args.add_argument( - '--merge', '--name', type=str, default='', metavar="FILE", - help='merge all input files into one signature file with the ' - 'specified name' + "--merge", + "--name", + type=str, + default="", + metavar="FILE", + help="merge all input files into one signature file with the " "specified name", ) file_args.add_argument( - '--output-dir', '--outdir', - help='output computed signatures to this directory', + "--output-dir", + "--outdir", + help="output computed signatures to this directory", ) file_args.add_argument( - '--singleton', action='store_true', - help='compute a signature for each sequence record individually' + "--singleton", + action="store_true", + help="compute a signature for each sequence record individually", ) file_args.add_argument( - '--name-from-first', action='store_true', - help='name the signature generated from each file after the first ' - 'record in the file' + "--name-from-first", + action="store_true", + help="name the signature generated from each file after the first " + "record in the file", ) file_args.add_argument( - '--randomize', action='store_true', - help='shuffle the list of input filenames randomly' + "--randomize", + action="store_true", + help="shuffle the list of input filenames randomly", ) def main(args): import sourmash.command_sketch + return sourmash.command_sketch.dna(args) diff --git a/src/sourmash/cli/sketch/fromfile.py b/src/sourmash/cli/sketch/fromfile.py index 08a3e44661..6bd57d26ad 100644 --- a/src/sourmash/cli/sketch/fromfile.py +++ b/src/sourmash/cli/sketch/fromfile.py @@ -1,6 +1,6 @@ """create signatures from a CSV file""" -usage=""" +usage = """ sourmash sketch fromfile --output-signatures -p <...> @@ -28,55 +28,66 @@ def subparser(subparsers): - subparser = subparsers.add_parser('fromfile', - usage=usage) + subparser = subparsers.add_parser("fromfile", usage=usage) subparser.add_argument( - 'csvs', nargs='+', - help="input CSVs providing 'name', 'genome_filename', and 'protein_filename'" + "csvs", + nargs="+", + help="input CSVs providing 'name', 'genome_filename', and 'protein_filename'", ) subparser.add_argument( - '-p', '--param-string', default=[], - help='signature parameters to use.', action='append', + "-p", + "--param-string", + default=[], + help="signature parameters to use.", + action="append", ) subparser.add_argument( - '--already-done', nargs='+', default=[], - help='one or more collections of existing signatures to avoid recalculating' + "--already-done", + nargs="+", + default=[], + help="one or more collections of existing signatures to avoid recalculating", ) subparser.add_argument( - '--license', default='CC0', type=str, - help='signature license. Currently only CC0 is supported.' + "--license", + default="CC0", + type=str, + help="signature license. Currently only CC0 is supported.", ) subparser.add_argument( - '--check-sequence', action='store_true', - help='complain if input sequence is invalid (NOTE: only checks DNA)' + "--check-sequence", + action="store_true", + help="complain if input sequence is invalid (NOTE: only checks DNA)", ) - file_args = subparser.add_argument_group('File handling options') + file_args = subparser.add_argument_group("File handling options") file_args.add_argument( - '-o', '--output-signatures', - help='output computed signatures to this file', + "-o", + "--output-signatures", + help="output computed signatures to this file", ) file_args.add_argument( - '--force-output-already-exists', action='store_true', - help='overwrite/append to --output-signatures location' + "--force-output-already-exists", + action="store_true", + help="overwrite/append to --output-signatures location", ) file_args.add_argument( - '--ignore-missing', action='store_true', - help='proceed with building possible signatures, even if some input files are missing' + "--ignore-missing", + action="store_true", + help="proceed with building possible signatures, even if some input files are missing", ) file_args.add_argument( - '--output-csv-info', - help='output information about what signatures need to be generated' + "--output-csv-info", + help="output information about what signatures need to be generated", ) file_args.add_argument( - '--output-manifest-matching', - help='output a manifest file of already-existing signatures' + "--output-manifest-matching", + help="output a manifest file of already-existing signatures", ) file_args.add_argument( - '--report-duplicated', action='store_true', - help='report duplicated names' + "--report-duplicated", action="store_true", help="report duplicated names" ) def main(args): import sourmash.command_sketch + return sourmash.command_sketch.fromfile(args) diff --git a/src/sourmash/cli/sketch/protein.py b/src/sourmash/cli/sketch/protein.py index 24324ea905..3092d35367 100644 --- a/src/sourmash/cli/sketch/protein.py +++ b/src/sourmash/cli/sketch/protein.py @@ -1,6 +1,6 @@ """create protein signatures""" -usage=""" +usage = """ sourmash sketch protein data/*.fna.gz @@ -26,69 +26,82 @@ from sourmash.logging import notify, print_results, error from sourmash import command_sketch -assert command_sketch.DEFAULTS['protein'] == 'k=10,scaled=200,noabund' + +assert command_sketch.DEFAULTS["protein"] == "k=10,scaled=200,noabund" def subparser(subparsers): - subparser = subparsers.add_parser('protein', aliases=['aa', 'prot'], - usage=usage) - subparser.add_argument( - '--license', default='CC0', type=str, - help='signature license. Currently only CC0 is supported.' - ) + subparser = subparsers.add_parser("protein", aliases=["aa", "prot"], usage=usage) subparser.add_argument( - '-p', '--param-string', default=[], - help='signature parameters to use.', action='append', + "--license", + default="CC0", + type=str, + help="signature license. Currently only CC0 is supported.", ) - subparser.add_argument( - 'filenames', nargs='*', help='file(s) of sequences' + "-p", + "--param-string", + default=[], + help="signature parameters to use.", + action="append", ) - file_args = subparser.add_argument_group('File handling options') + + subparser.add_argument("filenames", nargs="*", help="file(s) of sequences") + file_args = subparser.add_argument_group("File handling options") file_args.add_argument( - '-f', '--force', action='store_true', - help='recompute signatures even if the file exists' + "-f", + "--force", + action="store_true", + help="recompute signatures even if the file exists", ) file_args.add_argument( - '-o', '--output', - help='output computed signatures to this file' + "-o", "--output", help="output computed signatures to this file" ) subparser.add_argument( - '--from-file', - help='a text file containing a list of sequence files to load' + "--from-file", help="a text file containing a list of sequence files to load" ) file_args.add_argument( - '--merge', '--name', type=str, default='', metavar="FILE", - help='merge all input files into one signature file with the ' - 'specified name' + "--merge", + "--name", + type=str, + default="", + metavar="FILE", + help="merge all input files into one signature file with the " "specified name", ) file_args.add_argument( - '--output-dir', '--outdir', - help='output computed signatures to this directory', + "--output-dir", + "--outdir", + help="output computed signatures to this directory", ) file_args.add_argument( - '--singleton', action='store_true', - help='compute a signature for each sequence record individually' + "--singleton", + action="store_true", + help="compute a signature for each sequence record individually", ) file_args.add_argument( - '--name-from-first', action='store_true', - help='name the signature generated from each file after the first ' - 'record in the file' + "--name-from-first", + action="store_true", + help="name the signature generated from each file after the first " + "record in the file", ) file_args.add_argument( - '--randomize', action='store_true', - help='shuffle the list of input filenames randomly' + "--randomize", + action="store_true", + help="shuffle the list of input filenames randomly", ) file_args.add_argument( - '--dayhoff', action='store_true', - help='compute sketches using the dayhoff alphabet instead' + "--dayhoff", + action="store_true", + help="compute sketches using the dayhoff alphabet instead", ) file_args.add_argument( - '--hp', action='store_true', - help='compute sketches using the dayhoff alphabet instead' + "--hp", + action="store_true", + help="compute sketches using the dayhoff alphabet instead", ) def main(args): import sourmash.command_sketch + return sourmash.command_sketch.protein(args) diff --git a/src/sourmash/cli/sketch/translate.py b/src/sourmash/cli/sketch/translate.py index df48d4818a..f5bccab46f 100644 --- a/src/sourmash/cli/sketch/translate.py +++ b/src/sourmash/cli/sketch/translate.py @@ -1,6 +1,6 @@ """create protein signature from DNA/RNA sequence""" -usage=""" +usage = """ sourmash sketch translate data/*.fna.gz @@ -24,75 +24,90 @@ """ from sourmash import command_sketch -assert command_sketch.DEFAULTS['protein'] == 'k=10,scaled=200,noabund' + +assert command_sketch.DEFAULTS["protein"] == "k=10,scaled=200,noabund" import sourmash from sourmash.logging import notify, print_results, error def subparser(subparsers): - subparser = subparsers.add_parser('translate', usage=usage) - subparser.add_argument( - '--license', default='CC0', type=str, - help='signature license. Currently only CC0 is supported.' - ) + subparser = subparsers.add_parser("translate", usage=usage) subparser.add_argument( - '--check-sequence', action='store_true', - help='complain if input sequence is invalid DNA' + "--license", + default="CC0", + type=str, + help="signature license. Currently only CC0 is supported.", ) subparser.add_argument( - '-p', '--param-string', default=[], - help='signature parameters to use.', action='append', + "--check-sequence", + action="store_true", + help="complain if input sequence is invalid DNA", ) - subparser.add_argument( - 'filenames', nargs='*', help='file(s) of sequences' + "-p", + "--param-string", + default=[], + help="signature parameters to use.", + action="append", ) - file_args = subparser.add_argument_group('File handling options') + + subparser.add_argument("filenames", nargs="*", help="file(s) of sequences") + file_args = subparser.add_argument_group("File handling options") file_args.add_argument( - '-f', '--force', action='store_true', - help='recompute signatures even if the file exists' + "-f", + "--force", + action="store_true", + help="recompute signatures even if the file exists", ) file_args.add_argument( - '-o', '--output', - help='output computed signatures to this file' + "-o", "--output", help="output computed signatures to this file" ) subparser.add_argument( - '--from-file', - help='a text file containing a list of sequence files to load' + "--from-file", help="a text file containing a list of sequence files to load" ) file_args.add_argument( - '--merge', '--name', type=str, default='', metavar="FILE", - help='merge all input files into one signature file with the ' - 'specified name' + "--merge", + "--name", + type=str, + default="", + metavar="FILE", + help="merge all input files into one signature file with the " "specified name", ) file_args.add_argument( - '--output-dir', '--outdir', - help='output computed signatures to this directory', + "--output-dir", + "--outdir", + help="output computed signatures to this directory", ) file_args.add_argument( - '--singleton', action='store_true', - help='compute a signature for each sequence record individually' + "--singleton", + action="store_true", + help="compute a signature for each sequence record individually", ) file_args.add_argument( - '--name-from-first', action='store_true', - help='name the signature generated from each file after the first ' - 'record in the file' + "--name-from-first", + action="store_true", + help="name the signature generated from each file after the first " + "record in the file", ) file_args.add_argument( - '--randomize', action='store_true', - help='shuffle the list of input filenames randomly' + "--randomize", + action="store_true", + help="shuffle the list of input filenames randomly", ) file_args.add_argument( - '--dayhoff', action='store_true', - help='compute sketches using the dayhoff alphabet instead' + "--dayhoff", + action="store_true", + help="compute sketches using the dayhoff alphabet instead", ) file_args.add_argument( - '--hp', action='store_true', - help='compute sketches using the dayhoff alphabet instead' + "--hp", + action="store_true", + help="compute sketches using the dayhoff alphabet instead", ) def main(args): import sourmash.command_sketch + return sourmash.command_sketch.translate(args) diff --git a/src/sourmash/cli/storage/__init__.py b/src/sourmash/cli/storage/__init__.py index 8ad0b2ada1..42f1a292b2 100644 --- a/src/sourmash/cli/storage/__init__.py +++ b/src/sourmash/cli/storage/__init__.py @@ -12,19 +12,24 @@ def subparser(subparsers): - subparser = subparsers.add_parser('storage', formatter_class=RawDescriptionHelpFormatter, usage=SUPPRESS) - desc = 'Operations\n' + subparser = subparsers.add_parser( + "storage", formatter_class=RawDescriptionHelpFormatter, usage=SUPPRESS + ) + desc = "Operations\n" clidir = os.path.dirname(__file__) ops = command_list(clidir) for subcmd in ops: docstring = getattr(sys.modules[__name__], subcmd).__doc__ - helpstring = 'sourmash storage {op:s} --help'.format(op=subcmd) - desc += ' {hs:33s} {ds:s}\n'.format(hs=helpstring, ds=docstring) + helpstring = f"sourmash storage {subcmd:s} --help" + desc += f" {helpstring:33s} {docstring:s}\n" s = subparser.add_subparsers( - title='Storage utilities', dest='subcmd', metavar='subcmd', help=SUPPRESS, - description=desc + title="Storage utilities", + dest="subcmd", + metavar="subcmd", + help=SUPPRESS, + description=desc, ) for subcmd in ops: getattr(sys.modules[__name__], subcmd).subparser(s) subparser._action_groups.reverse() - subparser._optionals.title = 'Options' + subparser._optionals.title = "Options" diff --git a/src/sourmash/cli/storage/convert.py b/src/sourmash/cli/storage/convert.py index 0aa5c23fa5..7efbc2e7ff 100644 --- a/src/sourmash/cli/storage/convert.py +++ b/src/sourmash/cli/storage/convert.py @@ -1,16 +1,13 @@ "'sourmash storage convert' - convert an SBT to use a different back end." + def subparser(subparsers): - subparser = subparsers.add_parser('convert') - subparser.add_argument( - 'sbt', help='name to save SBT into' - ) - subparser.add_argument( - '-b', '--backend', type=str, - help='Backend to convert to' - ) + subparser = subparsers.add_parser("convert") + subparser.add_argument("sbt", help="name to save SBT into") + subparser.add_argument("-b", "--backend", type=str, help="Backend to convert to") def main(args): import sourmash + return sourmash.sbt.convert_cmd(args.sbt, args.backend) diff --git a/src/sourmash/cli/tax/__init__.py b/src/sourmash/cli/tax/__init__.py index b8bf95f8d8..0b58299f56 100644 --- a/src/sourmash/cli/tax/__init__.py +++ b/src/sourmash/cli/tax/__init__.py @@ -18,19 +18,27 @@ def subparser(subparsers): - subparser = subparsers.add_parser('tax', formatter_class=RawDescriptionHelpFormatter, usage=SUPPRESS, aliases=['taxonomy']) - desc = 'Operations\n' + subparser = subparsers.add_parser( + "tax", + formatter_class=RawDescriptionHelpFormatter, + usage=SUPPRESS, + aliases=["taxonomy"], + ) + desc = "Operations\n" clidir = os.path.dirname(__file__) ops = command_list(clidir) for subcmd in ops: docstring = getattr(sys.modules[__name__], subcmd).__doc__ - helpstring = 'sourmash tax {op:s} --help'.format(op=subcmd) - desc += ' {hs:33s} {ds:s}\n'.format(hs=helpstring, ds=docstring) + helpstring = f"sourmash tax {subcmd:s} --help" + desc += f" {helpstring:33s} {docstring:s}\n" s = subparser.add_subparsers( - title="Integrate taxonomy information based on 'gather' results", dest='subcmd', metavar='subcmd', help=SUPPRESS, - description=desc + title="Integrate taxonomy information based on 'gather' results", + dest="subcmd", + metavar="subcmd", + help=SUPPRESS, + description=desc, ) for subcmd in ops: getattr(sys.modules[__name__], subcmd).subparser(s) subparser._action_groups.reverse() - subparser._optionals.title = 'Options' + subparser._optionals.title = "Options" diff --git a/src/sourmash/cli/tax/annotate.py b/src/sourmash/cli/tax/annotate.py index 501a02fd58..7541440fc2 100644 --- a/src/sourmash/cli/tax/annotate.py +++ b/src/sourmash/cli/tax/annotate.py @@ -1,6 +1,6 @@ """annotate gather results with taxonomy information""" -usage=""" +usage = """ sourmash tax annotate --gather-csv [ ... ] --taxonomy-csv [ ... ] @@ -19,53 +19,70 @@ def subparser(subparsers): - subparser = subparsers.add_parser('annotate', - aliases=['annotate'], - usage=usage) + subparser = subparsers.add_parser("annotate", aliases=["annotate"], usage=usage) subparser.add_argument( - '-g', '--gather-csv', nargs='*', default = [], action='extend', - help='CSV output files from sourmash gather' + "-g", + "--gather-csv", + nargs="*", + default=[], + action="extend", + help="CSV output files from sourmash gather", ) subparser.add_argument( - '--from-file', metavar='FILE', default=None, - help='input many gather results as a text file, with one gather CSV per line' + "--from-file", + metavar="FILE", + default=None, + help="input many gather results as a text file, with one gather CSV per line", ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-t', '--taxonomy-csv', '--taxonomy', metavar='FILE', - nargs='*', required=True, action="extend", - help='database lineages CSV' + "-t", + "--taxonomy-csv", + "--taxonomy", + metavar="FILE", + nargs="*", + required=True, + action="extend", + help="database lineages CSV", ) subparser.add_argument( - '-o', '--output-dir', default= "", - help='directory for output files' + "-o", "--output-dir", default="", help="directory for output files" ) subparser.add_argument( - '--keep-full-identifiers', action='store_true', - help='do not split identifiers on whitespace' + "--keep-full-identifiers", + action="store_true", + help="do not split identifiers on whitespace", ) subparser.add_argument( - '--keep-identifier-versions', action='store_true', - help='after splitting identifiers, do not remove accession versions' + "--keep-identifier-versions", + action="store_true", + help="after splitting identifiers, do not remove accession versions", ) subparser.add_argument( - '--fail-on-missing-taxonomy', action='store_true', - help='fail quickly if taxonomy is not available for an identifier', + "--fail-on-missing-taxonomy", + action="store_true", + help="fail quickly if taxonomy is not available for an identifier", ) subparser.add_argument( - '-f', '--force', action = 'store_true', - help='continue past errors in file and taxonomy loading', + "-f", + "--force", + action="store_true", + help="continue past errors in file and taxonomy loading", ) subparser.add_argument( - '--lins', '--lin-taxonomy', action='store_true', default=False, - help='use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain LIN lineage information.' + "--lins", + "--lin-taxonomy", + action="store_true", + default=False, + help="use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain LIN lineage information.", ) + def main(args): - import sourmash if not args.gather_csv and not args.from_file: - raise ValueError(f"No gather CSVs found! Please input via '-g' or '--from-file'.") + raise ValueError( + "No gather CSVs found! Please input via '-g' or '--from-file'." + ) return sourmash.tax.__main__.annotate(args) diff --git a/src/sourmash/cli/tax/genome.py b/src/sourmash/cli/tax/genome.py index 3f3ee41578..b9712658a4 100644 --- a/src/sourmash/cli/tax/genome.py +++ b/src/sourmash/cli/tax/genome.py @@ -1,6 +1,6 @@ """classify genomes from gather results""" -usage=""" +usage = """ sourmash tax genome --gather-csv [ ... ] --taxonomy-csv [ ... ] @@ -34,81 +34,114 @@ import argparse import sourmash from sourmash.logging import notify, print_results, error -from sourmash.cli.utils import add_tax_threshold_arg, check_rank, check_tax_outputs, add_rank_arg +from sourmash.cli.utils import ( + add_tax_threshold_arg, + check_rank, + check_tax_outputs, + add_rank_arg, +) + def subparser(subparsers): - subparser = subparsers.add_parser('genome', - aliases=['classify'], - usage=usage) + subparser = subparsers.add_parser("genome", aliases=["classify"], usage=usage) subparser.add_argument( - '-g', '--gather-csv', action='extend', nargs='*', default = [], - help='CSVs output by sourmash gather for this sample' + "-g", + "--gather-csv", + action="extend", + nargs="*", + default=[], + help="CSVs output by sourmash gather for this sample", ) subparser.add_argument( - '--from-file', metavar='FILE', default=None, - help='input many gather results as a text file, with one gather CSV per line' + "--from-file", + metavar="FILE", + default=None, + help="input many gather results as a text file, with one gather CSV per line", ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-t', '--taxonomy-csv', '--taxonomy', metavar='FILE', - nargs='*', required=True, action='extend', - help='database lineages CSV' + "-t", + "--taxonomy-csv", + "--taxonomy", + metavar="FILE", + nargs="*", + required=True, + action="extend", + help="database lineages CSV", ) subparser.add_argument( - '-o', '--output-base', default='-', - help='base filepath for output file(s) (default stdout)' + "-o", + "--output-base", + default="-", + help="base filepath for output file(s) (default stdout)", ) subparser.add_argument( - '--output-dir', default= "", - help='directory for output files' + "--output-dir", default="", help="directory for output files" ) subparser.add_argument( - '--keep-full-identifiers', action='store_true', - help='do not split identifiers on whitespace' + "--keep-full-identifiers", + action="store_true", + help="do not split identifiers on whitespace", ) subparser.add_argument( - '--keep-identifier-versions', action='store_true', - help='after splitting identifiers, do not remove accession versions' + "--keep-identifier-versions", + action="store_true", + help="after splitting identifiers, do not remove accession versions", ) subparser.add_argument( - '--fail-on-missing-taxonomy', action='store_true', - help='fail quickly if taxonomy is not available for an identifier', + "--fail-on-missing-taxonomy", + action="store_true", + help="fail quickly if taxonomy is not available for an identifier", ) subparser.add_argument( - '-F', '--output-format', default=[], nargs='*', action='extend', + "-F", + "--output-format", + default=[], + nargs="*", + action="extend", choices=["csv_summary", "krona", "human", "lineage_csv"], - help='choose output format(s)', + help="choose output format(s)", ) subparser.add_argument( - '-f', '--force', action = 'store_true', - help='continue past survivable errors in loading taxonomy database or gather results', + "-f", + "--force", + action="store_true", + help="continue past survivable errors in loading taxonomy database or gather results", ) subparser.add_argument( - '--lins', '--lin-taxonomy', action='store_true', default=False, - help="use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain 'lin' lineage information." + "--lins", + "--lin-taxonomy", + action="store_true", + default=False, + help="use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain 'lin' lineage information.", ) subparser.add_argument( - '--lingroup', '--lingroups', metavar='FILE', default=None, - help="CSV containing 'name', 'lin' columns, where 'lin' is the lingroup prefix. Will restrict classification to these groups." + "--lingroup", + "--lingroups", + metavar="FILE", + default=None, + help="CSV containing 'name', 'lin' columns, where 'lin' is the lingroup prefix. Will restrict classification to these groups.", ) add_tax_threshold_arg(subparser, 0.1) add_rank_arg(subparser) def main(args): - import sourmash try: if not args.gather_csv and not args.from_file: - raise ValueError(f"No gather CSVs found! Please input via '-g' or '--from-file'.") + raise ValueError( + "No gather CSVs found! Please input via '-g' or '--from-file'." + ) if args.rank: args.rank = check_rank(args) - args.output_format = check_tax_outputs(args, rank_required = ['krona']) + args.output_format = check_tax_outputs(args, rank_required=["krona"]) except ValueError as exc: error(f"ERROR: {str(exc)}") - import sys; sys.exit(-1) + import sys + + sys.exit(-1) return sourmash.tax.__main__.genome(args) diff --git a/src/sourmash/cli/tax/grep.py b/src/sourmash/cli/tax/grep.py index 9aa5db3b89..13c25783fa 100644 --- a/src/sourmash/cli/tax/grep.py +++ b/src/sourmash/cli/tax/grep.py @@ -1,6 +1,6 @@ """search taxonomies and output picklists.""" -usage=""" +usage = """ sourmash tax grep --taxonomy-csv [ ... ] @@ -21,55 +21,69 @@ def subparser(subparsers): - subparser = subparsers.add_parser('grep', usage=usage) - subparser.add_argument('pattern') - subparser.add_argument('-r', '--rank', - help="search only this rank", - choices=['superkingdom', - 'phylum', - 'class', - 'order', - 'family', - 'genus', - 'species']) + subparser = subparsers.add_parser("grep", usage=usage) + subparser.add_argument("pattern") subparser.add_argument( - '-v', '--invert-match', - help="select non-matching lineages", - action="store_true" + "-r", + "--rank", + help="search only this rank", + choices=[ + "superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + ], ) subparser.add_argument( - '-i', '--ignore-case', + "-v", "--invert-match", help="select non-matching lineages", action="store_true" + ) + subparser.add_argument( + "-i", + "--ignore-case", help="ignore case distinctions (search lower and upper case both)", - action="store_true" + action="store_true", ) subparser.add_argument( - '--silent', '--no-picklist-output', + "--silent", + "--no-picklist-output", help="do not output picklist", - action='store_true', + action="store_true", ) subparser.add_argument( - '-c', '--count', + "-c", + "--count", help="only output a count of discovered lineages; implies --silent", - action='store_true' + action="store_true", ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-t', '--taxonomy-csv', '--taxonomy', metavar='FILE', - nargs="+", required=True, action="extend", - help='database lineages' + "-t", + "--taxonomy-csv", + "--taxonomy", + metavar="FILE", + nargs="+", + required=True, + action="extend", + help="database lineages", ) subparser.add_argument( - '-o', '--output', default='-', - help='output file (defaults to stdout)', + "-o", + "--output", + default="-", + help="output file (defaults to stdout)", ) subparser.add_argument( - '-f', '--force', action = 'store_true', - help='continue past errors in file and taxonomy loading', + "-f", + "--force", + action="store_true", + help="continue past errors in file and taxonomy loading", ) + def main(args): - import sourmash return sourmash.tax.__main__.grep(args) diff --git a/src/sourmash/cli/tax/metagenome.py b/src/sourmash/cli/tax/metagenome.py index 1e3cd50313..563c6c3d81 100644 --- a/src/sourmash/cli/tax/metagenome.py +++ b/src/sourmash/cli/tax/metagenome.py @@ -1,6 +1,6 @@ """summarize metagenome gather results""" -usage=""" +usage = """ sourmash tax metagenome --gather-csv [ ... ] --taxonomy-csv [ ... ] @@ -26,77 +26,118 @@ from sourmash.cli.utils import add_rank_arg, check_rank, check_tax_outputs - def subparser(subparsers): - subparser = subparsers.add_parser('metagenome', - usage=usage) + subparser = subparsers.add_parser("metagenome", usage=usage) subparser.add_argument( - '-g', '--gather-csv', action="extend", nargs='*', default = [], - help='CSVs from sourmash gather' + "-g", + "--gather-csv", + action="extend", + nargs="*", + default=[], + help="CSVs from sourmash gather", ) subparser.add_argument( - '--from-file', metavar='FILE', default = None, - help='input many gather results as a text file, with one gather CSV per line' + "--from-file", + metavar="FILE", + default=None, + help="input many gather results as a text file, with one gather CSV per line", ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-o', '--output-base', default='-', - help='base filepath for output file(s) (default stdout)' + "-o", + "--output-base", + default="-", + help="base filepath for output file(s) (default stdout)", ) subparser.add_argument( - '--output-dir', default= "", - help='directory for output files' + "--output-dir", default="", help="directory for output files" ) subparser.add_argument( - '-t', '--taxonomy-csv', '--taxonomy', metavar='FILE', - action="extend", nargs='+', required=True, - help='database lineages CSV' + "-t", + "--taxonomy-csv", + "--taxonomy", + metavar="FILE", + action="extend", + nargs="+", + required=True, + help="database lineages CSV", ) subparser.add_argument( - '--keep-full-identifiers', action='store_true', - help='do not split identifiers on whitespace' + "--keep-full-identifiers", + action="store_true", + help="do not split identifiers on whitespace", ) subparser.add_argument( - '--keep-identifier-versions', action='store_true', - help='after splitting identifiers, do not remove accession versions' + "--keep-identifier-versions", + action="store_true", + help="after splitting identifiers, do not remove accession versions", ) subparser.add_argument( - '--fail-on-missing-taxonomy', action='store_true', - help='fail quickly if taxonomy is not available for an identifier', + "--fail-on-missing-taxonomy", + action="store_true", + help="fail quickly if taxonomy is not available for an identifier", ) subparser.add_argument( - '-F', '--output-format', default=[], nargs='*', action="extend", - choices=["human", "csv_summary", "krona", "lineage_summary", "kreport", "lingroup", "bioboxes"], - help='choose output format(s)', + "-F", + "--output-format", + default=[], + nargs="*", + action="extend", + choices=[ + "human", + "csv_summary", + "krona", + "lineage_summary", + "kreport", + "lingroup", + "bioboxes", + ], + help="choose output format(s)", ) subparser.add_argument( - '-f', '--force', action = 'store_true', - help='continue past errors in taxonomy database loading', + "-f", + "--force", + action="store_true", + help="continue past errors in taxonomy database loading", ) subparser.add_argument( - '--lins', '--lin-taxonomy', action='store_true', default=False, - help="use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain 'lin' lineage information." + "--lins", + "--lin-taxonomy", + action="store_true", + default=False, + help="use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain 'lin' lineage information.", ) subparser.add_argument( - '--lingroup', '--lingroups', metavar='FILE', default=None, - help="CSV containing 'name', 'lin' columns, where 'lin' is the lingroup prefix. Will produce a 'lingroup' report containing taxonomic summarization for each group." + "--lingroup", + "--lingroups", + metavar="FILE", + default=None, + help="CSV containing 'name', 'lin' columns, where 'lin' is the lingroup prefix. Will produce a 'lingroup' report containing taxonomic summarization for each group.", ) add_rank_arg(subparser) + def main(args): - import sourmash try: if not args.gather_csv and not args.from_file: - raise ValueError(f"No gather CSVs found! Please input via '-g' or '--from-file'.") + raise ValueError( + "No gather CSVs found! Please input via '-g' or '--from-file'." + ) if args.rank: args.rank = check_rank(args) - args.output_format = check_tax_outputs(args, rank_required = ['krona', 'lineage_summary'], incompatible_with_lins = ['bioboxes', 'kreport'], use_lingroup_format=True) + args.output_format = check_tax_outputs( + args, + rank_required=["krona", "lineage_summary"], + incompatible_with_lins=["bioboxes", "kreport"], + use_lingroup_format=True, + ) except ValueError as exc: error(f"ERROR: {str(exc)}") - import sys; sys.exit(-1) + import sys + + sys.exit(-1) return sourmash.tax.__main__.metagenome(args) diff --git a/src/sourmash/cli/tax/prepare.py b/src/sourmash/cli/tax/prepare.py index de2e58521b..88e4a9f504 100644 --- a/src/sourmash/cli/tax/prepare.py +++ b/src/sourmash/cli/tax/prepare.py @@ -1,6 +1,6 @@ """combine multiple taxonomy databases into one.""" -usage=""" +usage = """ sourmash tax prepare --taxonomy-csv [ ... ] -o @@ -17,44 +17,55 @@ def subparser(subparsers): - subparser = subparsers.add_parser('prepare', - usage=usage) + subparser = subparsers.add_parser("prepare", usage=usage) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-t', '--taxonomy-csv', '--taxonomy', metavar='FILE', - nargs="+", required=True, action="extend", - help='database lineages' + "-t", + "--taxonomy-csv", + "--taxonomy", + metavar="FILE", + nargs="+", + required=True, + action="extend", + help="database lineages", ) subparser.add_argument( - '-o', '--output', required=True, - help='output file', + "-o", + "--output", + required=True, + help="output file", ) subparser.add_argument( - '-F', '--database-format', + "-F", + "--database-format", help="format of output file; default is 'sql')", - default='sql', - choices=['csv', 'sql'], + default="sql", + choices=["csv", "sql"], ) subparser.add_argument( - '--keep-full-identifiers', action='store_true', - help='do not split identifiers on whitespace' + "--keep-full-identifiers", + action="store_true", + help="do not split identifiers on whitespace", ) subparser.add_argument( - '--keep-identifier-versions', action='store_true', - help='after splitting identifiers, do not remove accession versions' + "--keep-identifier-versions", + action="store_true", + help="after splitting identifiers, do not remove accession versions", ) subparser.add_argument( - '--fail-on-missing-taxonomy', action='store_true', - help='fail quickly if taxonomy is not available for an identifier', + "--fail-on-missing-taxonomy", + action="store_true", + help="fail quickly if taxonomy is not available for an identifier", ) subparser.add_argument( - '-f', '--force', action = 'store_true', - help='continue past errors in file and taxonomy loading', + "-f", + "--force", + action="store_true", + help="continue past errors in file and taxonomy loading", ) + def main(args): - import sourmash return sourmash.tax.__main__.prepare(args) diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py index 06a109e95c..d430677b8f 100644 --- a/src/sourmash/cli/tax/summarize.py +++ b/src/sourmash/cli/tax/summarize.py @@ -1,6 +1,6 @@ """summarize taxonomy/lineage information""" -usage=""" +usage = """ sourmash tax summarize [ ... ] @@ -18,39 +18,46 @@ def subparser(subparsers): - subparser = subparsers.add_parser('summarize', - usage=usage) + subparser = subparsers.add_parser("summarize", usage=usage) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - 'taxonomy_files', - metavar='FILE', - nargs="+", action="extend", - help='database lineages' + "taxonomy_files", + metavar="FILE", + nargs="+", + action="extend", + help="database lineages", ) subparser.add_argument( - '-o', '--output-lineage-information', - help='output a CSV file containing individual lineage counts', + "-o", + "--output-lineage-information", + help="output a CSV file containing individual lineage counts", ) subparser.add_argument( - '--keep-full-identifiers', action='store_true', - help='do not split identifiers on whitespace' + "--keep-full-identifiers", + action="store_true", + help="do not split identifiers on whitespace", ) subparser.add_argument( - '--keep-identifier-versions', action='store_true', - help='after splitting identifiers, do not remove accession versions' + "--keep-identifier-versions", + action="store_true", + help="after splitting identifiers, do not remove accession versions", ) subparser.add_argument( - '-f', '--force', action = 'store_true', - help='continue past errors in file and taxonomy loading', + "-f", + "--force", + action="store_true", + help="continue past errors in file and taxonomy loading", ) subparser.add_argument( - '--lins', '--lin-taxonomy', action='store_true', default=False, - help='use LIN taxonomy in place of standard taxonomic ranks.' + "--lins", + "--lin-taxonomy", + action="store_true", + default=False, + help="use LIN taxonomy in place of standard taxonomic ranks.", ) + def main(args): - import sourmash return sourmash.tax.__main__.summarize(args) diff --git a/src/sourmash/cli/utils.py b/src/sourmash/cli/utils.py index e0d8975b09..26da5ead5f 100644 --- a/src/sourmash/cli/utils.py +++ b/src/sourmash/cli/utils.py @@ -7,38 +7,66 @@ def add_moltype_args(parser): parser.add_argument( - '--protein', dest='protein', action='store_true', - help='choose a protein signature; by default, a nucleotide signature is used' + "--protein", + dest="protein", + action="store_true", + help="choose a protein signature; by default, a nucleotide signature is used", ) parser.add_argument( - '--no-protein', dest='protein', action='store_false', - help='do not choose a protein signature') + "--no-protein", + dest="protein", + action="store_false", + help="do not choose a protein signature", + ) parser.set_defaults(protein=False) parser.add_argument( - '--dayhoff', dest='dayhoff', action='store_true', - help='choose Dayhoff-encoded amino acid signatures' + "--dayhoff", + dest="dayhoff", + action="store_true", + help="choose Dayhoff-encoded amino acid signatures", ) parser.add_argument( - '--no-dayhoff', dest='dayhoff', action='store_false', - help='do not choose Dayhoff-encoded amino acid signatures') + "--no-dayhoff", + dest="dayhoff", + action="store_false", + help="do not choose Dayhoff-encoded amino acid signatures", + ) parser.set_defaults(dayhoff=False) parser.add_argument( - '--hp', '--hydrophobic-polar', dest='hp', action='store_true', - help='choose hydrophobic-polar-encoded amino acid signatures' + "--hp", + "--hydrophobic-polar", + dest="hp", + action="store_true", + help="choose hydrophobic-polar-encoded amino acid signatures", ) parser.add_argument( - '--no-hp', '--no-hydrophobic-polar', dest='hp', action='store_false', - help='do not choose hydrophobic-polar-encoded amino acid signatures') + "--no-hp", + "--no-hydrophobic-polar", + dest="hp", + action="store_false", + help="do not choose hydrophobic-polar-encoded amino acid signatures", + ) parser.set_defaults(hp=False) parser.add_argument( - '--dna', '--rna', '--nucleotide', dest='dna', default=None, action='store_true', - help='choose a nucleotide signature (default: True)') + "--dna", + "--rna", + "--nucleotide", + dest="dna", + default=None, + action="store_true", + help="choose a nucleotide signature (default: True)", + ) parser.add_argument( - '--no-dna', '--no-rna', '--no-nucleotide', dest='dna', action='store_false', - help='do not choose a nucleotide signature') + "--no-dna", + "--no-rna", + "--no-nucleotide", + dest="dna", + action="store_false", + help="do not choose a nucleotide signature", + ) parser.set_defaults(dna=None) @@ -52,16 +80,21 @@ def add_ksize_arg(parser, *, default=None): if default: message = f"k-mer size to select; default={default}" else: - message = f"k-mer size to select; no default." + message = "k-mer size to select; no default." parser.add_argument( - '-k', '--ksize', metavar='K', default=default, type=int, + "-k", + "--ksize", + metavar="K", + default=default, + type=int, help=message, ) -#https://stackoverflow.com/questions/55324449/how-to-specify-a-minimum-or-maximum-float-value-with-argparse#55410582 + +# https://stackoverflow.com/questions/55324449/how-to-specify-a-minimum-or-maximum-float-value-with-argparse#55410582 def range_limited_float_type(arg): - """ Type function for argparse - a float within some predefined bounds """ + """Type function for argparse - a float within some predefined bounds""" min_val = 0 max_val = 1 try: @@ -69,119 +102,168 @@ def range_limited_float_type(arg): except ValueError: raise argparse.ArgumentTypeError("\n\tERROR: Must be a floating point number.") if f < min_val or f > max_val: - raise argparse.ArgumentTypeError(f"\n\tERROR: Argument must be >{str(min_val)} and <{str(max_val)}.") + raise argparse.ArgumentTypeError( + f"\n\tERROR: Argument must be >{str(min_val)} and <{str(max_val)}." + ) return f def add_tax_threshold_arg(parser, containment_default=0.1, ani_default=None): parser.add_argument( - '--containment-threshold', default=containment_default, type=range_limited_float_type, - help=f'minimum containment threshold for classification; default={containment_default}', + "--containment-threshold", + default=containment_default, + type=range_limited_float_type, + help=f"minimum containment threshold for classification; default={containment_default}", ) parser.add_argument( - '--ani-threshold', '--aai-threshold', default=ani_default, type=range_limited_float_type, - help=f'minimum ANI threshold (nucleotide gather) or AAI threshold (protein gather) for classification; default={ani_default}', + "--ani-threshold", + "--aai-threshold", + default=ani_default, + type=range_limited_float_type, + help=f"minimum ANI threshold (nucleotide gather) or AAI threshold (protein gather) for classification; default={ani_default}", ) def add_picklist_args(parser): parser.add_argument( - '--picklist', default=None, - help="select signatures based on a picklist, i.e. 'file.csv:colname:coltype'" + "--picklist", + default=None, + help="select signatures based on a picklist, i.e. 'file.csv:colname:coltype'", ) parser.add_argument( - '--picklist-require-all', default=False, action='store_true', - help="require that all picklist values be found or else fail" + "--picklist-require-all", + default=False, + action="store_true", + help="require that all picklist values be found or else fail", ) def add_pattern_args(parser): parser.add_argument( - '--include-db-pattern', + "--include-db-pattern", default=None, - help='search only signatures that match this pattern in name, filename, or md5' + help="search only signatures that match this pattern in name, filename, or md5", ) parser.add_argument( - '--exclude-db-pattern', + "--exclude-db-pattern", default=None, - help='search only signatures that do not match this pattern in name, filename, or md5' + help="search only signatures that do not match this pattern in name, filename, or md5", ) def opfilter(path): - return not path.startswith('__') and path not in ['utils'] + return not path.startswith("__") and path not in ["utils"] def command_list(dirpath): - paths = glob(os.path.join(dirpath, '*.py')) + paths = glob(os.path.join(dirpath, "*.py")) filenames = [os.path.basename(path) for path in paths] - basenames = [os.path.splitext(path)[0] for path in filenames if not path.startswith('__')] + basenames = [ + os.path.splitext(path)[0] for path in filenames if not path.startswith("__") + ] basenames = filter(opfilter, basenames) return sorted(basenames) def add_scaled_arg(parser, default=None): parser.add_argument( - '--scaled', metavar='FLOAT', type=check_scaled_bounds, - help='downsample to this scaled; value should be between 100 and 1e6' + "--scaled", + metavar="FLOAT", + type=check_scaled_bounds, + help="downsample to this scaled; value should be between 100 and 1e6", ) def add_num_arg(parser, default=0): parser.add_argument( - '-n', '--num-hashes', '--num', metavar='N', type=check_num_bounds, default=default, - help='num value should be between 50 and 50000' + "-n", + "--num-hashes", + "--num", + metavar="N", + type=check_num_bounds, + default=default, + help="num value should be between 50 and 50000", ) def check_rank(args): - """ Check '--rank'/'--position'/'--lin-position' argument matches selected taxonomy.""" - standard_ranks =['strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'] + """Check '--rank'/'--position'/'--lin-position' argument matches selected taxonomy.""" + standard_ranks = [ + "strain", + "species", + "genus", + "family", + "order", + "class", + "phylum", + "superkingdom", + ] if args.lins: - if args.rank.isdigit(): + if args.rank.isdigit(): return str(args.rank) - raise argparse.ArgumentTypeError(f"Invalid '--rank'/'--position' input: '{args.rank}'. '--lins' is specified. Rank must be an integer corresponding to a LIN position.") + raise argparse.ArgumentTypeError( + f"Invalid '--rank'/'--position' input: '{args.rank}'. '--lins' is specified. Rank must be an integer corresponding to a LIN position." + ) elif args.rank in standard_ranks: return args.rank else: - raise argparse.ArgumentTypeError(f"Invalid '--rank'/'--position' input: '{args.rank}'. Please choose: 'strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'") + raise argparse.ArgumentTypeError( + f"Invalid '--rank'/'--position' input: '{args.rank}'. Please choose: 'strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'" + ) def add_rank_arg(parser): parser.add_argument( - '-r', '--rank', - '--position', '--lin-position', + "-r", + "--rank", + "--position", + "--lin-position", help="For non-default output formats. Classify to this rank (tax genome) or summarize taxonomy at this rank and above (tax metagenome). \ Note that the taxonomy CSV must contain lineage information at this rank, and that LIN positions start at 0. \ - Choices: 'strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom' or an integer LIN position" + Choices: 'strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom' or an integer LIN position", ) -def check_tax_outputs(args, rank_required = ["krona"], incompatible_with_lins = None, use_lingroup_format=False): +def check_tax_outputs( + args, + rank_required=["krona"], + incompatible_with_lins=None, + use_lingroup_format=False, +): "Handle ouput format combinations" # check that rank is passed for formats requiring rank. if not args.rank: if any(x in rank_required for x in args.output_format): - raise ValueError(f"Rank (--rank) is required for {', '.join(rank_required)} output formats.") + raise ValueError( + f"Rank (--rank) is required for {', '.join(rank_required)} output formats." + ) if args.lins: # check for outputs incompatible with lins if incompatible_with_lins: if any(x in args.output_format for x in incompatible_with_lins): - raise ValueError(f"The following outputs are incompatible with '--lins': : {', '.join(incompatible_with_lins)}") + raise ValueError( + f"The following outputs are incompatible with '--lins': : {', '.join(incompatible_with_lins)}" + ) # check that lingroup file exists if needed if args.lingroup: if use_lingroup_format and "lingroup" not in args.output_format: args.output_format.append("lingroup") elif "lingroup" in args.output_format: - raise ValueError(f"Must provide lingroup csv via '--lingroup' in order to output a lingroup report.") + raise ValueError( + "Must provide lingroup csv via '--lingroup' in order to output a lingroup report." + ) elif args.lingroup or "lingroup" in args.output_format: - raise ValueError(f"Must enable LIN taxonomy via '--lins' in order to use lingroups.") + raise ValueError( + "Must enable LIN taxonomy via '--lins' in order to use lingroups." + ) # check that only one output format is specified if writing to stdout if len(args.output_format) > 1: if args.output_base == "-": - raise ValueError(f"Writing to stdout is incompatible with multiple output formats {args.output_format}") + raise ValueError( + f"Writing to stdout is incompatible with multiple output formats {args.output_format}" + ) elif not args.output_format: # change to "human" for 5.0 args.output_format = ["csv_summary"] diff --git a/src/sourmash/cli/watch.py b/src/sourmash/cli/watch.py index 7828d376e2..a82c06d1a3 100644 --- a/src/sourmash/cli/watch.py +++ b/src/sourmash/cli/watch.py @@ -4,33 +4,36 @@ def subparser(subparsers): - subparser = subparsers.add_parser('watch') - subparser.add_argument('sbt_name', help='name of SBT to search') - subparser.add_argument('inp_file', nargs='?', default='/dev/stdin') + subparser = subparsers.add_parser("watch") + subparser.add_argument("sbt_name", help="name of SBT to search") + subparser.add_argument("inp_file", nargs="?", default="/dev/stdin") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-o', '--output', - help='save signature generated from data here' + "-o", "--output", help="save signature generated from data here" ) subparser.add_argument( - '--threshold', metavar='T', default=0.05, type=float, - help='minimum threshold for matches (default=0.05)' + "--threshold", + metavar="T", + default=0.05, + type=float, + help="minimum threshold for matches (default=0.05)", ) subparser.add_argument( - '--input-is-protein', action='store_true', - help='Consume protein sequences - no translation needed' + "--input-is-protein", + action="store_true", + help="Consume protein sequences - no translation needed", ) add_moltype_args(subparser) subparser.add_argument( - '--name', type=str, default='stdin', - help='name to use for generated signature' + "--name", type=str, default="stdin", help="name to use for generated signature" ) add_ksize_arg(subparser) add_num_arg(subparser, 500) + def main(args): import sourmash + return sourmash.commands.watch(args) diff --git a/src/sourmash/command_compute.py b/src/sourmash/command_compute.py index 2dca0ae936..46c4f455f6 100644 --- a/src/sourmash/command_compute.py +++ b/src/sourmash/command_compute.py @@ -13,7 +13,7 @@ from .utils import RustObject from ._lowlevel import ffi, lib -DEFAULT_COMPUTE_K = '21,31,51' +DEFAULT_COMPUTE_K = "21,31,51" DEFAULT_MMHASH_SEED = 42 DEFAULT_LINE_COUNT = 1500 @@ -33,82 +33,82 @@ def compute(args): """ set_quiet(args.quiet) - if args.license != 'CC0': - error('error: sourmash only supports CC0-licensed signatures. sorry!') + if args.license != "CC0": + error("error: sourmash only supports CC0-licensed signatures. sorry!") sys.exit(-1) if args.input_is_protein and args.dna: - notify('WARNING: input is protein, turning off nucleotide hashing') + notify("WARNING: input is protein, turning off nucleotide hashing") args.dna = False args.protein = True if args.scaled: if args.scaled < 1: - error('ERROR: --scaled value must be >= 1') + error("ERROR: --scaled value must be >= 1") sys.exit(-1) if args.scaled != round(args.scaled, 0): - error('ERROR: --scaled value must be integer value') + error("ERROR: --scaled value must be integer value") sys.exit(-1) if args.scaled >= 1e9: - notify('WARNING: scaled value is nonsensical!? Continuing anyway.') + notify("WARNING: scaled value is nonsensical!? Continuing anyway.") if args.num_hashes != 0: - notify('setting num_hashes to 0 because --scaled is set') + notify("setting num_hashes to 0 because --scaled is set") args.num_hashes = 0 - notify('computing signatures for files: {}', ", ".join(args.filenames)) + notify("computing signatures for files: {}", ", ".join(args.filenames)) if args.randomize: - notify('randomizing file list because of --randomize') + notify("randomizing file list because of --randomize") random.shuffle(args.filenames) # get list of k-mer sizes for which to compute sketches ksizes = args.ksizes - notify('Computing signature for ksizes: {}', str(ksizes)) + notify("Computing signature for ksizes: {}", str(ksizes)) num_sigs = 0 if args.dna and args.protein: - notify('Computing both nucleotide and protein signatures.') - num_sigs = 2*len(ksizes) + notify("Computing both nucleotide and protein signatures.") + num_sigs = 2 * len(ksizes) elif args.dna and args.dayhoff: - notify('Computing both nucleotide and Dayhoff-encoded protein ' - 'signatures.') - num_sigs = 2*len(ksizes) + notify("Computing both nucleotide and Dayhoff-encoded protein " "signatures.") + num_sigs = 2 * len(ksizes) elif args.dna and args.hp: - notify('Computing both nucleotide and hp-encoded protein ' - 'signatures.') - num_sigs = 2*len(ksizes) + notify("Computing both nucleotide and hp-encoded protein " "signatures.") + num_sigs = 2 * len(ksizes) elif args.dna: - notify('Computing only nucleotide (and not protein) signatures.') + notify("Computing only nucleotide (and not protein) signatures.") num_sigs = len(ksizes) elif args.protein: - notify('Computing only protein (and not nucleotide) signatures.') + notify("Computing only protein (and not nucleotide) signatures.") num_sigs = len(ksizes) elif args.dayhoff: - notify('Computing only Dayhoff-encoded protein (and not nucleotide) ' - 'signatures.') + notify( + "Computing only Dayhoff-encoded protein (and not nucleotide) " "signatures." + ) num_sigs = len(ksizes) elif args.hp: - notify('Computing only hp-encoded protein (and not nucleotide) ' - 'signatures.') + notify("Computing only hp-encoded protein (and not nucleotide) " "signatures.") num_sigs = len(ksizes) - if (args.protein or args.dayhoff or args.hp): + if args.protein or args.dayhoff or args.hp: notify("") - notify("WARNING: you are using 'compute' to make a protein/dayhoff/hp signature,") + notify( + "WARNING: you are using 'compute' to make a protein/dayhoff/hp signature," + ) notify("WARNING: but the meaning of ksize has changed in 4.0. Please see the") notify("WARNING: migration guide to sourmash v4.0 at http://sourmash.rtfd.io/") notify("") - bad_ksizes = [ str(k) for k in ksizes if k % 3 != 0 ] + bad_ksizes = [str(k) for k in ksizes if k % 3 != 0] if bad_ksizes: - error('protein ksizes must be divisible by 3, sorry!') - error('bad ksizes: {}', ", ".join(bad_ksizes)) + error("protein ksizes must be divisible by 3, sorry!") + error("bad ksizes: {}", ", ".join(bad_ksizes)) sys.exit(-1) - notify('Computing a total of {} signature(s) for each input.', num_sigs) + notify("Computing a total of {} signature(s) for each input.", num_sigs) if num_sigs == 0: - error('...nothing to calculate!? Exiting!') + error("...nothing to calculate!? Exiting!") sys.exit(-1) if args.merge and not args.output: @@ -120,32 +120,35 @@ def compute(args): sys.exit(-1) if args.track_abundance: - notify('Tracking abundance of input k-mers.') + notify("Tracking abundance of input k-mers.") signatures_factory = _signatures_for_compute_factory(args) - if args.merge: # single name specified - combine all + if args.merge: # single name specified - combine all _compute_merged(args, signatures_factory) - else: # compute individual signatures + else: # compute individual signatures _compute_individual(args, signatures_factory) class _signatures_for_compute_factory: "Build signatures on demand, based on args input to 'compute'." + def __init__(self, args): self.args = args def __call__(self): args = self.args - params = ComputeParameters(ksizes=args.ksizes, - seed=args.seed, - protein=args.protein, - dayhoff=args.dayhoff, - hp=args.hp, - dna=args.dna, - num_hashes=args.num_hashes, - track_abundance=args.track_abundance, - scaled=args.scaled) + params = ComputeParameters( + ksizes=args.ksizes, + seed=args.seed, + protein=args.protein, + dayhoff=args.dayhoff, + hp=args.hp, + dna=args.dna, + num_hashes=args.num_hashes, + track_abundance=args.track_abundance, + scaled=args.scaled, + ) sig = SourmashSignature.from_params(params) return [sig] @@ -167,14 +170,14 @@ def _compute_individual(args, signatures_factory): for filename in args.filenames: if open_output_each_time: # for each input file, construct output filename - sigfile = os.path.basename(filename) + '.sig' + sigfile = os.path.basename(filename) + ".sig" if args.output_dir: sigfile = os.path.join(args.output_dir, sigfile) # does it already exist? skip if so. if os.path.exists(sigfile) and not args.force: - notify('skipping {} - already done', filename) - continue # go on to next file. + notify("skipping {} - already done", filename) + continue # go on to next file. # nope? ok, let's save to it. assert not save_sigs @@ -204,8 +207,12 @@ def _compute_individual(args, signatures_factory): for n, record in enumerate(screed_iter): sigs = signatures_factory() try: - add_seq(sigs, record.sequence, - args.input_is_protein, args.check_sequence) + add_seq( + sigs, + record.sequence, + args.input_is_protein, + args.check_sequence, + ) except ValueError as exc: error(f"ERROR when reading from '{filename}' - ") error(str(exc)) @@ -214,50 +221,63 @@ def _compute_individual(args, signatures_factory): set_sig_name(sigs, filename, name=record.name) save_sigs_to_location(sigs, save_sigs) - notify('calculated {} signatures for {} sequences in {}', - len(save_sigs), n + 1, filename) + notify( + "calculated {} signatures for {} sequences in {}", + len(save_sigs), + n + 1, + filename, + ) # nope; make a single sig for the whole file else: sigs = signatures_factory() # consume & calculate signatures - notify(f'... reading sequences from {filename}') + notify(f"... reading sequences from {filename}") name = None for n, record in enumerate(screed_iter): if n % 10000 == 0: if n: - notify('\r...{} {}', filename, n, end='') + notify("\r...{} {}", filename, n, end="") elif args.name_from_first: name = record.name try: - add_seq(sigs, record.sequence, - args.input_is_protein, args.check_sequence) + add_seq( + sigs, + record.sequence, + args.input_is_protein, + args.check_sequence, + ) except ValueError as exc: error(f"ERROR when reading from '{filename}' - ") error(str(exc)) sys.exit(-1) - notify('...{} {} sequences', filename, n, end='') + notify("...{} {} sequences", filename, n, end="") set_sig_name(sigs, filename, name) save_sigs_to_location(sigs, save_sigs) - notify(f'calculated {len(sigs)} signatures for {n+1} sequences in {filename}') + notify( + f"calculated {len(sigs)} signatures for {n+1} sequences in {filename}" + ) # if not args.output, close output for every input filename. if open_output_each_time: save_sigs.close() - notify(f"saved {len(save_sigs)} signature(s) to '{save_sigs.location}'. Note: signature license is CC0.") + notify( + f"saved {len(save_sigs)} signature(s) to '{save_sigs.location}'. Note: signature license is CC0." + ) save_sigs = None - # if --output-dir specified, all collected signatures => args.output, # and we need to close here. if args.output and save_sigs is not None: save_sigs.close() - notify(f"saved {len(save_sigs)} signature(s) to '{save_sigs.location}'. Note: signature license is CC0.") + notify( + f"saved {len(save_sigs)} signature(s) to '{save_sigs.location}'. Note: signature license is CC0." + ) def _compute_merged(args, signatures_factory): @@ -267,26 +287,30 @@ def _compute_merged(args, signatures_factory): total_seq = 0 for filename in args.filenames: # consume & calculate signatures - notify('... reading sequences from {}', filename) + notify("... reading sequences from {}", filename) n = None with screed.open(filename) as f: for n, record in enumerate(f): if n % 10000 == 0 and n: - notify('\r... {} {}', filename, n, end='') + notify("\r... {} {}", filename, n, end="") - add_seq(sigs, record.sequence, - args.input_is_protein, args.check_sequence) + add_seq( + sigs, record.sequence, args.input_is_protein, args.check_sequence + ) if n is not None: - notify('... {} {} sequences', filename, n + 1) + notify("... {} {} sequences", filename, n + 1) total_seq += n + 1 else: notify(f"no sequences found in '{filename}'?!") if total_seq: set_sig_name(sigs, filename, name=args.merge) - notify('calculated 1 signature for {} sequences taken from {} files', - total_seq, len(args.filenames)) + notify( + "calculated 1 signature for {} sequences taken from {} files", + total_seq, + len(args.filenames), + ) # at end, save! save_siglist(sigs, args.output) @@ -301,8 +325,8 @@ def add_seq(sigs, seq, input_is_protein, check_sequence): def set_sig_name(sigs, filename, name=None): - if filename == '-': # if stdin, set filename to empty. - filename = '' + if filename == "-": # if stdin, set filename to empty. + filename = "" for sig in sigs: if name is not None: sig._name = name @@ -332,17 +356,19 @@ def save_sigs_to_location(siglist, save_sig): class ComputeParameters(RustObject): __dealloc_func__ = lib.computeparams_free - def __init__(self, - *, - ksizes=(21, 31, 51), - seed=42, - protein=False, - dayhoff=False, - hp=False, - dna=True, - num_hashes=500, - track_abundance=False, - scaled=0): + def __init__( + self, + *, + ksizes=(21, 31, 51), + seed=42, + protein=False, + dayhoff=False, + hp=False, + dna=True, + num_hashes=500, + track_abundance=False, + scaled=0, + ): self._objptr = lib.computeparams_new() self.seed = seed @@ -359,31 +385,33 @@ def __init__(self, def from_manifest_row(cls, row): "convert a CollectionManifest row into a ComputeParameters object" is_dna = is_protein = is_dayhoff = is_hp = False - if row['moltype'] == 'DNA': + if row["moltype"] == "DNA": is_dna = True - elif row['moltype'] == 'protein': + elif row["moltype"] == "protein": is_protein = True - elif row['moltype'] == 'hp': + elif row["moltype"] == "hp": is_hp = True - elif row['moltype'] == 'dayhoff': + elif row["moltype"] == "dayhoff": is_dayhoff = True else: assert 0 if is_dna: - ksize = row['ksize'] + ksize = row["ksize"] else: - ksize = row['ksize'] * 3 - - p = cls(ksizes=[ksize], - seed=DEFAULT_MMHASH_SEED, - protein=is_protein, - dayhoff=is_dayhoff, - hp=is_hp, - dna=is_dna, - num_hashes=row['num'], - track_abundance=row['with_abundance'], - scaled=row['scaled']) + ksize = row["ksize"] * 3 + + p = cls( + ksizes=[ksize], + seed=DEFAULT_MMHASH_SEED, + protein=is_protein, + dayhoff=is_dayhoff, + hp=is_hp, + dna=is_dna, + num_hashes=row["num"], + track_abundance=row["with_abundance"], + scaled=row["scaled"], + ) return p @@ -400,7 +428,7 @@ def to_param_str(self): elif self.dayhoff: pi.append("dayhoff") else: - assert 0 # must be one of the previous + assert 0 # must be one of the previous if self.dna: kstr = [f"k={k}" for k in self.ksizes] @@ -431,15 +459,17 @@ def __repr__(self): return f"ComputeParameters(ksizes={self.ksizes}, seed={self.seed}, protein={self.protein}, dayhoff={self.dayhoff}, hp={self.hp}, dna={self.dna}, num_hashes={self.num_hashes}, track_abundance={self.track_abundance}, scaled={self.scaled})" def __eq__(self, other): - return (self.ksizes == other.ksizes and - self.seed == other.seed and - self.protein == other.protein and - self.dayhoff == other.dayhoff and - self.hp == other.hp and - self.dna == other.dna and - self.num_hashes == other.num_hashes and - self.track_abundance == other.track_abundance and - self.scaled == other.scaled) + return ( + self.ksizes == other.ksizes + and self.seed == other.seed + and self.protein == other.protein + and self.dayhoff == other.dayhoff + and self.hp == other.hp + and self.dna == other.dna + and self.num_hashes == other.num_hashes + and self.track_abundance == other.track_abundance + and self.scaled == other.scaled + ) @staticmethod def from_args(args): @@ -509,11 +539,16 @@ def dna(self, v): @property def moltype(self): - if self.dna: moltype = 'DNA' - elif self.protein: moltype = 'protein' - elif self.hp: moltype = 'hp' - elif self.dayhoff: moltype = 'dayhoff' - else: assert 0 + if self.dna: + moltype = "DNA" + elif self.protein: + moltype = "protein" + elif self.hp: + moltype = "hp" + elif self.dayhoff: + moltype = "dayhoff" + else: + assert 0 return moltype diff --git a/src/sourmash/command_sketch.py b/src/sourmash/command_sketch.py index f79e3a5fc8..508cac7c01 100644 --- a/src/sourmash/command_sketch.py +++ b/src/sourmash/command_sketch.py @@ -12,19 +12,24 @@ import sourmash from .signature import SourmashSignature from .logging import notify, error, set_quiet, print_results -from .command_compute import (_compute_individual, _compute_merged, - ComputeParameters, add_seq, set_sig_name, - DEFAULT_MMHASH_SEED) +from .command_compute import ( + _compute_individual, + _compute_merged, + ComputeParameters, + add_seq, + set_sig_name, + DEFAULT_MMHASH_SEED, +) from sourmash import sourmash_args from sourmash.sourmash_args import check_scaled_bounds, check_num_bounds from sourmash.sig.__main__ import _summarize_manifest, _SketchInfo from sourmash.manifest import CollectionManifest DEFAULTS = dict( - dna='k=31,scaled=1000,noabund', - protein='k=10,scaled=200,noabund', - dayhoff='k=16,scaled=200,noabund', - hp='k=42,scaled=200,noabund' + dna="k=31,scaled=1000,noabund", + protein="k=10,scaled=200,noabund", + dayhoff="k=16,scaled=200,noabund", + hp="k=42,scaled=200,noabund", ) @@ -32,21 +37,21 @@ def _parse_params_str(params_str): "Parse a parameter string of the form 'k=ks,num=num,scaled=scaled,abund'." moltype = None params = {} - params['ksize'] = [] - items = params_str.split(',') + params["ksize"] = [] + items = params_str.split(",") for item in items: - if item == 'abund': - params['track_abundance'] = True - elif item == 'noabund': - params['track_abundance'] = False - elif item.startswith('k'): - if len(item) < 3 or item[1] != '=': + if item == "abund": + params["track_abundance"] = True + elif item == "noabund": + params["track_abundance"] = False + elif item.startswith("k"): + if len(item) < 3 or item[1] != "=": raise ValueError("k takes a parameter, e.g. 'k=31'") - params['ksize'].append(int(item[2:])) - elif item.startswith('num'): - if len(item) < 5 or item[3] != '=': + params["ksize"].append(int(item[2:])) + elif item.startswith("num"): + if len(item) < 5 or item[3] != "=": raise ValueError("num takes a parameter, e.g. 'num=500'") - if params.get('scaled'): + if params.get("scaled"): raise ValueError("cannot set both num and scaled in a single minhash") try: num = item[4:] @@ -56,12 +61,12 @@ def _parse_params_str(params_str): num = check_num_bounds(num) - params['num'] = int(item[4:]) - params['scaled'] = 0 - elif item.startswith('scaled'): - if len(item) < 8 or item[6] != '=': + params["num"] = int(item[4:]) + params["scaled"] = 0 + elif item.startswith("scaled"): + if len(item) < 8 or item[6] != "=": raise ValueError("scaled takes a parameter, e.g. 'scaled=1000'") - if params.get('num'): + if params.get("num"): raise ValueError("cannot set both num and scaled in a single minhash") try: scaled = item[7:] @@ -71,13 +76,13 @@ def _parse_params_str(params_str): scaled = check_scaled_bounds(scaled) - params['scaled'] = scaled - params['num'] = 0 - elif item.startswith('seed'): - if len(item) < 6 or item[4] != '=': + params["scaled"] = scaled + params["num"] = 0 + elif item.startswith("seed"): + if len(item) < 6 or item[4] != "=": raise ValueError("seed takes a parameter, e.g. 'seed=42'") - params['seed'] = int(item[5:]) - elif item in ('protein', 'dayhoff', 'hp', 'dna'): + params["seed"] = int(item[5:]) + elif item in ("protein", "dayhoff", "hp", "dna"): moltype = item else: raise ValueError(f"unknown component '{item}' in params string") @@ -87,12 +92,13 @@ def _parse_params_str(params_str): class _signatures_for_sketch_factory: "Build sigs on demand, based on args input to 'sketch'." + def __init__(self, params_str_list, default_moltype): # first, set up defaults per-moltype defaults = {} for moltype, pstr in DEFAULTS.items(): mt, d = _parse_params_str(pstr) - assert mt is None # defaults cannot have moltype set! + assert mt is None # defaults cannot have moltype set! defaults[moltype] = d self.defaults = defaults @@ -105,19 +111,27 @@ def __init__(self, params_str_list, default_moltype): # provided. for params_str in params_str_list: moltype, params = _parse_params_str(params_str) - if moltype and moltype != 'dna' and default_moltype == 'dna': - raise ValueError(f"Incompatible sketch type ({default_moltype}) and parameter override ({moltype}) in '{params_str}'; maybe use 'sketch translate'?") - elif moltype == 'dna' and default_moltype and default_moltype != 'dna': - raise ValueError(f"Incompatible sketch type ({default_moltype}) and parameter override ({moltype}) in '{params_str}'") + if moltype and moltype != "dna" and default_moltype == "dna": + raise ValueError( + f"Incompatible sketch type ({default_moltype}) and parameter override ({moltype}) in '{params_str}'; maybe use 'sketch translate'?" + ) + elif moltype == "dna" and default_moltype and default_moltype != "dna": + raise ValueError( + f"Incompatible sketch type ({default_moltype}) and parameter override ({moltype}) in '{params_str}'" + ) elif moltype is None: if default_moltype is None: - raise ValueError(f"No default moltype and none specified in param string") + raise ValueError( + "No default moltype and none specified in param string" + ) moltype = default_moltype self.params_list.append((moltype, params)) else: if default_moltype is None: - raise ValueError(f"No default moltype and none specified in param string") + raise ValueError( + "No default moltype and none specified in param string" + ) # no params str? default to a single sig, using default_moltype. self.params_list.append((default_moltype, {})) @@ -125,38 +139,37 @@ def get_compute_params(self, *, split_ksizes=False): for moltype, params_d in self.params_list: # get defaults for this moltype from self.defaults: default_params = self.defaults[moltype] - def_seed = default_params.get('seed', DEFAULT_MMHASH_SEED) - def_num = default_params.get('num', 0) - def_abund = default_params['track_abundance'] - def_scaled = default_params.get('scaled', 0) - def_dna = default_params.get('is_dna', moltype == 'dna') - def_protein = default_params.get('is_protein', - moltype == 'protein') - def_dayhoff = default_params.get('is_dayhoff', - moltype == 'dayhoff') - def_hp = default_params.get('is_hp', moltype == 'hp') + def_seed = default_params.get("seed", DEFAULT_MMHASH_SEED) + def_num = default_params.get("num", 0) + def_abund = default_params["track_abundance"] + def_scaled = default_params.get("scaled", 0) + def_dna = default_params.get("is_dna", moltype == "dna") + def_protein = default_params.get("is_protein", moltype == "protein") + def_dayhoff = default_params.get("is_dayhoff", moltype == "dayhoff") + def_hp = default_params.get("is_hp", moltype == "hp") # handle ksize specially, for now - multiply by three? - def_ksizes = default_params['ksize'] - ksizes = params_d.get('ksize') + def_ksizes = default_params["ksize"] + ksizes = params_d.get("ksize") if not ksizes: ksizes = def_ksizes # 'command sketch' adjusts k-mer sizes by 3 if non-DNA sketch. if self.mult_ksize_by_3 and not def_dna: - ksizes = [ k*3 for k in ksizes ] - - make_param = lambda ksizes: ComputeParameters( - ksizes=ksizes, - seed=params_d.get('seed', def_seed), - protein=def_protein, - dayhoff=def_dayhoff, - hp=def_hp, - dna=def_dna, - num_hashes=params_d.get('num', def_num), - track_abundance=params_d.get('track_abundance', - def_abund), - scaled=params_d.get('scaled', def_scaled)) + ksizes = [k * 3 for k in ksizes] + + def make_param(ksizes): + return ComputeParameters( + ksizes=ksizes, + seed=params_d.get("seed", def_seed), + protein=def_protein, + dayhoff=def_dayhoff, + hp=def_hp, + dna=def_dna, + num_hashes=params_d.get("num", def_num), + track_abundance=params_d.get("track_abundance", def_abund), + scaled=params_d.get("scaled", def_scaled), + ) if split_ksizes: for ksize in ksizes: @@ -179,6 +192,7 @@ def __call__(self, *, split_ksizes=False): def _add_from_file_to_filenames(args): "Add filenames from --from-file to args.filenames" from .sourmash_args import load_pathlist_from_file + if args.from_file: file_list = load_pathlist_from_file(args.from_file) args.filenames.extend(file_list) @@ -189,11 +203,11 @@ def _execute_sketch(args, signatures_factory): set_quiet(args.quiet) if not args.filenames: - error('error: no input filenames provided! nothing to do - exiting.') + error("error: no input filenames provided! nothing to do - exiting.") sys.exit(-1) - if args.license != 'CC0': - error('error: sourmash only supports CC0-licensed signatures. sorry!') + if args.license != "CC0": + error("error: sourmash only supports CC0-licensed signatures. sorry!") sys.exit(-1) notify(f'computing signatures for files: {", ".join(args.filenames)}') @@ -208,15 +222,15 @@ def _execute_sketch(args, signatures_factory): # get number of output sigs: num_sigs = len(signatures_factory.params_list) - notify(f'Computing a total of {num_sigs} signature(s) for each input.') + notify(f"Computing a total of {num_sigs} signature(s) for each input.") if num_sigs == 0: - error('...nothing to calculate!? Exiting!') + error("...nothing to calculate!? Exiting!") sys.exit(-1) - if args.merge: # single name specified - combine all + if args.merge: # single name specified - combine all _compute_merged(args, signatures_factory) - else: # compute individual signatures + else: # compute individual signatures _compute_individual(args, signatures_factory) @@ -229,8 +243,7 @@ def dna(args): args.input_is_protein = False try: - signatures_factory = _signatures_for_sketch_factory(args.param_string, - 'dna') + signatures_factory = _signatures_for_sketch_factory(args.param_string, "dna") except ValueError as e: error(f"Error creating signatures: {str(e)}") sys.exit(-1) @@ -252,15 +265,14 @@ def protein(args): if args.dayhoff and args.hp: raise ValueError("cannot set both --dayhoff and --hp") if args.dayhoff: - moltype = 'dayhoff' + moltype = "dayhoff" elif args.hp: - moltype = 'hp' + moltype = "hp" else: - moltype = 'protein' + moltype = "protein" try: - signatures_factory = _signatures_for_sketch_factory(args.param_string, - moltype) + signatures_factory = _signatures_for_sketch_factory(args.param_string, moltype) except ValueError as e: error(f"Error creating signatures: {str(e)}") sys.exit(-1) @@ -281,15 +293,14 @@ def translate(args): if args.dayhoff and args.hp: raise ValueError("cannot set both --dayhoff and --hp") if args.dayhoff: - moltype = 'dayhoff' + moltype = "dayhoff" elif args.hp: - moltype = 'hp' + moltype = "hp" else: - moltype = 'protein' + moltype = "protein" try: - signatures_factory = _signatures_for_sketch_factory(args.param_string, - moltype) + signatures_factory = _signatures_for_sketch_factory(args.param_string, moltype) except ValueError as e: error(f"Error creating signatures: {str(e)}") sys.exit(-1) @@ -317,47 +328,51 @@ def _compute_sigs(to_build, output, *, check_sequence=False): is_dna = param_objs[0].dna for p in param_objs: - if p.dna: assert is_dna + if p.dna: + assert is_dna sig = SourmashSignature.from_params(p) sigs.append(sig) input_is_protein = not is_dna # read sequence records & sketch - notify(f'... reading sequences from {filename}') + notify(f"... reading sequences from {filename}") for n, record in enumerate(screed_iter): if n % 10000 == 0: if n: - notify('\r...{} {}', filename, n, end='') + notify("\r...{} {}", filename, n, end="") try: - add_seq(sigs, record.sequence, input_is_protein, - check_sequence) + add_seq(sigs, record.sequence, input_is_protein, check_sequence) except ValueError as exc: error(f"ERROR when reading from '{filename}' - ") error(str(exc)) sys.exit(-1) - notify('...{} {} sequences', filename, n, end='') + notify("...{} {} sequences", filename, n, end="") set_sig_name(sigs, filename, name) for sig in sigs: save_sigs.add(sig) - notify(f'calculated {len(sigs)} signatures for {n+1} sequences in {filename}') - + notify( + f"calculated {len(sigs)} signatures for {n+1} sequences in {filename}" + ) save_sigs.close() - notify(f"saved {len(save_sigs)} signature(s) to '{save_sigs.location}'. Note: signature license is CC0.") + notify( + f"saved {len(save_sigs)} signature(s) to '{save_sigs.location}'. Note: signature license is CC0." + ) def _output_csv_info(filename, sigs_to_build): "output information about what signatures to build, in CSV format" output_n = 0 with sourmash_args.FileOutputCSV(filename) as csv_fp: - w = csv.DictWriter(csv_fp, fieldnames=['filename', 'sketchtype', - 'output_index', 'name', - 'param_strs']) + w = csv.DictWriter( + csv_fp, + fieldnames=["filename", "sketchtype", "output_index", "name", "param_strs"], + ) w.writeheader() output_n = 0 @@ -366,18 +381,22 @@ def _output_csv_info(filename, sigs_to_build): # should all be the same! if param_objs[0].dna: - assert all( ( p.dna for p in param_objs ) ) + assert all(p.dna for p in param_objs) sketchtype = "dna" else: - assert not any( ( p.dna for p in param_objs ) ) + assert not any(p.dna for p in param_objs) sketchtype = "protein" for p in param_objs: param_strs.append(p.to_param_str()) - row = dict(filename=filename, sketchtype=sketchtype, - param_strs="-p " + " -p ".join(param_strs), - name=name, output_index=output_n) + row = dict( + filename=filename, + sketchtype=sketchtype, + param_strs="-p " + " -p ".join(param_strs), + name=name, + output_index=output_n, + ) w.writerow(row) @@ -385,15 +404,19 @@ def _output_csv_info(filename, sigs_to_build): def fromfile(args): - if args.license != 'CC0': - error('error: sourmash only supports CC0-licensed signatures. sorry!') + if args.license != "CC0": + error("error: sourmash only supports CC0-licensed signatures. sorry!") sys.exit(-1) if args.output_signatures and os.path.exists(args.output_signatures): if not args.force_output_already_exists: - error(f"** ERROR: output location '{args.output_signatures}' already exists!") - error(f"** Not overwriting/appending.") - error(f"** Use --force-output-already-exists if you want to overwrite/append.") + error( + f"** ERROR: output location '{args.output_signatures}' already exists!" + ) + error("** Not overwriting/appending.") + error( + "** Use --force-output-already-exists if you want to overwrite/append." + ) sys.exit(-1) # now, create the set of desired sketch specs. @@ -429,13 +452,13 @@ def fromfile(args): for csvfile in args.csvs: with sourmash_args.FileInputCSV(csvfile) as r: for row in r: - name = row['name'] + name = row["name"] if not name: n_missing_name += 1 continue - genome = row['genome_filename'] - proteome = row['protein_filename'] + genome = row["genome_filename"] + proteome = row["protein_filename"] total_rows += 1 if name in all_names: @@ -447,8 +470,10 @@ def fromfile(args): fail_exit = False if n_duplicate_name: if args.report_duplicated: - notify("duplicated:\n" + '\n'.join(sorted(duplicate_names))) - error(f"** ERROR: {n_duplicate_name} entries have duplicate 'name' records. Exiting!") + notify("duplicated:\n" + "\n".join(sorted(duplicate_names))) + error( + f"** ERROR: {n_duplicate_name} entries have duplicate 'name' records. Exiting!" + ) fail_exit = True if n_missing_name: @@ -470,7 +495,7 @@ def fromfile(args): # for each manifest row, for row in manifest.rows: - name = row['name'] + name = row["name"] if name: # build a ComputeParameters object for later comparison p = ComputeParameters.from_manifest_row(row) @@ -505,7 +530,7 @@ def fromfile(args): if p not in plist: # nope - figure out genome/proteome needed filename = genome if p.dna else proteome - filetype = 'genome' if p.dna else 'proteome' + filetype = "genome" if p.dna else "proteome" if filename: # add to build list @@ -524,77 +549,91 @@ def fromfile(args): if already_done_manifest: info_d = _summarize_manifest(already_done_manifest) - print_results('---') + print_results("---") print_results("summary of already-done sketches:") - for ski in info_d['sketch_info']: - mh_type = f"num={ski['num']}" if ski['num'] else f"scaled={ski['scaled']}" - mh_abund = ", abund" if ski['abund'] else "" + for ski in info_d["sketch_info"]: + mh_type = f"num={ski['num']}" if ski["num"] else f"scaled={ski['scaled']}" + mh_abund = ", abund" if ski["abund"] else "" sketch_str = f"{ski['count']} sketches with {ski['moltype']}, k={ski['ksize']}, {mh_type}{mh_abund}" print_results(f" {sketch_str: <50} {ski['n_hashes']} total hashes") - print_results('---') + print_results("---") if args.output_manifest_matching: already_done_manifest.write_to_filename(args.output_manifest_matching) - notify(f"output {len(already_done_manifest)} already-done signatures to '{args.output_manifest_matching}' in manifest format.") + notify( + f"output {len(already_done_manifest)} already-done signatures to '{args.output_manifest_matching}' in manifest format." + ) if missing: error("** ERROR: we cannot build some of the requested signatures.") - error(f"** {missing_count} total signatures (for {len(missing)} names) cannot be built.") + error( + f"** {missing_count} total signatures (for {len(missing)} names) cannot be built." + ) if args.ignore_missing: error("** (continuing past this error because --ignore-missing was set)") else: sys.exit(-1) - notify(f"** {total_sigs - skipped_sigs} new signatures to build from {len(to_build)} files;") + notify( + f"** {total_sigs - skipped_sigs} new signatures to build from {len(to_build)} files;" + ) if not to_build: - notify(f"** Nothing to build. Exiting!") + notify("** Nothing to build. Exiting!") sys.exit(0) if skipped_sigs: notify(f"** {skipped_sigs} already exist, so skipping those.") else: - notify(f"** we found no pre-existing signatures that match.") + notify("** we found no pre-existing signatures that match.") ## first, print out a summary of to_build: - print_results('---') + print_results("---") print_results("summary of sketches to build:") counter = Counter() - build_info_d = {} for filename, param_objs in to_build.items(): for p in param_objs: - moltype = p.moltype assert len(p.ksizes) == 1 ksize = p.ksizes[0] - if not p.dna: ksize //= 3 - - ski = _SketchInfo(ksize=ksize, moltype=p.moltype, - scaled=p.scaled, num=p.num_hashes, - abund=p.track_abundance) + if not p.dna: + ksize //= 3 + + ski = _SketchInfo( + ksize=ksize, + moltype=p.moltype, + scaled=p.scaled, + num=p.num_hashes, + abund=p.track_abundance, + ) counter[ski] += 1 for ski, count in counter.items(): mh_type = f"num={ski.num}" if ski.num else f"scaled={ski.scaled}" mh_abund = ", abund" if ski.abund else "" - sketch_str = f"{count} sketches with {ski.moltype}, k={ski.ksize}, {mh_type}{mh_abund}" + sketch_str = ( + f"{count} sketches with {ski.moltype}, k={ski.ksize}, {mh_type}{mh_abund}" + ) print_results(f" {sketch_str: <50}") - print_results('---') + print_results("---") ## now, onward ho - do we build anything, or output stuff, or just exit? - if args.output_signatures: # actually compute - _compute_sigs(to_build, args.output_signatures, - check_sequence=args.check_sequence) + if args.output_signatures: # actually compute + _compute_sigs( + to_build, args.output_signatures, check_sequence=args.check_sequence + ) - if args.output_csv_info: # output info necessary to construct + if args.output_csv_info: # output info necessary to construct _output_csv_info(args.output_csv_info, to_build) - notify(f"** {total_sigs} total requested; output {total_sigs - skipped_sigs}, skipped {skipped_sigs}") + notify( + f"** {total_sigs} total requested; output {total_sigs - skipped_sigs}, skipped {skipped_sigs}" + ) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 7de69c5621..e2d1a09a50 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -9,20 +9,24 @@ import io import screed -from .compare import (compare_all_pairs, compare_serial_containment, - compare_serial_max_containment, compare_serial_avg_containment) +from .compare import ( + compare_all_pairs, + compare_serial_containment, + compare_serial_max_containment, + compare_serial_avg_containment, +) from . import MinHash from .sbtmh import load_sbt_index, create_sbt_index from . import signature as sig from . import sourmash_args from .logging import notify, error, print_results, set_quiet -from .sourmash_args import (FileOutput, FileOutputCSV, - SaveSignaturesToLocation) +from .sourmash_args import FileOutput, FileOutputCSV, SaveSignaturesToLocation from .search import prefetch_database, PrefetchResult from .index import LazyLinearIndex WATERMARK_SIZE = 10000 + def _get_screen_width(): # default fallback is 80x24 (col, rows) = shutil.get_terminal_size() @@ -52,17 +56,21 @@ def compare(args): moltypes = set() size_may_be_inaccurate = False for filename in inp_files: - notify(f"loading '{filename}'", end='\r') - loaded = sourmash_args.load_file_as_signatures(filename, - ksize=args.ksize, - select_moltype=moltype, - picklist=picklist, - yield_all_files=args.force, - progress=progress, - pattern=pattern_search) + notify(f"loading '{filename}'", end="\r") + loaded = sourmash_args.load_file_as_signatures( + filename, + ksize=args.ksize, + select_moltype=moltype, + picklist=picklist, + yield_all_files=args.force, + progress=progress, + pattern=pattern_search, + ) loaded = list(loaded) if not loaded: - notify(f'\nwarning: no signatures loaded at given ksize/molecule type/picklist from {filename}') + notify( + f"\nwarning: no signatures loaded at given ksize/molecule type/picklist from {filename}" + ) siglist.extend(loaded) # track ksizes/moltypes @@ -75,22 +83,22 @@ def compare(args): break if not siglist: - error('no signatures found! exiting.') + error("no signatures found! exiting.") sys.exit(-1) # check ksizes and type if len(ksizes) > 1: - error('multiple k-mer sizes loaded; please specify one with -k.') + error("multiple k-mer sizes loaded; please specify one with -k.") ksizes = sorted(ksizes) - error('(saw k-mer sizes {})'.format(', '.join(map(str, ksizes)))) + error("(saw k-mer sizes {})".format(", ".join(map(str, ksizes)))) sys.exit(-1) if len(moltypes) > 1: - error('multiple molecule types loaded; please specify --dna, --protein') + error("multiple molecule types loaded; please specify --dna, --protein") sys.exit(-1) - notify(' '*79, end='\r') - notify(f'loaded {format(len(siglist))} signatures total.') + notify(" " * 79, end="\r") + notify(f"loaded {format(len(siglist))} signatures total.") if picklist: sourmash_args.report_picklist(args, picklist) @@ -103,21 +111,27 @@ def compare(args): # complain if it's not all one or the other if is_scaled != is_scaled_2: - error('ERROR: cannot mix scaled signatures with num signatures') + error("ERROR: cannot mix scaled signatures with num signatures") sys.exit(-1) is_containment = False if args.containment or args.max_containment or args.avg_containment: is_containment = True - containment_args = [args.containment, args.max_containment, args.avg_containment] + containment_args = [ + args.containment, + args.max_containment, + args.avg_containment, + ] if sum(containment_args) > 1: notify("ERROR: cannot specify more than one containment argument!") sys.exit(-1) # complain if --containment and not is_scaled if is_containment and not is_scaled: - error('must use scaled signatures with --containment, --max-containment, and --avg-containment') + error( + "must use scaled signatures with --containment, --max-containment, and --avg-containment" + ) sys.exit(-1) # complain if --ani and not is_scaled @@ -126,14 +140,16 @@ def compare(args): return_ani = True if return_ani and not is_scaled: - error('must use scaled signatures with --estimate-ani') + error("must use scaled signatures with --estimate-ani") sys.exit(-1) # notify about implicit --ignore-abundance: if is_containment or return_ani: - track_abundances = any(( s.minhash.track_abundance for s in siglist )) + track_abundances = any(s.minhash.track_abundance for s in siglist) if track_abundances: - notify('NOTE: --containment, --max-containment, --avg-containment, and --estimate-ani ignore signature abundances.') + notify( + "NOTE: --containment, --max-containment, --avg-containment, and --estimate-ani ignore signature abundances." + ) # if using scaled sketches or --scaled, downsample to common max scaled. printed_scaled_msg = False @@ -144,7 +160,9 @@ def compare(args): max_scaled = max(max_scaled, args.scaled) if max_scaled > args.scaled: - notify(f"WARNING: --scaled specified {args.scaled}, but max scaled of sketches is {max_scaled}") + notify( + f"WARNING: --scaled specified {args.scaled}, but max scaled of sketches is {max_scaled}" + ) notify(f"WARNING: continuing with scaled value of {max_scaled}.") new_siglist = [] @@ -153,7 +171,9 @@ def compare(args): size_may_be_inaccurate = True if s.minhash.scaled != max_scaled: if not printed_scaled_msg: - notify(f'NOTE: downsampling to scaled value of {format(max_scaled)}') + notify( + f"NOTE: downsampling to scaled value of {format(max_scaled)}" + ) printed_scaled_msg = True with s.update() as s: s.minhash = s.minhash.downsample(scaled=max_scaled) @@ -166,10 +186,10 @@ def compare(args): sys.exit(-1) if len(siglist) == 0: - error('no signatures!') + error("no signatures!") sys.exit(-1) - notify('') + notify("") # build the distance matrix numpy.set_printoptions(precision=3, suppress=True) @@ -184,8 +204,9 @@ def compare(args): elif args.avg_containment: similarity = compare_serial_avg_containment(siglist, return_ani=return_ani) else: - similarity = compare_all_pairs(siglist, args.ignore_abundance, - n_jobs=args.processes, return_ani=return_ani) + similarity = compare_all_pairs( + siglist, args.ignore_abundance, n_jobs=args.processes, return_ani=return_ani + ) # if distance matrix desired, switch to 1-similarity if args.distance_matrix: @@ -196,25 +217,33 @@ def compare(args): if len(siglist) < 30: for i, ss in enumerate(siglist): # for small matrices, pretty-print some output - name_num = '{}-{}'.format(i, str(ss)) + name_num = f"{i}-{str(ss)}" if len(name_num) > 20: - name_num = name_num[:17] + '...' - print_results('{:20s}\t{}'.format(name_num, matrix[i, :, ],)) + name_num = name_num[:17] + "..." + print_results( + "{:20s}\t{}".format( + name_num, + matrix[ + i, + :, + ], + ) + ) if args.distance_matrix: - print_results('max distance in matrix: {:.3f}', numpy.max(matrix)) + print_results("max distance in matrix: {:.3f}", numpy.max(matrix)) else: - print_results('min similarity in matrix: {:.3f}', numpy.min(matrix)) + print_results("min similarity in matrix: {:.3f}", numpy.min(matrix)) # shall we output a matrix to stdout? if args.output: - labeloutname = args.output + '.labels.txt' - notify(f'saving labels to: {labeloutname}') - with open(labeloutname, 'w') as fp: + labeloutname = args.output + ".labels.txt" + notify(f"saving labels to: {labeloutname}") + with open(labeloutname, "w") as fp: fp.write("\n".join(labeltext)) - notify(f'saving comparison matrix to: {args.output}') - with open(args.output, 'wb') as fp: + notify(f"saving comparison matrix to: {args.output}") + with open(args.output, "wb") as fp: numpy.save(fp, matrix) # output CSV? @@ -231,15 +260,20 @@ def compare(args): if size_may_be_inaccurate: if args.distance_matrix: - notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI distances will be set to 1 for these comparisons.") + notify( + "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI distances will be set to 1 for these comparisons." + ) else: - notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will be set to 1 for these comparisons.") + notify( + "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will be set to 1 for these comparisons." + ) def plot(args): "Produce a clustering matrix and plot." import matplotlib as mpl - mpl.use('Agg') + + mpl.use("Agg") import numpy import pylab import scipy.cluster.hierarchy as sch @@ -248,16 +282,16 @@ def plot(args): # load files D_filename = args.distances - notify(f'loading comparison matrix from {D_filename}...') - with open(D_filename, 'rb') as f: + notify(f"loading comparison matrix from {D_filename}...") + with open(D_filename, "rb") as f: D = numpy.load(f) # not sure how to change this to use f-strings - notify('...got {} x {} matrix.', *D.shape) + notify("...got {} x {} matrix.", *D.shape) # see sourmash#2790 for details :) if args.labeltext or args.labels: display_labels = True - args.labels = True # override => labels always true + args.labels = True # override => labels always true elif args.labels is None and not args.indices: # default to labels args.labels = True @@ -273,14 +307,14 @@ def plot(args): if args.labeltext: labelfilename = args.labeltext else: - labelfilename = D_filename + '.labels.txt' + labelfilename = D_filename + ".labels.txt" - notify(f'loading labels from {labelfilename}') + notify(f"loading labels from {labelfilename}") with open(labelfilename) as f: - labeltext = [ x.strip() for x in f ] - + labeltext = [x.strip() for x in f] + if len(labeltext) != D.shape[0]: - error('{} labels != matrix size, exiting', len(labeltext)) + error("{} labels != matrix size, exiting", len(labeltext)) sys.exit(-1) elif args.indices: # construct integer labels @@ -290,14 +324,14 @@ def plot(args): labeltext = [""] * D.shape[0] if args.pdf: - ext = '.pdf' + ext = ".pdf" else: - ext = '.png' + ext = ".png" # build filenames, decide on PDF/PNG output - dendrogram_out = os.path.basename(D_filename) + '.dendro' + ext - matrix_out = os.path.basename(D_filename) + '.matrix' + ext - hist_out = os.path.basename(D_filename) + '.hist' + ext + dendrogram_out = os.path.basename(D_filename) + ".dendro" + ext + matrix_out = os.path.basename(D_filename) + ".matrix" + ext + hist_out = os.path.basename(D_filename) + ".hist" + ext # output to a different directory? if args.output_dir: @@ -308,13 +342,13 @@ def plot(args): hist_out = os.path.join(args.output_dir, hist_out) # make the histogram - notify(f'saving histogram of matrix values => {hist_out}') - fig = pylab.figure(figsize=(8,5)) + notify(f"saving histogram of matrix values => {hist_out}") + fig = pylab.figure(figsize=(8, 5)) pylab.hist(numpy.array(D.flat), bins=100) fig.savefig(hist_out) ### make the dendrogram: - fig = pylab.figure(figsize=(8,5)) + fig = pylab.figure(figsize=(8, 5)) ax1 = fig.add_axes([0.1, 0.1, 0.7, 0.8]) ax1.set_xticks([]) ax1.set_yticks([]) @@ -325,32 +359,36 @@ def plot(args): sample_idx = list(range(len(labeltext))) numpy.random.shuffle(sample_idx) - sample_idx = sample_idx[:args.subsample] + sample_idx = sample_idx[: args.subsample] np_idx = numpy.array(sample_idx) D = D[numpy.ix_(np_idx, np_idx)] - labeltext = [ labeltext[idx] for idx in sample_idx ] + labeltext = [labeltext[idx] for idx in sample_idx] ### do clustering - Y = sch.linkage(D, method='single') - sch.dendrogram(Y, orientation='right', labels=labeltext, - no_labels=not display_labels) + Y = sch.linkage(D, method="single") + sch.dendrogram( + Y, orientation="right", labels=labeltext, no_labels=not display_labels + ) fig.savefig(dendrogram_out) - notify(f'wrote dendrogram to: {dendrogram_out}') + notify(f"wrote dendrogram to: {dendrogram_out}") ### make the dendrogram+matrix: - (fig, rlabels, rmat) = sourmash_fig.plot_composite_matrix(D, labeltext, - show_labels=display_labels, - vmin=args.vmin, - vmax=args.vmax, - force=args.force) + (fig, rlabels, rmat) = sourmash_fig.plot_composite_matrix( + D, + labeltext, + show_labels=display_labels, + vmin=args.vmin, + vmax=args.vmax, + force=args.force, + ) fig.savefig(matrix_out) - notify(f'wrote numpy distance matrix to: {matrix_out}') + notify(f"wrote numpy distance matrix to: {matrix_out}") if len(labeltext) < 30: # for small matrices, print out sample numbering for FYI. for i, name in enumerate(labeltext): - print_results('{}\t{}', i, name) + print_results("{}\t{}", i, name) # write out re-ordered matrix and labels if args.csv: @@ -361,15 +399,15 @@ def plot(args): for i in range(len(rlabels)): y = [] for j in range(len(rlabels)): - y.append('{}'.format(rmat[i][j])) + y.append(f"{rmat[i][j]}") w.writerow(y) - notify(f'Wrote clustered matrix and labels out to {args.csv}') + notify(f"Wrote clustered matrix and labels out to {args.csv}") def import_csv(args): "Import a CSV file full of signatures/hashes." - with open(args.mash_csvfile, newline='') as fp: + with open(args.mash_csvfile, newline="") as fp: reader = csv.reader(fp) siglist = [] for row in reader: @@ -377,29 +415,29 @@ def import_csv(args): hashseed = int(row[1]) # only support a limited import type, for now ;) - assert hashfn == 'murmur64' + assert hashfn == "murmur64" assert hashseed == 42 _, _, ksize, name, hashes = row ksize = int(ksize) hashes = hashes.strip() - hashes = list(map(int, hashes.split(' ' ))) + hashes = list(map(int, hashes.split(" "))) e = MinHash(len(hashes), ksize) e.add_many(hashes) s = sig.SourmashSignature(e, filename=name) siglist.append(s) - notify(f'loaded signature: {name} {s.md5sum()[:8]}') + notify(f"loaded signature: {name} {s.md5sum()[:8]}") - notify(f'saving {len(siglist)} signatures to JSON') + notify(f"saving {len(siglist)} signatures to JSON") with SaveSignaturesToLocation(args.output) as save_sig: save_sig.add_many(siglist) def sbt_combine(args): inp_files = list(args.sbts) - notify(f'combining {len(inp_files)} SBTs') + notify(f"combining {len(inp_files)} SBTs") tree = load_sbt_index(inp_files.pop(0)) @@ -426,11 +464,11 @@ def index(args): tree = create_sbt_index(args.bf_size, n_children=args.n_children) if args.sparseness < 0 or args.sparseness > 1.0: - error('sparseness must be in range [0.0, 1.0].') + error("sparseness must be in range [0.0, 1.0].") if args.scaled: args.scaled = int(args.scaled) - notify(f'downsampling signatures to scaled={args.scaled}') + notify(f"downsampling signatures to scaled={args.scaled}") inp_files = list(args.signatures) if args.from_file: @@ -441,7 +479,7 @@ def index(args): error("ERROR: no files to index!? Supply on command line or use --from-file") sys.exit(-1) - notify(f'loading {len(inp_files)} files into SBT') + notify(f"loading {len(inp_files)} files into SBT") progress = sourmash_args.SignatureLoadingProgress() @@ -451,12 +489,14 @@ def index(args): nums = set() scaleds = set() for f in inp_files: - siglist = sourmash_args.load_file_as_signatures(f, - ksize=args.ksize, - select_moltype=moltype, - yield_all_files=args.force, - picklist=picklist, - progress=progress) + siglist = sourmash_args.load_file_as_signatures( + f, + ksize=args.ksize, + select_moltype=moltype, + yield_all_files=args.force, + picklist=picklist, + progress=progress, + ) # load all matching signatures in this file ss = None @@ -481,26 +521,29 @@ def index(args): # check to make sure we aren't loading incompatible signatures if len(ksizes) > 1 or len(moltypes) > 1: - error('multiple k-mer sizes or molecule types present; fail.') - error('specify --dna/--protein and --ksize as necessary') - error('ksizes: {}; moltypes: {}', - ", ".join(map(str, ksizes)), ", ".join(moltypes)) + error("multiple k-mer sizes or molecule types present; fail.") + error("specify --dna/--protein and --ksize as necessary") + error( + "ksizes: {}; moltypes: {}", + ", ".join(map(str, ksizes)), + ", ".join(moltypes), + ) sys.exit(-1) - if nums == { 0 } and len(scaleds) == 1: - pass # good - elif scaleds == { 0 } and len(nums) == 1: - pass # also good + if nums == {0} and len(scaleds) == 1: + pass # good + elif scaleds == {0} and len(nums) == 1: + pass # also good else: - error('trying to build an SBT with incompatible signatures.') - error('nums = {}; scaleds = {}', repr(nums), repr(scaleds)) + error("trying to build an SBT with incompatible signatures.") + error("nums = {}; scaleds = {}", repr(nums), repr(scaleds)) sys.exit(-1) - notify('') + notify("") # did we load any!? if n == 0: - error('no signatures found to load into tree!? failing.') + error("no signatures found to load into tree!? failing.") sys.exit(-1) if picklist: @@ -513,8 +556,10 @@ def index(args): def search(args): - from .search import (search_databases_with_flat_query, - search_databases_with_abund_query) + from .search import ( + search_databases_with_flat_query, + search_databases_with_abund_query, + ) set_quiet(args.quiet, args.debug) moltype = sourmash_args.calculate_moltype(args) @@ -522,18 +567,21 @@ def search(args): pattern_search = sourmash_args.load_include_exclude_db_patterns(args) # set up the query. - query = sourmash_args.load_query_signature(args.query, - ksize=args.ksize, - select_moltype=moltype, - select_md5=args.md5) - notify(f'loaded query: {str(query)[:30]}... (k={query.minhash.ksize}, {sourmash_args.get_moltype(query)})') + query = sourmash_args.load_query_signature( + args.query, ksize=args.ksize, select_moltype=moltype, select_md5=args.md5 + ) + notify( + f"loaded query: {str(query)[:30]}... (k={query.minhash.ksize}, {sourmash_args.get_moltype(query)})" + ) if args.scaled: if not query.minhash.scaled: - error('cannot downsample a signature not created with --scaled') + error("cannot downsample a signature not created with --scaled") sys.exit(-1) if args.scaled != query.minhash.scaled: - notify(f'downsampling query from scaled={query.minhash.scaled} to {int(args.scaled)}') + notify( + f"downsampling query from scaled={query.minhash.scaled} to {int(args.scaled)}" + ) with query.update() as query: query.minhash = query.minhash.downsample(scaled=args.scaled) @@ -544,11 +592,14 @@ def search(args): notify("ERROR: cannot specify both --containment and --max-containment!") sys.exit(-1) - databases = sourmash_args.load_dbs_and_sigs(args.databases, query, - not is_containment, - picklist=picklist, - pattern=pattern_search, - fail_on_empty_database=args.fail_on_empty_database) + databases = sourmash_args.load_dbs_and_sigs( + args.databases, + query, + not is_containment, + picklist=picklist, + pattern=pattern_search, + fail_on_empty_database=args.fail_on_empty_database, + ) # handle signatures with abundance if query.minhash.track_abundance: @@ -559,7 +610,9 @@ def search(args): query.minhash = query.minhash.flatten() elif args.containment or args.max_containment: # abund sketch + keep abundance => no containment searches - notify("ERROR: cannot do containment searches on an abund signature; maybe specify --ignore-abundance?") + notify( + "ERROR: cannot do containment searches on an abund signature; maybe specify --ignore-abundance?" + ) sys.exit(-1) else: # forcibly ignore abundances if query has no abundances @@ -568,32 +621,40 @@ def search(args): # do the actual search if query.minhash.track_abundance: try: - results = search_databases_with_abund_query(query, databases, - threshold=args.threshold, - do_containment=args.containment, - do_max_containment=args.max_containment, - best_only=args.best_only, - unload_data=True) + results = search_databases_with_abund_query( + query, + databases, + threshold=args.threshold, + do_containment=args.containment, + do_max_containment=args.max_containment, + best_only=args.best_only, + unload_data=True, + ) except TypeError as exc: error(f"ERROR: {str(exc)}") sys.exit(-1) else: - results = search_databases_with_flat_query(query, databases, - threshold=args.threshold, - do_containment=args.containment, - do_max_containment=args.max_containment, - best_only=args.best_only, - unload_data=True, - estimate_ani_ci=args.estimate_ani_ci) + results = search_databases_with_flat_query( + query, + databases, + threshold=args.threshold, + do_containment=args.containment, + do_max_containment=args.max_containment, + best_only=args.best_only, + unload_data=True, + estimate_ani_ci=args.estimate_ani_ci, + ) n_matches = len(results) if args.best_only: args.num_results = 1 if not args.num_results or n_matches <= args.num_results: - print_results(f'{len(results)} matches above threshold {args.threshold:0.3f}:') + print_results(f"{len(results)} matches above threshold {args.threshold:0.3f}:") else: - print_results(f'{len(results)} matches above threshold {args.threshold:0.3f}; showing first {args.num_results}:') + print_results( + f"{len(results)} matches above threshold {args.threshold:0.3f}; showing first {args.num_results}:" + ) n_matches = args.num_results @@ -604,9 +665,9 @@ def search(args): print_results("similarity match") print_results("---------- -----") for sr in results[:n_matches]: - pct = '{:.1f}%'.format(sr.similarity*100) + pct = f"{sr.similarity * 100:.1f}%" name = sr.match._display_name(60) - print_results('{:>6} {}', pct, name) + print_results("{:>6} {}", pct, name) if sr.cmp_scaled is not None: if not size_may_be_inaccurate and sr.size_may_be_inaccurate: size_may_be_inaccurate = True @@ -637,9 +698,13 @@ def search(args): sourmash_args.report_picklist(args, picklist) if size_may_be_inaccurate: - notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons.") + notify( + "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons." + ) if jaccard_ani_untrustworthy: - notify("WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons.") + notify( + "WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons." + ) def categorize(args): @@ -653,7 +718,7 @@ def categorize(args): # eliminate names we've already categorized already_names = set() if args.load_csv: - with open(args.load_csv, newline='') as fp: + with open(args.load_csv, newline="") as fp: r = csv.reader(fp) for row in r: already_names.add(row[0]) @@ -668,13 +733,12 @@ def _yield_all_sigs(queries, ksize, moltype): for filename in queries: mi = MultiIndex.load_from_path(filename, False) mi = mi.select(ksize=ksize, moltype=moltype) - for ss, loc in mi.signatures_with_location(): - yield ss, loc + yield from mi.signatures_with_location() csv_w = None csv_fp = None if args.csv: - csv_fp = open(args.csv, 'w', newline='') + csv_fp = open(args.csv, "w", newline="") csv_w = csv.writer(csv_fp) search_obj = make_jaccard_search_query(threshold=args.threshold) @@ -683,7 +747,9 @@ def _yield_all_sigs(queries, ksize, moltype): if loc in already_names: continue - notify(f'loaded query: {str(orig_query)[:30]}... (k={orig_query.minhash.ksize}, {orig_query.minhash.moltype})') + notify( + f"loaded query: {str(orig_query)[:30]}... (k={orig_query.minhash.ksize}, {orig_query.minhash.moltype})" + ) if args.ignore_abundance and orig_query.minhash.track_abundance: query = orig_query.copy() @@ -691,7 +757,9 @@ def _yield_all_sigs(queries, ksize, moltype): query.minhash = query.minhash.flatten() else: if orig_query.minhash.track_abundance: - notify("ERROR: this search cannot be done on signatures calculated with abundance.") + notify( + "ERROR: this search cannot be done on signatures calculated with abundance." + ) notify("ERROR: please specify --ignore-abundance.") sys.exit(-1) @@ -700,19 +768,18 @@ def _yield_all_sigs(queries, ksize, moltype): results = [] for sr in db.find(search_obj, query): match = sr.signature - if match.md5sum() != query.md5sum(): # ignore self. + if match.md5sum() != query.md5sum(): # ignore self. results.append((orig_query.similarity(match), match)) if results: - results.sort(key=lambda x: -x[0]) # reverse sort on similarity + results.sort(key=lambda x: -x[0]) # reverse sort on similarity best_hit_sim, best_hit_query = results[0] - notify(f'for {query}, found: {best_hit_sim:.2f} {best_hit_query}') + notify(f"for {query}, found: {best_hit_sim:.2f} {best_hit_query}") best_hit_query_name = best_hit_query.name if csv_w: - csv_w.writerow([loc, query, best_hit_query_name, - best_hit_sim]) + csv_w.writerow([loc, query, best_hit_query_name, best_hit_sim]) else: - notify(f'for {query}, no match found') + notify(f"for {query}, no match found") if csv_fp: csv_fp.close() @@ -727,43 +794,49 @@ def gather(args): pattern_search = sourmash_args.load_include_exclude_db_patterns(args) # load the query signature & figure out all the things - query = sourmash_args.load_query_signature(args.query, - ksize=args.ksize, - select_moltype=moltype, - select_md5=args.md5) - notify(f'loaded query: {str(query)[:30]}... (k={query.minhash.ksize}, {sourmash_args.get_moltype(query)})') + query = sourmash_args.load_query_signature( + args.query, ksize=args.ksize, select_moltype=moltype, select_md5=args.md5 + ) + notify( + f"loaded query: {str(query)[:30]}... (k={query.minhash.ksize}, {sourmash_args.get_moltype(query)})" + ) # verify signature was computed right. if not query.minhash.scaled: - error('query signature needs to be created with --scaled') + error("query signature needs to be created with --scaled") sys.exit(-1) if args.scaled and args.scaled != query.minhash.scaled: - notify(f'downsampling query from scaled={query.minhash.scaled} to {int(args.scaled)}') + notify( + f"downsampling query from scaled={query.minhash.scaled} to {int(args.scaled)}" + ) with query.update() as query: query.minhash = query.minhash.downsample(scaled=args.scaled) # empty? if not len(query.minhash): - error('no query hashes!? exiting.') + error("no query hashes!? exiting.") sys.exit(-1) # set up the search databases cache_size = args.cache_size if args.cache_size == 0: cache_size = None - databases = sourmash_args.load_dbs_and_sigs(args.databases, query, False, - cache_size=cache_size, - picklist=picklist, - pattern=pattern_search, - fail_on_empty_database=args.fail_on_empty_database) - - - if args.linear: # force linear traversal? - databases = [ LazyLinearIndex(db) for db in databases ] + databases = sourmash_args.load_dbs_and_sigs( + args.databases, + query, + False, + cache_size=cache_size, + picklist=picklist, + pattern=pattern_search, + fail_on_empty_database=args.fail_on_empty_database, + ) + + if args.linear: # force linear traversal? + databases = [LazyLinearIndex(db) for db in databases] size_may_be_inaccurate = False - if args.prefetch: # note: on by default! + if args.prefetch: # note: on by default! notify("Starting prefetch sweep across databases.") prefetch_query = query.copy() if prefetch_query.minhash.track_abundance: @@ -800,14 +873,21 @@ def gather(args): ident_mh.add_many(union_found) noident_mh.remove_many(union_found) - # optionally calculate and output prefetch info to csv + # optionally calculate and output prefetch info to csv if prefetch_csvout_fp: for found_sig in counter.signatures(): # calculate intersection stats and info - prefetch_result = PrefetchResult(prefetch_query, found_sig, cmp_scaled=scaled, - threshold_bp=args.threshold_bp, estimate_ani_ci=args.estimate_ani_ci) + prefetch_result = PrefetchResult( + prefetch_query, + found_sig, + cmp_scaled=scaled, + threshold_bp=args.threshold_bp, + estimate_ani_ci=args.estimate_ani_ci, + ) if prefetch_csvout_w is None: - prefetch_csvout_w = prefetch_result.init_dictwriter(prefetch_csvout_fp) + prefetch_csvout_w = prefetch_result.init_dictwriter( + prefetch_csvout_fp + ) prefetch_result.write(prefetch_csvout_w) counters.append(counter) @@ -817,7 +897,9 @@ def gather(args): prefetch_csvout_fp.flush() display_bp = format_bp(args.threshold_bp) - notify(f"Prefetch found {len(save_prefetch)} signatures with overlap >= {display_bp}.") + notify( + f"Prefetch found {len(save_prefetch)} signatures with overlap >= {display_bp}." + ) save_prefetch.close() if prefetch_csvout_fp: prefetch_csvout_fp.close() @@ -831,20 +913,22 @@ def gather(args): notify("Doing gather to generate minimum metagenome cover.") found = 0 - weighted_missed = 1 is_abundance = query.minhash.track_abundance and not args.ignore_abundance orig_query_mh = query.minhash if not orig_query_mh.size_is_accurate(): size_may_be_inaccurate = True - gather_iter = GatherDatabases(query, counters, - threshold_bp=args.threshold_bp, - ignore_abundance=args.ignore_abundance, - noident_mh=noident_mh, - ident_mh=ident_mh, - estimate_ani_ci=args.estimate_ani_ci) + gather_iter = GatherDatabases( + query, + counters, + threshold_bp=args.threshold_bp, + ignore_abundance=args.ignore_abundance, + noident_mh=noident_mh, + ident_mh=ident_mh, + estimate_ani_ci=args.estimate_ani_ci, + ) screen_width = _get_screen_width() - sum_f_uniq_found = 0. + sum_f_uniq_found = 0.0 result = None ### open output handles as needed for (1) saving CSV (2) saving matches @@ -867,7 +951,7 @@ def gather(args): found += 1 sum_f_uniq_found += result.f_unique_to_query - if found == 1: # first result? print header. + if found == 1: # first result? print header. if is_abundance: print_results("") print_results("overlap p_query p_match avg_abund") @@ -877,22 +961,30 @@ def gather(args): print_results("overlap p_query p_match") print_results("--------- ------- -------") - # print interim result & save in `found` list for later use - pct_query = '{:.1f}%'.format(result.f_unique_weighted*100) - pct_genome = '{:.1f}%'.format(result.f_match*100) + pct_query = f"{result.f_unique_weighted * 100:.1f}%" + pct_genome = f"{result.f_match * 100:.1f}%" if is_abundance: name = result.match._display_name(screen_width - 41) - average_abund ='{:.1f}'.format(result.average_abund) - print_results('{:9} {:>7} {:>7} {:>9} {}', - format_bp(result.intersect_bp), pct_query, pct_genome, - average_abund, name) + average_abund = f"{result.average_abund:.1f}" + print_results( + "{:9} {:>7} {:>7} {:>9} {}", + format_bp(result.intersect_bp), + pct_query, + pct_genome, + average_abund, + name, + ) else: name = result.match._display_name(screen_width - 31) - print_results('{:9} {:>7} {:>7} {}', - format_bp(result.intersect_bp), pct_query, pct_genome, - name) + print_results( + "{:9} {:>7} {:>7} {}", + format_bp(result.intersect_bp), + pct_query, + pct_genome, + name, + ) # write out CSV if args.output: @@ -915,13 +1007,15 @@ def gather(args): # report on thresholding - if gather_iter.query: # if still a query, then we failed the threshold. - notify(f'found less than {format_bp(args.threshold_bp)} in common. => exiting') + notify(f"found less than {format_bp(args.threshold_bp)} in common. => exiting") # basic reporting: if found: - print_results(f'\nfound {found} matches total;') + print_results(f"\nfound {found} matches total;") if found == args.num_results: - print_results(f'(truncated gather because --num-results={args.num_results})') + print_results( + f"(truncated gather because --num-results={args.num_results})" + ) else: display_bp = format_bp(args.threshold_bp) notify(f"\nNo matches found for --threshold-bp at {display_bp}.") @@ -930,13 +1024,19 @@ def gather(args): if is_abundance and result: p_covered = result.sum_weighted_found / result.total_weighted_hashes p_covered *= 100 - print_results(f'the recovered matches hit {p_covered:.1f}% of the abundance-weighted query.') + print_results( + f"the recovered matches hit {p_covered:.1f}% of the abundance-weighted query." + ) - print_results(f'the recovered matches hit {sum_f_uniq_found*100:.1f}% of the query k-mers (unweighted).') + print_results( + f"the recovered matches hit {sum_f_uniq_found*100:.1f}% of the query k-mers (unweighted)." + ) - print_results('') + print_results("") if gather_iter.scaled != query.minhash.scaled: - print_results(f'WARNING: final scaled was {gather_iter.scaled}, vs query scaled of {query.minhash.scaled}') + print_results( + f"WARNING: final scaled was {gather_iter.scaled}, vs query scaled of {query.minhash.scaled}" + ) # save CSV? if (found and args.output) or args.create_empty_results: @@ -947,7 +1047,7 @@ def gather(args): if args.output_unassigned: remaining_query = gather_iter.query if not (remaining_query.minhash or noident_mh): - notify('no unassigned hashes to save with --output-unassigned!') + notify("no unassigned hashes to save with --output-unassigned!") else: notify(f"saving unassigned hashes to '{args.output_unassigned}'") @@ -967,7 +1067,9 @@ def gather(args): sourmash_args.report_picklist(args, picklist) if size_may_be_inaccurate: - notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons.") + notify( + "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons." + ) # DONE w/gather function. @@ -979,11 +1081,11 @@ def multigather(args): moltype = sourmash_args.calculate_moltype(args) if not args.db: - error('Error! must specify at least one database with --db') + error("Error! must specify at least one database with --db") sys.exit(-1) if not args.query and not args.query_from_file: - error('Error! must specify at least one query signature with --query') + error("Error! must specify at least one query signature with --query") sys.exit(-1) # flatten --db and --query @@ -994,36 +1096,49 @@ def multigather(args): inp_files.extend(more_files) # need a query to get ksize, moltype for db loading - query = next(iter(sourmash_args.load_file_as_signatures(inp_files[0], ksize=args.ksize, select_moltype=moltype))) - - notify(f'loaded first query: {str(query)[:30]}... (k={query.minhash.ksize}, {sourmash_args.get_moltype(query)})') - - databases = sourmash_args.load_dbs_and_sigs(args.db, query, False, - fail_on_empty_database=args.fail_on_empty_database) + query = next( + iter( + sourmash_args.load_file_as_signatures( + inp_files[0], ksize=args.ksize, select_moltype=moltype + ) + ) + ) + + notify( + f"loaded first query: {str(query)[:30]}... (k={query.minhash.ksize}, {sourmash_args.get_moltype(query)})" + ) + + databases = sourmash_args.load_dbs_and_sigs( + args.db, query, False, fail_on_empty_database=args.fail_on_empty_database + ) # run gather on all the queries. - n=0 + n = 0 size_may_be_inaccurate = False for queryfile in inp_files: # load the query signature(s) & figure out all the things - for query in sourmash_args.load_file_as_signatures(queryfile, - ksize=args.ksize, - select_moltype=moltype): - notify(f'loaded query: {str(query)[:30]}... (k={query.minhash.ksize}, {sourmash_args.get_moltype(query)})') + for query in sourmash_args.load_file_as_signatures( + queryfile, ksize=args.ksize, select_moltype=moltype + ): + notify( + f"loaded query: {str(query)[:30]}... (k={query.minhash.ksize}, {sourmash_args.get_moltype(query)})" + ) # verify signature was computed right. if not query.minhash.scaled: - error('query signature needs to be created with --scaled; skipping') + error("query signature needs to be created with --scaled; skipping") continue if args.scaled and args.scaled != query.minhash.scaled: - notify(f'downsampling query from scaled={query.minhash.scaled} to {int(args.scaled)}') + notify( + f"downsampling query from scaled={query.minhash.scaled} to {int(args.scaled)}" + ) with query.update() as query: query.minhash = query.minhash.downsample(scaled=args.scaled) # empty? if not len(query.minhash): - error('no query hashes!? skipping to next..') + error("no query hashes!? skipping to next..") continue counters = [] @@ -1050,17 +1165,19 @@ def multigather(args): ident_mh.add_many(union_found) found = 0 - weighted_missed = 1 is_abundance = query.minhash.track_abundance and not args.ignore_abundance orig_query_mh = query.minhash - gather_iter = GatherDatabases(query, counters, - threshold_bp=args.threshold_bp, - ignore_abundance=args.ignore_abundance, - noident_mh=noident_mh, - ident_mh=ident_mh) + gather_iter = GatherDatabases( + query, + counters, + threshold_bp=args.threshold_bp, + ignore_abundance=args.ignore_abundance, + noident_mh=noident_mh, + ident_mh=ident_mh, + ) screen_width = _get_screen_width() - sum_f_uniq_found = 0. + sum_f_uniq_found = 0.0 result = None query_filename = query.filename @@ -1071,9 +1188,9 @@ def multigather(args): output_base = os.path.basename(query_filename) if args.output_dir: output_base = os.path.join(args.output_dir, output_base) - output_csv = output_base + '.csv' + output_csv = output_base + ".csv" - output_matches = output_base + '.matches.sig' + output_matches = output_base + ".matches.sig" save_sig_obj = SaveSignaturesToLocation(output_matches) save_sig = save_sig_obj.__enter__() notify(f"saving all matching signatures to '{output_matches}'") @@ -1087,7 +1204,7 @@ def multigather(args): for result in gather_iter: found += 1 sum_f_uniq_found += result.f_unique_to_query - if found == 1: # first result? print header. + if found == 1: # first result? print header. if is_abundance: print_results("") print_results("overlap p_query p_match avg_abund") @@ -1097,22 +1214,30 @@ def multigather(args): print_results("overlap p_query p_match") print_results("--------- ------- -------") - # print interim result & save in a list for later use - pct_query = '{:.1f}%'.format(result.f_unique_weighted*100) - pct_genome = '{:.1f}%'.format(result.f_match*100) + pct_query = f"{result.f_unique_weighted * 100:.1f}%" + pct_genome = f"{result.f_match * 100:.1f}%" if is_abundance: name = result.match._display_name(screen_width - 41) - average_abund ='{:.1f}'.format(result.average_abund) - print_results('{:9} {:>7} {:>7} {:>9} {}', - format_bp(result.intersect_bp), pct_query, pct_genome, - average_abund, name) + average_abund = f"{result.average_abund:.1f}" + print_results( + "{:9} {:>7} {:>7} {:>9} {}", + format_bp(result.intersect_bp), + pct_query, + pct_genome, + average_abund, + name, + ) else: name = result.match._display_name(screen_width - 31) - print_results('{:9} {:>7} {:>7} {}', - format_bp(result.intersect_bp), pct_query, pct_genome, - name) + print_results( + "{:9} {:>7} {:>7} {}", + format_bp(result.intersect_bp), + pct_query, + pct_genome, + name, + ) ## @CTB if csv_writer is None: @@ -1128,10 +1253,12 @@ def multigather(args): # report on thresholding - if gather_iter.query.minhash: # if still a query, then we failed the threshold. - notify(f'found less than {format_bp(args.threshold_bp)} in common. => exiting') + notify( + f"found less than {format_bp(args.threshold_bp)} in common. => exiting" + ) # basic reporting - print_results('\nfound {} matches total;', found) + print_results("\nfound {} matches total;", found) # close saving etc. save_sig_obj.close() @@ -1143,17 +1270,21 @@ def multigather(args): if is_abundance and result: p_covered = result.sum_weighted_found / result.total_weighted_hashes p_covered *= 100 - print_results(f'the recovered matches hit {p_covered:.1f}% of the abundance-weighted query.') + print_results( + f"the recovered matches hit {p_covered:.1f}% of the abundance-weighted query." + ) - print_results(f'the recovered matches hit {sum_f_uniq_found*100:.1f}% of the query k-mers (unweighted).') - print_results('') + print_results( + f"the recovered matches hit {sum_f_uniq_found*100:.1f}% of the query k-mers (unweighted)." + ) + print_results("") if found == 0: - notify('nothing found... skipping.') + notify("nothing found... skipping.") continue - output_unassigned = output_base + '.unassigned.sig' - with open(output_unassigned, 'wt') as fp: + output_unassigned = output_base + ".unassigned.sig" + with open(output_unassigned, "w"): remaining_query = gather_iter.query if noident_mh: remaining_mh = remaining_query.minhash.to_mutable() @@ -1165,9 +1296,9 @@ def multigather(args): remaining_query.minhash = abund_query_mh if found == 0: - notify('nothing found - entire query signature unassigned.') + notify("nothing found - entire query signature unassigned.") elif not remaining_query: - notify('no unassigned hashes! not saving.') + notify("no unassigned hashes! not saving.") else: notify(f'saving unassigned hashes to "{output_unassigned}"') @@ -1177,9 +1308,11 @@ def multigather(args): n += 1 # fini, next query! - notify(f'\nconducted gather searches on {n} signatures') + notify(f"\nconducted gather searches on {n} signatures") if size_may_be_inaccurate: - notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons.") + notify( + "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons." + ) def watch(args): @@ -1187,7 +1320,7 @@ def watch(args): set_quiet(args.quiet) if args.input_is_protein and args.dna: - notify('WARNING: input is protein, turning off nucleotide hashing.') + notify("WARNING: input is protein, turning off nucleotide hashing.") args.dna = False args.protein = True @@ -1195,22 +1328,22 @@ def watch(args): notify('ERROR: cannot use "watch" with both nucleotide and protein.') if args.dna: - moltype = 'DNA' + moltype = "DNA" is_protein = False dayhoff = False hp = False elif args.protein: - moltype = 'protein' + moltype = "protein" is_protein = True dayhoff = False hp = False elif args.dayhoff: - moltype = 'dayhoff' + moltype = "dayhoff" is_protein = True dayhoff = True hp = False else: - moltype = 'hp' + moltype = "hp" is_protein = True dayhoff = False hp = True @@ -1224,23 +1357,27 @@ def watch(args): tree_mh = leaf.data.minhash ksize = tree_mh.ksize - E = MinHash(ksize=ksize, n=args.num_hashes, is_protein=is_protein, dayhoff=dayhoff, hp=hp) + E = MinHash( + ksize=ksize, n=args.num_hashes, is_protein=is_protein, dayhoff=dayhoff, hp=hp + ) - notify(f'Computing signature for k={ksize}, {moltype} from stdin') + notify(f"Computing signature for k={ksize}, {moltype} from stdin") def do_search(): results = [] - streamsig = sig.SourmashSignature(E, filename='stdin', name=args.name) - for similarity, match, _ in tree.search(streamsig, - threshold=args.threshold, - best_only=True, - ignore_abundance=True, - do_containment=False): + streamsig = sig.SourmashSignature(E, filename="stdin", name=args.name) + for similarity, match, _ in tree.search( + streamsig, + threshold=args.threshold, + best_only=True, + ignore_abundance=True, + do_containment=False, + ): results.append((similarity, match)) return results - notify('reading sequences from stdin') + notify("reading sequences from stdin") watermark = WATERMARK_SIZE # iterate over input records @@ -1249,7 +1386,7 @@ def do_search(): for n, record in enumerate(screed_iter): # at each watermark, print status & check cardinality if n >= watermark: - notify(f'\r... read {n} sequences', end='') + notify(f"\r... read {n} sequences", end="") watermark += WATERMARK_SIZE if do_search(): @@ -1262,16 +1399,15 @@ def do_search(): results = do_search() if not results: - notify(f'... read {n} sequences, no matches found.') + notify(f"... read {n} sequences, no matches found.") else: - results.sort(key=lambda x: -x[0]) # take best + results.sort(key=lambda x: -x[0]) # take best similarity, found_sig = results[0] - print_results('FOUND: {}, at {:.3f}', found_sig, - similarity) + print_results("FOUND: {}, at {:.3f}", found_sig, similarity) if args.output: notify(f"saving signature to '{args.output}'") - streamsig = sig.SourmashSignature(E, filename='stdin', name=args.name) + streamsig = sig.SourmashSignature(E, filename="stdin", name=args.name) with SaveSignaturesToLocation(args.output) as save_sig: save_sig.add(streamsig) @@ -1296,9 +1432,15 @@ def prefetch(args): notify("ERROR: no databases or signatures to search!?") sys.exit(-1) - if not (args.save_unmatched_hashes or args.save_matching_hashes or - args.save_matches or args.output): - notify("WARNING: no output(s) specified! Nothing will be saved from this prefetch!") + if not ( + args.save_unmatched_hashes + or args.save_matching_hashes + or args.save_matches + or args.output + ): + notify( + "WARNING: no output(s) specified! Nothing will be saved from this prefetch!" + ) # figure out what k-mer size and molecule type we're looking for here ksize = args.ksize @@ -1307,15 +1449,16 @@ def prefetch(args): pattern_search = sourmash_args.load_include_exclude_db_patterns(args) # load the query signature & figure out all the things - query = sourmash_args.load_query_signature(args.query, - ksize=args.ksize, - select_moltype=moltype, - select_md5=args.md5) - notify(f'loaded query: {str(query)[:30]}... (k={query.minhash.ksize}, {sourmash_args.get_moltype(query)})') + query = sourmash_args.load_query_signature( + args.query, ksize=args.ksize, select_moltype=moltype, select_md5=args.md5 + ) + notify( + f"loaded query: {str(query)[:30]}... (k={query.minhash.ksize}, {sourmash_args.get_moltype(query)})" + ) # verify signature was computed with scaled. if not query.minhash.scaled: - error('query signature needs to be created with --scaled') + error("query signature needs to be created with --scaled") sys.exit(-1) # if with track_abund, flatten me @@ -1325,15 +1468,19 @@ def prefetch(args): query_mh = query_mh.flatten() if args.scaled and args.scaled != query_mh.scaled: - notify(f'downsampling query from scaled={query_mh.scaled} to {int(args.scaled)}') + notify( + f"downsampling query from scaled={query_mh.scaled} to {int(args.scaled)}" + ) query_mh = query_mh.downsample(scaled=args.scaled) - notify(f"query sketch has scaled={query_mh.scaled}; will be dynamically downsampled as needed.") + notify( + f"query sketch has scaled={query_mh.scaled}; will be dynamically downsampled as needed." + ) common_scaled = query_mh.scaled # empty? if not len(query_mh): - error('no query hashes!? exiting.') + error("no query hashes!? exiting.") sys.exit(-1) with query.update() as query: @@ -1357,12 +1504,12 @@ def prefetch(args): ident_mh = query_mh.copy_and_clear() noident_mh = query_mh.to_mutable() - did_a_search = False # track whether we did _any_ search at all! + did_a_search = False # track whether we did _any_ search at all! size_may_be_inaccurate = False total_signatures_loaded = 0 sum_signatures_after_select = 0 for dbfilename in args.databases: - notify(f"loading signatures from '{dbfilename}'", end='\r') + notify(f"loading signatures from '{dbfilename}'", end="\r") db = sourmash_args.load_file_as_index(dbfilename) total_signatures_loaded += len(db) @@ -1371,24 +1518,25 @@ def prefetch(args): if args.linear: db = LazyLinearIndex(db) - db = db.select(ksize=ksize, moltype=moltype, - containment=True, scaled=True) + db = db.select(ksize=ksize, moltype=moltype, containment=True, scaled=True) sum_signatures_after_select += len(db) - db = sourmash_args.apply_picklist_and_pattern(db, picklist, - pattern_search) + db = sourmash_args.apply_picklist_and_pattern(db, picklist, pattern_search) if not db: notify(f"...no compatible signatures in '{dbfilename}'; skipping") continue - for result in prefetch_database(query, db, args.threshold_bp, estimate_ani_ci= args.estimate_ani_ci): + for result in prefetch_database( + query, db, args.threshold_bp, estimate_ani_ci=args.estimate_ani_ci + ): match = result.match # ensure we're all on the same page wrt scaled resolution: - common_scaled = max(match.minhash.scaled, query.minhash.scaled, - common_scaled) + common_scaled = max( + match.minhash.scaled, query.minhash.scaled, common_scaled + ) query_mh = query.minhash.downsample(scaled=common_scaled) match_mh = match.minhash.downsample(scaled=common_scaled) @@ -1412,8 +1560,10 @@ def prefetch(args): matches_out.add(match) if matches_out.count % 10 == 0: - notify(f"total of {matches_out.count} matching signatures so far.", - end="\r") + notify( + f"total of {matches_out.count} matching signatures so far.", + end="\r", + ) # keep track of inaccurate size estimation if not size_may_be_inaccurate and result.size_may_be_inaccurate: @@ -1429,11 +1579,17 @@ def prefetch(args): del db notify("--") - notify(f"loaded {total_signatures_loaded} total signatures from {len(args.databases)} locations.") - notify(f"after selecting signatures compatible with search, {sum_signatures_after_select} remain.") + notify( + f"loaded {total_signatures_loaded} total signatures from {len(args.databases)} locations." + ) + notify( + f"after selecting signatures compatible with search, {sum_signatures_after_select} remain." + ) if not did_a_search: - notify("ERROR in prefetch: after picklists and patterns, no signatures to search!?") + notify( + "ERROR in prefetch: after picklists and patterns, no signatures to search!?" + ) sys.exit(-1) notify("--") @@ -1445,7 +1601,9 @@ def prefetch(args): csvout_fp.close() assert len(query_mh) == len(ident_mh) + len(noident_mh) - notify(f"of {len(query_mh)} distinct query hashes, {len(ident_mh)} were found in matches above threshold.") + notify( + f"of {len(query_mh)} distinct query hashes, {len(ident_mh)} were found in matches above threshold." + ) notify(f"a total of {len(noident_mh)} query hashes remain unmatched.") notify(f"final scaled value (max across query and all matches) is {common_scaled}") @@ -1453,7 +1611,7 @@ def prefetch(args): filename = args.save_matching_hashes notify(f"saving {len(ident_mh)} matched hashes to '{filename}'") - sig_name = '' + sig_name = "" if query.name: sig_name = f"{query.name}-known" @@ -1468,7 +1626,7 @@ def prefetch(args): if args.save_unmatched_hashes: filename = args.save_unmatched_hashes - sig_name = '' + sig_name = "" if query.name: sig_name = f"{query.name}-unknown" @@ -1486,6 +1644,8 @@ def prefetch(args): sourmash_args.report_picklist(args, picklist) if size_may_be_inaccurate: - notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons.") + notify( + "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons." + ) return 0 diff --git a/src/sourmash/compare.py b/src/sourmash/compare.py index 35b8639cb5..85928dc8a4 100644 --- a/src/sourmash/compare.py +++ b/src/sourmash/compare.py @@ -39,22 +39,28 @@ def compare_serial(siglist, ignore_abundance, *, downsample=False, return_ani=Fa for i, j in iterator: if return_ani: - ani_result = siglist[i].jaccard_ani(siglist[j],downsample=downsample) + ani_result = siglist[i].jaccard_ani(siglist[j], downsample=downsample) if not potential_false_negatives and ani_result.p_exceeds_threshold: potential_false_negatives = True if not jaccard_ani_untrustworthy and ani_result.je_exceeds_threshold: jaccard_ani_untrustworthy = True ani = ani_result.ani - if ani == None: + if ani is None: ani = 0.0 similarities[i][j] = similarities[j][i] = ani else: - similarities[i][j] = similarities[j][i] = siglist[i].similarity(siglist[j], ignore_abundance=ignore_abundance, downsample=downsample) + similarities[i][j] = similarities[j][i] = siglist[i].similarity( + siglist[j], ignore_abundance=ignore_abundance, downsample=downsample + ) if jaccard_ani_untrustworthy: - notify("WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons.") + notify( + "WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons." + ) if potential_false_negatives: - notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.") + notify( + "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." + ) return similarities @@ -78,19 +84,24 @@ def compare_serial_containment(siglist, *, downsample=False, return_ani=False): if i == j: containments[i][j] = 1 elif return_ani: - ani_result = siglist[j].containment_ani(siglist[i], downsample=downsample) + ani_result = siglist[j].containment_ani( + siglist[i], downsample=downsample + ) ani = ani_result.ani if not potential_false_negatives and ani_result.p_exceeds_threshold: potential_false_negatives = True - if ani == None: + if ani is None: ani = 0.0 containments[i][j] = ani else: - containments[i][j] = siglist[j].contained_by(siglist[i], - downsample=downsample) + containments[i][j] = siglist[j].contained_by( + siglist[i], downsample=downsample + ) if potential_false_negatives: - notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.") + notify( + "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." + ) return containments @@ -115,18 +126,23 @@ def compare_serial_max_containment(siglist, *, downsample=False, return_ani=Fals for i, j in iterator: if return_ani: - ani_result = siglist[j].max_containment_ani(siglist[i], downsample=downsample) + ani_result = siglist[j].max_containment_ani( + siglist[i], downsample=downsample + ) ani = ani_result.ani if not potential_false_negatives and ani_result.p_exceeds_threshold: potential_false_negatives = True - if ani == None: + if ani is None: ani = 0.0 containments[i][j] = containments[j][i] = ani else: - containments[i][j] = containments[j][i] = siglist[j].max_containment(siglist[i], - downsample=downsample) + containments[i][j] = containments[j][i] = siglist[j].max_containment( + siglist[i], downsample=downsample + ) if potential_false_negatives: - notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.") + notify( + "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." + ) return containments @@ -153,17 +169,20 @@ def compare_serial_avg_containment(siglist, *, downsample=False, return_ani=Fals if return_ani: cmp = FracMinHashComparison(siglist[j].minhash, siglist[i].minhash) ani = cmp.avg_containment_ani - if ani == None: + if ani is None: ani = 0.0 if not potential_false_negatives and cmp.potential_false_negative: potential_false_negatives = True containments[i][j] = containments[j][i] = ani else: - containments[i][j] = containments[j][i] = siglist[j].avg_containment(siglist[i], - downsample=downsample) + containments[i][j] = containments[j][i] = siglist[j].avg_containment( + siglist[i], downsample=downsample + ) if potential_false_negatives: - notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.") + notify( + "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." + ) return containments @@ -174,16 +193,18 @@ def similarity_args_unpack(args, ignore_abundance, *, downsample, return_ani=Fal sig1, sig2 = args if return_ani: ani = sig1.jaccard_ani(sig2, downsample=downsample).ani - if ani == None: + if ani is None: ani = 0.0 return ani else: - return sig1.similarity(sig2, - ignore_abundance=ignore_abundance, - downsample=downsample) + return sig1.similarity( + sig2, ignore_abundance=ignore_abundance, downsample=downsample + ) -def get_similarities_at_index(index, ignore_abundance, downsample, siglist, *, return_ani=False): +def get_similarities_at_index( + index, ignore_abundance, downsample, siglist, *, return_ani=False +): """Returns similarities of all the combinations of signature at index in the siglist with the rest of the indices starting at index + 1. Doesn't redundantly calculate signatures with all the other indices prior to @@ -202,18 +223,24 @@ def get_similarities_at_index(index, ignore_abundance, downsample, siglist, *, r with rest of the signatures from index+1 """ startt = time.time() - sig_iterator = itertools.product([siglist[index]], siglist[index + 1:]) - func = partial(similarity_args_unpack, - ignore_abundance=ignore_abundance, - downsample=downsample, - return_ani=return_ani) + sig_iterator = itertools.product([siglist[index]], siglist[index + 1 :]) + func = partial( + similarity_args_unpack, + ignore_abundance=ignore_abundance, + downsample=downsample, + return_ani=return_ani, + ) similarity_list = list(map(func, sig_iterator)) notify( - f"comparison for index {index} done in {time.time() - startt:.5f} seconds", end='\r') + f"comparison for index {index} done in {time.time() - startt:.5f} seconds", + end="\r", + ) return similarity_list -def compare_parallel(siglist, ignore_abundance, downsample, n_jobs, *, return_ani=False): +def compare_parallel( + siglist, ignore_abundance, downsample, n_jobs, *, return_ani=False +): """Compare all combinations of signatures and return a matrix of similarities. Processes combinations parallely on number of processes given by n_jobs @@ -256,7 +283,8 @@ def compare_parallel(siglist, ignore_abundance, downsample, n_jobs, *, return_an siglist=siglist, ignore_abundance=ignore_abundance, downsample=downsample, - return_ani=return_ani) + return_ani=return_ani, + ) notify("Created similarity func") # Initialize multiprocess.pool @@ -279,19 +307,27 @@ def compare_parallel(siglist, ignore_abundance, downsample, n_jobs, *, return_an startt = time.time() col_idx = index + 1 for idx_condensed, item in enumerate(l): - memmap_similarities[index, col_idx + idx_condensed] = memmap_similarities[idx_condensed + col_idx, index] = item + memmap_similarities[index, col_idx + idx_condensed] = memmap_similarities[ + idx_condensed + col_idx, index + ] = item notify( - f"Setting similarities matrix for index {index} done in {time.time() - startt:.5f} seconds", end='\r') + f"Setting similarities matrix for index {index} done in {time.time() - startt:.5f} seconds", + end="\r", + ) notify("Setting similarities completed") pool.close() pool.join() - notify(f"Time taken to compare all pairs parallely is {time.time() - start_initial:.5f} seconds ") + notify( + f"Time taken to compare all pairs parallely is {time.time() - start_initial:.5f} seconds " + ) return np.memmap(filename, dtype=np.float64, shape=(length_siglist, length_siglist)) -def compare_all_pairs(siglist, ignore_abundance, downsample=False, n_jobs=None, return_ani=False): +def compare_all_pairs( + siglist, ignore_abundance, downsample=False, n_jobs=None, return_ani=False +): """Compare all combinations of signatures and return a matrix of similarities. Processes combinations either serially or based on parallely on number of processes given by n_jobs @@ -309,7 +345,14 @@ def compare_all_pairs(siglist, ignore_abundance, downsample=False, n_jobs=None, :return: np.array similarity matrix """ if n_jobs is None or n_jobs == 1: - similarities = compare_serial(siglist, ignore_abundance=ignore_abundance, downsample=downsample, return_ani=return_ani) + similarities = compare_serial( + siglist, + ignore_abundance=ignore_abundance, + downsample=downsample, + return_ani=return_ani, + ) else: - similarities = compare_parallel(siglist, ignore_abundance, downsample, n_jobs, return_ani=return_ani) + similarities = compare_parallel( + siglist, ignore_abundance, downsample, n_jobs, return_ani=return_ani + ) return similarities diff --git a/src/sourmash/distance_utils.py b/src/sourmash/distance_utils.py index 66feb6259c..9106bd8812 100644 --- a/src/sourmash/distance_utils.py +++ b/src/sourmash/distance_utils.py @@ -12,12 +12,14 @@ from .logging import notify + def check_distance(dist): if not 0 <= dist <= 1: raise ValueError(f"Error: distance value {dist :.4f} is not between 0 and 1!") else: return dist + def check_prob_threshold(val, threshold=1e-3): """ Check likelihood of no shared hashes based on chance alone (false neg). @@ -29,15 +31,18 @@ def check_prob_threshold(val, threshold=1e-3): exceeds_threshold = True return val, exceeds_threshold + def check_jaccard_error(val, threshold=1e-4): exceeds_threshold = False if threshold is not None and val > threshold: exceeds_threshold = True return val, exceeds_threshold + @dataclass class ANIResult: """Base class for distance/ANI from k-mer containment.""" + dist: float p_nothing_in_common: float p_threshold: float = 1e-3 @@ -47,7 +52,9 @@ class ANIResult: def check_dist_and_p_threshold(self): # check values self.dist = check_distance(self.dist) - self.p_nothing_in_common, self.p_exceeds_threshold = check_prob_threshold(self.p_nothing_in_common, self.p_threshold) + self.p_nothing_in_common, self.p_exceeds_threshold = check_prob_threshold( + self.p_nothing_in_common, self.p_threshold + ) def __post_init__(self): self.check_dist_and_p_threshold() @@ -62,6 +69,7 @@ def ani(self): @dataclass class jaccardANIResult(ANIResult): """Class for distance/ANI from jaccard (includes jaccard_error).""" + jaccard_error: float = None je_threshold: float = 1e-4 @@ -70,7 +78,9 @@ def __post_init__(self): self.check_dist_and_p_threshold() # check jaccard error if self.jaccard_error is not None: - self.jaccard_error, self.je_exceeds_threshold = check_jaccard_error(self.jaccard_error, self.je_threshold) + self.jaccard_error, self.je_exceeds_threshold = check_jaccard_error( + self.jaccard_error, self.je_threshold + ) else: raise ValueError("Error: jaccard_error cannot be None.") @@ -89,6 +99,7 @@ class ciANIResult(ANIResult): Set CI defaults to None, just in case CI can't be estimated for given sample. """ + dist_low: float = None dist_high: float = None @@ -128,7 +139,7 @@ def var_n_mutated(L, k, r1, *, q=None): if r1 == 0: return 0.0 r1 = float(r1) - if q == None: # we assume that if q is provided, it is correct for r1 + if q is None: # we assume that if q is provided, it is correct for r1 q = r1_to_q(k, r1) varN = ( L * (1 - q) * (q * (2 * k + (2 / r1) - 1) - 2 * k) @@ -158,7 +169,9 @@ def handle_seqlen_nkmers(ksize, *, sequence_len_bp=None, n_unique_kmers=None): return n_unique_kmers elif sequence_len_bp is None: # both are None, raise ValueError - raise ValueError("Error: distance estimation requires input of either 'sequence_len_bp' or 'n_unique_kmers'") + raise ValueError( + "Error: distance estimation requires input of either 'sequence_len_bp' or 'n_unique_kmers'" + ) else: n_unique_kmers = sequence_len_bp - (ksize - 1) return n_unique_kmers @@ -175,7 +188,7 @@ def set_size_chernoff(set_size, scaled, *, relative_error=0.05): @param relative_error: the desired relative error (defaults to 5%) @return: float (the upper bound probability) """ - upper_bound = 1 - 2 * np.exp(- relative_error**2*set_size/(scaled * 3)) + upper_bound = 1 - 2 * np.exp(-(relative_error**2) * set_size / (scaled * 3)) return upper_bound @@ -190,14 +203,17 @@ def set_size_exact_prob(set_size, scaled, *, relative_error=0.05): @return: float (the upper bound probability) """ # Need to check if the edge case is an integer or not. If not, don't include it in the equation - pmf_arg = -set_size/scaled * (relative_error - 1) + pmf_arg = -set_size / scaled * (relative_error - 1) if pmf_arg == int(pmf_arg): - prob = binom.cdf(set_size/scaled * (relative_error + 1), set_size, 1/scaled) - \ - binom.cdf(-set_size/scaled * (relative_error - 1), set_size, 1/scaled) + \ - binom.pmf(-set_size/scaled * (relative_error - 1), set_size, 1/scaled) + prob = ( + binom.cdf(set_size / scaled * (relative_error + 1), set_size, 1 / scaled) + - binom.cdf(-set_size / scaled * (relative_error - 1), set_size, 1 / scaled) + + binom.pmf(-set_size / scaled * (relative_error - 1), set_size, 1 / scaled) + ) else: - prob = binom.cdf(set_size / scaled * (relative_error + 1), set_size, 1 / scaled) - \ - binom.cdf(-set_size / scaled * (relative_error - 1), set_size, 1 / scaled) + prob = binom.cdf( + set_size / scaled * (relative_error + 1), set_size, 1 / scaled + ) - binom.cdf(-set_size / scaled * (relative_error - 1), set_size, 1 / scaled) return prob @@ -225,7 +241,9 @@ def get_exp_probability_nothing_common( Arguments: n_unique_kmers, ksize, mutation_rate, scaled Returns: float - expected likelihood that nothing is common between sketches """ - n_unique_kmers = handle_seqlen_nkmers(ksize, sequence_len_bp=sequence_len_bp,n_unique_kmers=n_unique_kmers) + n_unique_kmers = handle_seqlen_nkmers( + ksize, sequence_len_bp=sequence_len_bp, n_unique_kmers=n_unique_kmers + ) f_scaled = 1.0 / float(scaled) if mutation_rate == 1.0: return 1.0 @@ -251,12 +269,14 @@ def containment_to_distance( Containment --> distance CI (one step) """ sol1, sol2, point_estimate = None, None, None - n_unique_kmers = handle_seqlen_nkmers(ksize, sequence_len_bp = sequence_len_bp, n_unique_kmers=n_unique_kmers) + n_unique_kmers = handle_seqlen_nkmers( + ksize, sequence_len_bp=sequence_len_bp, n_unique_kmers=n_unique_kmers + ) if containment == 0: - #point_estimate = 1.0 + # point_estimate = 1.0 point_estimate = sol1 = sol2 = 1.0 elif containment == 1: - #point_estimate = 0.0 + # point_estimate = 0.0 point_estimate = sol1 = sol2 = 0.0 else: point_estimate = 1.0 - containment ** (1.0 / ksize) @@ -273,25 +293,33 @@ def containment_to_distance( term_1 = (1.0 - f_scaled) / ( f_scaled * n_unique_kmers**3 * bias_factor**2 ) - term_2 = lambda pest: n_unique_kmers * exp_n_mutated( - n_unique_kmers, ksize, pest - ) - exp_n_mutated_squared(n_unique_kmers, ksize, pest) - term_3 = lambda pest: var_n_mutated(n_unique_kmers, ksize, pest) / ( - n_unique_kmers**2 - ) - var_direct = lambda pest: term_1 * term_2(pest) + term_3(pest) - - f1 = ( - lambda pest: (1 - pest) ** ksize - + z_alpha * np.sqrt(var_direct(pest)) - - containment - ) - f2 = ( - lambda pest: (1 - pest) ** ksize - - z_alpha * np.sqrt(var_direct(pest)) - - containment - ) + def term_2(pest): + return n_unique_kmers * exp_n_mutated( + n_unique_kmers, ksize, pest + ) - exp_n_mutated_squared(n_unique_kmers, ksize, pest) + + def term_3(pest): + return ( + var_n_mutated(n_unique_kmers, ksize, pest) / n_unique_kmers**2 + ) + + def var_direct(pest): + return term_1 * term_2(pest) + term_3(pest) + + def f1(pest): + return ( + (1 - pest) ** ksize + + z_alpha * np.sqrt(var_direct(pest)) + - containment + ) + + def f2(pest): + return ( + (1 - pest) ** ksize + - z_alpha * np.sqrt(var_direct(pest)) + - containment + ) sol1 = brentq(f1, 0.0000001, 0.9999999) sol2 = brentq(f2, 0.0000001, 0.9999999) @@ -308,7 +336,13 @@ def containment_to_distance( prob_nothing_in_common = get_exp_probability_nothing_common( point_estimate, ksize, scaled, n_unique_kmers=n_unique_kmers ) - return ciANIResult(point_estimate, prob_nothing_in_common, dist_low=sol2, dist_high=sol1, p_threshold=prob_threshold) + return ciANIResult( + point_estimate, + prob_nothing_in_common, + dist_low=sol2, + dist_high=sol1, + p_threshold=prob_threshold, + ) def jaccard_to_distance( @@ -341,7 +375,9 @@ def jaccard_to_distance( useful for determining whether scaled is sufficient for these comparisons. """ error_lower_bound = None - n_unique_kmers = handle_seqlen_nkmers(ksize, sequence_len_bp=sequence_len_bp, n_unique_kmers=n_unique_kmers) + n_unique_kmers = handle_seqlen_nkmers( + ksize, sequence_len_bp=sequence_len_bp, n_unique_kmers=n_unique_kmers + ) if jaccard == 0: point_estimate = 1.0 error_lower_bound = 0.0 @@ -361,4 +397,10 @@ def jaccard_to_distance( prob_nothing_in_common = get_exp_probability_nothing_common( point_estimate, ksize, scaled, n_unique_kmers=n_unique_kmers ) - return jaccardANIResult(point_estimate, prob_nothing_in_common, jaccard_error=error_lower_bound, p_threshold=prob_threshold, je_threshold=err_threshold) + return jaccardANIResult( + point_estimate, + prob_nothing_in_common, + jaccard_error=error_lower_bound, + p_threshold=prob_threshold, + je_threshold=err_threshold, + ) diff --git a/src/sourmash/exceptions.py b/src/sourmash/exceptions.py index b2f18c12d2..002fbafdfc 100644 --- a/src/sourmash/exceptions.py +++ b/src/sourmash/exceptions.py @@ -1,7 +1,7 @@ from ._lowlevel import lib -__all__ = ['SourmashError'] +__all__ = ["SourmashError"] exceptions_by_code = {} @@ -16,13 +16,15 @@ def __init__(self, msg): def __str__(self): rv = self.message if self.rust_info is not None: - return u'%s\n\n%s' % (rv, self.rust_info) + return f"{rv}\n\n{self.rust_info}" return rv class IndexNotSupported(SourmashError): def __init__(self): - SourmashError.__init__(self, "This index format is not supported in this version of sourmash") + SourmashError.__init__( + self, "This index format is not supported in this version of sourmash" + ) class IndexNotLoaded(SourmashError): @@ -55,7 +57,7 @@ def _get_error_base(error_name): def _make_exceptions(): for attr in dir(lib): - if not attr.startswith('SOURMASH_ERROR_CODE_'): + if not attr.startswith("SOURMASH_ERROR_CODE_"): continue code = getattr(lib, attr) @@ -69,4 +71,5 @@ def _make_exceptions(): else: exceptions_by_code[code] = ValueError + _make_exceptions() diff --git a/src/sourmash/fig.py b/src/sourmash/fig.py index 4454ef64d9..9ca96f1aab 100644 --- a/src/sourmash/fig.py +++ b/src/sourmash/fig.py @@ -3,6 +3,7 @@ Make plots using the distance matrix+labels output by `sourmash compare`. """ from .logging import error, notify + try: import numpy import pylab @@ -10,18 +11,20 @@ except (RuntimeError, ImportError): pass + def load_matrix_and_labels(basefile): """Load the comparison matrix and associated labels. Returns a square numpy matrix & list of labels. """ - D = numpy.load(open(basefile, 'rb')) - labeltext = [x.strip() for x in open(basefile + '.labels.txt')] + D = numpy.load(open(basefile, "rb")) + labeltext = [x.strip() for x in open(basefile + ".labels.txt")] return (D, labeltext) -def plot_composite_matrix(D, labeltext, show_labels=True, - vmax=1.0, vmin=0.0, force=False): +def plot_composite_matrix( + D, labeltext, show_labels=True, vmax=1.0, vmin=0.0, force=False +): """Build a composite plot showing dendrogram + distance matrix/heatmap. Returns a matplotlib figure. @@ -30,25 +33,34 @@ def plot_composite_matrix(D, labeltext, show_labels=True, shown on the plot. """ if D.max() > 1.0 or D.min() < 0.0: - error('This matrix doesn\'t look like a distance matrix - min value {}, max value {}', D.min(), D.max()) + error( + "This matrix doesn't look like a distance matrix - min value {}, max value {}", + D.min(), + D.max(), + ) if not force: raise ValueError("not a distance matrix") else: - notify('force is set; scaling to [0, 1]') + notify("force is set; scaling to [0, 1]") D -= D.min() D /= D.max() if show_labels: - show_indices = True + pass fig = pylab.figure(figsize=(11, 8)) ax1 = fig.add_axes([0.09, 0.1, 0.2, 0.6]) # plot dendrogram - Y = sch.linkage(D, method='single') # centroid - - Z1 = sch.dendrogram(Y, orientation='left', labels=labeltext, - no_labels=not show_labels, get_leaves=True) + Y = sch.linkage(D, method="single") # centroid + + Z1 = sch.dendrogram( + Y, + orientation="left", + labels=labeltext, + no_labels=not show_labels, + get_leaves=True, + ) ax1.set_xticks([]) xstart = 0.45 @@ -58,8 +70,8 @@ def plot_composite_matrix(D, labeltext, show_labels=True, scale_xstart = xstart + width + 0.01 # re-order labels along rows, top to bottom - idx1 = Z1['leaves'] - reordered_labels = [ labeltext[i] for i in idx1 ] + idx1 = Z1["leaves"] + reordered_labels = [labeltext[i] for i in idx1] # reorder D by the clustering in the dendrogram D = D[idx1, :] @@ -68,8 +80,9 @@ def plot_composite_matrix(D, labeltext, show_labels=True, # show matrix axmatrix = fig.add_axes([xstart, 0.1, width, 0.6]) - im = axmatrix.matshow(D, aspect='auto', origin='lower', - cmap=pylab.cm.YlGnBu, vmin=vmin, vmax=vmax) + im = axmatrix.matshow( + D, aspect="auto", origin="lower", cmap=pylab.cm.YlGnBu, vmin=vmin, vmax=vmax + ) axmatrix.set_xticks([]) axmatrix.set_yticks([]) diff --git a/src/sourmash/hll.py b/src/sourmash/hll.py index c98ded5e8b..8a78049b34 100644 --- a/src/sourmash/hll.py +++ b/src/sourmash/hll.py @@ -32,7 +32,7 @@ def add_sequence(self, sequence, force=False): def add_kmer(self, kmer): "Add a kmer into the sketch." if len(kmer) != self.ksize: - raise ValueError("kmer to add is not {} in length".format(self.ksize)) + raise ValueError(f"kmer to add is not {self.ksize} in length") self.add_sequence(kmer) def add(self, h): diff --git a/src/sourmash/index/__init__.py b/src/sourmash/index/__init__.py index 08068255e5..154f37c126 100644 --- a/src/sourmash/index/__init__.py +++ b/src/sourmash/index/__init__.py @@ -39,18 +39,23 @@ from abc import abstractmethod, ABC from collections import namedtuple, Counter -from sourmash.search import (make_jaccard_search_query, - make_containment_query, - calc_threshold_from_bp) +from sourmash.search import ( + make_jaccard_search_query, + make_containment_query, + calc_threshold_from_bp, +) from sourmash.manifest import CollectionManifest from sourmash.logging import debug_literal from sourmash.signature import load_signatures, save_signatures -from sourmash.minhash import (flatten_and_downsample_scaled, - flatten_and_downsample_num, - flatten_and_intersect_scaled) +from sourmash.minhash import ( + flatten_and_downsample_scaled, + flatten_and_downsample_num, + flatten_and_intersect_scaled, +) # generic return tuple for Index.search and Index.gather -IndexSearchResult = namedtuple('Result', 'score, signature, location') +IndexSearchResult = namedtuple("Result", "score, signature, location") + class Index(ABC): # this will be removed soon; see sourmash#1894. @@ -103,8 +108,7 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False): @classmethod @abstractmethod - def load(cls, location, leaf_loader=None, storage=None, - print_version_warning=True): + def load(cls, location, leaf_loader=None, storage=None, print_version_warning=True): """ """ def find(self, search_fn, query, **kwargs): @@ -133,7 +137,7 @@ def prepare_subject(subj_mh): def prepare_query(query_mh, subj_mh): return flatten_and_downsample_scaled(query_mh, subj_mh.scaled) - else: # num + else: # num query_num = query_mh.num def prepare_subject(subj_mh): @@ -156,10 +160,7 @@ def prepare_query(query_mh, subj_mh): query_size = len(query_mh) subj_size = len(subj_mh) - score = search_fn.score_fn(query_size, - shared_size, - subj_size, - total_size) + score = search_fn.score_fn(query_size, shared_size, subj_size, total_size) if search_fn.passes(score): # note: here we yield the original signature, not the @@ -173,7 +174,9 @@ def search_abund(self, query, *, threshold=None, **kwargs): Results will be sorted by similarity, highest to lowest. """ if not query.minhash.track_abundance: - raise TypeError("'search_abund' requires query signature with abundance information") + raise TypeError( + "'search_abund' requires query signature with abundance information" + ) # check arguments if threshold is None: @@ -184,7 +187,9 @@ def search_abund(self, query, *, threshold=None, **kwargs): matches = [] for subj, loc in self.signatures_with_location(): if not subj.minhash.track_abundance: - raise TypeError("'search_abund' requires subject signatures with abundance information") + raise TypeError( + "'search_abund' requires subject signatures with abundance information" + ) score = query.similarity(subj, downsample=True) if score >= threshold: matches.append(IndexSearchResult(score, subj, loc)) @@ -193,9 +198,16 @@ def search_abund(self, query, *, threshold=None, **kwargs): matches.sort(key=lambda x: -x.score) return matches - def search(self, query, *, threshold=None, - do_containment=False, do_max_containment=False, - best_only=False, **kwargs): + def search( + self, + query, + *, + threshold=None, + do_containment=False, + do_max_containment=False, + best_only=False, + **kwargs, + ): """Return list of IndexSearchResult with similarity above 'threshold'. Results will be sorted by similarity, highest to lowest. @@ -211,10 +223,12 @@ def search(self, query, *, threshold=None, raise TypeError("'search' requires 'threshold'") threshold = float(threshold) - search_obj = make_jaccard_search_query(do_containment=do_containment, - do_max_containment=do_max_containment, - best_only=best_only, - threshold=threshold) + search_obj = make_jaccard_search_query( + do_containment=do_containment, + do_max_containment=do_max_containment, + best_only=best_only, + threshold=threshold, + ) # do the actual search: matches = list(self.find(search_obj, query, **kwargs)) @@ -228,17 +242,17 @@ def prefetch(self, query, threshold_bp, **kwargs): Generator. Returns 0 or more IndexSearchResult namedtuples. """ - if not self: # empty database? quit. + if not self: # empty database? quit. raise ValueError("no signatures to search") # default best_only to False - best_only = kwargs.get('best_only', False) + best_only = kwargs.get("best_only", False) - search_fn = make_containment_query(query.minhash, threshold_bp, - best_only=best_only) + search_fn = make_containment_query( + query.minhash, threshold_bp, best_only=best_only + ) - for sr in self.find(search_fn, query, **kwargs): - yield sr + yield from self.find(search_fn, query, **kwargs) def best_containment(self, query, threshold_bp=None, **kwargs): """Return the match with the best Jaccard containment in the Index. @@ -247,8 +261,7 @@ def best_containment(self, query, threshold_bp=None, **kwargs): """ results = self.prefetch(query, threshold_bp, best_only=True, **kwargs) - results = sorted(results, - key=lambda x: (-x.score, x.signature.md5sum())) + results = sorted(results, key=lambda x: (-x.score, x.signature.md5sum())) try: return next(iter(results)) @@ -277,8 +290,7 @@ def peek(self, query_mh, *, threshold_bp=0): return [] # if matches, calculate intersection & return. - intersect_mh = flatten_and_intersect_scaled(result.signature.minhash, - query_mh) + intersect_mh = flatten_and_intersect_scaled(result.signature.minhash, query_mh) return [result, intersect_mh] @@ -307,8 +319,15 @@ def counter_gather(self, query, threshold_bp, **kwargs): return counter @abstractmethod - def select(self, ksize=None, moltype=None, scaled=None, num=None, - abund=None, containment=None): + def select( + self, + ksize=None, + moltype=None, + scaled=None, + num=None, + abund=None, + containment=None, + ): """Return Index containing only signatures that match requirements. Current arguments can be any or all of: @@ -326,8 +345,17 @@ def select(self, ksize=None, moltype=None, scaled=None, num=None, """ -def select_signature(ss, *, ksize=None, moltype=None, scaled=0, num=0, - containment=False, abund=None, picklist=None): +def select_signature( + ss, + *, + ksize=None, + moltype=None, + scaled=0, + num=0, + containment=False, + abund=None, + picklist=None, +): "Check that the given signature matches the specified requirements." # ksize match? if ksize and ksize != ss.minhash.ksize: @@ -372,6 +400,7 @@ class LinearIndex(Index): Concrete class; signatures held in memory; does not use manifests. """ + def __init__(self, _signatures=None, filename=None): self._signatures = [] if _signatures: @@ -395,7 +424,7 @@ def insert(self, node): self._signatures.append(node) def save(self, path): - with open(path, 'wt') as fp: + with open(path, "w") as fp: save_signatures(self.signatures(), fp) @classmethod @@ -404,7 +433,7 @@ def load(cls, location, filename=None): si = load_signatures(location, do_raise=True) if filename is None: - filename=location + filename = location lidx = LinearIndex(si, filename=filename) return lidx @@ -449,14 +478,12 @@ def __init__(self, db, selection_dict={}): def signatures(self): "Return the selected signatures." db = self.db.select(**self.selection_dict) - for ss in db.signatures(): - yield ss + yield from db.signatures() def signatures_with_location(self): "Return the selected signatures, with a location." db = self.db.select(**self.selection_dict) - for tup in db.signatures_with_location(): - yield tup + yield from db.signatures_with_location() def __bool__(self): try: @@ -502,10 +529,18 @@ class ZipFileLinearIndex(Index): Concrete class; signatures dynamically loaded from disk; uses manifests. """ + is_database = True - def __init__(self, storage, *, selection_dict=None, - traverse_yield_all=False, manifest=None, use_manifest=True): + def __init__( + self, + storage, + *, + selection_dict=None, + traverse_yield_all=False, + manifest=None, + use_manifest=True, + ): self.storage = storage self.selection_dict = selection_dict self.traverse_yield_all = traverse_yield_all @@ -514,7 +549,7 @@ def __init__(self, storage, *, selection_dict=None, # do we have a manifest already? if not, try loading. if use_manifest: if manifest is not None: - debug_literal('ZipFileLinearIndex using passed-in manifest') + debug_literal("ZipFileLinearIndex using passed-in manifest") self.manifest = manifest else: self._load_manifest() @@ -529,15 +564,16 @@ def __init__(self, storage, *, selection_dict=None, def _load_manifest(self): "Load a manifest if one exists" try: - manifest_data = self.storage.load('SOURMASH-MANIFEST.csv') + manifest_data = self.storage.load("SOURMASH-MANIFEST.csv") except (KeyError, FileNotFoundError): self.manifest = None else: - debug_literal(f'found manifest on load for {self.storage.path}') + debug_literal(f"found manifest on load for {self.storage.path}") # load manifest! from io import StringIO - manifest_data = manifest_data.decode('utf-8') + + manifest_data = manifest_data.decode("utf-8") manifest_fp = StringIO(manifest_data) self.manifest = CollectionManifest.load_from_csv(manifest_fp) @@ -584,8 +620,9 @@ def load(cls, location, traverse_yield_all=False, use_manifest=True): raise FileNotFoundError(location) storage = ZipStorage(location) - return cls(storage, traverse_yield_all=traverse_yield_all, - use_manifest=use_manifest) + return cls( + storage, traverse_yield_all=traverse_yield_all, use_manifest=use_manifest + ) def _signatures_with_internal(self): """Return an iterator of tuples (ss, internal_location). @@ -596,9 +633,11 @@ def _signatures_with_internal(self): # 'Storage' does not provide a way to list all the files, so :shrug:. for filename in self.storage._filenames(): # should we load this file? if it ends in .sig OR we are forcing: - if filename.endswith('.sig') or \ - filename.endswith('.sig.gz') or \ - self.traverse_yield_all: + if ( + filename.endswith(".sig") + or filename.endswith(".sig.gz") + or self.traverse_yield_all + ): sig_data = self.storage.load(filename) for ss in load_signatures(sig_data): yield ss, filename @@ -628,14 +667,19 @@ def signatures(self): # ad-hoc zipfiles that have no manifests.) for filename in storage._filenames(): # should we load this file? if it ends in .sig OR force: - if filename.endswith('.sig') or \ - filename.endswith('.sig.gz') or \ - self.traverse_yield_all: + if ( + filename.endswith(".sig") + or filename.endswith(".sig.gz") + or self.traverse_yield_all + ): if selection_dict: - select = lambda x: select_signature(x, - **selection_dict) + + def select(x): + return select_signature(x, **selection_dict) else: - select = lambda x: True + + def select(x): + return True data = self.storage.load(filename) for ss in load_signatures(data): @@ -651,11 +695,13 @@ def select(self, **kwargs): if manifest is not None: manifest = manifest.select_to_manifest(**kwargs) - return ZipFileLinearIndex(self.storage, - selection_dict=None, - traverse_yield_all=traverse_yield_all, - manifest=manifest, - use_manifest=True) + return ZipFileLinearIndex( + self.storage, + selection_dict=None, + traverse_yield_all=traverse_yield_all, + manifest=manifest, + use_manifest=True, + ) else: # no manifest? just pass along all the selection kwargs to # the new ZipFileLinearIndex. @@ -671,11 +717,13 @@ def select(self, **kwargs): d[k] = v kwargs = d - return ZipFileLinearIndex(self.storage, - selection_dict=kwargs, - traverse_yield_all=traverse_yield_all, - manifest=None, - use_manifest=False) + return ZipFileLinearIndex( + self.storage, + selection_dict=kwargs, + traverse_yield_all=traverse_yield_all, + manifest=None, + use_manifest=False, + ) class CounterGather: @@ -699,11 +747,12 @@ class CounterGather: duplicate md5s are collapsed inside the class, because we use the md5sum as a key into the dictionary used to store matches. """ + def __init__(self, query): "Constructor - takes a query SourmashSignature." query_mh = query.minhash if not query_mh.scaled: - raise ValueError('gather requires scaled signatures') + raise ValueError("gather requires scaled signatures") # track query self.orig_query_mh = query_mh.copy().flatten() @@ -746,8 +795,7 @@ def downsample(self, scaled): def signatures(self): "Return all signatures." - for ss in self.siglist.values(): - yield ss + yield from self.siglist.values() @property def union_found(self): @@ -763,8 +811,7 @@ def union_found(self): # for each match, intersect match with query & then add to found_mh. for ss in self.siglist.values(): - intersect_mh = flatten_and_intersect_scaled(ss.minhash, - orig_query_mh) + intersect_mh = flatten_and_intersect_scaled(ss.minhash, orig_query_mh) found_mh.add_many(intersect_mh) return found_mh @@ -784,7 +831,7 @@ def peek(self, cur_query_mh, *, threshold_bp=0): scaled = self.downsample(cur_query_mh.scaled) cur_query_mh = cur_query_mh.downsample(scaled=scaled) - if not cur_query_mh: # empty query? quit. + if not cur_query_mh: # empty query? quit. return [] # CTB: could probably remove this check unless debug requested. @@ -841,7 +888,7 @@ def consume(self, intersect_mh): # Prepare counter for finding the next match by decrementing # all hashes found in the current match in other datasets; # remove empty datasets from counter, too. - for (dataset_id, _) in most_common: + for dataset_id, _ in most_common: # CTB: note, remaining_mh may not be at correct scaled here. # this means that counters that _should_ be empty might not # _be_ empty in some situations. This does not @@ -849,8 +896,7 @@ def consume(self, intersect_mh): # 'counter' objects. The tradeoffs to fixing this would # need to be examined! (This could be fixed in self.downsample().) remaining_mh = siglist[dataset_id].minhash - intersect_count = intersect_mh.count_common(remaining_mh, - downsample=True) + intersect_count = intersect_mh.count_common(remaining_mh, downsample=True) if intersect_count: counter[dataset_id] -= intersect_count if counter[dataset_id] == 0: @@ -881,6 +927,7 @@ class MultiIndex(Index): Concrete class; signatures held in memory; builds and uses manifests. """ + def __init__(self, manifest, parent, *, prepend_location=False): """Constructor; takes manifest containing signatures, together with the top-level location. @@ -898,16 +945,16 @@ def location(self): def signatures(self): for row in self.manifest.rows: - yield row['signature'] + yield row["signature"] def signatures_with_location(self): for row in self.manifest.rows: - loc = row['internal_location'] + loc = row["internal_location"] # here, 'parent' may have been removed from internal_location # for directories; if so, add it back in. if self.prepend_location: loc = os.path.join(self.parent, loc) - yield row['signature'], loc + yield row["signature"], loc def _signatures_with_internal(self): """Return an iterator of tuples (ss, location) @@ -916,8 +963,7 @@ def _signatures_with_internal(self): index. This is a special feature of this (in memory) class. """ for row in self.manifest.rows: - yield row['signature'], row['internal_location'] - + yield row["signature"], row["internal_location"] def __len__(self): if self.manifest is None: @@ -986,18 +1032,17 @@ def load_from_directory(cls, pathname, *, force=False): rel = os.path.relpath(thisfile, pathname) source_list.append(rel) - except (IOError, sourmash.exceptions.SourmashError) as exc: + except (OSError, sourmash.exceptions.SourmashError) as exc: if force: - continue # ignore error + continue # ignore error else: - raise ValueError(exc) # stop loading! + raise ValueError(exc) # stop loading! # did we load anything? if not, error if not index_list: raise ValueError(f"no signatures to load under directory '{pathname}'") - return cls.load(index_list, source_list, pathname, - prepend_location=True) + return cls.load(index_list, source_list, pathname, prepend_location=True) @classmethod def load_from_path(cls, pathname, force=False): @@ -1010,7 +1055,7 @@ def load_from_path(cls, pathname, force=False): if not os.path.exists(pathname): raise ValueError(f"'{pathname}' must exist.") - if os.path.isdir(pathname): # traverse + if os.path.isdir(pathname): # traverse return cls.load_from_directory(pathname, force=force) # load as a .sig/JSON file @@ -1020,7 +1065,7 @@ def load_from_path(cls, pathname, force=False): idx = LinearIndex.load(pathname) index_list = [idx] source_list = [pathname] - except (IOError, sourmash.exceptions.SourmashError): + except (OSError, sourmash.exceptions.SourmashError): if not force: raise ValueError(f"no signatures to load from '{pathname}'") return None @@ -1035,8 +1080,8 @@ def load_from_pathlist(cls, filename): including zip collections, etc; it uses 'load_file_as_index' underneath. """ - from ..sourmash_args import (load_pathlist_from_file, - load_file_as_index) + from ..sourmash_args import load_pathlist_from_file, load_file_as_index + idx_list = [] src_list = [] @@ -1056,8 +1101,9 @@ def save(self, *args): def select(self, **kwargs): "Run 'select' on the manifest." new_manifest = self.manifest.select_to_manifest(**kwargs) - return MultiIndex(new_manifest, self.parent, - prepend_location=self.prepend_location) + return MultiIndex( + new_manifest, self.parent, prepend_location=self.prepend_location + ) class StandaloneManifestIndex(Index): @@ -1085,6 +1131,7 @@ class StandaloneManifestIndex(Index): objects. However, this class does not store any signatures in memory, unlike MultiIndex. """ + is_database = True def __init__(self, manifest, location, *, prefix=None): @@ -1119,8 +1166,7 @@ def location(self): def signatures_with_location(self): "Return an iterator over all signatures and their locations." - for ss, loc in self._signatures_with_internal(): - yield ss, loc + yield from self._signatures_with_internal() def signatures(self): "Return an iterator over all signatures." @@ -1140,7 +1186,7 @@ def _signatures_with_internal(self): picklist = self.manifest.to_picklist() for iloc in self.manifest.locations(): # prepend location with prefix? - if not iloc.startswith('/') and self.prefix: + if not iloc.startswith("/") and self.prefix: iloc = os.path.join(self.prefix, iloc) idx = sourmash.load_file_as_index(iloc) @@ -1165,5 +1211,4 @@ def insert(self, *args): def select(self, **kwargs): "Run 'select' on the manifest." new_manifest = self.manifest.select_to_manifest(**kwargs) - return StandaloneManifestIndex(new_manifest, self._location, - prefix=self.prefix) + return StandaloneManifestIndex(new_manifest, self._location, prefix=self.prefix) diff --git a/src/sourmash/index/revindex.py b/src/sourmash/index/revindex.py index 2f7074b53f..01f808783d 100644 --- a/src/sourmash/index/revindex.py +++ b/src/sourmash/index/revindex.py @@ -123,9 +123,9 @@ def signatures(self): for sig in sigs: yield sig - #if self._signatures: + # if self._signatures: # yield from self._signatures - #else: + # else: # raise NotImplementedError("Call into Rust and retrieve sigs") def __len__(self): @@ -156,81 +156,81 @@ def select(self, ksize=None, moltype=None, **kwargs): # TODO: deal with None/default values self.template = MinHash(ksize=ksize, moltype=moltype) -# def search(self, query, *args, **kwargs): -# """Return set of matches with similarity above 'threshold'. -# -# Results will be sorted by similarity, highest to lowest. -# -# Optional arguments: -# * do_containment: default False. If True, use Jaccard containment. -# * ignore_abundance: default False. If True, and query signature -# and database support k-mer abundances, ignore those abundances. -# -# Note, the "best only" hint is ignored by LCA_Database -# """ -# if not query.minhash: -# return [] -# -# # check arguments -# if "threshold" not in kwargs: -# raise TypeError("'search' requires 'threshold'") -# threshold = kwargs["threshold"] -# do_containment = kwargs.get("do_containment", False) -# ignore_abundance = kwargs.get("ignore_abundance", False) -# -# self._init_inner() -# -# size = ffi.new("uintptr_t *") -# results_ptr = self._methodcall( -# lib.revindex_search, -# query._get_objptr(), -# threshold, -# do_containment, -# ignore_abundance, -# size, -# ) -# -# size = size[0] -# if size == 0: -# return [] -# -# results = [] -# for i in range(size): -# match = SearchResult._from_objptr(results_ptr[i]) -# if match.score >= threshold: -# results.append(IndexSearchResult(match.score, match.signature, match.filename)) -# -# return results -# -# def gather(self, query, *args, **kwargs): -# "Return the match with the best Jaccard containment in the database." -# if not query.minhash: -# return [] -# -# self._init_inner() -# -# threshold_bp = kwargs.get("threshold_bp", 0.0) -# threshold = threshold_bp / (len(query.minhash) * self.scaled) -# -# results = [] -# size = ffi.new("uintptr_t *") -# results_ptr = self._methodcall( -# lib.revindex_gather, query._get_objptr(), threshold, True, True, size -# ) -# size = size[0] -# if size == 0: -# return [] -# -# results = [] -# for i in range(size): -# match = SearchResult._from_objptr(results_ptr[i]) -# if match.score >= threshold: -# results.append(IndexSearchResult(match.score, match.signature, match.filename)) -# -# results.sort(reverse=True, -# key=lambda x: (x.score, x.signature.md5sum())) -# -# return results[:1] + # def search(self, query, *args, **kwargs): + # """Return set of matches with similarity above 'threshold'. + # + # Results will be sorted by similarity, highest to lowest. + # + # Optional arguments: + # * do_containment: default False. If True, use Jaccard containment. + # * ignore_abundance: default False. If True, and query signature + # and database support k-mer abundances, ignore those abundances. + # + # Note, the "best only" hint is ignored by LCA_Database + # """ + # if not query.minhash: + # return [] + # + # # check arguments + # if "threshold" not in kwargs: + # raise TypeError("'search' requires 'threshold'") + # threshold = kwargs["threshold"] + # do_containment = kwargs.get("do_containment", False) + # ignore_abundance = kwargs.get("ignore_abundance", False) + # + # self._init_inner() + # + # size = ffi.new("uintptr_t *") + # results_ptr = self._methodcall( + # lib.revindex_search, + # query._get_objptr(), + # threshold, + # do_containment, + # ignore_abundance, + # size, + # ) + # + # size = size[0] + # if size == 0: + # return [] + # + # results = [] + # for i in range(size): + # match = SearchResult._from_objptr(results_ptr[i]) + # if match.score >= threshold: + # results.append(IndexSearchResult(match.score, match.signature, match.filename)) + # + # return results + # + # def gather(self, query, *args, **kwargs): + # "Return the match with the best Jaccard containment in the database." + # if not query.minhash: + # return [] + # + # self._init_inner() + # + # threshold_bp = kwargs.get("threshold_bp", 0.0) + # threshold = threshold_bp / (len(query.minhash) * self.scaled) + # + # results = [] + # size = ffi.new("uintptr_t *") + # results_ptr = self._methodcall( + # lib.revindex_gather, query._get_objptr(), threshold, True, True, size + # ) + # size = size[0] + # if size == 0: + # return [] + # + # results = [] + # for i in range(size): + # match = SearchResult._from_objptr(results_ptr[i]) + # if match.score >= threshold: + # results.append(IndexSearchResult(match.score, match.signature, match.filename)) + # + # results.sort(reverse=True, + # key=lambda x: (x.score, x.signature.md5sum())) + # + # return results[:1] @property def scaled(self): diff --git a/src/sourmash/index/sqlite_index.py b/src/sourmash/index/sqlite_index.py index b16eb00b59..458d40919d 100644 --- a/src/sourmash/index/sqlite_index.py +++ b/src/sourmash/index/sqlite_index.py @@ -95,9 +95,15 @@ # converters for unsigned 64-bit ints: if over MAX_SQLITE_INT, # convert to signed int. -MAX_SQLITE_INT = 2 ** 63 - 1 -convert_hash_to = lambda x: BitArray(uint=x, length=64).int if x > MAX_SQLITE_INT else x -convert_hash_from = lambda x: BitArray(int=x, length=64).uint if x < 0 else x +MAX_SQLITE_INT = 2**63 - 1 + + +def convert_hash_to(x): + return BitArray(uint=x, length=64).int if x > MAX_SQLITE_INT else x + + +def convert_hash_from(x): + return BitArray(int=x, length=64).uint if x < 0 else x def load_sqlite_index(filename, *, request_manifest=False): @@ -126,27 +132,29 @@ def load_sqlite_index(filename, *, request_manifest=False): is_manifest = False is_lca_db = False - if 'SqliteIndex' in internal_d: - v = internal_d['SqliteIndex'] - if v != '1.0': + if "SqliteIndex" in internal_d: + v = internal_d["SqliteIndex"] + if v != "1.0": raise IndexNotSupported is_index = True debug_literal("load_sqlite_index: it's an index!") - if is_index and 'SqliteLineage' in internal_d: - v = internal_d['SqliteLineage'] - if v != '1.0': + if is_index and "SqliteLineage" in internal_d: + v = internal_d["SqliteLineage"] + if v != "1.0": raise IndexNotSupported is_lca_db = True debug_literal("load_sqlite_index: it's got a lineage table!") - if 'SqliteManifest' in internal_d: - v = internal_d['SqliteManifest'] - if v != '1.0': + if "SqliteManifest" in internal_d: + v = internal_d["SqliteManifest"] + if v != "1.0": raise IndexNotSupported is_manifest = True - debug_literal(f"load_sqlite_index: it's a manifest! request_manifest: {request_manifest}") + debug_literal( + f"load_sqlite_index: it's a manifest! request_manifest: {request_manifest}" + ) # every Index is a Manifest! if is_index or is_lca_db: @@ -163,10 +171,10 @@ def load_sqlite_index(filename, *, request_manifest=False): debug_literal("load_sqlite_index: returning SqliteIndex") idx = SqliteIndex(filename) elif is_manifest: - managed_by_index=False + managed_by_index = False if is_index: assert request_manifest - managed_by_index=True + managed_by_index = True prefix = os.path.dirname(filename) mf = SqliteCollectionManifest(conn, managed_by_index=managed_by_index) @@ -178,7 +186,7 @@ def load_sqlite_index(filename, *, request_manifest=False): class SqliteIndex(Index): is_database = True - + # NOTE: we do not need _signatures_with_internal for this class # because it supplies a manifest directly :tada:. @@ -192,8 +200,7 @@ def __init__(self, dbfile, *, sqlite_manifest=None, conn=None): # build me a SQLite manifest class to use for selection. if sqlite_manifest is None: - sqlite_manifest = SqliteCollectionManifest(conn, - managed_by_index=True) + sqlite_manifest = SqliteCollectionManifest(conn, managed_by_index=True) self.manifest = sqlite_manifest self.conn = conn @@ -202,7 +209,9 @@ def __init__(self, dbfile, *, sqlite_manifest=None, conn=None): c.execute("SELECT DISTINCT scaled FROM sourmash_sketches") scaled_vals = c.fetchall() if len(scaled_vals) > 1: - raise ValueError("this database has multiple scaled values, which is not currently allowed") + raise ValueError( + "this database has multiple scaled values, which is not currently allowed" + ) if scaled_vals: self.scaled = scaled_vals[0][0] @@ -247,28 +256,35 @@ def create(cls, dbfile, *, append=False): def _create_tables(cls, c, *, ignore_exists=False): "Create sqlite tables for SqliteIndex" try: - sqlite_utils.add_sourmash_internal(c, 'SqliteIndex', '1.0') + sqlite_utils.add_sourmash_internal(c, "SqliteIndex", "1.0") SqliteCollectionManifest._create_tables(c) - c.execute(""" + c.execute( + """ CREATE TABLE IF NOT EXISTS sourmash_hashes ( hashval INTEGER NOT NULL, sketch_id INTEGER NOT NULL, FOREIGN KEY (sketch_id) REFERENCES sourmash_sketches (id) ) - """) - c.execute(""" + """ + ) + c.execute( + """ CREATE INDEX IF NOT EXISTS sourmash_hashval_idx ON sourmash_hashes ( hashval, sketch_id ) - """) - c.execute(""" + """ + ) + c.execute( + """ CREATE INDEX IF NOT EXISTS sourmash_hashval_idx2 ON sourmash_hashes ( hashval ) - """) - c.execute(""" + """ + ) + c.execute( + """ CREATE INDEX IF NOT EXISTS sourmash_sketch_idx ON sourmash_hashes ( sketch_id ) @@ -312,18 +328,21 @@ def insert(self, ss, *, cursor=None, commit=True): raise ValueError("cannot store signatures with abundance in SqliteIndex") if self.scaled is not None and self.scaled != ss.minhash.scaled: - raise ValueError(f"this database can only store scaled values={self.scaled}") + raise ValueError( + f"this database can only store scaled values={self.scaled}" + ) elif self.scaled is None: self.scaled = ss.minhash.scaled # ok, first create and insert a manifest row - row = BaseCollectionManifest.make_manifest_row(ss, None, - include_signature=False) + row = BaseCollectionManifest.make_manifest_row( + ss, None, include_signature=False + ) self.manifest._insert_row(c, row, call_is_from_index=True) # retrieve ID of row for retrieving hashes: c.execute("SELECT last_insert_rowid()") - sketch_id, = c.fetchone() + (sketch_id,) = c.fetchone() # insert all the hashes hashes_to_sketch = [] @@ -331,8 +350,10 @@ def insert(self, ss, *, cursor=None, commit=True): hh = convert_hash_to(h) hashes_to_sketch.append((hh, sketch_id)) - c.executemany("INSERT INTO sourmash_hashes (hashval, sketch_id) VALUES (?, ?)", - hashes_to_sketch) + c.executemany( + "INSERT INTO sourmash_hashes (hashval, sketch_id) VALUES (?, ?)", + hashes_to_sketch, + ) if commit: self.conn.commit() @@ -366,30 +387,31 @@ def find(self, search_fn, query, **kwargs): picklist = None if self.manifest.selection_dict: - picklist = self.manifest.selection_dict.get('picklist') + picklist = self.manifest.selection_dict.get("picklist") c1 = self.conn.cursor() c2 = self.conn.cursor() - debug_literal('running _get_matching_sketches...') + debug_literal("running _get_matching_sketches...") t0 = time.time() - xx = self._get_matching_sketches(c1, query_mh.hashes, - query_mh._max_hash) + xx = self._get_matching_sketches(c1, query_mh.hashes, query_mh._max_hash) for sketch_id, n_matching_hashes in xx: - debug_literal(f"...got sketch {sketch_id}, with {n_matching_hashes} matching hashes in {time.time() - t0:.2f}") + debug_literal( + f"...got sketch {sketch_id}, with {n_matching_hashes} matching hashes in {time.time() - t0:.2f}" + ) # # first, estimate sketch size using sql results. # query_size = len(query_mh) - subj_size = self._load_sketch_size(c2, sketch_id, - query_mh._max_hash) + subj_size = self._load_sketch_size(c2, sketch_id, query_mh._max_hash) total_size = query_size + subj_size - n_matching_hashes shared_size = n_matching_hashes - score = search_fn.score_fn(query_size, shared_size, subj_size, - total_size) + score = search_fn.score_fn(query_size, shared_size, subj_size, total_size) - debug_literal(f"APPROX RESULT: score={score} qsize={query_size}, ssize={subj_size} total={total_size} overlap={shared_size}") + debug_literal( + f"APPROX RESULT: score={score} qsize={query_size}, ssize={subj_size} total={total_size} overlap={shared_size}" + ) # do we pass? if not search_fn.passes(score): @@ -415,8 +437,7 @@ def _select(self, *, num=0, track_abundance=False, **kwargs): # create manifest if needed manifest = self.manifest if manifest is None: - manifest = SqliteCollectionManifest(self.conn, - managed_by_index=True) + manifest = SqliteCollectionManifest(self.conn, managed_by_index=True) # modify manifest manifest = manifest.select_to_manifest(**kwargs) @@ -427,9 +448,7 @@ def select(self, *args, **kwargs): sqlite_manifest = self._select(*args, **kwargs) # return a new SqliteIndex with a new manifest, but same old conn. - return SqliteIndex(self.dbfile, - sqlite_manifest=sqlite_manifest, - conn=self.conn) + return SqliteIndex(self.dbfile, sqlite_manifest=sqlite_manifest, conn=self.conn) # # Actual SQL queries, etc. @@ -438,53 +457,77 @@ def select(self, *args, **kwargs): def _load_sketch_size(self, c1, sketch_id, max_hash): "Get sketch size for given sketch, downsampled by max_hash." if max_hash <= MAX_SQLITE_INT: - c1.execute(""" + c1.execute( + """ SELECT COUNT(hashval) FROM sourmash_hashes WHERE sketch_id=? AND hashval >= 0 AND hashval <= ?""", - (sketch_id, max_hash)) + (sketch_id, max_hash), + ) else: - c1.execute('SELECT COUNT(hashval) FROM sourmash_hashes WHERE sketch_id=?', - (sketch_id,)) + c1.execute( + "SELECT COUNT(hashval) FROM sourmash_hashes WHERE sketch_id=?", + (sketch_id,), + ) - n_hashes, = c1.fetchone() + (n_hashes,) = c1.fetchone() return n_hashes def _load_sketch(self, c, sketch_id, *, match_scaled=None): "Load an individual sketch. If match_scaled is set, downsample." start = time.time() - c.execute(""" + c.execute( + """ SELECT id, name, scaled, ksize, filename, moltype, seed - FROM sourmash_sketches WHERE id=?""", (sketch_id,)) - debug_literal(f"load sketch {sketch_id}: got sketch info in {time.time() - start:.2f}") + FROM sourmash_sketches WHERE id=?""", + (sketch_id,), + ) + debug_literal( + f"load sketch {sketch_id}: got sketch info in {time.time() - start:.2f}" + ) sketch_id, name, scaled, ksize, filename, moltype, seed = c.fetchone() if match_scaled is not None: scaled = max(scaled, match_scaled) - is_protein = 1 if moltype=='protein' else 0 - is_dayhoff = 1 if moltype=='dayhoff' else 0 - is_hp = 1 if moltype=='hp' else 0 - - mh = MinHash(n=0, ksize=ksize, scaled=scaled, seed=seed, - is_protein=is_protein, dayhoff=is_dayhoff, hp=is_hp) - + is_protein = 1 if moltype == "protein" else 0 + is_dayhoff = 1 if moltype == "dayhoff" else 0 + is_hp = 1 if moltype == "hp" else 0 + + mh = MinHash( + n=0, + ksize=ksize, + scaled=scaled, + seed=seed, + is_protein=is_protein, + dayhoff=is_dayhoff, + hp=is_hp, + ) template_values = [sketch_id] hash_constraint_str = "" max_hash = mh._max_hash if max_hash <= MAX_SQLITE_INT: - hash_constraint_str = "sourmash_hashes.hashval >= 0 AND sourmash_hashes.hashval <= ? AND" + hash_constraint_str = ( + "sourmash_hashes.hashval >= 0 AND sourmash_hashes.hashval <= ? AND" + ) template_values.insert(0, max_hash) else: - debug_literal('NOT EMPLOYING hash_constraint_str') + debug_literal("NOT EMPLOYING hash_constraint_str") - debug_literal(f"finding hashes for sketch {sketch_id} in {time.time() - start:.2f}") - c.execute(f"SELECT hashval FROM sourmash_hashes WHERE {hash_constraint_str} sourmash_hashes.sketch_id=?", template_values) + debug_literal( + f"finding hashes for sketch {sketch_id} in {time.time() - start:.2f}" + ) + c.execute( + f"SELECT hashval FROM sourmash_hashes WHERE {hash_constraint_str} sourmash_hashes.sketch_id=?", + template_values, + ) - debug_literal(f"loading hashes for sketch {sketch_id} in {time.time() - start:.2f}") - for hashval, in c: + debug_literal( + f"loading hashes for sketch {sketch_id} in {time.time() - start:.2f}" + ) + for (hashval,) in c: hh = convert_hash_from(hashval) mh.add_hash(hh) @@ -495,29 +538,36 @@ def _load_sketch(self, c, sketch_id, *, match_scaled=None): def _load_sketches(self, c): "Load sketches based on manifest _id column." for row in self.manifest.rows: - sketch_id = row['_id'] - assert row['num'] == 0 - - moltype = row['moltype'] - is_protein = 1 if moltype=='protein' else 0 - is_dayhoff = 1 if moltype=='dayhoff' else 0 - is_hp = 1 if moltype=='hp' else 0 - - ksize = row['ksize'] - scaled = row['scaled'] - seed = row['seed'] - - mh = MinHash(n=0, ksize=ksize, scaled=scaled, seed=seed, - is_protein=is_protein, dayhoff=is_dayhoff, hp=is_hp) + sketch_id = row["_id"] + assert row["num"] == 0 + + moltype = row["moltype"] + is_protein = 1 if moltype == "protein" else 0 + is_dayhoff = 1 if moltype == "dayhoff" else 0 + is_hp = 1 if moltype == "hp" else 0 + + ksize = row["ksize"] + scaled = row["scaled"] + seed = row["seed"] + + mh = MinHash( + n=0, + ksize=ksize, + scaled=scaled, + seed=seed, + is_protein=is_protein, + dayhoff=is_dayhoff, + hp=is_hp, + ) - c.execute("SELECT hashval FROM sourmash_hashes WHERE sketch_id=?", - (sketch_id,)) + c.execute( + "SELECT hashval FROM sourmash_hashes WHERE sketch_id=?", (sketch_id,) + ) - for hashval, in c: + for (hashval,) in c: mh.add_hash(convert_hash_from(hashval)) - ss = SourmashSignature(mh, name=row['name'], - filename=row['filename']) + ss = SourmashSignature(mh, name=row["name"], filename=row["filename"]) yield ss, self.dbfile, sketch_id def _get_matching_sketches(self, c, hashes, max_hash): @@ -529,11 +579,14 @@ def _get_matching_sketches(self, c, hashes, max_hash): because it slows things down in practice. """ c.execute("DROP TABLE IF EXISTS sourmash_hash_query") - c.execute("CREATE TEMPORARY TABLE sourmash_hash_query (hashval INTEGER PRIMARY KEY)") + c.execute( + "CREATE TEMPORARY TABLE sourmash_hash_query (hashval INTEGER PRIMARY KEY)" + ) - hashvals = [ (convert_hash_to(h),) for h in hashes ] - c.executemany("INSERT OR IGNORE INTO sourmash_hash_query (hashval) VALUES (?)", - hashvals) + hashvals = [(convert_hash_to(h),) for h in hashes] + c.executemany( + "INSERT OR IGNORE INTO sourmash_hash_query (hashval) VALUES (?)", hashvals + ) # # set up SELECT conditions @@ -550,15 +603,18 @@ def _get_matching_sketches(self, c, hashes, max_hash): template_values.append(max_hash) # format conditions - conditions.append('sourmash_hashes.hashval=sourmash_hash_query.hashval') + conditions.append("sourmash_hashes.hashval=sourmash_hash_query.hashval") conditions = " AND ".join(conditions) - c.execute(f""" + c.execute( + f""" SELECT DISTINCT sourmash_hashes.sketch_id,COUNT(sourmash_hashes.hashval) as CNT FROM sourmash_hashes, sourmash_hash_query WHERE {conditions} GROUP BY sourmash_hashes.sketch_id ORDER BY CNT DESC - """, template_values) + """, + template_values, + ) return c @@ -578,6 +634,7 @@ class SqliteCollectionManifest(BaseCollectionManifest): In the latter case, the SqliteCollectionManifest is created with managed_by_index set to True. """ + def __init__(self, conn, *, selection_dict=None, managed_by_index=False): """ Here, 'conn' should already be connected and configured. @@ -617,8 +674,9 @@ def create_or_open(cls, filename): @classmethod def load_from_manifest(cls, manifest, *, dbfile=":memory:", append=False): "Create a new sqlite manifest from an existing manifest object." - return cls._create_manifest_from_rows(manifest.rows, location=dbfile, - append=append) + return cls._create_manifest_from_rows( + manifest.rows, location=dbfile, append=append + ) @classmethod def create_manifest(cls, locations_iter, *, include_signature=False): @@ -629,10 +687,10 @@ def create_manifest(cls, locations_iter, *, include_signature=False): Note: do NOT catch exceptions here, so this passes through load excs. Note: this method ignores 'include_signature'. """ + def rows_iter(): for ss, location in locations_iter: - row = cls.make_manifest_row(ss, location, - include_signature=False) + row = cls.make_manifest_row(ss, location, include_signature=False) yield row return cls._create_manifest_from_rows(rows_iter()) @@ -643,8 +701,9 @@ def _create_tables(cls, cursor): # this is a class method so that it can be used by SqliteIndex to # create manifest-compatible tables. - sqlite_utils.add_sourmash_internal(cursor, 'SqliteManifest', '1.0') - cursor.execute(""" + sqlite_utils.add_sourmash_internal(cursor, "SqliteManifest", "1.0") + cursor.execute( + """ CREATE TABLE sourmash_sketches (id INTEGER PRIMARY KEY, name TEXT, @@ -660,7 +719,8 @@ def _create_tables(cls, cursor): internal_location TEXT, UNIQUE(internal_location, md5sum) ) - """) + """ + ) def add_row(self, row): c = self.conn.cursor() @@ -674,18 +734,21 @@ def _insert_row(self, cursor, row, *, call_is_from_index=False): raise Exception("must use SqliteIndex.insert to add to this manifest") row = dict(row) - if 'seed' not in row: - row['seed'] = 42 + if "seed" not in row: + row["seed"] = 42 - cursor.execute(""" + cursor.execute( + """ INSERT OR IGNORE INTO sourmash_sketches (name, num, scaled, ksize, filename, md5sum, moltype, seed, n_hashes, with_abundance, internal_location) VALUES (:name, :num, :scaled, :ksize, :filename, :md5, :moltype, :seed, :n_hashes, :with_abundance, - :internal_location)""", row) + :internal_location)""", + row, + ) - self._num_rows = None # reset cache + self._num_rows = None # reset cache def __bool__(self): "Is this manifest empty?" @@ -700,7 +763,7 @@ def __bool__(self): def __eq__(self, other): "Check equality on a row-by-row basis. May fail on out-of-order rows." - for (a, b) in itertools.zip_longest(self.rows, other.rows): + for a, b in itertools.zip_longest(self.rows, other.rows): # ignore non-required keys. for k in self.required_keys: if a[k] != b[k]: @@ -749,21 +812,21 @@ def _make_select(self): picklist = None if self.selection_dict: select_d = self.selection_dict - if 'ksize' in select_d and select_d['ksize']: + if "ksize" in select_d and select_d["ksize"]: conditions.append("sourmash_sketches.ksize = ?") - values.append(select_d['ksize']) - if 'num' in select_d and select_d['num'] > 0: + values.append(select_d["ksize"]) + if "num" in select_d and select_d["num"] > 0: conditions.append("sourmash_sketches.num > 0") - if 'scaled' in select_d and select_d['scaled'] > 0: + if "scaled" in select_d and select_d["scaled"] > 0: conditions.append("sourmash_sketches.scaled > 0") - if 'containment' in select_d and select_d['containment']: + if "containment" in select_d and select_d["containment"]: conditions.append("sourmash_sketches.scaled > 0") - if 'moltype' in select_d and select_d['moltype'] is not None: - moltype = select_d['moltype'] - assert moltype in ('DNA', 'protein', 'dayhoff', 'hp'), moltype + if "moltype" in select_d and select_d["moltype"] is not None: + moltype = select_d["moltype"] + assert moltype in ("DNA", "protein", "dayhoff", "hp"), moltype conditions.append(f"sourmash_sketches.moltype = '{moltype}'") - picklist = select_d.get('picklist') + picklist = select_d.get("picklist") return conditions, values, picklist @@ -784,10 +847,10 @@ def select_to_manifest(self, **kwargs): new_mf = SqliteCollectionManifest(self.conn, selection_dict=kwargs) # if picklist, make sure we fill in 'found'. - picklist = kwargs.get('picklist') + picklist = kwargs.get("picklist") if picklist is not None: debug_literal("sqlite manifest: iterating through picklist") - _ = len(self) # this forces iteration through rows. + _ = len(self) # this forces iteration through rows. return new_mf @@ -803,19 +866,43 @@ def rows(self): conditions = "" debug_literal(f"sqlite manifest rows: executing select with '{conditions}'") - c1.execute(f""" + c1.execute( + f""" SELECT id, name, md5sum, num, scaled, ksize, filename, moltype, seed, n_hashes, internal_location FROM sourmash_sketches {conditions} - """, values) + """, + values, + ) debug_literal("sqlite manifest: entering row yield loop") - for (_id, name, md5sum, num, scaled, ksize, filename, moltype, - seed, n_hashes, iloc) in c1: - row = dict(num=num, scaled=scaled, name=name, filename=filename, - n_hashes=n_hashes, with_abundance=0, ksize=ksize, - md5=md5sum, internal_location=iloc, - moltype=moltype, md5short=md5sum[:8], - seed=seed, _id=_id) + for ( + _id, + name, + md5sum, + num, + scaled, + ksize, + filename, + moltype, + seed, + n_hashes, + iloc, + ) in c1: + row = dict( + num=num, + scaled=scaled, + name=name, + filename=filename, + n_hashes=n_hashes, + with_abundance=0, + ksize=ksize, + md5=md5sum, + internal_location=iloc, + moltype=moltype, + md5short=md5sum[:8], + seed=seed, + _id=_id, + ) if picklist is None or picklist.matches_manifest_row(row): yield row @@ -824,6 +911,7 @@ def filter_rows(self, row_filter_fn): This is done in memory, inserting each row one at a time. """ + def rows_iter(): for row in self.rows: if row_filter_fn(row): @@ -833,9 +921,11 @@ def rows_iter(): def filter_on_columns(self, col_filter_fn, col_names): "Create a new manifest based on column matches." + def row_filter_fn(row): - x = [ row[col] for col in col_names if row[col] is not None ] + x = [row[col] for col in col_names if row[col] is not None] return col_filter_fn(x) + return self.filter_rows(row_filter_fn) def locations(self): @@ -856,20 +946,22 @@ def locations(self): else: conditions = "" - c1.execute(f""" + c1.execute( + f""" SELECT DISTINCT internal_location FROM sourmash_sketches {conditions} - """, values) + """, + values, + ) - return ( iloc for iloc, in c1 ) + return (iloc for (iloc,) in c1) def __contains__(self, ss): "Check to see if signature 'ss' is in this manifest." md5 = ss.md5sum() c = self.conn.cursor() - c.execute('SELECT COUNT(*) FROM sourmash_sketches WHERE md5sum=?', - (md5,)) - val, = c.fetchone() + c.execute("SELECT COUNT(*) FROM sourmash_sketches WHERE md5sum=?", (md5,)) + (val,) = c.fetchone() if bool(val): picklist = self.picklist @@ -880,18 +972,19 @@ def __contains__(self, ss): def picklist(self): "Return the picklist, if any." if self.selection_dict: - return self.selection_dict.get('picklist') + return self.selection_dict.get("picklist") return None def to_picklist(self): "Convert this manifest to a picklist." - pl = SignaturePicklist('manifest') - pl.pickset = { pl._get_value_for_manifest_row(row) for row in self.rows } + pl = SignaturePicklist("manifest") + pl.pickset = {pl._get_value_for_manifest_row(row) for row in self.rows} return pl @classmethod - def _create_manifest_from_rows(cls, rows_iter, *, location=":memory:", - append=False): + def _create_manifest_from_rows( + cls, rows_iter, *, location=":memory:", append=False + ): """Create a SqliteCollectionManifest from a rows iterator. Internal utility function. @@ -903,7 +996,9 @@ def _create_manifest_from_rows(cls, rows_iter, *, location=":memory:", mf = cls.create(location) except (sqlite3.OperationalError, sqlite3.DatabaseError) as exc: if not append: - raise Exception(f"cannot create sqlite3 db at '{location}'; exception: {str(exc)}") + raise Exception( + f"cannot create sqlite3 db at '{location}'; exception: {str(exc)}" + ) db = load_sqlite_index(location, request_manifest=True) mf = db.manifest @@ -920,6 +1015,7 @@ class LCA_SqliteDatabase(SqliteIndex): """ A wrapper class for SqliteIndex + lineage db => LCA_Database functionality. """ + is_database = True def __init__(self, dbfile, *, lineage_db=None, sqlite_manifest=None): @@ -929,10 +1025,12 @@ def __init__(self, dbfile, *, lineage_db=None, sqlite_manifest=None): c = self.conn.cursor() - c.execute('SELECT DISTINCT ksize, moltype FROM sourmash_sketches') + c.execute("SELECT DISTINCT ksize, moltype FROM sourmash_sketches") res = list(c) if len(res) > 1: - raise TypeError("can only have one ksize & moltype in an LCA_SqliteDatabase") + raise TypeError( + "can only have one ksize & moltype in an LCA_SqliteDatabase" + ) if len(res) == 0: raise ValueError("cannot load an LCA_SqliteDatabase") @@ -996,20 +1094,20 @@ def _build_index(self): lid_to_lineage = {} for row in mf.rows: - name = row['name'] + name = row["name"] if name: # this is a bit of a hack. we try identifiers _with_ and # _without_ versions, and take whichever works. There is # definitely a better way to do this, but I can't think # of one right now. - ident = name.split(' ')[0] + ident = name.split(" ")[0] - lineage = lineage_db.get(ident) # try with identifier version - if lineage is None: # nope - remove version.x - ident = name.split('.')[0] + lineage = lineage_db.get(ident) # try with identifier version + if lineage is None: # nope - remove version.x + ident = name.split(".")[0] lineage = lineage_db.get(ident) - idx = row['_id'] # this is only present in sqlite manifests. + idx = row["_id"] # this is only present in sqlite manifests. ident_to_idx[ident] = idx if lineage: @@ -1038,16 +1136,16 @@ def insert(self, *args, **kwargs): def select(self, *args, **kwargs): sqlite_manifest = self._select(*args, **kwargs) - return LCA_SqliteDatabase(self.dbfile, - sqlite_manifest=sqlite_manifest, - lineage_db=self.lineage_db) + return LCA_SqliteDatabase( + self.dbfile, sqlite_manifest=sqlite_manifest, lineage_db=self.lineage_db + ) ### LCA_Database API/protocol. def downsample_scaled(self, scaled): "Downsample the scaled for querying." if scaled < self.scaled: - raise ValueError("cannot decrease scaled from {} to {}".format(self.scaled, scaled)) + raise ValueError(f"cannot decrease scaled from {self.scaled} to {scaled}") # CTB: maybe return a new LCA_Database? Right now this isn't how # the lca_db protocol works tho. @@ -1097,17 +1195,18 @@ def get_identifiers_for_hashval(self, hashval): class _SqliteIndexHashvalToIndex: """ - Internal wrapper class to retrieve keys and key/value pairs for + Internal wrapper class to retrieve keys and key/value pairs for hashval -> [ list of idx ]. """ + def __init__(self, sqlidx): self.sqlidx = sqlidx def __iter__(self): "Get all hashvals." c = self.sqlidx.conn.cursor() - c.execute('SELECT DISTINCT hashval FROM sourmash_hashes') - for hashval, in c: + c.execute("SELECT DISTINCT hashval FROM sourmash_hashes") + for (hashval,) in c: yield hashval def get(self, key, dv=None): @@ -1117,10 +1216,9 @@ def get(self, key, dv=None): hh = convert_hash_to(key) - c.execute('SELECT sketch_id FROM sourmash_hashes WHERE hashval=?', - (hh,)) + c.execute("SELECT sketch_id FROM sourmash_hashes WHERE hashval=?", (hh,)) - x = [ convert_hash_from(h) for h, in c ] + x = [convert_hash_from(h) for (h,) in c] return x or dv def __getitem__(self, key): diff --git a/src/sourmash/lca/__init__.py b/src/sourmash/lca/__init__.py index b2a9af2589..82b468c424 100644 --- a/src/sourmash/lca/__init__.py +++ b/src/sourmash/lca/__init__.py @@ -1,13 +1,18 @@ "LCA and reverse index utilities." from .lca_db import LCA_Database -from .lca_utils import (taxlist, zip_lineage, build_tree, find_lca, - gather_assignments, display_lineage, - count_lca_for_assignments) +from .lca_utils import ( + taxlist, + zip_lineage, + build_tree, + find_lca, + gather_assignments, + display_lineage, + count_lca_for_assignments, +) from .command_index import index from .command_classify import classify from .command_summarize import summarize_main from .command_rankinfo import rankinfo_main from .__main__ import main - diff --git a/src/sourmash/lca/__main__.py b/src/sourmash/lca/__main__.py index b02b891771..73faa36019 100644 --- a/src/sourmash/lca/__main__.py +++ b/src/sourmash/lca/__main__.py @@ -9,7 +9,7 @@ from .command_compare_csv import compare_csv from ..logging import set_quiet, error -usage=''' +usage = """ sourmash lca [] - work with taxonomic information. ** Commands can be: @@ -23,14 +23,15 @@ ** Use '-h' to get subcommand-specific help, e.g. sourmash lca index -h -''' +""" + def main(arglist=None): args = sourmash.cli.get_parser().parse_args(arglist) submod = getattr(sourmash.cli.sig, args.subcmd) - mainmethod = getattr(submod, 'main') + mainmethod = getattr(submod, "main") return mainmethod(args) -if __name__ == '__main__': +if __name__ == "__main__": main(sys.argv) diff --git a/src/sourmash/lca/command_classify.py b/src/sourmash/lca/command_classify.py index cf5605be72..4ea5ae69ec 100644 --- a/src/sourmash/lca/command_classify.py +++ b/src/sourmash/lca/command_classify.py @@ -11,7 +11,7 @@ from . import lca_utils from .lca_utils import check_files_exist -DEFAULT_THRESHOLD=5 # how many counts of a taxid at min +DEFAULT_THRESHOLD = 5 # how many counts of a taxid at min def classify_signature(query_sig, dblist, threshold, majority): @@ -33,10 +33,9 @@ def classify_signature(query_sig, dblist, threshold, majority): shows up, and filter out low-abundance ones (under threshold). Then, determine the LCA of all of those. - """ + """ # gather assignments from across all the databases - assignments = lca_utils.gather_assignments(query_sig.minhash.hashes, - dblist) + assignments = lca_utils.gather_assignments(query_sig.minhash.hashes, dblist) # now convert to trees -> do LCA & counts counts = lca_utils.count_lca_for_assignments(assignments) @@ -59,20 +58,20 @@ def classify_signature(query_sig, dblist, threshold, majority): # update tree with this set of assignments lca_utils.build_tree([lca], tree) - status = 'nomatch' + status = "nomatch" if not tree: return [], status # now find lowest-common-ancestor of the resulting tree. lca, reason = lca_utils.find_lca(tree) - if reason == 0: # leaf node - debug('END', lca) - status = 'found' - else: # internal node => disagreement - debug('MULTI', lca) - status = 'disagree' + if reason == 0: # leaf node + debug("END", lca) + status = "found" + else: # internal node => disagreement + debug("MULTI", lca) + status = "disagree" - debug('lineage is:', lca) + debug("lineage is:", lca) return lca, status @@ -82,7 +81,7 @@ def classify(args): main single-genome classification function. """ if not args.db: - error('Error! must specify at least one LCA database with --db') + error("Error! must specify at least one LCA database with --db") sys.exit(-1) set_quiet(args.quiet, args.debug) @@ -98,7 +97,7 @@ def classify(args): dblist, ksize, scaled = lca_utils.load_databases(args.db, args.scaled) # find all the queries - notify('finding query signatures...') + notify("finding query signatures...") inp_files = list(args.query) if args.query_from_file: more_files = sourmash_args.load_pathlist_from_file(args.query_from_file) @@ -108,7 +107,9 @@ def classify(args): sys.exit(-1) if not inp_files: - error('Error! must specify at least one query signature with --query or --query-from-file') + error( + "Error! must specify at least one query signature with --query or --query-from-file" + ) sys.exit(-1) # set up output @@ -117,7 +118,7 @@ def classify(args): with sourmash_args.FileOutputCSV(args.output) as outfp: csvfp = csv.writer(outfp) - csvfp.writerow(['ID','status'] + list(lca_utils.taxlist())) + csvfp.writerow(["ID", "status"] + list(lca_utils.taxlist())) # for each query, gather all the matches across databases total_count = 0 @@ -125,11 +126,10 @@ def classify(args): total_n = len(inp_files) for query_filename in inp_files: n += 1 - for query_sig in load_file_as_signatures(query_filename, - ksize=ksize): - notify(u'\r\033[K', end=u'') - notify(f'... classifying {query_sig} (file {n} of {total_n})', end='\r') - debug('classifying', query_sig) + for query_sig in load_file_as_signatures(query_filename, ksize=ksize): + notify("\r\033[K", end="") + notify(f"... classifying {query_sig} (file {n} of {total_n})", end="\r") + debug("classifying", query_sig) total_count += 1 # make sure we're looking at the same scaled value as database @@ -139,8 +139,9 @@ def classify(args): query_sig.minhash = downsample_mh # do the classification - lineage, status = classify_signature(query_sig, dblist, - args.threshold, args.majority) + lineage, status = classify_signature( + query_sig, dblist, args.threshold, args.majority + ) debug(lineage) # output each classification to the spreadsheet @@ -149,12 +150,12 @@ def classify(args): # when outputting to stdout, make output intelligible if not args.output: - notify(u'\r\033[K', end=u'') + notify("\r\033[K", end="") csvfp.writerow(row) - notify(u'\r\033[K', end=u'') - notify(f'classified {total_count} signatures total') + notify("\r\033[K", end="") + notify(f"classified {total_count} signatures total") -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(classify(sys.argv[1:])) diff --git a/src/sourmash/lca/command_compare_csv.py b/src/sourmash/lca/command_compare_csv.py index 99b7f8211a..c8018256f0 100644 --- a/src/sourmash/lca/command_compare_csv.py +++ b/src/sourmash/lca/command_compare_csv.py @@ -13,44 +13,50 @@ def compare_csv(args): if args.start_column < 2: - error('error, --start-column cannot be less than 2') + error("error, --start-column cannot be less than 2") sys.exit(-1) set_quiet(args.quiet, args.debug) # first, load classify-style spreadsheet - notify(f'loading classify output from: {args.csv1}') - assignments0, num_rows0 = load_taxonomy_assignments(args.csv1, - start_column=3, - force=args.force) + notify(f"loading classify output from: {args.csv1}") + assignments0, num_rows0 = load_taxonomy_assignments( + args.csv1, start_column=3, force=args.force + ) - notify(f'loaded {len(set(assignments0.values()))} distinct lineages, {num_rows0} rows') - notify('----') + notify( + f"loaded {len(set(assignments0.values()))} distinct lineages, {num_rows0} rows" + ) + notify("----") # next, load custom taxonomy spreadsheet - delimiter = ',' + delimiter = "," if args.tabs: - delimiter = '\t' + delimiter = "\t" - notify(f'loading custom spreadsheet from: {args.csv2}') - assignments, num_rows = load_taxonomy_assignments(args.csv2, - delimiter=delimiter, - start_column=args.start_column, - use_headers=not args.no_headers, - force=args.force) - notify(f'loaded {len(set(assignments.values()))} distinct lineages, {num_rows} rows') + notify(f"loading custom spreadsheet from: {args.csv2}") + assignments, num_rows = load_taxonomy_assignments( + args.csv2, + delimiter=delimiter, + start_column=args.start_column, + use_headers=not args.no_headers, + force=args.force, + ) + notify( + f"loaded {len(set(assignments.values()))} distinct lineages, {num_rows} rows" + ) # now, compute basic differences: missing_1 = set(assignments0.keys()) - set(assignments.keys()) missing_2 = set(assignments.keys()) - set(assignments0.keys()) if missing_2: - notify(f'missing {len(missing_2)} assignments in classify spreadsheet.') + notify(f"missing {len(missing_2)} assignments in classify spreadsheet.") if missing_1: - notify(f'missing {len(missing_1)} assignments in custom spreadsheet.') + notify(f"missing {len(missing_1)} assignments in custom spreadsheet.") if missing_1 or missing_2: - notify('(these will not be evaluated any further)') + notify("(these will not be evaluated any further)") else: - notify('note: all IDs are in both spreadsheets!') + notify("note: all IDs are in both spreadsheets!") # next, look at differences in lineages common = set(assignments0.keys()) @@ -71,7 +77,7 @@ def compare_csv(args): lca_utils.build_tree([v1], tree) lca, reason = lca_utils.find_lca(tree) - if reason == 0: # compatible lineages + if reason == 0: # compatible lineages n_compat += 1 print_results("{},compatible,{}", k, ";".join(zip_lineage(lca))) else: @@ -88,8 +94,8 @@ def compare_csv(args): if n_incompat: for rank in lca_utils.taxlist(): - notify(f'{incompat_rank[rank]} incompatible at rank {rank}') - + notify(f"{incompat_rank[rank]} incompatible at rank {rank}") -if __name__ == '__main__': + +if __name__ == "__main__": sys.exit(compare_csv(sys.argv[1:])) diff --git a/src/sourmash/lca/command_index.py b/src/sourmash/lca/command_index.py index 3ee13164a8..f75a0ec8f2 100644 --- a/src/sourmash/lca/command_index.py +++ b/src/sourmash/lca/command_index.py @@ -15,10 +15,16 @@ from sourmash.sourmash_args import DEFAULT_LOAD_K -def load_taxonomy_assignments(filename, *, delimiter=',', start_column=2, - use_headers=True, force=False, - split_identifiers=False, - keep_identifier_versions=False): +def load_taxonomy_assignments( + filename, + *, + delimiter=",", + start_column=2, + use_headers=True, + force=False, + split_identifiers=False, + keep_identifier_versions=False, +): """ Load a taxonomy assignment spreadsheet into a dictionary. @@ -26,34 +32,35 @@ def load_taxonomy_assignments(filename, *, delimiter=',', start_column=2, lineage tuples. """ from sourmash.tax.tax_utils import LineagePair + # parse spreadsheet! # CTB note: can't easily switch to FileInputCSV, because of # janky way we do/don't handle headers here. See issue #2198. - fp = open(filename, newline='') + fp = open(filename, newline="") r = csv.reader(fp, delimiter=delimiter) - row_headers = ['identifiers'] - row_headers += ['_skip_']*(start_column - 2) + row_headers = ["identifiers"] + row_headers += ["_skip_"] * (start_column - 2) row_headers += list(lca_utils.taxlist()) # first check that headers are interpretable. if use_headers: - notify('examining spreadsheet headers...') + notify("examining spreadsheet headers...") first_row = next(iter(r)) n_disagree = 0 - for (column, value) in zip(row_headers, first_row): - if column == '_skip_': + for column, value in zip(row_headers, first_row): + if column == "_skip_": continue if column.lower() != value.lower(): notify(f"** assuming column '{value}' is {column} in spreadsheet") n_disagree += 1 if n_disagree > 2: - error('whoa, too many assumptions. are the headers right?') - error('expecting {}', ",".join(row_headers)) + error("whoa, too many assumptions. are the headers right?") + error("expecting {}", ",".join(row_headers)) if not force: sys.exit(-1) - notify('...continue, because --force was specified.') + notify("...continue, because --force was specified.") # convert into a lineage pair assignments = {} @@ -61,27 +68,27 @@ def load_taxonomy_assignments(filename, *, delimiter=',', start_column=2, n_species = 0 n_strains = 0 for row in r: - if row and row[0].strip(): # want non-empty row + if row and row[0].strip(): # want non-empty row num_rows += 1 lineage = list(zip(row_headers, row)) - lineage = [ x for x in lineage if x[0] != '_skip_' ] + lineage = [x for x in lineage if x[0] != "_skip_"] ident = lineage[0][1] lineage = lineage[1:] # fold, spindle, and mutilate ident? if split_identifiers: - ident = ident.split(' ')[0] + ident = ident.split(" ")[0] if not keep_identifier_versions: - ident = ident.split('.')[0] + ident = ident.split(".")[0] # clean lineage of null names, replace with 'unassigned' - lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ] - lineage = [ LineagePair(a, b) for (a, b) in lineage ] + lineage = [(a, lca_utils.filter_null(b)) for (a, b) in lineage] + lineage = [LineagePair(a, b) for (a, b) in lineage] # remove end nulls - while lineage and lineage[-1].name == 'unassigned': + while lineage and lineage[-1].name == "unassigned": lineage = lineage[:-1] # store lineage tuple @@ -90,13 +97,13 @@ def load_taxonomy_assignments(filename, *, delimiter=',', start_column=2, if ident in assignments: if assignments[ident] != tuple(lineage): if not force: - raise Exception("multiple lineages for identifier {}".format(ident)) + raise Exception(f"multiple lineages for identifier {ident}") else: assignments[ident] = tuple(lineage) - if lineage[-1].rank == 'species': + if lineage[-1].rank == "species": n_species += 1 - elif lineage[-1].rank == 'strain': + elif lineage[-1].rank == "strain": n_species += 1 n_strains += 1 @@ -106,35 +113,50 @@ def load_taxonomy_assignments(filename, *, delimiter=',', start_column=2, # any more, when building a large GTDB-based database :) --CTB if len(assignments) * 0.2 > n_species and len(assignments) > 50: if not force: - error('') + error("") error("ERROR: fewer than 20% of lineages have species-level resolution!?") - error("({} species assignments found, of {} assignments total)", - n_species, len(assignments)) + error( + "({} species assignments found, of {} assignments total)", + n_species, + len(assignments), + ) error("** If this is intentional, re-run the command with -f.") sys.exit(-1) return assignments, num_rows -def generate_report(record_duplicates, record_no_lineage, record_remnants, - unused_lineages, unused_identifiers, filename): +def generate_report( + record_duplicates, + record_no_lineage, + record_remnants, + unused_lineages, + unused_identifiers, + filename, +): """ Output a report of anomalies from building the index. """ - with open(filename, 'wt') as fp: - print(f'Duplicate signatures: {len(record_duplicates)}', file=fp) + with open(filename, "w") as fp: + print(f"Duplicate signatures: {len(record_duplicates)}", file=fp) fp.write("\n".join(record_duplicates)) fp.write("\n") - print(f'----\nUnused identifiers: {len(unused_identifiers)}', file=fp) + print(f"----\nUnused identifiers: {len(unused_identifiers)}", file=fp) fp.write("\n".join(unused_identifiers)) fp.write("\n") - print(f'----\nNo lineage provided for these identifiers: {len(record_no_lineage)}', file=fp) + print( + f"----\nNo lineage provided for these identifiers: {len(record_no_lineage)}", + file=fp, + ) fp.write("\n".join(record_no_lineage)) fp.write("\n") - print(f'----\nNo signatures found for these identifiers: {len(record_remnants)}', file=fp) - fp.write('\n'.join(record_remnants)) + print( + f"----\nNo signatures found for these identifiers: {len(record_remnants)}", + file=fp, + ) + fp.write("\n".join(record_remnants)) fp.write("\n") - print(f'----\nUnused lineages: {len(unused_lineages)}', file=fp) + print(f"----\nUnused lineages: {len(unused_lineages)}", file=fp) for lineage in unused_lineages: fp.write(";".join(lca_utils.zip_lineage(lineage))) fp.write("\n") @@ -145,7 +167,7 @@ def index(args): main function for building an LCA database. """ if args.start_column < 2: - error('error, --start-column cannot be less than 2') + error("error, --start-column cannot be less than 2") sys.exit(-1) set_quiet(args.quiet, args.debug) @@ -155,42 +177,50 @@ def index(args): if args.ksize is None: args.ksize = DEFAULT_LOAD_K - moltype = sourmash_args.calculate_moltype(args, default='DNA') + moltype = sourmash_args.calculate_moltype(args, default="DNA") picklist = sourmash_args.load_picklist(args) db_outfile = args.lca_db_out - if args.database_format == 'json': - if not (db_outfile.endswith('.lca.json') or \ - db_outfile.endswith('.lca.json.gz')): # logic -> db.save - db_outfile += '.lca.json' + if args.database_format == "json": + if not ( + db_outfile.endswith(".lca.json") or db_outfile.endswith(".lca.json.gz") + ): # logic -> db.save + db_outfile += ".lca.json" else: - assert args.database_format == 'sql' - if not db_outfile.endswith('.lca.sql'): - db_outfile += '.lca.sql' + assert args.database_format == "sql" + if not db_outfile.endswith(".lca.sql"): + db_outfile += ".lca.sql" if os.path.exists(db_outfile): error(f"ERROR: output file {db_outfile} already exists. Not overwriting.") sys.exit(-1) - notify(f'saving to LCA DB: {format(db_outfile)}') + notify(f"saving to LCA DB: {format(db_outfile)}") - notify(f'Building LCA database with ksize={args.ksize} scaled={args.scaled} moltype={moltype}.') + notify( + f"Building LCA database with ksize={args.ksize} scaled={args.scaled} moltype={moltype}." + ) # first, load taxonomy spreadsheet - delimiter = ',' + delimiter = "," if args.tabs: - delimiter = '\t' - assignments, num_rows = load_taxonomy_assignments(args.csv, - delimiter=delimiter, - start_column=args.start_column, - use_headers=not args.no_headers, - force=args.force, - split_identifiers=args.split_identifiers, - keep_identifier_versions=args.keep_identifier_versions + delimiter = "\t" + assignments, num_rows = load_taxonomy_assignments( + args.csv, + delimiter=delimiter, + start_column=args.start_column, + use_headers=not args.no_headers, + force=args.force, + split_identifiers=args.split_identifiers, + keep_identifier_versions=args.keep_identifier_versions, ) - notify(f'{len(assignments)} distinct identities in spreadsheet out of {num_rows} rows.') - notify(f'{len(set(assignments.values()))} distinct lineages in spreadsheet out of {num_rows} rows.') + notify( + f"{len(assignments)} distinct identities in spreadsheet out of {num_rows} rows." + ) + notify( + f"{len(set(assignments.values()))} distinct lineages in spreadsheet out of {num_rows} rows." + ) db = LCA_Database(args.ksize, args.scaled, moltype) @@ -216,18 +246,28 @@ def index(args): n_skipped = 0 for filename in inp_files: n += 1 - it = load_file_as_signatures(filename, ksize=args.ksize, - select_moltype=moltype, - picklist=picklist, - yield_all_files=args.force) + it = load_file_as_signatures( + filename, + ksize=args.ksize, + select_moltype=moltype, + picklist=picklist, + yield_all_files=args.force, + ) for sig in it: - notify(u'\r\033[K', end=u'') - notify(f'\r... loading signature {str(sig)[:30]} ({n} of {total_n}); skipped {n_skipped} so far', end='') + notify("\r\033[K", end="") + notify( + f"\r... loading signature {str(sig)[:30]} ({n} of {total_n}); skipped {n_skipped} so far", + end="", + ) debug(filename, sig) # block off duplicates. if sig.md5sum() in md5_to_name: - debug('WARNING: in file {}, duplicate md5sum: {}; skipping', filename, sig.md5sum()) + debug( + "WARNING: in file {}, duplicate md5sum: {}; skipping", + filename, + sig.md5sum(), + ) record_duplicates.add(sig.name) continue @@ -240,13 +280,13 @@ def index(args): ident = sig.filename orig_ident = ident - if args.split_identifiers: # hack for NCBI-style names, etc. + if args.split_identifiers: # hack for NCBI-style names, etc. # split on space... - ident = ident.split(' ')[0] + ident = ident.split(" ")[0] if not args.keep_identifier_versions: # ...and on period. - ident = ident.split('.')[0] + ident = ident.split(".")[0] lineage = assignments.get(ident) @@ -257,7 +297,7 @@ def index(args): if args.split_identifiers: notify(f"(Identifier extracted from name: '{orig_ident})')") sys.exit(-1) - debug('(skipping, because --require-taxonomy was specified)') + debug("(skipping, because --require-taxonomy was specified)") n_skipped += 1 continue @@ -265,8 +305,12 @@ def index(args): try: db.insert(sig, ident=ident, lineage=lineage) except ValueError as e: - error("ERROR: cannot insert signature '{}' (md5 {}, loaded from '{}') into database.", - sig, sig.md5sum()[:8], filename) + error( + "ERROR: cannot insert signature '{}' (md5 {}, loaded from '{}') into database.", + sig, + sig.md5sum()[:8], + filename, + ) error("ERROR: {}", str(e)) sys.exit(-1) @@ -280,35 +324,43 @@ def index(args): # track lineage info - either no lineage, or this lineage used. else: - debug('WARNING: no lineage assignment for {}.', ident) + debug("WARNING: no lineage assignment for {}.", ident) record_no_lineage.append(ident) # end main add signatures loop if n_skipped: - notify(f'... loaded {total_n} signatures; skipped {n_skipped} because of --require-taxonomy.') + notify( + f"... loaded {total_n} signatures; skipped {n_skipped} because of --require-taxonomy." + ) else: - notify(f'... loaded {total_n} signatures.') + notify(f"... loaded {total_n} signatures.") # check -- did we find any signatures? if n == 0: - error('ERROR: no signatures found. ??') + error("ERROR: no signatures found. ??") sys.exit(1) # check -- did the signatures we found have any hashes? if not db.hashvals: - error('ERROR: no hash values found - are there any signatures?') + error("ERROR: no hash values found - are there any signatures?") sys.exit(1) - notify(f'loaded {len(db.hashvals)} hashes at ksize={args.ksize} scaled={args.scaled}') + notify( + f"loaded {len(db.hashvals)} hashes at ksize={args.ksize} scaled={args.scaled}" + ) if picklist: sourmash_args.report_picklist(args, picklist) # summarize: - notify(f'{len(record_used_lineages)} assigned lineages out of {len(set(assignments.values()))} distinct lineages in spreadsheet.') + notify( + f"{len(record_used_lineages)} assigned lineages out of {len(set(assignments.values()))} distinct lineages in spreadsheet." + ) unused_lineages = set(assignments.values()) - record_used_lineages - notify(f'{len(record_used_idents)} identifiers used out of {len(set(assignments))} distinct identifiers in spreadsheet.') + notify( + f"{len(record_used_idents)} identifiers used out of {len(set(assignments))} distinct identifiers in spreadsheet." + ) assert record_used_idents.issubset(set(assignments)) unused_identifiers = set(assignments) - record_used_idents @@ -321,25 +373,34 @@ def index(args): # output a record of stuff if requested/available: if record_duplicates or record_no_lineage or record_remnants or unused_lineages: if record_duplicates: - notify(f'WARNING: {len(record_duplicates)} duplicate signatures.') + notify(f"WARNING: {len(record_duplicates)} duplicate signatures.") if record_no_lineage: - notify(f'WARNING: no lineage provided for {len(record_no_lineage)} signatures.') + notify( + f"WARNING: no lineage provided for {len(record_no_lineage)} signatures." + ) if record_remnants: - notify(f'WARNING: no signatures for {len(record_remnants)} spreadsheet rows.') + notify( + f"WARNING: no signatures for {len(record_remnants)} spreadsheet rows." + ) if unused_lineages: - notify(f'WARNING: {len(unused_lineages)} unused lineages.') + notify(f"WARNING: {len(unused_lineages)} unused lineages.") if unused_identifiers: - notify(f'WARNING: {len(unused_identifiers)} unused identifiers.') + notify(f"WARNING: {len(unused_identifiers)} unused identifiers.") if args.report: notify(f"generating a report and saving in '{args.report}'") - generate_report(record_duplicates, record_no_lineage, - record_remnants, unused_lineages, - unused_identifiers, args.report) + generate_report( + record_duplicates, + record_no_lineage, + record_remnants, + unused_lineages, + unused_identifiers, + args.report, + ) else: - notify('(You can use --report to generate a detailed report.)') + notify("(You can use --report to generate a detailed report.)") -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(index(sys.argv[1:])) diff --git a/src/sourmash/lca/command_rankinfo.py b/src/sourmash/lca/command_rankinfo.py index 8cd4c95a71..af0dbfa9d9 100644 --- a/src/sourmash/lca/command_rankinfo.py +++ b/src/sourmash/lca/command_rankinfo.py @@ -27,7 +27,6 @@ def make_lca_counts(dblist, min_num=0): # now convert to trees -> do LCA & counts counts = defaultdict(int) for hashval, lineages in assignments.items(): - # for each list of tuple_info [(rank, name), ...] build # a tree that lets us discover lowest-common-ancestor. debug(lineages) @@ -46,7 +45,7 @@ def rankinfo_main(args): rankinfo! """ if not args.db: - error('Error! must specify at least one LCA database with --db') + error("Error! must specify at least one LCA database with --db") sys.exit(-1) set_quiet(args.quiet, args.debug) @@ -74,8 +73,8 @@ def rankinfo_main(args): else: for rank in lca_utils.taxlist(): count = counts_by_rank.get(rank, 0) - print('{}: {} ({:.1f}%)'.format(rank, count, count / total * 100.)) + print(f"{rank}: {count} ({count / total * 100.0:.1f}%)") -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(rankinfo_main(sys.argv[1:])) diff --git a/src/sourmash/lca/command_summarize.py b/src/sourmash/lca/command_summarize.py index c571d7e141..02b57e60e1 100644 --- a/src/sourmash/lca/command_summarize.py +++ b/src/sourmash/lca/command_summarize.py @@ -13,7 +13,7 @@ from sourmash.index import MultiIndex -DEFAULT_THRESHOLD=5 +DEFAULT_THRESHOLD = 5 def summarize(hashvals, dblist, threshold, ignore_abundance): @@ -32,7 +32,7 @@ def summarize(hashvals, dblist, threshold, ignore_abundance): # now convert to trees -> do LCA & counts if not ignore_abundance: counts = lca_utils.count_lca_for_assignments(assignments, hashvals) - else: # flatten + else: # flatten counts = lca_utils.count_lca_for_assignments(assignments, None) debug(counts.most_common()) @@ -69,9 +69,10 @@ def load_singletons_and_count(filenames, ksize, scaled, ignore_abundance): idx = idx.select(ksize=ksize) for query_sig, query_filename in idx.signatures_with_location(): - notify(u'\r\033[K', end=u'') - notify(f'... loading {query_sig} (file {n} of {total_n})', - total_n, end='\r') + notify("\r\033[K", end="") + notify( + f"... loading {query_sig} (file {n} of {total_n})", total_n, end="\r" + ) total_count += 1 if ignore_abundance and query_sig.minhash.track_abundance: @@ -82,8 +83,8 @@ def load_singletons_and_count(filenames, ksize, scaled, ignore_abundance): count_signature(query_sig, scaled, hashvals) yield query_filename, query_sig, hashvals - notify(u'\r\033[K', end=u'') - notify(f'loaded {total_count} signatures from {n} files total.') + notify("\r\033[K", end="") + notify(f"loaded {total_count} signatures from {n} files total.") def count_signature(sig, scaled, hashvals): @@ -104,32 +105,34 @@ def output_results(lineage_counts, total_counts, filename=None, sig=None): Output results in ~human-readable format. """ - for (lineage, count) in lineage_counts.items(): + for lineage, count in lineage_counts.items(): if lineage: lineage = lca_utils.zip_lineage(lineage, truncate_empty=True) - lineage = ';'.join(lineage) + lineage = ";".join(lineage) else: - lineage = '(root)' + lineage = "(root)" - p = count / total_counts * 100. - p = '{:.1f}%'.format(p) + p = count / total_counts * 100.0 + p = f"{p:.1f}%" - print_results('{:5} {:>5} {} {}:{} {}'.format(p, count, lineage, filename, sig.md5sum()[:8], sig)) + print_results( + f"{p:5} {count:>5} {lineage} {filename}:{sig.md5sum()[:8]} {sig}" + ) -def output_csv(lineage_counts, total_counts, csv_fp, filename, sig, - write_header=True): + +def output_csv(lineage_counts, total_counts, csv_fp, filename, sig, write_header=True): """\ Output results in CSV. """ w = csv.writer(csv_fp) if write_header: - headers = ['count'] + list(lca_utils.taxlist()) - headers += ['filename', 'sig_name', 'sig_md5', 'total_counts'] + headers = ["count"] + list(lca_utils.taxlist()) + headers += ["filename", "sig_name", "sig_md5", "total_counts"] w.writerow(headers) - for (lineage, count) in lineage_counts.items(): - debug('lineage:', lineage) + for lineage, count in lineage_counts.items(): + debug("lineage:", lineage) row = [count] + lca_utils.zip_lineage(lineage, truncate_empty=False) row += [filename, sig.name, sig.md5sum(), total_counts] w.writerow(row) @@ -140,7 +143,7 @@ def summarize_main(args): main summarization function. """ if not args.db: - error('Error! must specify at least one LCA database with --db') + error("Error! must specify at least one LCA database with --db") sys.exit(-1) set_quiet(args.quiet, args.debug) @@ -160,10 +163,12 @@ def summarize_main(args): # load all the databases dblist, ksize, scaled = lca_utils.load_databases(args.db, args.scaled) if ignore_abundance: - notify("Ignoring any k-mer abundances in query, since --ignore-abundance given.") + notify( + "Ignoring any k-mer abundances in query, since --ignore-abundance given." + ) # find all the queries - notify('finding query signatures...') + notify("finding query signatures...") inp_files = args.query if args.query_from_file: @@ -171,7 +176,7 @@ def summarize_main(args): inp_files.extend(more_files) if not inp_files: - error('Error! must specify at least one query signature with --query') + error("Error! must specify at least one query signature with --query") sys.exit(-1) if not check_files_exist(*inp_files): @@ -181,31 +186,37 @@ def summarize_main(args): csv_fp = None write_header = True if args.output: - csv_fp = open(args.output, 'w', newline='') + csv_fp = open(args.output, "w", newline="") try: - for filename, sig, hashvals in \ - load_singletons_and_count(inp_files, ksize, scaled, ignore_abundance): - + for filename, sig, hashvals in load_singletons_and_count( + inp_files, ksize, scaled, ignore_abundance + ): # get the full counted list of lineage counts in this signature - lineage_counts = summarize(hashvals, dblist, args.threshold, - ignore_abundance) + lineage_counts = summarize( + hashvals, dblist, args.threshold, ignore_abundance + ) if not ignore_abundance: total = float(sum(hashvals.values())) else: total = float(len(hashvals)) - output_results(lineage_counts, total, - filename=filename, sig=sig) + output_results(lineage_counts, total, filename=filename, sig=sig) if csv_fp: - output_csv(lineage_counts, total, csv_fp, filename, sig, - write_header=write_header) + output_csv( + lineage_counts, + total, + csv_fp, + filename, + sig, + write_header=write_header, + ) write_header = False finally: if csv_fp: csv_fp.close() -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(summarize_main(sys.argv[1:])) diff --git a/src/sourmash/lca/lca_db.py b/src/sourmash/lca/lca_db.py index daabe3cb70..78855c71b8 100644 --- a/src/sourmash/lca/lca_db.py +++ b/src/sourmash/lca/lca_db.py @@ -14,6 +14,7 @@ def cached_property(fun): """A memoize decorator for class properties.""" + @functools.wraps(fun) def get(self): try: @@ -24,6 +25,7 @@ def get(self): pass ret = self._cache[fun] = fun(self) return ret + return property(get) @@ -56,13 +58,14 @@ class LCA_Database(Index): `_hashval_to_idx` is a dictionary from individual hash values to sets of `idx`. """ + is_database = True # we set manifest to None to avoid implication of fast on-disk access to # sketches. This may be revisited later. manifest = None - def __init__(self, ksize, scaled, moltype='DNA'): + def __init__(self, ksize, scaled, moltype="DNA"): self.ksize = int(ksize) self.scaled = int(scaled) self.filename = None @@ -98,7 +101,7 @@ def _invalidate_cache(self): Internal method. """ - if hasattr(self, '_cache'): + if hasattr(self, "_cache"): del self._cache def _get_ident_index(self, ident, fail_on_duplicate=False): @@ -108,7 +111,7 @@ def _get_ident_index(self, ident, fail_on_duplicate=False): """ idx = self._ident_to_idx.get(ident) if fail_on_duplicate: - assert idx is None # should be no duplicate identities + assert idx is None # should be no duplicate identities if idx is None: idx = self._next_index @@ -153,10 +156,18 @@ def insert(self, sig, ident=None, lineage=None): minhash = sig.minhash if minhash.ksize != self.ksize: - raise ValueError("cannot insert signature with ksize {} into DB (ksize {})".format(minhash.ksize, self.ksize)) + raise ValueError( + "cannot insert signature with ksize {} into DB (ksize {})".format( + minhash.ksize, self.ksize + ) + ) if minhash.moltype != self.moltype: - raise ValueError("cannot insert signature with moltype {} into DB (moltype {})".format(minhash.moltype, self.moltype)) + raise ValueError( + "cannot insert signature with moltype {} into DB (moltype {})".format( + minhash.moltype, self.moltype + ) + ) # downsample to specified scaled; this has the side effect of # making sure they're all at the same scaled value! @@ -169,7 +180,7 @@ def insert(self, sig, ident=None, lineage=None): ident = str(sig) if ident in self._ident_to_name: - raise ValueError("signature '{}' is already in this LCA db.".format(ident)) + raise ValueError(f"signature '{ident}' is already in this LCA db.") # before adding, invalide any caching from @cached_property self._invalidate_cache() @@ -189,7 +200,7 @@ def insert(self, sig, ident=None, lineage=None): # map idx to lid as well. self._idx_to_lid[idx] = lid except TypeError: - raise ValueError('lineage cannot be used as a key?!') + raise ValueError("lineage cannot be used as a key?!") for hashval in minhash.hashes: self._hashval_to_idx[hashval].add(idx) @@ -197,7 +208,7 @@ def insert(self, sig, ident=None, lineage=None): return len(minhash) def __repr__(self): - return "LCA_Database('{}')".format(self.filename) + return f"LCA_Database('{self.filename}')" def signatures(self): """Return all of the signatures in this LCA database. @@ -224,8 +235,16 @@ def _signatures_with_internal(self): for idx, ss in self._signatures.items(): yield ss, idx - def select(self, ksize=None, moltype=None, num=0, scaled=0, abund=None, - containment=False, picklist=None): + def select( + self, + ksize=None, + moltype=None, + num=0, + scaled=0, + abund=None, + containment=False, + picklist=None, + ): """Select a subset of signatures to search. As with SBTs, queries with higher scaled values than the database @@ -239,12 +258,18 @@ def select(self, ksize=None, moltype=None, num=0, scaled=0, abund=None, raise ValueError("cannot use 'num' MinHashes to search LCA database") if scaled > self.scaled and not containment: - raise ValueError(f"cannot use scaled={scaled} on this database (scaled={self.scaled})") + raise ValueError( + f"cannot use scaled={scaled} on this database (scaled={self.scaled})" + ) if ksize is not None and self.ksize != ksize: - raise ValueError(f"ksize on this database is {self.ksize}; this is different from requested ksize of {ksize}") + raise ValueError( + f"ksize on this database is {self.ksize}; this is different from requested ksize of {ksize}" + ) if moltype is not None and moltype != self.moltype: - raise ValueError(f"moltype on this database is {self.moltype}; this is different from requested moltype of {moltype}") + raise ValueError( + f"moltype on this database is {self.moltype}; this is different from requested moltype of {moltype}" + ) if abund: raise ValueError("LCA databases do not support sketches with abund=True") @@ -252,7 +277,9 @@ def select(self, ksize=None, moltype=None, num=0, scaled=0, abund=None, if picklist is not None: self.picklists.append(picklist) if len(self.picklists) > 1: - raise ValueError("we do not (yet) support multiple picklists for LCA databases") + raise ValueError( + "we do not (yet) support multiple picklists for LCA databases" + ) return self @@ -266,24 +293,27 @@ def load(cls, db_name): from sourmash.tax.tax_utils import LineagePair if not os.path.isfile(db_name): - raise ValueError(f"'{db_name}' is not a file and cannot be loaded as an LCA database") + raise ValueError( + f"'{db_name}' is not a file and cannot be loaded as an LCA database" + ) try: from sourmash.index.sqlite_index import LCA_SqliteDatabase + return LCA_SqliteDatabase.load(db_name) except ValueError: pass xopen = open - if db_name.endswith('.gz'): + if db_name.endswith(".gz"): xopen = gzip.open - with xopen(db_name, 'rt') as fp: + with xopen(db_name, "rt") as fp: try: first_ch = fp.read(1) except ValueError: - first_ch = 'X' - if not first_ch or first_ch[0] != '{': + first_ch = "X" + if not first_ch or first_ch[0] != "{": raise ValueError(f"'{db_name}' is not an LCA database file.") fp.seek(0) @@ -295,41 +325,45 @@ def load(cls, db_name): pass if not load_d: - raise ValueError("cannot parse database file '{}' as JSON; invalid format.") + raise ValueError( + "cannot parse database file '{}' as JSON; invalid format." + ) version = None db_type = None try: - version = load_d.get('version') - db_type = load_d.get('type') + version = load_d.get("version") + db_type = load_d.get("type") except AttributeError: pass - if db_type != 'sourmash_lca': - raise ValueError("database file '{}' is not an LCA db.".format(db_name)) + if db_type != "sourmash_lca": + raise ValueError(f"database file '{db_name}' is not an LCA db.") version = float(version) - if version < 2.0 or 'lid_to_lineage' not in load_d: - raise ValueError("Error! This is an old-style LCA DB. You'll need to rebuild or download a newer one.") - - ksize = int(load_d['ksize']) - scaled = int(load_d['scaled']) - moltype = load_d.get('moltype', 'DNA') - if moltype != 'DNA': + if version < 2.0 or "lid_to_lineage" not in load_d: + raise ValueError( + "Error! This is an old-style LCA DB. You'll need to rebuild or download a newer one." + ) + + ksize = int(load_d["ksize"]) + scaled = int(load_d["scaled"]) + moltype = load_d.get("moltype", "DNA") + if moltype != "DNA": assert ksize % 3 == 0 ksize = int(ksize / 3) db = cls(ksize, scaled, moltype) # convert lineage_dict to proper lineages (tuples of LineagePairs) - lid_to_lineage_2 = load_d['lid_to_lineage'] + lid_to_lineage_2 = load_d["lid_to_lineage"] lid_to_lineage = {} lineage_to_lid = {} for k, v in lid_to_lineage_2.items(): - v = dict( ((x[0], x[1]) for x in v) ) + v = dict((x[0], x[1]) for x in v) vv = [] for rank in taxlist(): - name = v.get(rank, '') + name = v.get(rank, "") vv.append(LineagePair(rank, name)) vv = tuple(vv) @@ -340,18 +374,18 @@ def load(cls, db_name): # convert hashval -> lineage index keys to integers (looks like # JSON doesn't have a 64 bit type so stores them as strings) - hashval_to_idx_2 = load_d['hashval_to_idx'] + hashval_to_idx_2 = load_d["hashval_to_idx"] hashval_to_idx = {} for k, v in hashval_to_idx_2.items(): hashval_to_idx[int(k)] = v db._hashval_to_idx = hashval_to_idx - db._ident_to_name = load_d['ident_to_name'] - db._ident_to_idx = load_d['ident_to_idx'] + db._ident_to_name = load_d["ident_to_name"] + db._ident_to_idx = load_d["ident_to_idx"] db._idx_to_lid = {} - for k, v in load_d['idx_to_lid'].items(): + for k, v in load_d["idx_to_lid"].items(): db._idx_to_lid[int(k)] = v if db._ident_to_idx: @@ -367,11 +401,11 @@ def load(cls, db_name): return db - def save(self, db_name, *, format='json'): - if format == 'sql': + def save(self, db_name, *, format="json"): + if format == "sql": self.save_to_sql(db_name) else: - assert format == 'json' + assert format == "json" self.save_to_json(db_name) def save_to_json(self, db_name): @@ -380,42 +414,45 @@ def save_to_json(self, db_name): Method specific to this class. """ if os.path.exists(db_name): - raise ValueError(f"LCA database {db_name} already exists; not overwriting or appending") + raise ValueError( + f"LCA database {db_name} already exists; not overwriting or appending" + ) xopen = open - if db_name.endswith('.gz'): + if db_name.endswith(".gz"): xopen = gzip.open - with xopen(db_name, 'wt') as fp: + with xopen(db_name, "wt") as fp: # use an OrderedDict to preserve output order save_d = OrderedDict() - save_d['version'] = '2.1' - save_d['type'] = 'sourmash_lca' - save_d['license'] = 'CC0' + save_d["version"] = "2.1" + save_d["type"] = "sourmash_lca" + save_d["license"] = "CC0" - if self.moltype != 'DNA': - ksize = self.ksize*3 + if self.moltype != "DNA": + ksize = self.ksize * 3 else: ksize = self.ksize - save_d['ksize'] = ksize - save_d['scaled'] = self.scaled - save_d['moltype'] = self.moltype + save_d["ksize"] = ksize + save_d["scaled"] = self.scaled + save_d["moltype"] = self.moltype # convert lineage internals from tuples to dictionaries d = OrderedDict() for k, v in self._lid_to_lineage.items(): - d[k] = dict([ (vv.rank, vv.name) for vv in v ]) - save_d['lid_to_lineage'] = d + d[k] = dict([(vv.rank, vv.name) for vv in v]) + save_d["lid_to_lineage"] = d # convert values from sets to lists, so that JSON knows how to save - save_d['hashval_to_idx'] = \ - dict((k, list(v)) for (k, v) in self._hashval_to_idx.items()) - - save_d['ident_to_name'] = self._ident_to_name - save_d['ident_to_idx'] = self._ident_to_idx - save_d['idx_to_lid'] = self._idx_to_lid - save_d['lid_to_lineage'] = self._lid_to_lineage - + save_d["hashval_to_idx"] = dict( + (k, list(v)) for (k, v) in self._hashval_to_idx.items() + ) + + save_d["ident_to_name"] = self._ident_to_name + save_d["ident_to_idx"] = self._ident_to_idx + save_d["idx_to_lid"] = self._idx_to_lid + save_d["lid_to_lineage"] = self._lid_to_lineage + json.dump(save_d, fp) def save_to_sql(self, dbname): @@ -424,11 +461,13 @@ def save_to_sql(self, dbname): from sourmash.tax.tax_utils import LineageDB if os.path.exists(dbname): - raise ValueError(f"LCA database {dbname} already exists; not overwriting or appending") + raise ValueError( + f"LCA database {dbname} already exists; not overwriting or appending" + ) # create a new in-memory lineage db... assignments = {} - available_ranks = set() # track ranks, too + available_ranks = set() # track ranks, too for ident, idx in self._ident_to_idx.items(): lid = self._idx_to_lid.get(idx) if lid is not None: @@ -454,7 +493,7 @@ def downsample_scaled(self, scaled): if scaled == self.scaled: return elif scaled < self.scaled: - raise ValueError("cannot decrease scaled from {} to {}".format(self.scaled, scaled)) + raise ValueError(f"cannot decrease scaled from {self.scaled} to {scaled}") self._invalidate_cache() @@ -513,22 +552,28 @@ def _signatures(self): is_protein = False is_hp = False is_dayhoff = False - if self.moltype == 'protein': + if self.moltype == "protein": is_protein = True - elif self.moltype == 'hp': + elif self.moltype == "hp": is_hp = True - elif self.moltype == 'dayhoff': + elif self.moltype == "dayhoff": is_dayhoff = True - minhash = MinHash(n=0, ksize=self.ksize, scaled=self.scaled, - is_protein=is_protein, hp=is_hp, dayhoff=is_dayhoff) + minhash = MinHash( + n=0, + ksize=self.ksize, + scaled=self.scaled, + is_protein=is_protein, + hp=is_hp, + dayhoff=is_dayhoff, + ) - debug('creating signatures for LCA DB...') + debug("creating signatures for LCA DB...") mhd = defaultdict(minhash.copy_and_clear) temp_vals = defaultdict(list) # invert the hashval_to_idx dictionary - for (hashval, idlist) in self._hashval_to_idx.items(): + for hashval, idlist in self._hashval_to_idx.items(): for idx in idlist: temp_hashes = temp_vals[idx] temp_hashes.append(hashval) @@ -559,7 +604,7 @@ def _signatures(self): if passes_all_picklists(ss, self.picklists): sigd[idx] = ss - debug('=> {} signatures!', len(sigd)) + debug("=> {} signatures!", len(sigd)) return sigd def find(self, search_fn, query, **kwargs): @@ -582,9 +627,13 @@ def find(self, search_fn, query, **kwargs): if self.scaled > query_scaled: query_mh = query_mh.downsample(scaled=self.scaled) query_scaled = query_mh.scaled - prepare_subject = lambda x: x # identity + + def prepare_subject(x): + return x # identity else: - prepare_subject = lambda subj: subj.downsample(scaled=query_scaled) + + def prepare_subject(subj): + return subj.downsample(scaled=query_scaled) # collect matching hashes for the query: c = Counter() @@ -594,7 +643,7 @@ def find(self, search_fn, query, **kwargs): for idx in idx_list: c[idx] += 1 - debug('number of matching signatures for hashes: {}', len(c)) + debug("number of matching signatures for hashes: {}", len(c)) # for each match, in order of largest overlap, for idx, count in c.most_common(): @@ -604,7 +653,7 @@ def find(self, search_fn, query, **kwargs): # this piecemeal by iterating across all the hashes, instead. subj = self._signatures.get(idx) - if subj is None: # must be because of a picklist exclusion + if subj is None: # must be because of a picklist exclusion assert self.picklists continue @@ -616,8 +665,7 @@ def find(self, search_fn, query, **kwargs): shared_size = query_mh.count_common(subj_mh) total_size = len(query_mh + subj_mh) - score = search_fn.score_fn(query_size, shared_size, subj_size, - total_size) + score = search_fn.score_fn(query_size, shared_size, subj_size, total_size) # CTB note to self: even with JaccardSearchBestOnly, this will # still iterate over & score all signatures. We should come @@ -671,14 +719,14 @@ def load_databases(filenames, scaled=None, verbose=True): # load all the databases for db_name in filenames: if verbose: - notify(u'\r\033[K', end=u'') - notify(f'... loading database {format(db_name)}', end='\r') + notify("\r\033[K", end="") + notify(f"... loading database {format(db_name)}", end="\r") lca_db = LCA_Database.load(db_name) ksize_vals.add(lca_db.ksize) if len(ksize_vals) > 1: - raise Exception('multiple ksizes, quitting') + raise Exception("multiple ksizes, quitting") if scaled and scaled > lca_db.scaled: lca_db.downsample_scaled(scaled) @@ -686,7 +734,7 @@ def load_databases(filenames, scaled=None, verbose=True): moltype_vals.add(lca_db.moltype) if len(moltype_vals) > 1: - raise Exception('multiple moltypes, quitting') + raise Exception("multiple moltypes, quitting") dblist.append(lca_db) @@ -695,7 +743,9 @@ def load_databases(filenames, scaled=None, verbose=True): moltype = moltype_vals.pop() if verbose: - notify(u'\r\033[K', end=u'') - notify(f'loaded {len(dblist)} LCA databases. ksize={ksize}, scaled={scaled} moltype={moltype}') + notify("\r\033[K", end="") + notify( + f"loaded {len(dblist)} LCA databases. ksize={ksize}, scaled={scaled} moltype={moltype}" + ) return dblist, ksize, scaled diff --git a/src/sourmash/lca/lca_utils.py b/src/sourmash/lca/lca_utils.py index 8ee9340ed7..70b883bb7d 100644 --- a/src/sourmash/lca/lca_utils.py +++ b/src/sourmash/lca/lca_utils.py @@ -7,12 +7,23 @@ from .lca_db import LCA_Database, load_single_database, load_databases -__all__ = ['taxlist', 'zip_lineage', 'build_tree', 'find_lca', - 'load_single_database', 'load_databases', 'gather_assignments', - 'count_lca_for_assignments', 'LineagePair', 'display_lineage', - 'make_lineage', 'pop_to_rank', 'is_lineage_match'] - -try: # py2/py3 compat +__all__ = [ + "taxlist", + "zip_lineage", + "build_tree", + "find_lca", + "load_single_database", + "load_databases", + "gather_assignments", + "count_lca_for_assignments", + "LineagePair", + "display_lineage", + "make_lineage", + "pop_to_rank", + "is_lineage_match", +] + +try: # py2/py3 compat from itertools import zip_longest except ImportError: from itertools import izip_longest as zip_longest @@ -20,7 +31,7 @@ from sourmash.logging import notify, error, debug # type to store an element in a taxonomic lineage -LineagePair = namedtuple('LineagePair', ['rank', 'name']) +LineagePair = namedtuple("LineagePair", ["rank", "name"]) def check_files_exist(*files): @@ -32,8 +43,12 @@ def check_files_exist(*files): ret = False if len(not_found): - error('Error! Could not find the following files.' - ' Make sure the file paths are specified correctly.\n{}'.format('\n'.join(not_found))) + error( + "Error! Could not find the following files." + " Make sure the file paths are specified correctly.\n{}".format( + "\n".join(not_found) + ) + ) return ret @@ -43,11 +58,17 @@ def taxlist(include_strain=True): """ Provide an ordered list of taxonomic ranks. """ - for k in ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', - 'species']: - yield k + yield from [ + "superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + ] if include_strain: - yield 'strain' + yield "strain" # produce an ordered list of tax names from lineage @@ -67,10 +88,11 @@ def zip_lineage(lineage, include_strain=True, truncate_empty=False): ['a', '', 'c', '', '', '', '', ''] """ - empty = LineagePair(None, '') + empty = LineagePair(None, "") - pairs = zip_longest(taxlist(include_strain=include_strain), - lineage, fillvalue=empty) + pairs = zip_longest( + taxlist(include_strain=include_strain), lineage, fillvalue=empty + ) pairs = list(pairs) # eliminate empty if so requested @@ -85,22 +107,30 @@ def zip_lineage(lineage, include_strain=True, truncate_empty=False): for taxrank, lineage_tup in pairs: # validate non-empty tax, e.g. superkingdom/phylum/class in order. if lineage_tup != empty and lineage_tup.rank != taxrank: - raise ValueError('incomplete lineage at {} - is {} instead'.format(taxrank, lineage_tup.rank)) + raise ValueError( + f"incomplete lineage at {taxrank} - is {lineage_tup.rank} instead" + ) row.append(lineage_tup.name) return row def display_lineage(lineage, include_strain=True, truncate_empty=True): - return ";".join(zip_lineage(lineage, - include_strain=include_strain, - truncate_empty=truncate_empty)) + return ";".join( + zip_lineage( + lineage, include_strain=include_strain, truncate_empty=truncate_empty + ) + ) # filter function toreplace blank/na/null with 'unassigned' -filter_null = lambda x: 'unassigned' if x is None or x.strip() in \ - ('[Blank]', 'na', 'null', '') else x -null_names = set(['[Blank]', 'na', 'null']) +def filter_null(x): + return ( + "unassigned" if x is None or x.strip() in ("[Blank]", "na", "null", "") else x + ) + + +null_names = set(["[Blank]", "na", "null"]) def build_tree(assignments, initial=None): @@ -142,13 +172,13 @@ def find_lca(tree): node = tree lineage = [] while 1: - if len(node) == 1: # descend to only child; track path + if len(node) == 1: # descend to only child; track path lineage_tup = next(iter(node.keys())) lineage.append(lineage_tup) node = node[lineage_tup] - elif len(node) == 0: # at leaf; end + elif len(node) == 0: # at leaf; end return tuple(lineage), 0 - else: # len(node) > 1 => confusion!! + else: # len(node) > 1 => confusion!! return tuple(lineage), len(node) @@ -231,14 +261,14 @@ def pop_to_rank(lin, rank): return tuple(lin) - def make_lineage(lineage): "Turn a ; or ,-separated set of lineages into a tuple of LineagePair objs." from sourmash.tax.tax_utils import LineagePair - lin = lineage.split(';') + + lin = lineage.split(";") if len(lin) == 1: - lin = lineage.split(',') - lin = [ LineagePair(rank, n) for (rank, n) in zip(taxlist(), lin) ] + lin = lineage.split(",") + lin = [LineagePair(rank, n) for (rank, n) in zip(taxlist(), lin)] lin = tuple(lin) return lin diff --git a/src/sourmash/logging.py b/src/sourmash/logging.py index 2915c43f78..ad885a7aee 100644 --- a/src/sourmash/logging.py +++ b/src/sourmash/logging.py @@ -3,6 +3,8 @@ _quiet = False _debug = False + + def set_quiet(val, print_debug=False): global _quiet, _debug _quiet = bool(val) @@ -22,10 +24,9 @@ def notify(s, *args, **kwargs): if _quiet: return - print(u'\r\033[K', end=u'', file=sys.stderr) - print(s.format(*args, **kwargs), file=sys.stderr, - end=kwargs.get('end', u'\n')) - if kwargs.get('flush'): + print("\r\033[K", end="", file=sys.stderr) + print(s.format(*args, **kwargs), file=sys.stderr, end=kwargs.get("end", "\n")) + if kwargs.get("flush"): sys.stderr.flush() @@ -34,10 +35,9 @@ def debug(s, *args, **kwargs): if _quiet or not _debug: return - print(u'\r\033[K', end=u'', file=sys.stderr) - print(s.format(*args, **kwargs), file=sys.stderr, - end=kwargs.get('end', u'\n')) - if kwargs.get('flush'): + print("\r\033[K", end="", file=sys.stderr) + print(s.format(*args, **kwargs), file=sys.stderr, end=kwargs.get("end", "\n")) + if kwargs.get("flush"): sys.stderr.flush() @@ -46,17 +46,17 @@ def debug_literal(s, *args, **kwargs): if _quiet or not _debug: return - print(u'\r\033[K', end=u'', file=sys.stderr) - print(s, file=sys.stderr, end=kwargs.get('end', u'\n')) - if kwargs.get('flush'): + print("\r\033[K", end="", file=sys.stderr) + print(s, file=sys.stderr, end=kwargs.get("end", "\n")) + if kwargs.get("flush"): sys.stderr.flush() def error(s, *args, **kwargs): "A simple error logging function => stderr." - print(u'\r\033[K', end=u'', file=sys.stderr) + print("\r\033[K", end="", file=sys.stderr) print(s.format(*args, **kwargs), file=sys.stderr) - if kwargs.get('flush'): + if kwargs.get("flush"): sys.stderr.flush() @@ -67,13 +67,13 @@ def test_notify(): saveerr, sys.stderr = sys.stderr, StringIO() try: _quiet = False - notify(u'hello, world') + notify("hello, world") finally: _quiet = qsave saveerr, sys.stderr = sys.stderr, saveerr print(type(saveerr)) - assert 'hello, world\n' in saveerr.getvalue() + assert "hello, world\n" in saveerr.getvalue() def test_notify_flush(): @@ -83,13 +83,13 @@ def test_notify_flush(): saveerr, sys.stderr = sys.stderr, StringIO() try: _quiet = False - notify(u'hello, world', flush=True) + notify("hello, world", flush=True) finally: _quiet = qsave saveerr, sys.stderr = sys.stderr, saveerr print(type(saveerr)) - assert 'hello, world' in saveerr.getvalue() + assert "hello, world" in saveerr.getvalue() def test_notify_end(): @@ -99,13 +99,13 @@ def test_notify_end(): saveerr, sys.stderr = sys.stderr, StringIO() try: _quiet = False - notify(u'hello, world', end=u'FOO') + notify("hello, world", end="FOO") finally: _quiet = qsave saveerr, sys.stderr = sys.stderr, saveerr print(type(saveerr)) - assert 'hello, worldFOO' in saveerr.getvalue() + assert "hello, worldFOO" in saveerr.getvalue() def test_notify_quiet(): @@ -115,13 +115,13 @@ def test_notify_quiet(): saveerr, sys.stderr = sys.stderr, StringIO() try: _quiet = True - notify(u'hello, world') + notify("hello, world") finally: _quiet = qsave saveerr, sys.stderr = sys.stderr, saveerr print(type(saveerr)) - assert 'hello, world' not in saveerr.getvalue() + assert "hello, world" not in saveerr.getvalue() def test_error(): @@ -131,13 +131,13 @@ def test_error(): saveerr, sys.stderr = sys.stderr, StringIO() try: _quiet = False - error(u'hello, world') + error("hello, world") finally: _quiet = qsave saveerr, sys.stderr = sys.stderr, saveerr print(type(saveerr)) - assert 'hello, world\n' in saveerr.getvalue() + assert "hello, world\n" in saveerr.getvalue() def test_error_flush(): @@ -147,13 +147,13 @@ def test_error_flush(): saveerr, sys.stderr = sys.stderr, StringIO() try: _quiet = False - error(u'hello, world', flush=True) + error("hello, world", flush=True) finally: _quiet = qsave saveerr, sys.stderr = sys.stderr, saveerr print(type(saveerr)) - assert 'hello, world' in saveerr.getvalue() + assert "hello, world" in saveerr.getvalue() def test_error_quiet(): @@ -164,10 +164,10 @@ def test_error_quiet(): saveerr, sys.stderr = sys.stderr, StringIO() try: _quiet = True - error(u'hello, world') + error("hello, world") finally: _quiet = qsave saveerr, sys.stderr = sys.stderr, saveerr print(type(saveerr)) - assert 'hello, world' in saveerr.getvalue() + assert "hello, world" in saveerr.getvalue() diff --git a/src/sourmash/manifest.py b/src/sourmash/manifest.py index 466bfa8e7a..2f00f5c382 100644 --- a/src/sourmash/manifest.py +++ b/src/sourmash/manifest.py @@ -23,11 +23,21 @@ class BaseCollectionManifest: * 'locations()' returns all distinct locations for e.g. lazy loading * supports container protocol for signatures, e.g. 'if ss in manifest: ...' """ + # each manifest row must have the following, although they may be empty. - required_keys = ('internal_location', - 'md5', 'md5short', 'ksize', 'moltype', 'num', - 'scaled', 'n_hashes', 'with_abundance', - 'name', 'filename') + required_keys = ( + "internal_location", + "md5", + "md5short", + "ksize", + "moltype", + "num", + "scaled", + "n_hashes", + "with_abundance", + "name", + "filename", + ) @classmethod @abstractmethod @@ -42,12 +52,12 @@ def load_from_filename(cls, filename): return db # not a SQLite db? CTB: fix this to actually try loading this as .gz... - if filename.endswith('.gz'): + if filename.endswith(".gz"): xopen = gzip.open else: xopen = open - with xopen(filename, 'rt', newline="") as fp: + with xopen(filename, "rt", newline="") as fp: return cls.load_from_csv(fp) @classmethod @@ -55,10 +65,10 @@ def load_from_csv(cls, fp): "load a manifest from a CSV file." manifest_list = [] firstline = fp.readline().rstrip() - if not firstline.startswith('# SOURMASH-MANIFEST-VERSION: '): + if not firstline.startswith("# SOURMASH-MANIFEST-VERSION: "): raise ValueError("manifest is missing version header") - version = firstline[len('# SOURMASH-MANIFEST-VERSION: '):] + version = firstline[len("# SOURMASH-MANIFEST-VERSION: ") :] if float(version) != 1.0: raise ValueError(f"unknown manifest version number {version}") @@ -73,15 +83,15 @@ def load_from_csv(cls, fp): row = None # do row type conversion - introws = ('num', 'scaled', 'ksize', 'n_hashes') - boolrows = ('with_abundance',) + introws = ("num", "scaled", "ksize", "n_hashes") + boolrows = ("with_abundance",) for row in r: for k in introws: row[k] = int(row[k]) for k in boolrows: row[k] = bool(ast.literal_eval(str(row[k]))) - row['signature'] = None + row["signature"] = None manifest_list.append(row) return CollectionManifest(manifest_list) @@ -89,69 +99,71 @@ def load_from_csv(cls, fp): @classmethod def load_from_sql(cls, filename): from sourmash.index.sqlite_index import load_sqlite_index + db = load_sqlite_index(filename, request_manifest=True) if db is not None: return db.manifest return None - def write_to_filename(self, filename, *, database_format='csv', - ok_if_exists=False): - if database_format == 'csv': + def write_to_filename(self, filename, *, database_format="csv", ok_if_exists=False): + if database_format == "csv": from .sourmash_args import FileOutputCSV + if ok_if_exists or not os.path.exists(filename): with FileOutputCSV(filename) as fp: return self.write_to_csv(fp, write_header=True) elif os.path.exists(filename) and not ok_if_exists: raise Exception("output manifest already exists") - elif database_format == 'sql': + elif database_format == "sql": from sourmash.index.sqlite_index import SqliteCollectionManifest - SqliteCollectionManifest.load_from_manifest(self, dbfile=filename, - append=ok_if_exists) + + SqliteCollectionManifest.load_from_manifest( + self, dbfile=filename, append=ok_if_exists + ) @classmethod def write_csv_header(cls, fp): "write header for manifest CSV format" - fp.write('# SOURMASH-MANIFEST-VERSION: 1.0\n') + fp.write("# SOURMASH-MANIFEST-VERSION: 1.0\n") w = csv.DictWriter(fp, fieldnames=cls.required_keys) w.writeheader() def write_to_csv(self, fp, write_header=False): "write manifest CSV to specified file handle" - w = csv.DictWriter(fp, fieldnames=self.required_keys, - extrasaction='ignore') + w = csv.DictWriter(fp, fieldnames=self.required_keys, extrasaction="ignore") if write_header: self.write_csv_header(fp) for row in self.rows: # don't write signature! - if 'signature' in row: - del row['signature'] + if "signature" in row: + del row["signature"] w.writerow(row) @classmethod def make_manifest_row(cls, ss, location, *, include_signature=True): "make a manifest row dictionary." row = {} - row['md5'] = ss.md5sum() - row['md5short'] = row['md5'][:8] - row['ksize'] = ss.minhash.ksize - row['moltype'] = ss.minhash.moltype - row['num'] = ss.minhash.num - row['scaled'] = ss.minhash.scaled - row['n_hashes'] = len(ss.minhash) - row['with_abundance'] = 1 if ss.minhash.track_abundance else 0 - row['name'] = ss.name - row['filename'] = ss.filename - row['internal_location'] = location + row["md5"] = ss.md5sum() + row["md5short"] = row["md5"][:8] + row["ksize"] = ss.minhash.ksize + row["moltype"] = ss.minhash.moltype + row["num"] = ss.minhash.num + row["scaled"] = ss.minhash.scaled + row["n_hashes"] = len(ss.minhash) + row["with_abundance"] = 1 if ss.minhash.track_abundance else 0 + row["name"] = ss.name + row["filename"] = ss.filename + row["internal_location"] = location assert set(row.keys()) == set(cls.required_keys) # if requested, include the signature in the manifest. if include_signature: - row['signature'] = ss + row["signature"] = ss return row @classmethod @@ -164,8 +176,9 @@ def create_manifest(cls, locations_iter, *, include_signature=True): """ manifest_list = [] for ss, location in locations_iter: - row = cls.make_manifest_row(ss, location, - include_signature=include_signature) + row = cls.make_manifest_row( + ss, location, include_signature=include_signature + ) manifest_list.append(row) return cls(manifest_list) @@ -216,6 +229,7 @@ class CollectionManifest(BaseCollectionManifest): """ An in-memory manifest that simply stores the rows in a list. """ + def __init__(self, rows=[]): "Initialize from an iterable of metadata dictionaries." self.rows = [] @@ -237,7 +251,7 @@ def _add_rows(self, rows): # only iterate once, in case it's a generator for row in rows: self.rows.append(row) - md5set.add(row['md5']) + md5set.add(row["md5"]) def __iadd__(self, other): if self is other: @@ -258,7 +272,7 @@ def __len__(self): def __eq__(self, other): "Check equality on a row-by-row basis. May fail on out-of-order rows." - for (a, b) in itertools.zip_longest(self.rows, other.rows): + for a, b in itertools.zip_longest(self.rows, other.rows): if a is None or b is None: return False @@ -269,41 +283,49 @@ def __eq__(self, other): return True - def _select(self, *, ksize=None, moltype=None, scaled=0, num=0, - containment=False, abund=None, picklist=None): + def _select( + self, + *, + ksize=None, + moltype=None, + scaled=0, + num=0, + containment=False, + abund=None, + picklist=None, + ): """Yield manifest rows for sigs that match the specified requirements. Internal method; call `select_to_manifest` instead. """ matching_rows = self.rows if ksize: - matching_rows = ( row for row in matching_rows - if row['ksize'] == ksize ) + matching_rows = (row for row in matching_rows if row["ksize"] == ksize) if moltype: - matching_rows = ( row for row in matching_rows - if row['moltype'] == moltype ) + matching_rows = (row for row in matching_rows if row["moltype"] == moltype) if scaled or containment: if containment and not scaled: raise ValueError("'containment' requires 'scaled' in Index.select'") - matching_rows = ( row for row in matching_rows - if row['scaled'] and not row['num'] ) + matching_rows = ( + row for row in matching_rows if row["scaled"] and not row["num"] + ) if num: - matching_rows = ( row for row in matching_rows - if row['num'] and not row['scaled'] ) + matching_rows = ( + row for row in matching_rows if row["num"] and not row["scaled"] + ) if abund: # only need to concern ourselves if abundance is _required_ - matching_rows = ( row for row in matching_rows - if row['with_abundance'] ) + matching_rows = (row for row in matching_rows if row["with_abundance"]) if picklist: - matching_rows = ( row for row in matching_rows - if picklist.matches_manifest_row(row) ) + matching_rows = ( + row for row in matching_rows if picklist.matches_manifest_row(row) + ) # return only the internal filenames! - for row in matching_rows: - yield row + yield from matching_rows def select_to_manifest(self, **kwargs): "Do a 'select' and return a new CollectionManifest object." @@ -312,22 +334,24 @@ def select_to_manifest(self, **kwargs): def filter_rows(self, row_filter_fn): "Create a new manifest filtered through row_filter_fn." - new_rows = [ row for row in self.rows if row_filter_fn(row) ] + new_rows = [row for row in self.rows if row_filter_fn(row)] return CollectionManifest(new_rows) def filter_on_columns(self, col_filter_fn, col_names): "Create a new manifest based on column matches." + def row_filter_fn(row): - x = [ row[col] for col in col_names if row[col] is not None ] + x = [row[col] for col in col_names if row[col] is not None] return col_filter_fn(x) + return self.filter_rows(row_filter_fn) def locations(self): "Return all distinct locations." seen = set() for row in self.rows: - loc = row['internal_location'] + loc = row["internal_location"] # track/remove duplicates if loc not in seen: @@ -341,8 +365,8 @@ def __contains__(self, ss): def to_picklist(self): "Convert this manifest to a picklist." - pl = picklist.SignaturePicklist('manifest') + pl = picklist.SignaturePicklist("manifest") - pl.pickset = { pl._get_value_for_manifest_row(row) for row in self.rows } + pl.pickset = {pl._get_value_for_manifest_row(row) for row in self.rows} return pl diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py index 360ca6165b..ffa879b64d 100644 --- a/src/sourmash/minhash.py +++ b/src/sourmash/minhash.py @@ -1,22 +1,26 @@ -# -*- coding: utf-8 -*- """ sourmash submodule that provides MinHash class and utility functions. class MinHash - core MinHash class. class FrozenMinHash - read-only MinHash class. """ -from __future__ import unicode_literals, division -from .distance_utils import jaccard_to_distance, containment_to_distance, set_size_exact_prob +from .distance_utils import ( + jaccard_to_distance, + containment_to_distance, + set_size_exact_prob, +) from .logging import notify import numpy as np -__all__ = ['get_minhash_default_seed', - 'get_minhash_max_hash', - 'hash_murmur', - 'MinHash', - 'FrozenMinHash'] +__all__ = [ + "get_minhash_default_seed", + "get_minhash_max_hash", + "hash_murmur", + "MinHash", + "FrozenMinHash", +] from collections.abc import Mapping @@ -52,20 +56,14 @@ def _get_max_hash_for_scaled(scaled): elif scaled == 1: return get_minhash_max_hash() - return min( - int(round(get_minhash_max_hash() / scaled, 0)), - MINHASH_MAX_HASH - ) + return min(int(round(get_minhash_max_hash() / scaled, 0)), MINHASH_MAX_HASH) def _get_scaled_for_max_hash(max_hash): "Convert a 'max_hash' value into a 'scaled' value." if max_hash == 0: return 0 - return min( - int(round(get_minhash_max_hash() / max_hash, 0)), - MINHASH_MAX_HASH - ) + return min(int(round(get_minhash_max_hash() / max_hash, 0)), MINHASH_MAX_HASH) def to_bytes(s): @@ -75,7 +73,7 @@ def to_bytes(s): if isinstance(s, bytes): return s - if not isinstance(s, (str, bytes, int)): + if not isinstance(s, str | bytes | int): raise TypeError("Requires a string-like sequence") if isinstance(s, str): @@ -97,8 +95,7 @@ def hash_murmur(kmer, seed=MINHASH_DEFAULT_SEED): def translate_codon(codon): "Translate a codon into an amino acid." try: - return rustcall(lib.sourmash_translate_codon, - to_bytes(codon)).decode('utf-8') + return rustcall(lib.sourmash_translate_codon, to_bytes(codon)).decode("utf-8") except SourmashError as e: raise ValueError(e.message) @@ -106,7 +103,7 @@ def translate_codon(codon): def flatten_and_downsample_scaled(mh, *scaled_vals): "Flatten MinHash object and downsample to max of scaled values." assert mh.scaled - assert all( (x > 0 for x in scaled_vals) ) + assert all(x > 0 for x in scaled_vals) mh = mh.flatten() scaled = max(scaled_vals) @@ -118,7 +115,7 @@ def flatten_and_downsample_scaled(mh, *scaled_vals): def flatten_and_downsample_num(mh, *num_vals): "Flatten MinHash object and downsample to min of num values." assert mh.num - assert all( (x > 0 for x in num_vals) ) + assert all(x > 0 for x in num_vals) mh = mh.flatten() num = min(num_vals) @@ -138,6 +135,7 @@ def flatten_and_intersect_scaled(mh1, mh2): class _HashesWrapper(Mapping): "A read-only view of the hashes contained by a MinHash object." + def __init__(self, h): self._data = h @@ -186,6 +184,7 @@ class MinHash(RustObject): >>> round(mh1.similarity(mh2), 2) 0.85 """ + __dealloc_func__ = lib.kmerminhash_free def __init__( @@ -236,13 +235,13 @@ def __init__( if dayhoff: hash_function = lib.HASH_FUNCTIONS_MURMUR64_DAYHOFF - ksize = ksize*3 + ksize = ksize * 3 elif hp: hash_function = lib.HASH_FUNCTIONS_MURMUR64_HP - ksize = ksize*3 + ksize = ksize * 3 elif is_protein: hash_function = lib.HASH_FUNCTIONS_MURMUR64_PROTEIN - ksize = ksize*3 + ksize = ksize * 3 else: hash_function = lib.HASH_FUNCTIONS_MURMUR64_DNA @@ -281,7 +280,7 @@ def __getstate__(self): # get a ksize that makes sense to the Rust layer. See #2262. return ( self.num, - self.ksize if self.is_dna else self.ksize*3, + self.ksize if self.is_dna else self.ksize * 3, self.is_protein, self.dayhoff, self.hp, @@ -294,16 +293,29 @@ def __getstate__(self): def __setstate__(self, tup): "support pickling via __getstate__/__setstate__" - (n, ksize, is_protein, dayhoff, hp, mins, _, track_abundance, - max_hash, seed) = tup + ( + n, + ksize, + is_protein, + dayhoff, + hp, + mins, + _, + track_abundance, + max_hash, + seed, + ) = tup self.__del__() hash_function = ( - lib.HASH_FUNCTIONS_MURMUR64_DAYHOFF if dayhoff else - lib.HASH_FUNCTIONS_MURMUR64_HP if hp else - lib.HASH_FUNCTIONS_MURMUR64_PROTEIN if is_protein else - lib.HASH_FUNCTIONS_MURMUR64_DNA + lib.HASH_FUNCTIONS_MURMUR64_DAYHOFF + if dayhoff + else lib.HASH_FUNCTIONS_MURMUR64_HP + if hp + else lib.HASH_FUNCTIONS_MURMUR64_PROTEIN + if is_protein + else lib.HASH_FUNCTIONS_MURMUR64_DNA ) scaled = _get_scaled_for_max_hash(max_hash) @@ -335,10 +347,11 @@ def copy_and_clear(self): def add_sequence(self, sequence, force=False): "Add a sequence into the sketch." - self._methodcall(lib.kmerminhash_add_sequence, to_bytes(sequence), - force) + self._methodcall(lib.kmerminhash_add_sequence, to_bytes(sequence), force) - def seq_to_hashes(self, sequence, *, force=False, bad_kmers_as_zeroes=False, is_protein=False): + def seq_to_hashes( + self, sequence, *, force=False, bad_kmers_as_zeroes=False, is_protein=False + ): """Convert sequence to hashes without adding to the sketch. If input sequence is DNA and this is a protein, dayhoff, or hp @@ -354,10 +367,20 @@ def seq_to_hashes(self, sequence, *, force=False, bad_kmers_as_zeroes=False, is_ raise ValueError("cannot add protein sequence to DNA MinHash") if bad_kmers_as_zeroes and not force: - raise ValueError("cannot represent invalid kmers as 0 while force is not set to True") + raise ValueError( + "cannot represent invalid kmers as 0 while force is not set to True" + ) size = ffi.new("uintptr_t *") - hashes_ptr = self._methodcall(lib.kmerminhash_seq_to_hashes, to_bytes(sequence), len(sequence), force, bad_kmers_as_zeroes, is_protein, size) + hashes_ptr = self._methodcall( + lib.kmerminhash_seq_to_hashes, + to_bytes(sequence), + len(sequence), + force, + bad_kmers_as_zeroes, + is_protein, + size, + ) size = size[0] try: @@ -384,21 +407,24 @@ def kmers_and_hashes(self, sequence, *, force=False, is_protein=False): bad_kmers_as_zeroes = True sequence = sequence.upper() - hashvals = self.seq_to_hashes(sequence, - force=force, is_protein=is_protein, - bad_kmers_as_zeroes=bad_kmers_as_zeroes) + hashvals = self.seq_to_hashes( + sequence, + force=force, + is_protein=is_protein, + bad_kmers_as_zeroes=bad_kmers_as_zeroes, + ) if bad_kmers_as_zeroes: - hashvals = [ None if h == 0 else h for h in hashvals ] + hashvals = [None if h == 0 else h for h in hashvals] ksize = self.ksize translate = False - if self.moltype == 'DNA': + if self.moltype == "DNA": pass elif is_protein: pass - else: # translate input DNA sequence => aa - assert self.moltype in ('protein', 'dayhoff', 'hp') + else: # translate input DNA sequence => aa + assert self.moltype in ("protein", "dayhoff", "hp") translate = True ksize = self.ksize * 3 @@ -415,13 +441,13 @@ def kmers_and_hashes(self, sequence, *, force=False, is_protein=False): for frame in (0, 1, 2): # get forward k-mers for start in range(0, len(sequence) - ksize + 1 - frame, 3): - kmer = sequence[start + frame:start + frame + ksize] + kmer = sequence[start + frame : start + frame + ksize] yield kmer, hashvals[hash_i] hash_i += 1 # get rc k-mers for start in range(0, len(seqrc) - ksize + 1 - frame, 3): - kmer = seqrc[start + frame:start + frame + ksize] + kmer = seqrc[start + frame : start + frame + ksize] yield kmer, hashvals[hash_i] hash_i += 1 else: @@ -429,17 +455,17 @@ def kmers_and_hashes(self, sequence, *, force=False, is_protein=False): n_kmers = len(sequence) - ksize + 1 assert n_kmers == len(hashvals) for i, hashval in zip(range(0, n_kmers), hashvals): - kmer = sequence[i:i+ksize] + kmer = sequence[i : i + ksize] yield kmer, hashval def add_kmer(self, kmer): "Add a kmer into the sketch." if self.is_dna: if len(kmer) != self.ksize: - raise ValueError("kmer to add is not {} in length".format(self.ksize)) + raise ValueError(f"kmer to add is not {self.ksize} in length") else: - if len(kmer) != self.ksize*3: - raise ValueError("kmer to add is not {} in length".format(self.ksize*3)) + if len(kmer) != self.ksize * 3: + raise ValueError(f"kmer to add is not {self.ksize * 3} in length") self.add_sequence(kmer) def add_many(self, hashes): @@ -468,9 +494,12 @@ def __len__(self): "Number of hashes." return self._methodcall(lib.kmerminhash_get_mins_size) - @deprecated(deprecated_in="3.5", removed_in="5.0", - current_version=VERSION, - details='Use .hashes property instead.') + @deprecated( + deprecated_in="3.5", + removed_in="5.0", + current_version=VERSION, + details="Use .hashes property instead.", + ) def get_mins(self, with_abundance=False): """Return list of hashes or if ``with_abundance`` a list of (hash, abund). @@ -480,10 +509,12 @@ def get_mins(self, with_abundance=False): return mins.keys() return mins - - @deprecated(deprecated_in="3.5", removed_in="5.0", - current_version=VERSION, - details='Use .hashes property instead.') + @deprecated( + deprecated_in="3.5", + removed_in="5.0", + current_version=VERSION, + details="Use .hashes property instead.", + ) def get_hashes(self): "Return the list of hashes." return self.hashes.keys() @@ -500,17 +531,18 @@ def hashes(self): abunds_ptr = self._methodcall(lib.kmerminhash_get_abunds, size_abunds) size_abunds = size_abunds[0] assert size == size_abunds - result = dict(zip(ffi.unpack(mins_ptr, size), ffi.unpack(abunds_ptr, size))) + result = dict( + zip(ffi.unpack(mins_ptr, size), ffi.unpack(abunds_ptr, size)) + ) lib.kmerminhash_slice_free(abunds_ptr, size) return _HashesWrapper(result) else: d = ffi.unpack(mins_ptr, size) - return _HashesWrapper({ k : 1 for k in d }) + return _HashesWrapper({k: 1 for k in d}) finally: lib.kmerminhash_slice_free(mins_ptr, size) - @property def seed(self): return self._methodcall(lib.kmerminhash_seed) @@ -551,9 +583,12 @@ def ksize(self): return k @property - @deprecated(deprecated_in="3.5", removed_in="5.0", - current_version=VERSION, - details='Use scaled instead.') + @deprecated( + deprecated_in="3.5", + removed_in="5.0", + current_version=VERSION, + details="Use scaled instead.", + ) def max_hash(self): return self._methodcall(lib.kmerminhash_max_hash) @@ -574,7 +609,9 @@ def track_abundance(self, b): if b is False: self._methodcall(lib.kmerminhash_disable_abundance) elif len(self) > 0: - raise RuntimeError("Can only set track_abundance=True if the MinHash is empty") + raise RuntimeError( + "Can only set track_abundance=True if the MinHash is empty" + ) else: self._methodcall(lib.kmerminhash_enable_abundance) @@ -604,7 +641,9 @@ def count_common(self, other, downsample=False): """ if not isinstance(other, MinHash): raise TypeError("Must be a MinHash!") - return self._methodcall(lib.kmerminhash_count_common, other._get_objptr(), downsample) + return self._methodcall( + lib.kmerminhash_count_common, other._get_objptr(), downsample + ) def intersection_and_union_size(self, other): "Calculate intersection and union sizes between `self` and `other`." @@ -614,8 +653,9 @@ def intersection_and_union_size(self, other): raise TypeError("incompatible MinHash objects") usize = ffi.new("uint64_t *") - common = self._methodcall(lib.kmerminhash_intersection_union_size, - other._get_objptr(), usize) + common = self._methodcall( + lib.kmerminhash_intersection_union_size, other._get_objptr(), usize + ) usize = ffi.unpack(usize, 1)[0] return common, usize @@ -628,11 +668,11 @@ def downsample(self, *, num=None, scaled=None): # at least one must be specified! if num is None and scaled is None: - raise ValueError('must specify either num or scaled to downsample') + raise ValueError("must specify either num or scaled to downsample") # both cannot be specified if num is not None and scaled is not None: - raise ValueError('cannot specify both num and scaled') + raise ValueError("cannot specify both num and scaled") if num is not None: # cannot downsample a scaled MinHash with num: @@ -644,13 +684,15 @@ def downsample(self, *, num=None, scaled=None): # acceptable num value? make sure to set max_hash to 0. max_hash = 0 - + elif scaled is not None: # cannot downsample a num MinHash with scaled if self.num: raise ValueError("cannot downsample a num MinHash using scaled") if self.scaled > scaled: - raise ValueError(f"new scaled {scaled} is lower than current sample scaled {self.scaled}") + raise ValueError( + f"new scaled {scaled} is lower than current sample scaled {self.scaled}" + ) # acceptable scaled value? reconfigure max_hash, keep num 0. max_hash = _get_max_hash_for_scaled(scaled) @@ -658,10 +700,14 @@ def downsample(self, *, num=None, scaled=None): # end checks! create new object: a = MinHash( - num, self.ksize, - is_protein=self.is_protein, dayhoff=self.dayhoff, hp=self.hp, - track_abundance=self.track_abundance, seed=self.seed, - max_hash=max_hash + num, + self.ksize, + is_protein=self.is_protein, + dayhoff=self.dayhoff, + hp=self.hp, + track_abundance=self.track_abundance, + seed=self.seed, + max_hash=max_hash, ) # copy over hashes: if self.track_abundance: @@ -676,9 +722,14 @@ def flatten(self): if self.track_abundance: # create new object: a = MinHash( - self.num, self.ksize, - is_protein=self.is_protein, dayhoff=self.dayhoff, hp=self.hp, - track_abundance=False, seed=self.seed, max_hash=self._max_hash + self.num, + self.ksize, + is_protein=self.is_protein, + dayhoff=self.dayhoff, + hp=self.hp, + track_abundance=False, + seed=self.seed, + max_hash=self._max_hash, ) a.add_many(self) @@ -688,11 +739,21 @@ def flatten(self): def jaccard(self, other, downsample=False): "Calculate Jaccard similarity of two MinHash objects." if self.num != other.num: - err = "must have same num: {} != {}".format(self.num, other.num) + err = f"must have same num: {self.num} != {other.num}" raise TypeError(err) - return self._methodcall(lib.kmerminhash_similarity, other._get_objptr(), True, downsample) + return self._methodcall( + lib.kmerminhash_similarity, other._get_objptr(), True, downsample + ) - def jaccard_ani(self, other, *, downsample=False, jaccard=None, prob_threshold=1e-3, err_threshold=1e-4): + def jaccard_ani( + self, + other, + *, + downsample=False, + jaccard=None, + prob_threshold=1e-3, + err_threshold=1e-4, + ): "Use jaccard to estimate ANI between two MinHash objects." if not (self.scaled and other.scaled): raise TypeError("Error: can only calculate ANI for scaled MinHashes") @@ -705,12 +766,18 @@ def jaccard_ani(self, other, *, downsample=False, jaccard=None, prob_threshold=1 other_mh = other.downsample(scaled=scaled) if jaccard is None: jaccard = self_mh.similarity(other_mh, ignore_abundance=True) - avg_sketch_kmers = (len(self_mh) + len(other_mh))/2 - avg_n_kmers = round(avg_sketch_kmers * scaled) # would be better if hll estimate - see #1798 - j_aniresult = jaccard_to_distance(jaccard, self_mh.ksize, scaled, - n_unique_kmers=avg_n_kmers, - prob_threshold = prob_threshold, - err_threshold = err_threshold) + avg_sketch_kmers = (len(self_mh) + len(other_mh)) / 2 + avg_n_kmers = round( + avg_sketch_kmers * scaled + ) # would be better if hll estimate - see #1798 + j_aniresult = jaccard_to_distance( + jaccard, + self_mh.ksize, + scaled, + n_unique_kmers=avg_n_kmers, + prob_threshold=prob_threshold, + err_threshold=err_threshold, + ) # null out ANI if either mh size estimation is inaccurate if not self.size_is_accurate() or not other.size_is_accurate(): j_aniresult.size_is_inaccurate = True @@ -730,16 +797,20 @@ def similarity(self, other, ignore_abundance=False, downsample=False): See https://en.wikipedia.org/wiki/Cosine_similarity """ - return self._methodcall(lib.kmerminhash_similarity, - other._get_objptr(), - ignore_abundance, downsample) + return self._methodcall( + lib.kmerminhash_similarity, + other._get_objptr(), + ignore_abundance, + downsample, + ) def angular_similarity(self, other): "Calculate the angular similarity." if not (self.track_abundance and other.track_abundance): - raise TypeError("Error: Angular (cosine) similarity requires both sketches to track hash abundance.") - return self._methodcall(lib.kmerminhash_angular_similarity, - other._get_objptr()) + raise TypeError( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + ) + return self._methodcall(lib.kmerminhash_angular_similarity, other._get_objptr()) def is_compatible(self, other): return self._methodcall(lib.kmerminhash_is_compatible, other._get_objptr()) @@ -749,12 +820,16 @@ def contained_by(self, other, downsample=False): Calculate how much of self is contained by other. """ if not (self.scaled and other.scaled): - raise TypeError("Error: can only calculate containment for scaled MinHashes") + raise TypeError( + "Error: can only calculate containment for scaled MinHashes" + ) denom = len(self) if not denom: return 0.0 - total_denom = float(denom * self.scaled) # would be better if hll estimate - see #1798 - bias_factor = 1.0 - (1.0 - 1.0/self.scaled) ** total_denom + total_denom = float( + denom * self.scaled + ) # would be better if hll estimate - see #1798 + bias_factor = 1.0 - (1.0 - 1.0 / self.scaled) ** total_denom containment = self.count_common(other, downsample) / (denom * bias_factor) # debiasing containment can lead to vals outside of 0-1 range. constrain. if containment >= 1: @@ -764,8 +839,16 @@ def contained_by(self, other, downsample=False): else: return containment - - def containment_ani(self, other, *, downsample=False, containment=None, confidence=0.95, estimate_ci = False, prob_threshold=1e-3): + def containment_ani( + self, + other, + *, + downsample=False, + containment=None, + confidence=0.95, + estimate_ci=False, + prob_threshold=1e-3, + ): "Use self contained by other to estimate ANI between two MinHash objects." if not (self.scaled and other.scaled): raise TypeError("Error: can only calculate ANI for scaled MinHashes") @@ -778,11 +861,17 @@ def containment_ani(self, other, *, downsample=False, containment=None, confiden other_mh = other.downsample(scaled=scaled) if containment is None: containment = self_mh.contained_by(other_mh) - n_kmers = len(self_mh) * scaled # would be better if hll estimate - see #1798 - - c_aniresult = containment_to_distance(containment, self_mh.ksize, self_mh.scaled, - n_unique_kmers=n_kmers, confidence=confidence, - estimate_ci = estimate_ci, prob_threshold=prob_threshold) + n_kmers = len(self_mh) * scaled # would be better if hll estimate - see #1798 + + c_aniresult = containment_to_distance( + containment, + self_mh.ksize, + self_mh.scaled, + n_unique_kmers=n_kmers, + confidence=confidence, + estimate_ci=estimate_ci, + prob_threshold=prob_threshold, + ) # null out ANI if either mh size estimation is inaccurate if not self.size_is_accurate() or not other.size_is_accurate(): c_aniresult.size_is_inaccurate = True @@ -793,13 +882,19 @@ def max_containment(self, other, downsample=False): Calculate maximum containment. """ if not (self.scaled and other.scaled): - raise TypeError("Error: can only calculate containment for scaled MinHashes") + raise TypeError( + "Error: can only calculate containment for scaled MinHashes" + ) min_denom = min((len(self), len(other))) if not min_denom: return 0.0 - total_denom = float(min_denom * self.scaled) # would be better if hll estimate - see #1798 - bias_factor = 1.0 - (1.0 - 1.0/self.scaled) ** total_denom - max_containment = self.count_common(other, downsample) / (min_denom * bias_factor) + total_denom = float( + min_denom * self.scaled + ) # would be better if hll estimate - see #1798 + bias_factor = 1.0 - (1.0 - 1.0 / self.scaled) ** total_denom + max_containment = self.count_common(other, downsample) / ( + min_denom * bias_factor + ) # debiasing containment can lead to vals outside of 0-1 range. constrain. if max_containment >= 1: return 1.0 @@ -808,7 +903,16 @@ def max_containment(self, other, downsample=False): else: return max_containment - def max_containment_ani(self, other, *, downsample=False, max_containment=None, confidence=0.95, estimate_ci=False, prob_threshold=1e-3): + def max_containment_ani( + self, + other, + *, + downsample=False, + max_containment=None, + confidence=0.95, + estimate_ci=False, + prob_threshold=1e-3, + ): "Use max_containment to estimate ANI between two MinHash objects." if not (self.scaled and other.scaled): raise TypeError("Error: can only calculate ANI for scaled MinHashes") @@ -824,9 +928,15 @@ def max_containment_ani(self, other, *, downsample=False, max_containment=None, min_n_kmers = min(len(self_mh), len(other_mh)) n_kmers = min_n_kmers * scaled # would be better if hll estimate - see #1798 - c_aniresult = containment_to_distance(max_containment, self_mh.ksize, scaled, - n_unique_kmers=n_kmers,confidence=confidence, - estimate_ci = estimate_ci, prob_threshold=prob_threshold) + c_aniresult = containment_to_distance( + max_containment, + self_mh.ksize, + scaled, + n_unique_kmers=n_kmers, + confidence=confidence, + estimate_ci=estimate_ci, + prob_threshold=prob_threshold, + ) # null out ANI if either mh size estimation is inaccurate if not self.size_is_accurate() or not other.size_is_accurate(): c_aniresult.size_is_inaccurate = True @@ -838,12 +948,14 @@ def avg_containment(self, other, *, downsample=False): Note: this is average of the containments, *not* count_common/ avg_denom """ if not (self.scaled and other.scaled): - raise TypeError("Error: can only calculate containment for scaled MinHashes") + raise TypeError( + "Error: can only calculate containment for scaled MinHashes" + ) c1 = self.contained_by(other, downsample) c2 = other.contained_by(self, downsample) - return (c1 + c2)/2 + return (c1 + c2) / 2 def avg_containment_ani(self, other, *, downsample=False, prob_threshold=1e-3): """ @@ -852,11 +964,15 @@ def avg_containment_ani(self, other, *, downsample=False, prob_threshold=1e-3): """ if not (self.scaled and other.scaled): raise TypeError("Error: can only calculate ANI for scaled MinHashes") - a1 = self.containment_ani(other, downsample=downsample, prob_threshold=prob_threshold).ani - a2 = other.containment_ani(self, downsample=downsample, prob_threshold=prob_threshold).ani + a1 = self.containment_ani( + other, downsample=downsample, prob_threshold=prob_threshold + ).ani + a2 = other.containment_ani( + self, downsample=downsample, prob_threshold=prob_threshold + ).ani if any([a1 is None, a2 is None]): return None - return (a1 + a2)/2 + return (a1 + a2) / 2 def __add__(self, other): if not isinstance(other, MinHash): @@ -864,11 +980,14 @@ def __add__(self, other): if self.num and other.num: if self.num != other.num: - raise TypeError(f"incompatible num values: self={self.num} other={other.num}") + raise TypeError( + f"incompatible num values: self={self.num} other={other.num}" + ) new_obj = self.to_mutable() new_obj += other return new_obj + __or__ = __add__ def __iadd__(self, other): @@ -890,6 +1009,7 @@ def intersection(self, other): ptr = self._methodcall(lib.kmerminhash_intersection, other._get_objptr()) return MinHash._from_objptr(ptr) + __and__ = intersection def set_abundances(self, values, clear=True): @@ -904,12 +1024,14 @@ def set_abundances(self, values, clear=True): abunds = [] for h, v in values.items(): - hashes.append(h) + hashes.append(h) if v < 0: raise ValueError("Abundance cannot be set to a negative value.") abunds.append(v) - self._methodcall(lib.kmerminhash_set_abundances, hashes, abunds, len(hashes), clear) + self._methodcall( + lib.kmerminhash_set_abundances, hashes, abunds, len(hashes), clear + ) else: raise RuntimeError( "Use track_abundance=True when constructing " @@ -921,15 +1043,15 @@ def add_protein(self, sequence): self._methodcall(lib.kmerminhash_add_protein, to_bytes(sequence)) @property - def moltype(self): # TODO: test in minhash tests + def moltype(self): # TODO: test in minhash tests if self.is_protein: - return 'protein' + return "protein" elif self.dayhoff: - return 'dayhoff' + return "dayhoff" elif self.hp: - return 'hp' + return "hp" else: - return 'DNA' + return "DNA" def to_mutable(self): "Return a copy of this MinHash that can be changed." @@ -954,7 +1076,7 @@ def inflate(self, from_mh): """ if not self.track_abundance and from_mh.track_abundance: orig_abunds = from_mh.hashes - abunds = { h: orig_abunds.get(h, 0) for h in self.hashes } + abunds = {h: orig_abunds.get(h, 0) for h in self.hashes} abund_mh = from_mh.copy_and_clear() @@ -963,7 +1085,9 @@ def inflate(self, from_mh): return abund_mh else: - raise ValueError("inflate operates on a flat MinHash and takes a MinHash object with track_abundance=True") + raise ValueError( + "inflate operates on a flat MinHash and takes a MinHash object with track_abundance=True" + ) @property def sum_abundances(self): @@ -995,9 +1119,11 @@ def unique_dataset_hashes(self): Approximate total number of hashes (num_hashes *scaled). """ if not self.scaled: - raise TypeError("can only approximate unique_dataset_hashes for scaled MinHashes") + raise TypeError( + "can only approximate unique_dataset_hashes for scaled MinHashes" + ) # TODO: replace set_size with HLL estimate when that gets implemented - return len(self) * self.scaled # + (self.ksize - 1) for bp estimation + return len(self) * self.scaled # + (self.ksize - 1) for bp estimation def size_is_accurate(self, relative_error=0.20, confidence=0.95): """ @@ -1008,41 +1134,47 @@ def size_is_accurate(self, relative_error=0.20, confidence=0.95): Returns True if probability is greater than or equal to the desired confidence. """ if not self.scaled: - raise TypeError("Error: can only estimate dataset size for scaled MinHashes") + raise TypeError( + "Error: can only estimate dataset size for scaled MinHashes" + ) if any([not (0 <= relative_error <= 1), not (0 <= confidence <= 1)]): - raise ValueError("Error: relative error and confidence values must be between 0 and 1.") - # to do: replace unique_dataset_hashes with HLL estimation when it gets implemented - probability = set_size_exact_prob(self.unique_dataset_hashes, self.scaled, relative_error=relative_error) + raise ValueError( + "Error: relative error and confidence values must be between 0 and 1." + ) + # to do: replace unique_dataset_hashes with HLL estimation when it gets implemented + probability = set_size_exact_prob( + self.unique_dataset_hashes, self.scaled, relative_error=relative_error + ) return probability >= confidence class FrozenMinHash(MinHash): def add_sequence(self, *args, **kwargs): - raise TypeError('FrozenMinHash does not support modification') + raise TypeError("FrozenMinHash does not support modification") def add_kmer(self, *args, **kwargs): - raise TypeError('FrozenMinHash does not support modification') + raise TypeError("FrozenMinHash does not support modification") def add_many(self, *args, **kwargs): - raise TypeError('FrozenMinHash does not support modification') + raise TypeError("FrozenMinHash does not support modification") def remove_many(self, *args, **kwargs): - raise TypeError('FrozenMinHash does not support modification') + raise TypeError("FrozenMinHash does not support modification") def add_hash(self, *args, **kwargs): - raise TypeError('FrozenMinHash does not support modification') + raise TypeError("FrozenMinHash does not support modification") def add_hash_with_abundance(self, *args, **kwargs): - raise TypeError('FrozenMinHash does not support modification') + raise TypeError("FrozenMinHash does not support modification") def clear(self, *args, **kwargs): - raise TypeError('FrozenMinHash does not support modification') + raise TypeError("FrozenMinHash does not support modification") def set_abundances(self, *args, **kwargs): - raise TypeError('FrozenMinHash does not support modification') + raise TypeError("FrozenMinHash does not support modification") def add_protein(self, *args, **kwargs): - raise TypeError('FrozenMinHash does not support modification') + raise TypeError("FrozenMinHash does not support modification") def downsample(self, *, num=None, scaled=None): if scaled and self.scaled == scaled: @@ -1062,10 +1194,10 @@ def flatten(self): return flat_mh def __iadd__(self, *args, **kwargs): - raise TypeError('FrozenMinHash does not support modification') + raise TypeError("FrozenMinHash does not support modification") def merge(self, *args, **kwargs): - raise TypeError('FrozenMinHash does not support modification') + raise TypeError("FrozenMinHash does not support modification") def to_mutable(self): "Return a copy of this MinHash that can be changed." @@ -1085,16 +1217,29 @@ def into_frozen(self): def __setstate__(self, tup): "support pickling via __getstate__/__setstate__" - (n, ksize, is_protein, dayhoff, hp, mins, _, track_abundance, - max_hash, seed) = tup + ( + n, + ksize, + is_protein, + dayhoff, + hp, + mins, + _, + track_abundance, + max_hash, + seed, + ) = tup self.__del__() hash_function = ( - lib.HASH_FUNCTIONS_MURMUR64_DAYHOFF if dayhoff else - lib.HASH_FUNCTIONS_MURMUR64_HP if hp else - lib.HASH_FUNCTIONS_MURMUR64_PROTEIN if is_protein else - lib.HASH_FUNCTIONS_MURMUR64_DNA + lib.HASH_FUNCTIONS_MURMUR64_DAYHOFF + if dayhoff + else lib.HASH_FUNCTIONS_MURMUR64_HP + if hp + else lib.HASH_FUNCTIONS_MURMUR64_PROTEIN + if is_protein + else lib.HASH_FUNCTIONS_MURMUR64_DNA ) scaled = _get_scaled_for_max_hash(max_hash) @@ -1108,4 +1253,5 @@ def __setstate__(self, tup): def __copy__(self): return self + copy = __copy__ diff --git a/src/sourmash/nodegraph.py b/src/sourmash/nodegraph.py index 8faa2eb874..3204e11b7e 100644 --- a/src/sourmash/nodegraph.py +++ b/src/sourmash/nodegraph.py @@ -88,6 +88,7 @@ def matches(self, mh): def to_khmer_nodegraph(self): import khmer + try: load_nodegraph = khmer.load_nodegraph except AttributeError: @@ -117,41 +118,44 @@ def extract_nodegraph_info(filename): ht_type = None occupied = None - uint_size = len(pack('I', 0)) - uchar_size = len(pack('B', 0)) - ulonglong_size = len(pack('Q', 0)) + uint_size = len(pack("I", 0)) + uchar_size = len(pack("B", 0)) + ulonglong_size = len(pack("Q", 0)) try: - with open(filename, 'rb') as nodegraph: - signature, = unpack('4s', nodegraph.read(4)) - version, = unpack('B', nodegraph.read(1)) - ht_type, = unpack('B', nodegraph.read(1)) - ksize, = unpack('I', nodegraph.read(uint_size)) - n_tables, = unpack('B', nodegraph.read(uchar_size)) - occupied, = unpack('Q', nodegraph.read(ulonglong_size)) - table_size, = unpack('Q', nodegraph.read(ulonglong_size)) + with open(filename, "rb") as nodegraph: + (signature,) = unpack("4s", nodegraph.read(4)) + (version,) = unpack("B", nodegraph.read(1)) + (ht_type,) = unpack("B", nodegraph.read(1)) + (ksize,) = unpack("I", nodegraph.read(uint_size)) + (n_tables,) = unpack("B", nodegraph.read(uchar_size)) + (occupied,) = unpack("Q", nodegraph.read(ulonglong_size)) + (table_size,) = unpack("Q", nodegraph.read(ulonglong_size)) if signature != b"OXLI": - raise ValueError("Node graph '{}' is missing file type " - "signature".format(filename) + str(signature)) + raise ValueError( + f"Node graph '{filename}' is missing file type " + "signature" + str(signature) + ) except: - raise ValueError("Node graph '{}' is corrupt ".format(filename)) + raise ValueError(f"Node graph '{filename}' is corrupt ") return ksize, round(table_size, -2), n_tables, version, ht_type, occupied -def calc_expected_collisions(graph, force=False, max_false_pos=.2): +def calc_expected_collisions(graph, force=False, max_false_pos=0.2): fp_all = graph.expected_collisions if fp_all > max_false_pos: print("**", file=sys.stderr) - print("** ERROR: the graph structure is too small for ", - file=sys.stderr) - print("** this data set. Increase data structure size.", - file=sys.stderr) + print("** ERROR: the graph structure is too small for ", file=sys.stderr) + print("** this data set. Increase data structure size.", file=sys.stderr) print("** Do not use these results!!", file=sys.stderr) print("**", file=sys.stderr) - print("** (estimated false positive rate of %.3f;" % fp_all, - file=sys.stderr, end=' ') + print( + "** (estimated false positive rate of %.3f;" % fp_all, + file=sys.stderr, + end=" ", + ) print("max recommended %.3f)" % max_false_pos, file=sys.stderr) print("**", file=sys.stderr) diff --git a/src/sourmash/np_utils.py b/src/sourmash/np_utils.py index 683f0be6f6..5c69a0bd5d 100644 --- a/src/sourmash/np_utils.py +++ b/src/sourmash/np_utils.py @@ -12,9 +12,11 @@ def to_memmap(array): """ import numpy as np - filename = tempfile.NamedTemporaryFile(prefix="array", suffix=".mmap", delete=False).name + filename = tempfile.NamedTemporaryFile( + prefix="array", suffix=".mmap", delete=False + ).name shape = array.shape - f = np.memmap(filename, mode='w+', shape=shape, dtype=array.dtype) + f = np.memmap(filename, mode="w+", shape=shape, dtype=array.dtype) f[:] = array[:] del f large_memmap = np.memmap(filename, dtype=array.dtype, shape=shape) diff --git a/src/sourmash/picklist.py b/src/sourmash/picklist.py index 8f43aca739..8a5652eb1a 100644 --- a/src/sourmash/picklist.py +++ b/src/sourmash/picklist.py @@ -17,29 +17,32 @@ preprocess = {} # exact matches -preprocess['name'] = lambda x: x -preprocess['md5'] = lambda x: x +preprocess["name"] = lambda x: x +preprocess["md5"] = lambda x: x # identifier matches/prefix foo - space delimited identifiers -preprocess['identprefix'] = lambda x: x.split(' ')[0].split('.')[0] -preprocess['ident'] = lambda x: x.split(' ')[0] +preprocess["identprefix"] = lambda x: x.split(" ")[0].split(".")[0] +preprocess["ident"] = lambda x: x.split(" ")[0] # match 8 characters -preprocess['md5prefix8'] = lambda x: x[:8] -preprocess['md5short'] = lambda x: x[:8] +preprocess["md5prefix8"] = lambda x: x[:8] +preprocess["md5short"] = lambda x: x[:8] + # all meta-coltypes use the same preprocessing of tuple => (ident, md5short) def combine_ident_md5(x): "preprocess (name, md5) tup into (ident, md5short) tup" name, md5 = x - ident = name.split(' ')[0] + ident = name.split(" ")[0] md5 = md5[:8] return (ident, md5) -preprocess['manifest'] = combine_ident_md5 -preprocess['prefetch'] = combine_ident_md5 -preprocess['gather'] = combine_ident_md5 -preprocess['search'] = combine_ident_md5 + + +preprocess["manifest"] = combine_ident_md5 +preprocess["prefetch"] = combine_ident_md5 +preprocess["gather"] = combine_ident_md5 +preprocess["search"] = combine_ident_md5 class PickStyle(Enum): @@ -74,12 +77,20 @@ class SignaturePicklist: blank in this case: e.g. use 'pickfile.csv::gather'. These "meta-coltypes" use composite selection on (ident, md5short) tuples. """ - meta_coltypes = ('manifest', 'gather', 'prefetch', 'search') - supported_coltypes = ('md5', 'md5prefix8', 'md5short', - 'name', 'ident', 'identprefix') - def __init__(self, coltype, *, pickfile=None, column_name=None, - pickstyle=PickStyle.INCLUDE): + meta_coltypes = ("manifest", "gather", "prefetch", "search") + supported_coltypes = ( + "md5", + "md5prefix8", + "md5short", + "name", + "ident", + "identprefix", + ) + + def __init__( + self, coltype, *, pickfile=None, column_name=None, pickstyle=PickStyle.INCLUDE + ): "create a picklist of column type 'coltype'." # first, check coltype... @@ -96,10 +107,10 @@ def __init__(self, coltype, *, pickfile=None, column_name=None, if column_name: raise ValueError(f"no column name allowed for coltype '{coltype}'") - if coltype == 'prefetch': - column_name = '(match_name, match_md5)' + if coltype == "prefetch": + column_name = "(match_name, match_md5)" else: - column_name = '(name, md5)' + column_name = "(name, md5)" self.coltype = coltype self.pickfile = pickfile @@ -114,18 +125,20 @@ def __init__(self, coltype, *, pickfile=None, column_name=None, @classmethod def from_picklist_args(cls, argstr): "load a picklist from an argument string 'pickfile:col:coltype:style'" - picklist = argstr.split(':') + picklist = argstr.split(":") pickstyle = PickStyle.INCLUDE # pickstyle specified? if len(picklist) == 4: pickstyle_str = picklist.pop() - if pickstyle_str == 'include': + if pickstyle_str == "include": pickstyle = PickStyle.INCLUDE - elif pickstyle_str == 'exclude': + elif pickstyle_str == "exclude": pickstyle = PickStyle.EXCLUDE else: - raise ValueError(f"invalid picklist 'pickstyle' argument 4: '{pickstyle_str}' must be 'include' or 'exclude'") + raise ValueError( + f"invalid picklist 'pickstyle' argument 4: '{pickstyle_str}' must be 'include' or 'exclude'" + ) if len(picklist) != 3: raise ValueError(f"invalid picklist argument '{argstr}'") @@ -133,36 +146,39 @@ def from_picklist_args(cls, argstr): assert len(picklist) == 3 pickfile, column, coltype = picklist - return cls(coltype, pickfile=pickfile, column_name=column, - pickstyle=pickstyle) + return cls(coltype, pickfile=pickfile, column_name=column, pickstyle=pickstyle) def _get_sig_attribute(self, ss): "for a given SourmashSignature, return relevant picklist value." coltype = self.coltype - if coltype in self.meta_coltypes: # gather, prefetch, search, manifest + if coltype in self.meta_coltypes: # gather, prefetch, search, manifest q = (ss.name, ss.md5sum()) - elif coltype in ('md5', 'md5prefix8', 'md5short'): + elif coltype in ("md5", "md5prefix8", "md5short"): q = ss.md5sum() - elif coltype in ('name', 'ident', 'identprefix'): + elif coltype in ("name", "ident", "identprefix"): q = ss.name else: - raise ValueError(f"picklist get_sig_attribute {coltype} has unhandled branch") + raise ValueError( + f"picklist get_sig_attribute {coltype} has unhandled branch" + ) return q def _get_value_for_manifest_row(self, row): "return the picklist value from a manifest row" - if self.coltype in self.meta_coltypes: # gather, prefetch, search, manifest - q = (row['name'], row['md5']) + if self.coltype in self.meta_coltypes: # gather, prefetch, search, manifest + q = (row["name"], row["md5"]) else: - if self.coltype == 'md5': - colkey = 'md5' - elif self.coltype in ('md5prefix8', 'md5short'): - colkey = 'md5short' - elif self.coltype in ('name', 'ident', 'identprefix'): - colkey = 'name' + if self.coltype == "md5": + colkey = "md5" + elif self.coltype in ("md5prefix8", "md5short"): + colkey = "md5short" + elif self.coltype in ("name", "ident", "identprefix"): + colkey = "name" else: - raise ValueError(f"picklist get_value_for_row {colkey} has unhandled branch") + raise ValueError( + f"picklist get_value_for_row {colkey} has unhandled branch" + ) q = row.get(colkey) @@ -175,12 +191,12 @@ def _get_value_for_csv_row(self, row): "return the picklist value from a CSV pickfile row - supplied by user, typically" # customize for each type of meta_coltypes - if self.coltype == 'manifest': - q = (row['name'], row['md5']) - elif self.coltype == 'prefetch': - q = (row['match_name'], row['match_md5']) - elif self.coltype in ('gather', 'search'): - q = (row['name'], row['md5']) + if self.coltype == "manifest": + q = (row["name"], row["md5"]) + elif self.coltype == "prefetch": + q = (row["match_name"], row["match_md5"]) + elif self.coltype in ("gather", "search"): + q = (row["name"], row["md5"]) else: q = row[self.column_name] @@ -218,7 +234,9 @@ def load(self, *, allow_empty=False): self.pickfile = pickfile if not r.fieldnames: if not allow_empty: - raise ValueError(f"empty or improperly formatted pickfile '{pickfile}'") + raise ValueError( + f"empty or improperly formatted pickfile '{pickfile}'" + ) else: return 0, 0 diff --git a/src/sourmash/plugins.py b/src/sourmash/plugins.py index 4c18f27533..0871154f2d 100644 --- a/src/sourmash/plugins.py +++ b/src/sourmash/plugins.py @@ -18,7 +18,7 @@ import itertools import argparse -from .logging import (debug_literal, error, notify, set_quiet) +from .logging import debug_literal, error, notify, set_quiet # cover for older versions of Python that don't support selection on load # (the 'group=' below). @@ -26,20 +26,22 @@ # load 'load_from' entry points. NOTE: this executes on import of this module. try: - _plugin_load_from = entry_points(group='sourmash.load_from') + _plugin_load_from = entry_points(group="sourmash.load_from") except TypeError: from importlib_metadata import entry_points - _plugin_load_from = entry_points(group='sourmash.load_from') + + _plugin_load_from = entry_points(group="sourmash.load_from") # load 'save_to' entry points as well. -_plugin_save_to = entry_points(group='sourmash.save_to') +_plugin_save_to = entry_points(group="sourmash.save_to") # aaaaand CLI entry points: -_plugin_cli = entry_points(group='sourmash.cli_script') +_plugin_cli = entry_points(group="sourmash.cli_script") _plugin_cli_once = False ### + def get_load_from_functions(): "Load the 'load_from' plugins and yield tuples (priority, name, fn)." debug_literal(f"load_from plugins: {_plugin_load_from}") @@ -49,11 +51,13 @@ def get_load_from_functions(): try: loader_fn = plugin.load() except (ModuleNotFoundError, AttributeError) as e: - debug_literal(f"plugins.load_from_functions: got error loading {plugin.name}: {str(e)}") + debug_literal( + f"plugins.load_from_functions: got error loading {plugin.name}: {str(e)}" + ) continue # get 'priority' if it is available - priority = getattr(loader_fn, 'priority', DEFAULT_LOAD_FROM_PRIORITY) + priority = getattr(loader_fn, "priority", DEFAULT_LOAD_FROM_PRIORITY) # retrieve name (which is specified by plugin?) name = plugin.name @@ -70,11 +74,13 @@ def get_save_to_functions(): try: save_cls = plugin.load() except (ModuleNotFoundError, AttributeError) as e: - debug_literal(f"plugins.load_from_functions: got error loading {plugin.name}: {str(e)}") + debug_literal( + f"plugins.load_from_functions: got error loading {plugin.name}: {str(e)}" + ) continue # get 'priority' if it is available - priority = getattr(save_cls, 'priority', DEFAULT_SAVE_TO_PRIORITY) + priority = getattr(save_cls, "priority", DEFAULT_SAVE_TO_PRIORITY) # retrieve name (which is specified by plugin?) name = plugin.name @@ -88,17 +94,16 @@ class CommandLinePlugin: Subclasses should call super().__init__(parser) and super().main(args). """ + command = None description = None def __init__(self, parser): parser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) parser.add_argument( - '-d', '--debug', action='store_true', - help='provide debugging output' + "-d", "--debug", action="store_true", help="provide debugging output" ) def main(self, args): @@ -116,14 +121,18 @@ def get_cli_script_plugins(): script_cls = plugin.load() except (ModuleNotFoundError, AttributeError): if _plugin_cli_once is False: - error(f"ERROR: cannot find or load module for cli_script plugin '{name}'") + error( + f"ERROR: cannot find or load module for cli_script plugin '{name}'" + ) continue - command = getattr(script_cls, 'command', None) + command = getattr(script_cls, "command", None) if command is None: # print error message only once... if _plugin_cli_once is False: - error(f"ERROR: no command provided by cli_script plugin '{name}' from {mod}; skipping") + error( + f"ERROR: no command provided by cli_script plugin '{name}' from {mod}; skipping" + ) else: x.append(plugin) @@ -137,8 +146,8 @@ def get_cli_scripts_descriptions(): name = plugin.name script_cls = plugin.load() - command = getattr(script_cls, 'command') - description = getattr(script_cls, 'description', "") + command = getattr(script_cls, "command") + description = getattr(script_cls, "description", "") if description: description = description.splitlines()[0] if not description: @@ -155,18 +164,21 @@ def add_cli_scripts(parser): name = plugin.name script_cls = plugin.load() - usage = getattr(script_cls, 'usage', None) - description = getattr(script_cls, 'description', None) - epilog = getattr(script_cls, 'epilog', None) - formatter_class = getattr(script_cls, 'formatter_class', - argparse.HelpFormatter) - - subparser = parser.add_parser(script_cls.command, - usage=usage, - description=description, - epilog=epilog, - formatter_class=formatter_class) - debug_literal(f"cls_script plugin '{name}' adding command '{script_cls.command}'") + usage = getattr(script_cls, "usage", None) + description = getattr(script_cls, "description", None) + epilog = getattr(script_cls, "epilog", None) + formatter_class = getattr(script_cls, "formatter_class", argparse.HelpFormatter) + + subparser = parser.add_parser( + script_cls.command, + usage=usage, + description=description, + epilog=epilog, + formatter_class=formatter_class, + ) + debug_literal( + f"cls_script plugin '{name}' adding command '{script_cls.command}'" + ) obj = script_cls(subparser) d[script_cls.command] = obj @@ -174,9 +186,7 @@ def add_cli_scripts(parser): def list_all_plugins(): - plugins = itertools.chain(_plugin_load_from, - _plugin_save_to, - _plugin_cli) + plugins = itertools.chain(_plugin_load_from, _plugin_save_to, _plugin_cli) plugins = list(plugins) if not plugins: @@ -185,7 +195,9 @@ def list_all_plugins(): notify("") notify("the following plugins are installed:") notify("") - notify(f"{'plugin type':<20s} {'from python module':<30s} {'v':<5s} {'entry point name':<20s}") + notify( + f"{'plugin type':<20s} {'from python module':<30s} {'v':<5s} {'entry point name':<20s}" + ) notify(f"{'-'*20} {'-'*30} {'-'*5} {'-'*20}") for plugin in plugins: diff --git a/src/sourmash/save_load.py b/src/sourmash/save_load.py index f7109f0fb1..1f73c116c7 100644 --- a/src/sourmash/save_load.py +++ b/src/sourmash/save_load.py @@ -43,7 +43,7 @@ from .sbtmh import load_sbt_index from .lca.lca_db import load_single_database from . import signature as sigmod -from .index import (LinearIndex, ZipFileLinearIndex, MultiIndex) +from .index import LinearIndex, ZipFileLinearIndex, MultiIndex from .manifest import CollectionManifest @@ -74,16 +74,18 @@ def SaveSignaturesToLocation(location): with SaveSignaturesToLocation(filename_or_location) as save_sigs: save_sigs.add(sig_obj) """ - save_list = itertools.chain(_save_classes, - sourmash_plugins.get_save_to_functions()) - for priority, cls in sorted(save_list, key=lambda x:x[0]): + save_list = itertools.chain(_save_classes, sourmash_plugins.get_save_to_functions()) + for priority, cls in sorted(save_list, key=lambda x: x[0]): debug_literal(f"trying to match save function {cls}, priority={priority}") if cls.matches(location): debug_literal(f"{cls} is a match!") return cls(location) - raise Exception(f"cannot determine how to open location {location} for saving; this should never happen!?") + raise Exception( + f"cannot determine how to open location {location} for saving; this should never happen!?" + ) + ### Implementation machinery for _load_databases @@ -101,18 +103,19 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): plugin_fns = sourmash_plugins.get_load_from_functions() # aggregate with default load_from functions & sort by priority - load_from_functions = sorted(itertools.chain(_loader_functions, - plugin_fns)) - + load_from_functions = sorted(itertools.chain(_loader_functions, plugin_fns)) + # iterate through loader functions, sorted by priority; try them all. # Catch ValueError & IndexNotLoaded but nothing else. - for (priority, desc, load_fn) in load_from_functions: + for priority, desc, load_fn in load_from_functions: db = None try: - debug_literal(f"_load_databases: trying loader fn - priority {priority} - '{desc}'") - db = load_fn(filename, - traverse_yield_all=traverse_yield_all, - cache_size=cache_size) + debug_literal( + f"_load_databases: trying loader fn - priority {priority} - '{desc}'" + ) + db = load_fn( + filename, traverse_yield_all=traverse_yield_all, cache_size=cache_size + ) except (ValueError, IndexNotLoaded): debug_literal(f"_load_databases: FAIL with ValueError: on fn {desc}.") debug_literal(traceback.format_exc()) @@ -126,16 +129,20 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): if loaded: assert db is not None return db - + raise ValueError(f"Error while reading signatures from '{filename}'.") _loader_functions = [] + + def add_loader(name, priority): "decorator to add name/priority to _loader_functions" + def dec_priority(func): _loader_functions.append((priority, name, func)) return func + return dec_priority @@ -143,10 +150,10 @@ def dec_priority(func): def _load_stdin(filename, **kwargs): "Load collection from .sig file streamed in via stdin" db = None - if filename == '-': + if filename == "-": # load as LinearIndex, then pass into MultiIndex to generate a # manifest. - lidx = LinearIndex.load(sys.stdin, filename='-') + lidx = LinearIndex.load(sys.stdin, filename="-") db = MultiIndex.load((lidx,), (None,), parent="-") return db @@ -175,7 +182,7 @@ def _multiindex_load_from_pathlist(filename, **kwargs): @add_loader("load from path (file or directory)", 40) def _multiindex_load_from_path(filename, **kwargs): "Load collection from a directory." - traverse_yield_all = kwargs['traverse_yield_all'] + traverse_yield_all = kwargs["traverse_yield_all"] db = MultiIndex.load_from_path(filename, traverse_yield_all) return db @@ -184,7 +191,7 @@ def _multiindex_load_from_path(filename, **kwargs): @add_loader("load SBT", 60) def _load_sbt(filename, **kwargs): "Load collection from an SBT." - cache_size = kwargs.get('cache_size') + cache_size = kwargs.get("cache_size") try: db = load_sbt_index(filename, cache_size=cache_size) @@ -210,11 +217,12 @@ def _load_sqlite_db(filename, **kwargs): def _load_zipfile(filename, **kwargs): "Load collection from a .zip file." db = None - if filename.endswith('.zip'): - traverse_yield_all = kwargs['traverse_yield_all'] + if filename.endswith(".zip"): + traverse_yield_all = kwargs["traverse_yield_all"] try: - db = ZipFileLinearIndex.load(filename, - traverse_yield_all=traverse_yield_all) + db = ZipFileLinearIndex.load( + filename, traverse_yield_all=traverse_yield_all + ) except FileNotFoundError as exc: # turn this into an IndexNotLoaded => proper exception handling by # _load_database. @@ -236,13 +244,17 @@ def _error_on_fastaq(filename, **kwargs): pass if success: - raise Exception(f"Error while reading signatures from '{filename}' - got sequences instead! Is this a FASTA/FASTQ file?") + raise Exception( + f"Error while reading signatures from '{filename}' - got sequences instead! Is this a FASTA/FASTQ file?" + ) ### Implementation machinery for SaveSignaturesToLocation + class Base_SaveSignaturesToLocation: "Base signature saving class. Track location (if any) and count." + def __init__(self, location): self.location = location self.count = 0 @@ -288,14 +300,14 @@ def _get_signatures_from_rust(siglist): # Rust supports multiple. For now, go through serializing # and deserializing the signature! See issue #1167 for more. json_str = sourmash.save_signatures(siglist) - for ss in sourmash.signature.load_signatures(json_str): - yield ss + yield from sourmash.signature.load_signatures(json_str) class SaveSignatures_NoOutput(Base_SaveSignaturesToLocation): "Do not save signatures." + def __repr__(self): - return 'SaveSignatures_NoOutput()' + return "SaveSignatures_NoOutput()" @classmethod def matches(cls, location): @@ -310,6 +322,7 @@ def close(self): class SaveSignatures_Directory(Base_SaveSignaturesToLocation): "Save signatures within a directory, using md5sum names." + def __init__(self, location): super().__init__(location) @@ -320,7 +333,7 @@ def __repr__(self): def matches(cls, location): "anything ending in /" if location: - return location.endswith('/') + return location.endswith("/") def close(self): pass @@ -354,6 +367,7 @@ def add(self, ss): class SaveSignatures_SqliteIndex(Base_SaveSignaturesToLocation): "Save signatures within a directory, using md5sum names." + def __init__(self, location): super().__init__(location) self.location = location @@ -364,14 +378,14 @@ def __init__(self, location): def matches(cls, location): "anything ending in .sqldb" if location: - return location.endswith('.sqldb') + return location.endswith(".sqldb") def __repr__(self): return f"SaveSignatures_SqliteIndex('{self.location}')" def close(self): self.idx.commit() - self.cursor.execute('VACUUM') + self.cursor.execute("VACUUM") self.idx.close() def open(self): @@ -390,11 +404,12 @@ def add(self, add_sig): class SaveSignatures_SigFile(Base_SaveSignaturesToLocation): "Save signatures to a .sig JSON file." + def __init__(self, location): super().__init__(location) self.keep = [] self.compress = 0 - if self.location.endswith('.gz'): + if self.location.endswith(".gz"): self.compress = 1 @classmethod @@ -409,12 +424,12 @@ def open(self): pass def close(self): - if self.location == '-': + if self.location == "-": sourmash.save_signatures(self.keep, sys.stdout) else: # text mode? encode in utf-8 mode = "w" - encoding = 'utf-8' + encoding = "utf-8" # compressed? bytes & binary. if self.compress: @@ -422,8 +437,7 @@ def close(self): mode = "wb" with open(self.location, mode, encoding=encoding) as fp: - sourmash.save_signatures(self.keep, fp, - compression=self.compress) + sourmash.save_signatures(self.keep, fp, compression=self.compress) def add(self, ss): super().add(ss) @@ -432,6 +446,7 @@ def add(self, ss): class SaveSignatures_ZipFile(Base_SaveSignaturesToLocation): "Save compressed signatures in an uncompressed Zip file." + def __init__(self, location): super().__init__(location) self.storage = None @@ -440,7 +455,7 @@ def __init__(self, location): def matches(cls, location): "anything ending in .zip" if location: - return location.endswith('.zip') + return location.endswith(".zip") def __repr__(self): return f"SaveSignatures_ZipFile('{self.location}')" @@ -454,8 +469,7 @@ def close(self): manifest.write_to_csv(manifest_fp, write_header=True) manifest_data = manifest_fp.getvalue().encode("utf-8") - self.storage.save(manifest_name, manifest_data, overwrite=True, - compress=True) + self.storage.save(manifest_name, manifest_data, overwrite=True, compress=True) self.storage.flush() self.storage.close() @@ -476,19 +490,21 @@ def open(self): raise ValueError(f"File '{self.location}' cannot be opened as a zip file.") if not storage.subdir: - storage.subdir = 'signatures' + storage.subdir = "signatures" # now, try to load manifest try: - manifest_data = storage.load('SOURMASH-MANIFEST.csv') + manifest_data = storage.load("SOURMASH-MANIFEST.csv") except (FileNotFoundError, KeyError): # if file already exists must have manifest... if not do_create: - raise ValueError(f"Cannot add to existing zipfile '{self.location}' without a manifest") + raise ValueError( + f"Cannot add to existing zipfile '{self.location}' without a manifest" + ) self.manifest_rows = [] else: # success! decode manifest_data, create manifest rows => append. - manifest_data = manifest_data.decode('utf-8') + manifest_data = manifest_data.decode("utf-8") manifest_fp = StringIO(manifest_data) manifest = CollectionManifest.load_from_csv(manifest_fp) self.manifest_rows = list(manifest._select()) @@ -511,12 +527,13 @@ def add(self, add_sig): md5 = ss.md5sum() storage = self.storage - path = f'{storage.subdir}/{md5}.sig.gz' + path = f"{storage.subdir}/{md5}.sig.gz" location = storage.save(path, buf) # update manifest - row = CollectionManifest.make_manifest_row(ss, location, - include_signature=False) + row = CollectionManifest.make_manifest_row( + ss, location, include_signature=False + ) self.manifest_rows.append(row) super().add(ss) diff --git a/src/sourmash/sbt.py b/src/sourmash/sbt.py index 3ad36ebe1f..452ca29375 100644 --- a/src/sourmash/sbt.py +++ b/src/sourmash/sbt.py @@ -26,10 +26,10 @@ from .nodegraph import Nodegraph, extract_nodegraph_info, calc_expected_collisions STORAGES = { - 'FSStorage': FSStorage, - 'IPFSStorage': IPFSStorage, - 'RedisStorage': RedisStorage, - 'ZipStorage': ZipStorage, + "FSStorage": FSStorage, + "IPFSStorage": IPFSStorage, + "RedisStorage": RedisStorage, + "ZipStorage": ZipStorage, } @@ -103,7 +103,7 @@ def popitem(self): # we just need to select the maximum key/node id (key, _) = max(c for c in common if c[1] == count) except IndexError: - msg = '%s is empty' % self.__class__.__name__ + msg = "%s is empty" % self.__class__.__name__ raise KeyError(msg) from None else: value = self.pop(key) @@ -136,6 +136,7 @@ class SBT(Index): We use two dicts to store the tree structure: One for the internal nodes, and another for the leaves (datasets). """ + is_database = True def __init__(self, factory, *, d=2, storage=None, cache_size=None): @@ -162,6 +163,7 @@ def signatures(self): # if manifest, use it & load using direct path to storage. # this will be faster when using picklists. from .signature import load_one_signature + manifest = self.manifest # iteratively select picklists; no other selection criteria @@ -191,8 +193,16 @@ def _signatures_with_internal(self): ss = k.data yield ss, k._path - def select(self, ksize=None, moltype=None, num=0, scaled=0, - containment=False, abund=None, picklist=None): + def select( + self, + ksize=None, + moltype=None, + num=0, + scaled=0, + containment=False, + abund=None, + picklist=None, + ): """Make sure this database matches the requested requirements. Will always raise ValueError if a requirement cannot be met. @@ -216,33 +226,45 @@ def select(self, ksize=None, moltype=None, num=0, scaled=0, # check ksize. if ksize is not None and db_mh.ksize != ksize: - raise ValueError(f"search ksize {ksize} is different from database ksize {db_mh.ksize}") + raise ValueError( + f"search ksize {ksize} is different from database ksize {db_mh.ksize}" + ) # check moltype. if moltype is not None and db_mh.moltype != moltype: - raise ValueError(f"search moltype {moltype} is different from database moltype {db_mh.moltype}") + raise ValueError( + f"search moltype {moltype} is different from database moltype {db_mh.moltype}" + ) # containment requires 'scaled'. if containment: if not scaled: raise ValueError("'containment' requires 'scaled' in SBT.select'") if not db_mh.scaled: - raise ValueError("cannot search this SBT for containment; signatures are not calculated with scaled") + raise ValueError( + "cannot search this SBT for containment; signatures are not calculated with scaled" + ) # 'num' and 'scaled' do not mix. if num: if not db_mh.num: - raise ValueError(f"this database was created with 'scaled' MinHash sketches, not 'num'") + raise ValueError( + "this database was created with 'scaled' MinHash sketches, not 'num'" + ) if num != db_mh.num: raise ValueError(f"num mismatch for SBT: num={num}, {db_mh.num}") if scaled: if not db_mh.scaled: - raise ValueError(f"this database was created with 'num' MinHash sketches, not 'scaled'") + raise ValueError( + "this database was created with 'num' MinHash sketches, not 'scaled'" + ) # we can downsample SBTs for containment operations. if scaled > db_mh.scaled and not containment: - raise ValueError(f"search scaled value {scaled} is less than database scaled value of {db_mh.scaled}") + raise ValueError( + f"search scaled value {scaled} is less than database scaled value of {db_mh.scaled}" + ) if abund: raise ValueError("SBT indices do not support sketches with abund=True") @@ -269,9 +291,13 @@ def new_node_pos(self, node): next_internal_node = None if self.next_node <= min_leaf: for i in range(min_leaf): - if all((i not in self._nodes, + if all( + ( + i not in self._nodes, i not in self._leaves, - i not in self._missing_nodes)): + i not in self._missing_nodes, + ) + ): next_internal_node = i break @@ -285,7 +311,7 @@ def new_node_pos(self, node): def insert(self, signature): "Add a new SourmashSignature in to the SBT." from .sbtmh import SigLeaf - + leaf = SigLeaf(signature.md5sum(), signature) self.add_node(leaf) @@ -315,19 +341,19 @@ def add_node(self, node): c1, c2 = self.children(p.pos)[:2] self._leaves[c1.pos] = p.node - self._leaves[c2.pos] = node + self._leaves[c2.pos] = node del self._leaves[p.pos] for child in (p.node, node): child.update(n) elif isinstance(p.node, Node): - self._leaves[pos] = node + self._leaves[pos] = node node.update(p.node) elif p.node is None: n = Node(self.factory, name="internal." + str(p.pos)) self._nodes[p.pos] = n c1 = self.children(p.pos)[0] - self._leaves[c1.pos] = node + self._leaves[c1.pos] = node node.update(n) else: # this branch should never be reached; put guard in to make sure! @@ -375,16 +401,15 @@ def _find_nodes(self, search_fn, *args, **kwargs): # apply search fn. If return false, truncate search. if search_fn(node_g, *args): - # leaf node? it's a match! if isinstance(node_g, Leaf): matches.append(node_g) # internal node? descend. elif isinstance(node_g, Node): - if kwargs.get('dfs', True): # defaults search to dfs + if kwargs.get("dfs", True): # defaults search to dfs for c in self.children(node_p): queue.insert(0, c.pos) - else: # bfs + else: # bfs queue.extend(c.pos for c in self.children(node_p)) if unload_data: @@ -423,8 +448,11 @@ def find(self, search_fn, query, **kwargs): # provide function to downsample leaf_node as well if scaled == tree_scaled: - downsample_node = lambda x: x + + def downsample_node(x): + return x else: + def downsample_node(node_mh): return node_mh.downsample(scaled=scaled) else: @@ -439,8 +467,11 @@ def downsample_node(node_mh): # provide function to downsample leaf nodes. if min_num == a_leaf.data.minhash.num: - downsample_node = lambda x: x + + def downsample_node(x): + return x else: + def downsample_node(node_mh): return node_mh.downsample(num=min_num) @@ -469,23 +500,22 @@ def node_search(node, *args, **kwargs): else: # Node / Nodegraph by minhash comparison # no downsampling needed -- shared_size = node.data.matches(query_mh) - subj_size = node.metadata.get('min_n_below', -1) + subj_size = node.metadata.get("min_n_below", -1) if subj_size == -1: - raise ValueError("ERROR: no min_n_below on this tree, cannot search.") - total_size = subj_size # approximate; do not collect + raise ValueError( + "ERROR: no min_n_below on this tree, cannot search." + ) + total_size = subj_size # approximate; do not collect # calculate score (exact, if leaf; approximate, if not) - score = search_fn.score_fn(query_size, - shared_size, - subj_size, - total_size) + score = search_fn.score_fn(query_size, shared_size, subj_size, total_size) if search_fn.passes(score): - if is_leaf: # terminal node? keep. + if is_leaf: # terminal node? keep. if search_fn.collect(score, node.data): results[node.data] = score return True - else: # it's a good internal node, keep. + else: # it's a good internal node, keep. return True return False @@ -514,7 +544,7 @@ def _rebuild_node(self, pos=0): # this node was already build, skip return - node = Node(self.factory, name="internal.{}".format(pos)) + node = Node(self.factory, name=f"internal.{pos}") self._nodes[pos] = node for c in self.children(pos): if c.pos in self._missing_nodes or isinstance(c.node, Leaf): @@ -614,8 +644,8 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False): full path to the new SBT description """ info = {} - info['d'] = self.d - info['version'] = 6 + info["d"] = self.d + info["version"] = 6 info["index_type"] = self.__class__.__name__ # TODO: check # choose between ZipStorage and FS (file system/directory) storage. @@ -623,22 +653,22 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False): kind = None if not path.endswith(".sbt.json"): kind = "Zip" - if not path.endswith('.sbt.zip'): - path += '.sbt.zip' + if not path.endswith(".sbt.zip"): + path += ".sbt.zip" storage = ZipStorage(path, mode="w") backend = "FSStorage" - assert path[-8:] == '.sbt.zip' + assert path[-8:] == ".sbt.zip" name = os.path.basename(path[:-8]) # align the storage prefix with what we do for FSStorage, below. - subdir = '.sbt.{}'.format(name) + subdir = f".sbt.{name}" storage_args = FSStorage("", subdir, make_dirs=False).init_args() storage.save(subdir + "/", b"") storage.subdir = subdir index_filename = os.path.abspath(path) - else: # path.endswith('.sbt.json') - assert path.endswith('.sbt.json') + else: # path.endswith('.sbt.json') + assert path.endswith(".sbt.json") name = os.path.basename(path) name = name[:-9] index_filename = os.path.abspath(path) @@ -649,7 +679,7 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False): location = os.path.dirname(index_filename) # align subdir names with what we do above for ZipStorage - subdir = '.sbt.{}'.format(name) + subdir = f".sbt.{name}" # when we go to default of FSStorage, use full location for # storage, e.g. location/.sbt.{name}/ @@ -659,13 +689,10 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False): backend = [k for (k, v) in STORAGES.items() if v == type(storage)][0] storage_args = storage.init_args() - info['storage'] = { - 'backend': backend, - 'args': storage_args - } - info['factory'] = { - 'class': GraphFactory.__name__, - 'args': self.factory.init_args() + info["storage"] = {"backend": backend, "args": storage_args} + info["factory"] = { + "class": GraphFactory.__name__, + "args": self.factory.init_args(), } nodes = {} @@ -685,16 +712,16 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False): data = { # TODO: start using md5sum instead? - 'filename': os.path.basename(node.name), - 'name': node.name + "filename": os.path.basename(node.name), + "name": node.name, } try: - node.metadata.pop('max_n_below') + node.metadata.pop("max_n_below") except (AttributeError, KeyError): pass - data['metadata'] = node.metadata + data["metadata"] = node.metadata if structure_only is False: # trigger data loading before saving to the new place @@ -703,27 +730,26 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False): node.storage = storage if kind == "Zip": - new_name = node.save(os.path.join(subdir, data['filename'])) - assert new_name.startswith(subdir + '/') + new_name = node.save(os.path.join(subdir, data["filename"])) + assert new_name.startswith(subdir + "/") # strip off prefix - new_name = new_name[len(subdir) + 1:] - data['filename'] = new_name + new_name = new_name[len(subdir) + 1 :] + data["filename"] = new_name else: - data['filename'] = node.save(data['filename']) - + data["filename"] = node.save(data["filename"]) if isinstance(node, Node): nodes[i] = data else: leaves[i] = data - row = node.make_manifest_row(data['filename']) + row = node.make_manifest_row(data["filename"]) if row: manifest_rows.append(row) if n % 100 == 0: - notify(f"{format(n+1)} of {format(total_nodes)} nodes saved", end='\r') + notify(f"{format(n+1)} of {format(total_nodes)} nodes saved", end="\r") # now, save the index file and manifests. # @@ -736,8 +762,8 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False): # (CTB: manifests are not yet supported for Redis and IPFS) # notify("Finished saving nodes, now saving SBT index file.") - info['nodes'] = nodes - info['signatures'] = leaves + info["nodes"] = nodes + info["signatures"] = leaves # finish constructing manifest object & save manifest = CollectionManifest(manifest_rows) @@ -749,39 +775,46 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False): if kind == "Zip": manifest_name = os.path.join(storage.subdir, manifest_name) - manifest_path = storage.save(manifest_name, manifest_data, - overwrite=True, compress=True) + manifest_path = storage.save( + manifest_name, manifest_data, overwrite=True, compress=True + ) elif kind == "FS": manifest_name = manifest_name - manifest_path = storage.save(manifest_name, manifest_data, - overwrite=True) + manifest_path = storage.save(manifest_name, manifest_data, overwrite=True) else: manifest_path = None if manifest_path: - info['manifest_path'] = manifest_path + info["manifest_path"] = manifest_path # now, save index. tree_data = json.dumps(info).encode("utf-8") if kind == "Zip": - save_path = "{}.sbt.json".format(name) + save_path = f"{name}.sbt.json" storage.save(save_path, tree_data, overwrite=True) storage.flush() elif kind == "FS": storage.save(index_filename, tree_data, overwrite=True) else: # save tree locally. - with open(index_filename, 'wb') as tree_fp: + with open(index_filename, "wb") as tree_fp: tree_fp.write(tree_data) notify(f"Finished saving SBT index, available at {format(index_filename)}\n") return path - @classmethod - def load(cls, location, *, leaf_loader=None, storage=None, print_version_warning=True, cache_size=None): + def load( + cls, + location, + *, + leaf_loader=None, + storage=None, + print_version_warning=True, + cache_size=None, + ): """Load an SBT description from a file. Parameters @@ -807,8 +840,8 @@ def load(cls, location, *, leaf_loader=None, storage=None, print_version_warning if ZipStorage.can_open(location): storage = ZipStorage(location) else: - if not location.endswith('.sbt.zip'): - location2 = location + '.sbt.zip' + if not location.endswith(".sbt.zip"): + location2 = location + ".sbt.zip" if ZipStorage.can_open(location2): storage = ZipStorage(location2) @@ -828,12 +861,12 @@ def load(cls, location, *, leaf_loader=None, storage=None, print_version_warning if sbt_name is None: dirname = os.path.dirname(os.path.abspath(location)) sbt_name = os.path.basename(location) - if sbt_name.endswith('.sbt.json'): + if sbt_name.endswith(".sbt.json"): sbt_name = sbt_name[:-9] sbt_fn = os.path.join(dirname, sbt_name) - if not sbt_fn.endswith('.sbt.json') and tempfile is None: - sbt_fn += '.sbt.json' + if not sbt_fn.endswith(".sbt.json") and tempfile is None: + sbt_fn += ".sbt.json" try: with open(sbt_fn) as fp: @@ -846,7 +879,7 @@ def load(cls, location, *, leaf_loader=None, storage=None, print_version_warning version = 1 if isinstance(jnodes, Mapping): - version = jnodes['version'] + version = jnodes["version"] if leaf_loader is None: leaf_loader = Leaf.load @@ -865,26 +898,33 @@ def load(cls, location, *, leaf_loader=None, storage=None, print_version_warning except KeyError: raise IndexNotSupported() - #if version >= 6: + # if version >= 6: # if jnodes.get("index_type", "SBT") == "LocalizedSBT": # loaders[6] = LocalizedSBT._load_v6 if version < 3 and storage is None: - storage = FSStorage(dirname, '.sbt.{}'.format(sbt_name)) + storage = FSStorage(dirname, f".sbt.{sbt_name}") elif storage is None: - klass = STORAGES[jnodes['storage']['backend']] - if jnodes['storage']['backend'] == "FSStorage": - storage = FSStorage(dirname, jnodes['storage']['args']['path']) + klass = STORAGES[jnodes["storage"]["backend"]] + if jnodes["storage"]["backend"] == "FSStorage": + storage = FSStorage(dirname, jnodes["storage"]["args"]["path"]) elif storage is None: - storage = klass(**jnodes['storage']['args']) - - obj = loader(jnodes, leaf_loader, dirname, storage, print_version_warning=print_version_warning, cache_size=cache_size) + storage = klass(**jnodes["storage"]["args"]) + + obj = loader( + jnodes, + leaf_loader, + dirname, + storage, + print_version_warning=print_version_warning, + cache_size=cache_size, + ) obj._location = location - if 'manifest_path' in jnodes: - manifest_path = jnodes['manifest_path'] + if "manifest_path" in jnodes: + manifest_path = jnodes["manifest_path"] manifest_data = storage.load(manifest_path) - manifest_data = manifest_data.decode('utf-8') + manifest_data = manifest_data.decode("utf-8") manifest_fp = StringIO(manifest_data) obj.manifest = CollectionManifest.load_from_csv(manifest_fp) else: @@ -893,15 +933,22 @@ def load(cls, location, *, leaf_loader=None, storage=None, print_version_warning return obj @staticmethod - def _load_v1(jnodes, leaf_loader, dirname, storage, *, print_version_warning=True, cache_size=None): - + def _load_v1( + jnodes, + leaf_loader, + dirname, + storage, + *, + print_version_warning=True, + cache_size=None, + ): if jnodes[0] is None: raise ValueError("Empty tree!") sbt_nodes = {} sbt_leaves = {} - sample_bf = os.path.join(dirname, jnodes[0]['filename']) + sample_bf = os.path.join(dirname, jnodes[0]["filename"]) ksize, tablesize, ntables = extract_nodegraph_info(sample_bf)[:3] factory = GraphFactory(ksize, tablesize, ntables) @@ -909,10 +956,10 @@ def _load_v1(jnodes, leaf_loader, dirname, storage, *, print_version_warning=Tru if jnode is None: continue - jnode['filename'] = os.path.join(dirname, jnode['filename']) + jnode["filename"] = os.path.join(dirname, jnode["filename"]) - if 'internal' in jnode['name']: - jnode['factory'] = factory + if "internal" in jnode["name"]: + jnode["factory"] = factory sbt_node = Node.load(jnode, storage) sbt_nodes[i] = sbt_node else: @@ -926,8 +973,17 @@ def _load_v1(jnodes, leaf_loader, dirname, storage, *, print_version_warning=Tru return tree @classmethod - def _load_v2(cls, info, leaf_loader, dirname, storage, *, print_version_warning=True, cache_size=None): - nodes = {int(k): v for (k, v) in info['nodes'].items()} + def _load_v2( + cls, + info, + leaf_loader, + dirname, + storage, + *, + print_version_warning=True, + cache_size=None, + ): + nodes = {int(k): v for (k, v) in info["nodes"].items()} if nodes[0] is None: raise ValueError("Empty tree!") @@ -935,7 +991,7 @@ def _load_v2(cls, info, leaf_loader, dirname, storage, *, print_version_warning= sbt_nodes = {} sbt_leaves = {} - sample_bf = os.path.join(dirname, nodes[0]['filename']) + sample_bf = os.path.join(dirname, nodes[0]["filename"]) k, size, ntables = extract_nodegraph_info(sample_bf)[:3] factory = GraphFactory(k, size, ntables) @@ -943,25 +999,34 @@ def _load_v2(cls, info, leaf_loader, dirname, storage, *, print_version_warning= if node is None: continue - node['filename'] = os.path.join(dirname, node['filename']) + node["filename"] = os.path.join(dirname, node["filename"]) - if 'internal' in node['name']: - node['factory'] = factory + if "internal" in node["name"]: + node["factory"] = factory sbt_node = Node.load(node, storage) sbt_nodes[k] = sbt_node else: sbt_node = leaf_loader(node, storage) sbt_leaves[k] = sbt_node - tree = cls(factory, d=info['d'], cache_size=cache_size) + tree = cls(factory, d=info["d"], cache_size=cache_size) tree._nodes = sbt_nodes tree._leaves = sbt_leaves return tree @classmethod - def _load_v3(cls, info, leaf_loader, dirname, storage, *, print_version_warning=True, cache_size=None): - nodes = {int(k): v for (k, v) in info['nodes'].items()} + def _load_v3( + cls, + info, + leaf_loader, + dirname, + storage, + *, + print_version_warning=True, + cache_size=None, + ): + nodes = {int(k): v for (k, v) in info["nodes"].items()} if not nodes: raise ValueError("Empty tree!") @@ -969,15 +1034,15 @@ def _load_v3(cls, info, leaf_loader, dirname, storage, *, print_version_warning= sbt_nodes = {} sbt_leaves = {} - factory = GraphFactory(*info['factory']['args']) + factory = GraphFactory(*info["factory"]["args"]) max_node = 0 for k, node in nodes.items(): if node is None: continue - if 'internal' in node['name']: - node['factory'] = factory + if "internal" in node["name"]: + node["factory"] = factory sbt_node = Node.load(node, storage) sbt_nodes[k] = sbt_node else: @@ -986,23 +1051,37 @@ def _load_v3(cls, info, leaf_loader, dirname, storage, *, print_version_warning= max_node = max(max_node, k) - tree = cls(factory, d=info['d'], storage=storage, cache_size=cache_size) + tree = cls(factory, d=info["d"], storage=storage, cache_size=cache_size) tree._nodes = sbt_nodes tree._leaves = sbt_leaves - tree._missing_nodes = {i for i in range(max_node) - if i not in sbt_nodes and i not in sbt_leaves} + tree._missing_nodes = { + i for i in range(max_node) if i not in sbt_nodes and i not in sbt_leaves + } if print_version_warning: - error("WARNING: this is an old index version, please run `sourmash migrate` to update it.") - error("WARNING: proceeding with execution, but it will take longer to finish!") + error( + "WARNING: this is an old index version, please run `sourmash migrate` to update it." + ) + error( + "WARNING: proceeding with execution, but it will take longer to finish!" + ) tree._fill_min_n_below() return tree @classmethod - def _load_v4(cls, info, leaf_loader, dirname, storage, *, print_version_warning=True, cache_size=None): - nodes = {int(k): v for (k, v) in info['nodes'].items()} + def _load_v4( + cls, + info, + leaf_loader, + dirname, + storage, + *, + print_version_warning=True, + cache_size=None, + ): + nodes = {int(k): v for (k, v) in info["nodes"].items()} if not nodes: raise ValueError("Empty tree!") @@ -1010,12 +1089,12 @@ def _load_v4(cls, info, leaf_loader, dirname, storage, *, print_version_warning= sbt_nodes = {} sbt_leaves = {} - factory = GraphFactory(*info['factory']['args']) + factory = GraphFactory(*info["factory"]["args"]) max_node = 0 for k, node in nodes.items(): - if 'internal' in node['name']: - node['factory'] = factory + if "internal" in node["name"]: + node["factory"] = factory sbt_node = Node.load(node, storage) sbt_nodes[k] = sbt_node else: @@ -1024,20 +1103,30 @@ def _load_v4(cls, info, leaf_loader, dirname, storage, *, print_version_warning= max_node = max(max_node, k) - tree = cls(factory, d=info['d'], storage=storage, cache_size=cache_size) + tree = cls(factory, d=info["d"], storage=storage, cache_size=cache_size) tree._nodes = sbt_nodes tree._leaves = sbt_leaves - tree._missing_nodes = {i for i in range(max_node) - if i not in sbt_nodes and i not in sbt_leaves} + tree._missing_nodes = { + i for i in range(max_node) if i not in sbt_nodes and i not in sbt_leaves + } tree.next_node = max_node return tree @classmethod - def _load_v5(cls, info, leaf_loader, dirname, storage, *, print_version_warning=True, cache_size=None): - nodes = {int(k): v for (k, v) in info['nodes'].items()} - leaves = {int(k): v for (k, v) in info['leaves'].items()} + def _load_v5( + cls, + info, + leaf_loader, + dirname, + storage, + *, + print_version_warning=True, + cache_size=None, + ): + nodes = {int(k): v for (k, v) in info["nodes"].items()} + leaves = {int(k): v for (k, v) in info["leaves"].items()} if not leaves: raise ValueError("Empty tree!") @@ -1046,17 +1135,17 @@ def _load_v5(cls, info, leaf_loader, dirname, storage, *, print_version_warning= sbt_leaves = {} if storage is None: - klass = STORAGES[info['storage']['backend']] - if info['storage']['backend'] == "FSStorage": - storage = FSStorage(dirname, info['storage']['args']['path']) + klass = STORAGES[info["storage"]["backend"]] + if info["storage"]["backend"] == "FSStorage": + storage = FSStorage(dirname, info["storage"]["args"]["path"]) elif storage is None: - storage = klass(**info['storage']['args']) + storage = klass(**info["storage"]["args"]) - factory = GraphFactory(*info['factory']['args']) + factory = GraphFactory(*info["factory"]["args"]) max_node = 0 for k, node in nodes.items(): - node['factory'] = factory + node["factory"] = factory sbt_node = Node.load(node, storage) sbt_nodes[k] = sbt_node @@ -1067,18 +1156,28 @@ def _load_v5(cls, info, leaf_loader, dirname, storage, *, print_version_warning= sbt_leaves[k] = sbt_leaf max_node = max(max_node, k) - tree = cls(factory, d=info['d'], storage=storage, cache_size=cache_size) + tree = cls(factory, d=info["d"], storage=storage, cache_size=cache_size) tree._nodes = sbt_nodes tree._leaves = sbt_leaves - tree._missing_nodes = {i for i in range(max_node) - if i not in sbt_nodes and i not in sbt_leaves} + tree._missing_nodes = { + i for i in range(max_node) if i not in sbt_nodes and i not in sbt_leaves + } return tree @classmethod - def _load_v6(cls, info, leaf_loader, dirname, storage, *, print_version_warning=True, cache_size=None): - nodes = {int(k): v for (k, v) in info['nodes'].items()} - leaves = {int(k): v for (k, v) in info['signatures'].items()} + def _load_v6( + cls, + info, + leaf_loader, + dirname, + storage, + *, + print_version_warning=True, + cache_size=None, + ): + nodes = {int(k): v for (k, v) in info["nodes"].items()} + leaves = {int(k): v for (k, v) in info["signatures"].items()} if not leaves: raise ValueError("Empty tree!") @@ -1087,17 +1186,17 @@ def _load_v6(cls, info, leaf_loader, dirname, storage, *, print_version_warning= sbt_leaves = {} if storage is None: - klass = STORAGES[info['storage']['backend']] - if info['storage']['backend'] == "FSStorage": - storage = FSStorage(dirname, info['storage']['args']['path']) + klass = STORAGES[info["storage"]["backend"]] + if info["storage"]["backend"] == "FSStorage": + storage = FSStorage(dirname, info["storage"]["args"]["path"]) elif storage is None: - storage = klass(**info['storage']['args']) + storage = klass(**info["storage"]["args"]) - factory = GraphFactory(*info['factory']['args']) + factory = GraphFactory(*info["factory"]["args"]) max_node = 0 for k, node in nodes.items(): - node['factory'] = factory + node["factory"] = factory sbt_node = Node.load(node, storage) sbt_nodes[k] = sbt_node @@ -1108,11 +1207,12 @@ def _load_v6(cls, info, leaf_loader, dirname, storage, *, print_version_warning= sbt_leaves[k] = sbt_leaf max_node = max(max_node, k) - tree = cls(factory, d=info['d'], storage=storage, cache_size=cache_size) + tree = cls(factory, d=info["d"], storage=storage, cache_size=cache_size) tree._nodes = sbt_nodes tree._leaves = sbt_leaves - tree._missing_nodes = {i for i in range(max_node) - if i not in sbt_nodes and i not in sbt_leaves} + tree._missing_nodes = { + i for i in range(max_node) if i not in sbt_nodes and i not in sbt_leaves + } return tree @@ -1121,31 +1221,31 @@ def _fill_min_n_below(self): Propagate the smallest hash size below each node up the tree from the leaves. """ + def fill_min_n_below(node, *args, **kwargs): - original_min_n_below = node.metadata.get('min_n_below', sys.maxsize) + original_min_n_below = node.metadata.get("min_n_below", sys.maxsize) min_n_below = original_min_n_below - children = kwargs['children'] + children = kwargs["children"] for child in children: if child.node is not None: if isinstance(child.node, Leaf): min_n_below = min(len(child.node.data.minhash), min_n_below) else: - child_n = child.node.metadata.get('min_n_below', sys.maxsize) + child_n = child.node.metadata.get("min_n_below", sys.maxsize) min_n_below = min(child_n, min_n_below) if min_n_below == 0: min_n_below = 1 - node.metadata['min_n_below'] = min_n_below + node.metadata["min_n_below"] = min_n_below return original_min_n_below != min_n_below self._fill_up(fill_min_n_below) def _fill_internal(self): - def fill_nodegraphs(node, *args, **kwargs): - children = kwargs['children'] + children = kwargs["children"] for child in children: if child.node is not None: child.node.update(node) @@ -1191,28 +1291,29 @@ def _fill_up(self, search_fn, *args, **kwargs): processed += 1 if processed % 100 == 0: - debug("processed {}, in queue {}", processed, len(queue), sep='\r') + debug("processed {}, in queue {}", processed, len(queue), sep="\r") def __len__(self): return len(self._leaves) def print_dot(self): - print(""" + print( + """ digraph G { nodesep=0.3; ranksep=0.2; margin=0.1; node [shape=ellipse]; edge [arrowsize=0.8]; - """) + """ + ) for i, node in self._nodes.items(): if isinstance(node, Node): - print('"{}" [shape=box fillcolor=gray style=filled]'.format( - node.name)) + print(f'"{node.name}" [shape=box fillcolor=gray style=filled]') for j, child in self.children(i): if child is not None: - print('"{}" -> "{}"'.format(node.name, child.name)) + print(f'"{node.name}" -> "{child.name}"') print("}") def print(self): @@ -1225,8 +1326,9 @@ def print(self): depth = int(math.floor(math.log(node_p + 1, self.d))) print(" " * 4 * depth, node_g) if isinstance(node_g, Node): - stack.extend(c.pos for c in self.children(node_p) - if c.pos not in visited) + stack.extend( + c.pos for c in self.children(node_p) if c.pos not in visited + ) def __iter__(self): for i, node in self._nodes.items(): @@ -1274,14 +1376,14 @@ def combine(self, other): for pos in range(n_previous, n_next): if tree._nodes.get(pos, None) is not None: new_node = copy(tree._nodes[pos]) - new_node.name = "internal.{}".format(current_pos) + new_node.name = f"internal.{current_pos}" new_nodes[current_pos] = new_node elif tree._leaves.get(pos, None) is not None: new_node = copy(tree._leaves[pos]) new_leaves[current_pos] = new_node current_pos += 1 n_previous = n_next - n_next = n_previous + int(self.d ** level) + n_next = n_previous + int(self.d**level) current_pos = n_next # TODO: do we want to return a new tree, or merge into this one? @@ -1299,12 +1401,14 @@ def __init__(self, factory, name=None, path=None, storage=None): self._factory = factory self._data = None self._path = path - self.metadata = dict() + self.metadata = {} def __str__(self): - return '*Node:{name} [occupied: {nb}, fpr: {fpr:.2}]'.format( - name=self.name, nb=self.data.n_occupied(), - fpr=calc_expected_collisions(self.data, True, 1.1)) + return "*Node:{name} [occupied: {nb}, fpr: {fpr:.2}]".format( + name=self.name, + nb=self.data.n_occupied(), + fpr=calc_expected_collisions(self.data, True, 1.1), + ) def save(self, path): buf = self.data.to_bytes(compression=1) @@ -1332,21 +1436,22 @@ def unload(self): @staticmethod def load(info, storage=None): - new_node = Node(info['factory'], - name=info['name'], - path=info['filename'], - storage=storage) - new_node.metadata = info.get('metadata', {}) + new_node = Node( + info["factory"], name=info["name"], path=info["filename"], storage=storage + ) + new_node.metadata = info.get("metadata", {}) return new_node def update(self, parent): parent.data.update(self.data) - if 'min_n_below' in self.metadata: - min_n_below = min(parent.metadata.get('min_n_below', sys.maxsize), - self.metadata.get('min_n_below')) + if "min_n_below" in self.metadata: + min_n_below = min( + parent.metadata.get("min_n_below", sys.maxsize), + self.metadata.get("min_n_below"), + ) if min_n_below == 0: min_n_below = 1 - parent.metadata['min_n_below'] = min_n_below + parent.metadata["min_n_below"] = min_n_below class Leaf: @@ -1363,10 +1468,12 @@ def __init__(self, metadata, data=None, name=None, storage=None, path=None): self._path = path def __str__(self): - return '**Leaf:{name} [occupied: {nb}, fpr: {fpr:.2}] -> {metadata}'.format( - name=self.name, metadata=self.metadata, - nb=self.data.n_occupied(), - fpr=calc_expected_collisions(self.data, True, 1.1)) + return "**Leaf:{name} [occupied: {nb}, fpr: {fpr:.2}] -> {metadata}".format( + name=self.name, + metadata=self.metadata, + nb=self.data.n_occupied(), + fpr=calc_expected_collisions(self.data, True, 1.1), + ) def make_manifest_row(self, location): return None @@ -1397,10 +1504,9 @@ def update(self, parent): @classmethod def load(cls, info, storage=None): - return cls(info['metadata'], - name=info['name'], - path=info['filename'], - storage=storage) + return cls( + info["metadata"], name=info["name"], path=info["filename"], storage=storage + ) def filter_distance(filter_a, filter_b, n=1000): @@ -1428,9 +1534,15 @@ def filter_distance(filter_a, filter_b, n=1000): a = array(q, copy=False) b = array(p, copy=False) for i in map(lambda x: randint(0, len(a)), range(n)): - distance += sum(map(int, - [not bool((a[i] >> j) & 1) ^ bool((b[i] >> j) & 1) - for j in range(8)])) + distance += sum( + map( + int, + [ + not bool((a[i] >> j) & 1) ^ bool((b[i] >> j) & 1) + for j in range(8) + ], + ) + ) return distance / (8.0 * len(A) * n) @@ -1438,41 +1550,41 @@ def convert_cmd(name, backend): "Convert an SBT to use a different back end." from .sbtmh import SigLeaf - options = backend.split('(') + options = backend.split("(") backend = options.pop(0) backend = backend.lower().strip("'") kwargs = {} if options: - print(options) - options = options[0].split(')') - options = [options.pop(0)] - #options = {} + print(options) + options = options[0].split(")") + options = [options.pop(0)] + # options = {} else: - options = [] + options = [] - if backend.lower() in ('ipfs', 'ipfsstorage'): + if backend.lower() in ("ipfs", "ipfsstorage"): backend = IPFSStorage - elif backend.lower() in ('redis', 'redisstorage'): + elif backend.lower() in ("redis", "redisstorage"): backend = RedisStorage - elif backend.lower() in ('zip', 'zipstorage'): + elif backend.lower() in ("zip", "zipstorage"): backend = ZipStorage - kwargs['mode'] = 'w' - elif backend.lower() in ('fs', 'fsstorage'): + kwargs["mode"] = "w" + elif backend.lower() in ("fs", "fsstorage"): backend = FSStorage if options: options = [os.path.dirname(options[0]), os.path.basename(options[0])] else: # this is the default for SBT v2 - tag = '.sbt.' + os.path.basename(name) - if tag.endswith('.sbt.json'): + tag = ".sbt." + os.path.basename(name) + if tag.endswith(".sbt.json"): tag = tag[:-9] path = os.path.dirname(name) options = [path, tag] else: - error('backend not recognized: {}'.format(backend)) + error(f"backend not recognized: {backend}") with backend(*options, **kwargs) as storage: sbt = SBT.load(name, leaf_loader=SigLeaf.load) diff --git a/src/sourmash/sbt_storage.py b/src/sourmash/sbt_storage.py index 42a4fceaa6..1b7a9e7d78 100644 --- a/src/sourmash/sbt_storage.py +++ b/src/sourmash/sbt_storage.py @@ -15,7 +15,6 @@ class Storage(ABC): - @abc.abstractmethod def save(self, path, content, *, overwrite=False): pass @@ -44,7 +43,6 @@ def can_open(self, location): class FSStorage(Storage): - def __init__(self, location, subdir, make_dirs=True): self.location = location self.subdir = subdir @@ -55,7 +53,7 @@ def __init__(self, location, subdir, make_dirs=True): os.makedirs(fullpath) def init_args(self): - return {'path': self.subdir} + return {"path": self.subdir} def save(self, path, content, overwrite=False): "Save a node/leaf." @@ -64,27 +62,27 @@ def save(self, path, content, overwrite=False): if os.path.exists(fullpath): # check for content, if same return path, - with open(fullpath, 'rb') as f: + with open(fullpath, "rb") as f: old_content = f.read() if old_content == content: return path if overwrite: - pass # fine to overwrite file! + pass # fine to overwrite file! else: # different content, need to find new path to save newpath = None n = 0 while newpath is None: - testpath = "{}_{}".format(fullpath, n) + testpath = f"{fullpath}_{n}" if os.path.exists(testpath): n += 1 else: # testpath is available, use it as newpath - newpath = "{}_{}".format(path, n) + newpath = f"{path}_{n}" fullpath = os.path.join(self.location, self.subdir, newpath) - with open(fullpath, 'wb') as f: + with open(fullpath, "wb") as f: f.write(content) return newpath @@ -95,7 +93,6 @@ def load(self, path): class ZipStorage(RustObject, Storage): - __dealloc_func__ = lib.zipstorage_free def __init__(self, path, *, mode="r"): @@ -146,7 +143,9 @@ def _filenames(self): def save(self, path, content, *, overwrite=False, compress=False): if self.__inner: - return self.__inner.save(path, content, overwrite=overwrite, compress=compress) + return self.__inner.save( + path, content, overwrite=overwrite, compress=compress + ) raise NotImplementedError() def load(self, path): @@ -155,7 +154,9 @@ def load(self, path): try: size = ffi.new("uintptr_t *") - rawbuf = self._methodcall(lib.zipstorage_load, to_bytes(path), len(path), size) + rawbuf = self._methodcall( + lib.zipstorage_load, to_bytes(path), len(path), size + ) size = size[0] rawbuf = ffi.gc(rawbuf, lambda o: lib.nodegraph_buffer_free(o, size), size) @@ -182,7 +183,7 @@ def list_sbts(self): return paths def init_args(self): - return {'path': self.path} + return {"path": self.path} def flush(self): if self.__inner: @@ -198,7 +199,6 @@ def can_open(location): class _RwZipStorage(Storage): - def __init__(self, path): self.path = os.path.abspath(path) @@ -212,14 +212,15 @@ def __init__(self, path): # so we need to check some things: if not os.path.exists(self.path): # If the file doesn't exist open it in write mode. - self.zipfile = zipfile.ZipFile(path, mode='w', - compression=zipfile.ZIP_STORED) + self.zipfile = zipfile.ZipFile( + path, mode="w", compression=zipfile.ZIP_STORED + ) else: # If it exists, open it in read mode and prepare a buffer for # new/duplicated items. During close() there are checks to see # how the original file needs to be updated (append new items, # deal with duplicates, and so on) - self.zipfile = zipfile.ZipFile(path, 'r') + self.zipfile = zipfile.ZipFile(path, "r") self.bufferzip = zipfile.ZipFile(BytesIO(), mode="w") self.subdir = "" @@ -250,7 +251,7 @@ def _generate_filename(self, zf, path, content): newpath = None n = 0 while newpath is None: - testpath = "{}_{}".format(path, n) + testpath = f"{path}_{n}" try: matches = self._content_matches(zf, testpath, content) if matches: @@ -260,7 +261,7 @@ def _generate_filename(self, zf, path, content): except KeyError: return testpath, True - assert 0 # should never get here! + assert 0 # should never get here! def _write_to_zf(self, zf, path, content, *, compress=False): compress_type = zipfile.ZIP_STORED @@ -272,9 +273,9 @@ def _write_to_zf(self, zf, path, content, *, compress=False): # set permissions zi = zf.getinfo(path) - perms = 0o444 << 16 # give a+r access - if path.endswith('/'): - perms = 0o755 << 16 # directories get u+rwx, a+rx + perms = 0o444 << 16 # give a+r access + if path.endswith("/"): + perms = 0o755 << 16 # directories get u+rwx, a+rx zi.external_attr = perms def save(self, path, content, *, overwrite=False, compress=False): @@ -287,15 +288,15 @@ def save(self, path, content, *, overwrite=False, compress=False): newpath, do_write = self._generate_filename(self.zipfile, path, content) if do_write: try: - self._write_to_zf(self.zipfile, newpath, content, - compress=compress) + self._write_to_zf(self.zipfile, newpath, content, compress=compress) except (ValueError, RuntimeError): # Can't write in the zipfile, write in buffer instead # CTB: do we need to generate a new filename wrt to the # bufferzip, too? Not sure this code is working as intended... if self.bufferzip: - self._write_to_zf(self.bufferzip, newpath, content, - compress=compress) + self._write_to_zf( + self.bufferzip, newpath, content, compress=compress + ) else: # Throw error, can't write the data raise ValueError("can't write data") @@ -326,7 +327,7 @@ def close(self): # might not have self.zipfile if was invalid zipfile and __init__ # failed. - if hasattr(self, 'zipfile'): + if hasattr(self, "zipfile"): if self.zipfile is not None or self.bufferzip is not None: self.flush(keep_closed=True) self.zipfile.close() @@ -341,8 +342,9 @@ def flush(self, *, keep_closed=False): if self.zipfile is not None: self.zipfile.close() if not keep_closed: - self.zipfile = zipfile.ZipFile(self.path, mode='a', - compression=zipfile.ZIP_STORED) + self.zipfile = zipfile.ZipFile( + self.path, mode="a", compression=zipfile.ZIP_STORED + ) else: # The complicated one. Need to consider: # - Is there data in the buffer? @@ -367,7 +369,9 @@ def flush(self, *, keep_closed=False): if item in duplicated or item in buffer_names: # we prioritize writing data from the buffer to the # final file - self._write_to_zf(final_file, item, self.bufferzip.read(item)) + self._write_to_zf( + final_file, item, self.bufferzip.read(item) + ) else: # it is only in the zipfile, so write from it self._write_to_zf(final_file, item, self.zipfile.read(item)) @@ -379,8 +383,9 @@ def flush(self, *, keep_closed=False): os.unlink(self.path) shutil.move(tempfile.name, self.path) if not keep_closed: - self.zipfile = zipfile.ZipFile(self.path, mode='a', - compression=zipfile.ZIP_STORED) + self.zipfile = zipfile.ZipFile( + self.path, mode="a", compression=zipfile.ZIP_STORED + ) elif new_data: # Since there is no duplicated data, we can # reopen self.zipfile in append mode and write the new data @@ -388,8 +393,9 @@ def flush(self, *, keep_closed=False): if keep_closed: raise Exception("unexpected error") else: - zf = zipfile.ZipFile(self.path, mode='a', - compression=zipfile.ZIP_STORED) + zf = zipfile.ZipFile( + self.path, mode="a", compression=zipfile.ZIP_STORED + ) for item in new_data: self._write_to_zf(zf, item, self.bufferzip.read(item)) self.zipfile = zf @@ -405,9 +411,9 @@ def __del__(self): class IPFSStorage(Storage): - def __init__(self, pin_on_add=True, **kwargs): import ipfshttpclient + self.ipfs_args = kwargs self.pin_on_add = pin_on_add self.api = ipfshttpclient.connect(**self.ipfs_args) @@ -444,9 +450,9 @@ def __exit__(self, type, value, traceback): class RedisStorage(Storage): - def __init__(self, **kwargs): import redis + self.redis_args = kwargs self.conn = redis.Redis(**self.redis_args) diff --git a/src/sourmash/sbtmh.py b/src/sourmash/sbtmh.py index 6cb9cc0135..3fa7aa23f2 100644 --- a/src/sourmash/sbtmh.py +++ b/src/sourmash/sbtmh.py @@ -7,9 +7,12 @@ def load_sbt_index(filename, *, print_version_warning=True, cache_size=None): "Load and return an SBT index." - return SBT.load(filename, leaf_loader=SigLeaf.load, - print_version_warning=print_version_warning, - cache_size=cache_size) + return SBT.load( + filename, + leaf_loader=SigLeaf.load, + print_version_warning=print_version_warning, + cache_size=cache_size, + ) def create_sbt_index(bloom_filter_size=1e5, n_children=2): @@ -29,21 +32,18 @@ def search_sbt_index(tree, query, threshold): for match_sig, similarity in search_sbt_index(tree, query, threshold): ... """ - for (score, match, _) in tree.search(query, threshold=threshold, - unload_data=True): + for score, match, _ in tree.search(query, threshold=threshold, unload_data=True): yield match, score class SigLeaf(Leaf): def __str__(self): - return '**Leaf:{name} -> {metadata}'.format( - name=self.name, metadata=self.metadata) + return f"**Leaf:{self.name} -> {self.metadata}" def make_manifest_row(self, loc): from .index import CollectionManifest - row = CollectionManifest.make_manifest_row(self.data, - loc, - include_signature=0) + + row = CollectionManifest.make_manifest_row(self.data, loc, include_signature=0) return row def save(self, path): @@ -58,13 +58,13 @@ def save(self, path): def update(self, parent): mh = self.data.minhash parent.data.update(mh) - min_n_below = parent.metadata.get('min_n_below', sys.maxsize) + min_n_below = parent.metadata.get("min_n_below", sys.maxsize) min_n_below = min(len(mh), min_n_below) if min_n_below == 0: min_n_below = 1 - parent.metadata['min_n_below'] = min_n_below + parent.metadata["min_n_below"] = min_n_below @property def data(self): diff --git a/src/sourmash/search.py b/src/sourmash/search.py index 7b2db8008f..f730d1daf5 100644 --- a/src/sourmash/search.py +++ b/src/sourmash/search.py @@ -4,10 +4,10 @@ import csv import numpy as np from enum import Enum -import numpy as np from dataclasses import dataclass -from .signature import SourmashSignature, MinHash +from .minhash import MinHash +from .signature import SourmashSignature from .sketchcomparison import FracMinHashComparison, NumMinHashComparison @@ -42,11 +42,9 @@ class SearchType(Enum): MAX_CONTAINMENT = 3 -def make_jaccard_search_query(*, - do_containment=False, - do_max_containment=False, - best_only=False, - threshold=None): +def make_jaccard_search_query( + *, do_containment=False, do_max_containment=False, best_only=False, threshold=None +): """\ Make a "flat" search object for Jaccard search & containment. """ @@ -81,11 +79,9 @@ def make_containment_query(query_mh, threshold_bp, *, best_only=True): threshold, _ = calc_threshold_from_bp(threshold_bp, scaled, len(query_mh)) if best_only: - search_obj = JaccardSearchBestOnly(SearchType.CONTAINMENT, - threshold=threshold) + search_obj = JaccardSearchBestOnly(SearchType.CONTAINMENT, threshold=threshold) else: - search_obj = JaccardSearch(SearchType.CONTAINMENT, - threshold=threshold) + search_obj = JaccardSearch(SearchType.CONTAINMENT, threshold=threshold) return search_obj @@ -94,6 +90,7 @@ class JaccardSearch: """ A class used by Index classes for searching/gathering. """ + def __init__(self, search_type, threshold=None): "Constructor. Takes type of search, and optional threshold." score_fn = None @@ -148,15 +145,13 @@ def score_jaccard(self, query_size, shared_size, subject_size, total_size): return 0 return shared_size / total_size - def score_containment(self, query_size, shared_size, subject_size, - total_size): + def score_containment(self, query_size, shared_size, subject_size, total_size): "Calculate Jaccard containment." if query_size == 0: return 0 return shared_size / query_size - def score_max_containment(self, query_size, shared_size, subject_size, - total_size): + def score_max_containment(self, query_size, shared_size, subject_size, total_size): "Calculate Jaccard max containment." min_denom = min(query_size, subject_size) if min_denom == 0: @@ -166,11 +161,13 @@ def score_max_containment(self, query_size, shared_size, subject_size, class JaccardSearchBestOnly(JaccardSearch): "A subclass of JaccardSearch that implements best-only." + def collect(self, score, match): "Raise the threshold to the best match found so far." self.threshold = max(self.threshold, score) return True + @dataclass class BaseResult: """ @@ -179,10 +176,11 @@ class BaseResult: properly initialize a SketchComparison, this class doesn't actually do anything other than define some functions needed by *Result classes. """ + query: SourmashSignature match: SourmashSignature filename: str = None - ignore_abundance: bool = False # optionally ignore abundances + ignore_abundance: bool = False # optionally ignore abundances # need these for scaled result comparisons estimate_ani_ci: bool = False ani_confidence: float = 0.95 @@ -196,18 +194,24 @@ def init_result(self): self.mh2 = self.match.minhash def build_fracminhashcomparison(self): - self.cmp = FracMinHashComparison(self.mh1, self.mh2, cmp_scaled=self.cmp_scaled, - threshold_bp=self.threshold_bp, - ignore_abundance=self.ignore_abundance, - estimate_ani_ci=self.estimate_ani_ci, - ani_confidence=self.ani_confidence) + self.cmp = FracMinHashComparison( + self.mh1, + self.mh2, + cmp_scaled=self.cmp_scaled, + threshold_bp=self.threshold_bp, + ignore_abundance=self.ignore_abundance, + estimate_ani_ci=self.estimate_ani_ci, + ani_confidence=self.ani_confidence, + ) self.cmp_scaled = self.cmp.cmp_scaled self.query_scaled = self.mh1.scaled self.match_scaled = self.mh2.scaled self.size_may_be_inaccurate = self.cmp.size_may_be_inaccurate def build_numminhashcomparison(self, cmp_num=None): - self.cmp = NumMinHashComparison(self.mh1, self.mh2, cmp_num=cmp_num, ignore_abundance=self.ignore_abundance) + self.cmp = NumMinHashComparison( + self.mh1, self.mh2, cmp_num=cmp_num, ignore_abundance=self.ignore_abundance + ) self.cmp_num = self.cmp.cmp_num self.query_num = self.mh1.num self.match_num = self.mh2.num @@ -230,7 +234,7 @@ def get_cmpinfo(self): self.filename = self.match_filename self.match_md5 = self.match.md5sum() # set these from self.match_* - self.md5= self.match_md5 + self.md5 = self.match_md5 self.name = self.match_name # could define in PrefetchResult instead, same reasoning as above self.query_abundance = self.mh1.track_abundance @@ -248,8 +252,9 @@ def shorten_md5(self, md5): def to_write(self, columns=[]): # convert comparison attrs into a dictionary # that can be used by csv dictwriter - info = {k: v for k, v in self.__dict__.items() - if k in columns and v is not None} + info = { + k: v for k, v in self.__dict__.items() if k in columns and v is not None + } return info def init_dictwriter(self, csv_handle): @@ -279,13 +284,22 @@ class SearchResult(BaseResult): """ SearchResult class supports 'sourmash search' operations. """ + similarity: float = None cmp_num: int = None searchtype: SearchType = None - #columns for standard SearchResult output - search_write_cols = ['similarity', 'md5', 'filename', 'name', # here we use 'filename' - 'query_filename', 'query_name', 'query_md5', 'ani'] + # columns for standard SearchResult output + search_write_cols = [ + "similarity", + "md5", + "filename", + "name", # here we use 'filename' + "query_filename", + "query_name", + "query_md5", + "ani", + ] ci_cols = ["ani_low", "ani_high"] @@ -297,10 +311,10 @@ def init_sigcomparison(self): self.build_fracminhashcomparison() elif any([self.mh1.num, self.mh2.num]): self.build_numminhashcomparison(cmp_num=self.cmp_num) - self.get_cmpinfo() # grab comparison metadata + self.get_cmpinfo() # grab comparison metadata def __post_init__(self): - self.init_sigcomparison() # build sketch comparison + self.init_sigcomparison() # build sketch comparison self.check_similarity() if self.cmp_scaled is not None and self.searchtype is not None: self.estimate_search_ani() @@ -317,11 +331,13 @@ def check_similarity(self): raise ValueError("Error: Must provide 'similarity' for SearchResult.") def estimate_search_ani(self): - #future: could estimate ANI from abund searches if we want (use query containment?) + # future: could estimate ANI from abund searches if we want (use query containment?) if self.cmp_scaled is None: raise TypeError("Error: ANI can only be estimated from scaled signatures.") if self.searchtype == SearchType.CONTAINMENT: - self.cmp.estimate_ani_from_mh1_containment_in_mh2(containment = self.similarity) + self.cmp.estimate_ani_from_mh1_containment_in_mh2( + containment=self.similarity + ) self.ani = self.cmp.ani_from_mh1_containment_in_mh2 if self.estimate_ani_ci: self.ani_low = self.cmp.ani_from_mh1_containment_in_mh2_low @@ -347,16 +363,38 @@ class PrefetchResult(BaseResult): """ # current prefetch columns - prefetch_write_cols = ['intersect_bp', 'jaccard', 'max_containment', 'f_query_match', - 'f_match_query', 'match_filename', 'match_name', # here we use 'match_filename' - 'match_md5', 'match_bp', 'query_filename', 'query_name', - 'query_md5', 'query_bp', 'ksize', 'moltype', 'scaled', - 'query_n_hashes', 'query_abundance', 'query_containment_ani', - 'match_containment_ani', 'average_containment_ani', 'max_containment_ani', - 'potential_false_negative'] #'match_abundance' - - ci_cols = ["query_containment_ani_low", "query_containment_ani_high", - "match_containment_ani_low", "match_containment_ani_high"] + prefetch_write_cols = [ + "intersect_bp", + "jaccard", + "max_containment", + "f_query_match", + "f_match_query", + "match_filename", + "match_name", # here we use 'match_filename' + "match_md5", + "match_bp", + "query_filename", + "query_name", + "query_md5", + "query_bp", + "ksize", + "moltype", + "scaled", + "query_n_hashes", + "query_abundance", + "query_containment_ani", + "match_containment_ani", + "average_containment_ani", + "max_containment_ani", + "potential_false_negative", + ] #'match_abundance' + + ci_cols = [ + "query_containment_ani_low", + "query_containment_ani_high", + "match_containment_ani_low", + "match_containment_ani_high", + ] prefetch_write_cols_ci = prefetch_write_cols + ci_cols @@ -366,8 +404,10 @@ def init_sigcomparison(self): if all([self.mh1.scaled, self.mh2.scaled]): self.build_fracminhashcomparison() else: - raise TypeError("Error: prefetch and gather results must be between scaled signatures.") - self.get_cmpinfo() # grab comparison metadata + raise TypeError( + "Error: prefetch and gather results must be between scaled signatures." + ) + self.get_cmpinfo() # grab comparison metadata self.intersect_bp = self.cmp.total_unique_intersect_hashes self.max_containment = self.cmp.max_containment self.query_bp = self.mh1.unique_dataset_hashes @@ -394,8 +434,12 @@ def handle_ani_ci(self): def build_prefetch_result(self): # unique prefetch values self.jaccard = self.cmp.jaccard - self.f_query_match = self.cmp.mh2_containment_in_mh1 #db_mh.contained_by(query_mh) - self.f_match_query = self.cmp.mh1_containment_in_mh2 #query_mh.contained_by(db_mh) + self.f_query_match = ( + self.cmp.mh2_containment_in_mh1 + ) # db_mh.contained_by(query_mh) + self.f_match_query = ( + self.cmp.mh1_containment_in_mh2 + ) # query_mh.contained_by(db_mh) # set write columns for prefetch result self.write_cols = self.prefetch_write_cols if self.estimate_ani_ci: @@ -433,50 +477,80 @@ class GatherResult(PrefetchResult): sum_weighted_found: int = None total_weighted_hashes: int = None - gather_write_cols = ['intersect_bp', 'f_orig_query', 'f_match', - 'f_unique_to_query', - 'f_unique_weighted','average_abund', - 'median_abund', 'std_abund', 'filename', - 'name', 'md5', - 'f_match_orig', 'unique_intersect_bp', - 'gather_result_rank', 'remaining_bp', - 'query_filename', 'query_name', 'query_md5', - 'query_bp', 'ksize', 'moltype', 'scaled', - 'query_n_hashes', 'query_abundance', - 'query_containment_ani', - 'match_containment_ani', - 'average_containment_ani', - 'max_containment_ani', - 'potential_false_negative', - 'n_unique_weighted_found', - 'sum_weighted_found', - 'total_weighted_hashes'] - - ci_cols = ["query_containment_ani_low", "query_containment_ani_high", - "match_containment_ani_low", "match_containment_ani_high"] + gather_write_cols = [ + "intersect_bp", + "f_orig_query", + "f_match", + "f_unique_to_query", + "f_unique_weighted", + "average_abund", + "median_abund", + "std_abund", + "filename", + "name", + "md5", + "f_match_orig", + "unique_intersect_bp", + "gather_result_rank", + "remaining_bp", + "query_filename", + "query_name", + "query_md5", + "query_bp", + "ksize", + "moltype", + "scaled", + "query_n_hashes", + "query_abundance", + "query_containment_ani", + "match_containment_ani", + "average_containment_ani", + "max_containment_ani", + "potential_false_negative", + "n_unique_weighted_found", + "sum_weighted_found", + "total_weighted_hashes", + ] + + ci_cols = [ + "query_containment_ani_low", + "query_containment_ani_high", + "match_containment_ani_low", + "match_containment_ani_high", + ] gather_write_cols_ci = gather_write_cols + ci_cols def init_gathersketchcomparison(self): # compare remaining gather hashes with match. Force at cmp_scaled. Force match flatten(), bc we don't need abunds. - self.gather_comparison = FracMinHashComparison(self.gather_querymh, self.match.minhash.flatten()) + self.gather_comparison = FracMinHashComparison( + self.gather_querymh, self.match.minhash.flatten() + ) def check_gatherresult_input(self): # check we have what we need: if self.cmp_scaled is None: - raise ValueError("Error: must provide comparison scaled value ('cmp_scaled') for GatherResult") + raise ValueError( + "Error: must provide comparison scaled value ('cmp_scaled') for GatherResult" + ) if self.gather_querymh is None: - raise ValueError("Error: must provide current gather sketch (remaining hashes) for GatherResult") + raise ValueError( + "Error: must provide current gather sketch (remaining hashes) for GatherResult" + ) if self.gather_result_rank is None: raise ValueError("Error: must provide 'gather_result_rank' to GatherResult") - if not self.total_weighted_hashes: # catch total_weighted_hashes = 0 as well - raise ValueError("Error: must provide sum of all abundances ('total_weighted_hashes') to GatherResult") + if not self.total_weighted_hashes: # catch total_weighted_hashes = 0 as well + raise ValueError( + "Error: must provide sum of all abundances ('total_weighted_hashes') to GatherResult" + ) if not self.orig_query_abunds: - raise ValueError("Error: must provide original query abundances ('orig_query_abunds') to GatherResult") + raise ValueError( + "Error: must provide original query abundances ('orig_query_abunds') to GatherResult" + ) def build_gather_result(self): # build gather-specific attributes - + # the 'query' that is passed into gather is all _matched_ hashes, after subtracting noident_mh # this affects estimation of original query information, and requires us to pass in orig_query_len and orig_query_abunds. # we also need to overwrite self.query_bp, self.query_n_hashes, and self.query_abundance @@ -486,43 +560,70 @@ def build_gather_result(self): # calculate intersection with query hashes: self.unique_intersect_bp = self.gather_comparison.total_unique_intersect_hashes - + # calculate fraction of subject match with orig query self.f_match_orig = self.cmp.mh2_containment_in_mh1 # calculate fractions wrt first denominator - genome size - self.f_match = self.gather_comparison.mh2_containment_in_mh1 # unique match containment + self.f_match = ( + self.gather_comparison.mh2_containment_in_mh1 + ) # unique match containment self.f_orig_query = len(self.cmp.intersect_mh) / self.orig_query_len - assert self.gather_comparison.intersect_mh.contained_by(self.gather_comparison.mh1_cmp) == 1.0 - + assert ( + self.gather_comparison.intersect_mh.contained_by( + self.gather_comparison.mh1_cmp + ) + == 1.0 + ) + # calculate fractions wrt second denominator - metagenome size - assert self.gather_comparison.intersect_mh.contained_by(self.gather_comparison.mh2_cmp) == 1.0 - self.f_unique_to_query = len(self.gather_comparison.intersect_mh)/self.orig_query_len + assert ( + self.gather_comparison.intersect_mh.contained_by( + self.gather_comparison.mh2_cmp + ) + == 1.0 + ) + self.f_unique_to_query = ( + len(self.gather_comparison.intersect_mh) / self.orig_query_len + ) # here, need to make sure to use the mh1_cmp (bc was downsampled to cmp_scaled) - self.remaining_bp = (self.gather_comparison.mh1_cmp.unique_dataset_hashes - self.gather_comparison.total_unique_intersect_hashes) + self.remaining_bp = ( + self.gather_comparison.mh1_cmp.unique_dataset_hashes + - self.gather_comparison.total_unique_intersect_hashes + ) # calculate stats on abundances, if desired. self.average_abund, self.median_abund, self.std_abund = None, None, None if not self.ignore_abundance: - self.query_weighted_unique_intersection = self.gather_comparison.weighted_intersection(from_abundD = self.orig_query_abunds) + self.query_weighted_unique_intersection = ( + self.gather_comparison.weighted_intersection( + from_abundD=self.orig_query_abunds + ) + ) self.average_abund = self.query_weighted_unique_intersection.mean_abundance self.median_abund = self.query_weighted_unique_intersection.median_abundance self.std_abund = self.query_weighted_unique_intersection.std_abundance # 'query' will be flattened by default. reset track abundance if we have abunds - self.query_abundance = self.query_weighted_unique_intersection.track_abundance + self.query_abundance = ( + self.query_weighted_unique_intersection.track_abundance + ) # calculate scores weighted by abundances - self.n_unique_weighted_found = self.query_weighted_unique_intersection.sum_abundances - self.f_unique_weighted = self.n_unique_weighted_found / self.total_weighted_hashes + self.n_unique_weighted_found = ( + self.query_weighted_unique_intersection.sum_abundances + ) + self.f_unique_weighted = ( + self.n_unique_weighted_found / self.total_weighted_hashes + ) else: self.f_unique_weighted = self.f_unique_to_query self.query_abundance = False def __post_init__(self): self.check_gatherresult_input() - self.init_sigcomparison() # initialize original sketch vs match sketch comparison (inherited from PrefetchResult) - self.init_gathersketchcomparison() # initialize remaining gather sketch vs match sketch comparison - self.build_gather_result() # build gather-specific attributes + self.init_sigcomparison() # initialize original sketch vs match sketch comparison (inherited from PrefetchResult) + self.init_gathersketchcomparison() # initialize remaining gather sketch vs match sketch comparison + self.build_gather_result() # build gather-specific attributes # set write columns for prefetch result self.write_cols = self.gather_write_cols if self.estimate_ani_ci: @@ -550,8 +651,12 @@ def prefetchresultdict(self): if self.estimate_ani_ci: prefetch_cols = self.prefetch_write_cols_ci self.jaccard = self.cmp.jaccard - self.f_query_match = self.cmp.mh2_containment_in_mh1 #db_mh.contained_by(query_mh) - self.f_match_query = self.cmp.mh1_containment_in_mh2 #query_mh.contained_by(db_mh) + self.f_query_match = ( + self.cmp.mh2_containment_in_mh1 + ) # db_mh.contained_by(query_mh) + self.f_match_query = ( + self.cmp.mh1_containment_in_mh2 + ) # query_mh.contained_by(db_mh) self.prep_prefetch_result() return self.to_write(columns=prefetch_cols) @@ -560,14 +665,14 @@ def format_bp(bp): "Pretty-print bp information." bp = float(bp) if bp < 500: - return '{:.0f} bp'.format(bp) + return f"{bp:.0f} bp" elif bp <= 500e3: - return '{:.1f} kbp'.format(round(bp / 1e3, 1)) + return f"{round(bp / 1e3, 1):.1f} kbp" elif bp < 500e6: - return '{:.1f} Mbp'.format(round(bp / 1e6, 1)) + return f"{round(bp / 1e6, 1):.1f} Mbp" elif bp < 500e9: - return '{:.1f} Gbp'.format(round(bp / 1e9, 1)) - return '???' + return f"{round(bp / 1e9, 1):.1f} Gbp" + return "???" def search_databases_with_flat_query(query, databases, **kwargs): @@ -576,7 +681,7 @@ def search_databases_with_flat_query(query, databases, **kwargs): for db in databases: search_iter = db.search(query, **kwargs) - for (score, match, filename) in search_iter: + for score, match, filename in search_iter: md5 = match.md5sum() if md5 not in found_md5: results.append((score, match, filename)) @@ -589,22 +694,27 @@ def search_databases_with_flat_query(query, databases, **kwargs): # repetitive/not optimal - would it be better to produce SearchResult from db.search? estimate_ani_ci = False search_type = SearchType.JACCARD - if kwargs.get('do_containment'): + if kwargs.get("do_containment"): search_type = SearchType.CONTAINMENT - if kwargs.get('estimate_ani_ci'): + if kwargs.get("estimate_ani_ci"): estimate_ani_ci = True - elif kwargs.get('do_max_containment'): + elif kwargs.get("do_max_containment"): search_type = SearchType.MAX_CONTAINMENT - if kwargs.get('estimate_ani_ci'): + if kwargs.get("estimate_ani_ci"): estimate_ani_ci = True x = [] - for (score, match, filename) in results: - x.append(SearchResult(query, match, - similarity=score, - filename = filename, - searchtype=search_type, - estimate_ani_ci=estimate_ani_ci)) + for score, match, filename in results: + x.append( + SearchResult( + query, + match, + similarity=score, + filename=filename, + searchtype=search_type, + estimate_ani_ci=estimate_ani_ci, + ) + ) return x @@ -612,12 +722,14 @@ def search_databases_with_abund_query(query, databases, **kwargs): results = [] found_md5 = set() - if kwargs.get('do_containment') or kwargs.get('do_max_containment'): + if kwargs.get("do_containment") or kwargs.get("do_max_containment"): raise TypeError("containment searches cannot be done with abund sketches") for db in databases: - search_iter = db.search_abund(query, **kwargs) # could return SearchResult here instead of tuple? - for (score, match, filename) in search_iter: + search_iter = db.search_abund( + query, **kwargs + ) # could return SearchResult here instead of tuple? + for score, match, filename in search_iter: md5 = match.md5sum() if md5 not in found_md5: results.append((score, match, filename)) @@ -627,16 +739,16 @@ def search_databases_with_abund_query(query, databases, **kwargs): results.sort(key=lambda x: -x[0]) x = [] - for (score, match, filename) in results: - x.append(SearchResult(query, match, - similarity=score, - filename = filename)) + for score, match, filename in results: + x.append(SearchResult(query, match, similarity=score, filename=filename)) return x + ### ### gather code ### + def _find_best(counters, query, threshold_bp): """ Search for the best containment, return precisely one match. @@ -667,8 +779,17 @@ def _find_best(counters, query, threshold_bp): class GatherDatabases: "Iterator object for doing gather/min-set-cov." - def __init__(self, query, counters, *, - threshold_bp=0, ignore_abundance=False, noident_mh=None, ident_mh=None, estimate_ani_ci=False): + def __init__( + self, + query, + counters, + *, + threshold_bp=0, + ignore_abundance=False, + noident_mh=None, + ident_mh=None, + estimate_ani_ci=False, + ): # track original query information for later usage? track_abundance = query.minhash.track_abundance and not ignore_abundance self.orig_query = query @@ -683,7 +804,7 @@ def __init__(self, query, counters, *, if track_abundance: orig_query_abunds = query_hashes else: - orig_query_abunds = { k: 1 for k in query_hashes } + orig_query_abunds = {k: 1 for k in query_hashes} # adjust for not found... if noident_mh is None: # create empty @@ -702,7 +823,7 @@ def __init__(self, query, counters, *, query = query.to_mutable() query.minhash = orig_query_mh - cmp_scaled = query.minhash.scaled # initialize with resolution of query + cmp_scaled = query.minhash.scaled # initialize with resolution of query self.result_n = 0 self.query = query @@ -713,10 +834,12 @@ def __init__(self, query, counters, *, self.orig_query_mh = orig_query_mh self.orig_query_abunds = orig_query_abunds - self.cmp_scaled = 0 # initialize with something very low! + self.cmp_scaled = 0 # initialize with something very low! self._update_scaled(cmp_scaled) - self.estimate_ani_ci = estimate_ani_ci # by default, do not report ANI confidence intervals + self.estimate_ani_ci = ( + estimate_ani_ci # by default, do not report ANI confidence intervals + ) def _update_scaled(self, scaled): max_scaled = max(self.cmp_scaled, scaled) @@ -729,10 +852,12 @@ def _update_scaled(self, scaled): # NOTE: orig_query_abunds can be used w/o downsampling orig_query_abunds = self.orig_query_abunds - self.noident_query_sum_abunds = sum(( orig_query_abunds[k] \ - for k in self.noident_mh.hashes )) - self.total_weighted_hashes = sum(( orig_query_abunds[k] \ - for k in self.orig_query_mh.hashes )) + self.noident_query_sum_abunds = sum( + orig_query_abunds[k] for k in self.noident_mh.hashes + ) + self.total_weighted_hashes = sum( + orig_query_abunds[k] for k in self.orig_query_mh.hashes + ) self.total_weighted_hashes += self.noident_query_sum_abunds if max_scaled != scaled: @@ -753,7 +878,6 @@ def __next__(self): # may be changed: counters = self.counters - cmp_scaled = self.cmp_scaled # will not be changed:: threshold_bp = self.threshold_bp @@ -762,7 +886,7 @@ def __next__(self): # find the best match! best_result, intersect_mh = _find_best(counters, query, threshold_bp) - if not best_result: # no matches at all for this cutoff! + if not best_result: # no matches at all for this cutoff! raise StopIteration best_match = best_result.signature @@ -794,24 +918,26 @@ def __next__(self): # compute weighted information for remaining query hashes query_hashes = set(new_query_mh.hashes) - n_weighted_missed = sum((orig_query_abunds[k] for k in query_hashes)) + n_weighted_missed = sum(orig_query_abunds[k] for k in query_hashes) n_weighted_missed += self.noident_query_sum_abunds sum_weighted_found = total_weighted_hashes - n_weighted_missed # build a GatherResult - result = GatherResult(self.orig_query, best_match, - cmp_scaled=scaled, - filename=filename, - gather_result_rank=self.result_n, - gather_querymh=query.minhash, - ignore_abundance=not self.track_abundance, - threshold_bp=threshold_bp, - orig_query_len=orig_query_len, - orig_query_abunds=self.orig_query_abunds, - estimate_ani_ci=self.estimate_ani_ci, - sum_weighted_found=sum_weighted_found, - total_weighted_hashes=total_weighted_hashes, - ) + result = GatherResult( + self.orig_query, + best_match, + cmp_scaled=scaled, + filename=filename, + gather_result_rank=self.result_n, + gather_querymh=query.minhash, + ignore_abundance=not self.track_abundance, + threshold_bp=threshold_bp, + orig_query_len=orig_query_len, + orig_query_abunds=self.orig_query_abunds, + estimate_ani_ci=self.estimate_ani_ci, + sum_weighted_found=sum_weighted_found, + total_weighted_hashes=total_weighted_hashes, + ) self.result_n += 1 self.query = new_query @@ -823,6 +949,7 @@ def __next__(self): ### prefetch code ### + def prefetch_database(query, database, threshold_bp, *, estimate_ani_ci=False): """ Find all matches to `query_mh` >= `threshold_bp` in `database`. @@ -830,7 +957,14 @@ def prefetch_database(query, database, threshold_bp, *, estimate_ani_ci=False): scaled = query.minhash.scaled assert scaled # iterate over all signatures in database, find matches - for result in database.prefetch(query, threshold_bp): # future: could return PrefetchResult directly here - result = PrefetchResult(query, result.signature, threshold_bp=threshold_bp, estimate_ani_ci=estimate_ani_ci) + for result in database.prefetch( + query, threshold_bp + ): # future: could return PrefetchResult directly here + result = PrefetchResult( + query, + result.signature, + threshold_bp=threshold_bp, + estimate_ani_ci=estimate_ani_ci, + ) assert result.pass_threshold yield result diff --git a/src/sourmash/sig/__init__.py b/src/sourmash/sig/__init__.py index 0fafe39246..441c8fa37f 100644 --- a/src/sourmash/sig/__init__.py +++ b/src/sourmash/sig/__init__.py @@ -1,2 +1,2 @@ -from .__main__ import * # bring all functions into top-level +from .__main__ import * # bring all functions into top-level from . import grep diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index d10e8745f9..1a89d6239f 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -1,47 +1,55 @@ """ Command-line entry point for 'python -m sourmash.sig' """ -__all__ = ["cat", - "split", - "describe", - "manifest", - "overlap", - "merge", - "intersect", - "inflate", - "subtract", - "rename", - "extract", - "filter", - "flatten", - "downsample", - "ingest", - "export", - "kmers", - "fileinfo", - "check", - "collect"] +__all__ = [ + "cat", + "split", + "describe", + "manifest", + "overlap", + "merge", + "intersect", + "inflate", + "subtract", + "rename", + "extract", + "filter", + "flatten", + "downsample", + "ingest", + "export", + "kmers", + "fileinfo", + "check", + "collect", +] import sys import csv import json import os from collections import defaultdict, namedtuple, Counter -import json import re import screed import sourmash from sourmash.sourmash_args import FileOutput -from sourmash.logging import (set_quiet, error, notify, print_results, debug, - debug_literal, _debug) +from sourmash.logging import ( + set_quiet, + error, + notify, + print_results, + debug, + debug_literal, + _debug, +) from sourmash import sourmash_args from sourmash.minhash import _get_max_hash_for_scaled from sourmash.manifest import CollectionManifest -usage=''' +usage = """ sourmash signature [] - manipulate/work with signature files. ** Commands can be: @@ -67,15 +75,19 @@ ** Use '-h' to get subcommand-specific help, e.g. sourmash signature merge -h -''' +""" def _check_abundance_compatibility(sig1, sig2): if sig1.minhash.track_abundance != sig2.minhash.track_abundance: - raise ValueError("incompatible signatures: track_abundance is {} in first sig, {} in second".format(sig1.minhash.track_abundance, sig2.minhash.track_abundance)) + raise ValueError( + "incompatible signatures: track_abundance is {} in first sig, {} in second".format( + sig1.minhash.track_abundance, sig2.minhash.track_abundance + ) + ) -def _extend_signatures_with_from_file(args, *, target_attr='signatures'): +def _extend_signatures_with_from_file(args, *, target_attr="signatures"): # extend input signatures with --from-file if args.from_file: more_files = sourmash_args.load_pathlist_from_file(args.from_file) @@ -109,7 +121,7 @@ def cat(args): picklist = sourmash_args.load_picklist(args) pattern_search = sourmash_args.load_include_exclude_db_patterns(args) - encountered_md5sums = defaultdict(int) # used by --unique + encountered_md5sums = defaultdict(int) # used by --unique # open output for saving sigs save_sigs = sourmash_args.SaveSignaturesToLocation(args.output) @@ -119,14 +131,16 @@ def cat(args): # start loading! progress = sourmash_args.SignatureLoadingProgress() - loader = sourmash_args.load_many_signatures(args.signatures, - ksize=args.ksize, - moltype=moltype, - picklist=picklist, - progress=progress, - yield_all_files=args.force, - force=args.force, - pattern=pattern_search) + loader = sourmash_args.load_many_signatures( + args.signatures, + ksize=args.ksize, + moltype=moltype, + picklist=picklist, + progress=progress, + yield_all_files=args.force, + force=args.force, + pattern=pattern_search, + ) for ss, sigloc in loader: md5 = ss.md5sum() encountered_md5sums[md5] += 1 @@ -135,19 +149,19 @@ def cat(args): save_sigs.add(ss) - notify(f'loaded {len(save_sigs)} signatures total.') + notify(f"loaded {len(save_sigs)} signatures total.") if picklist: sourmash_args.report_picklist(args, picklist) save_sigs.close() - notify(f'output {len(save_sigs)} signatures') + notify(f"output {len(save_sigs)} signatures") - multiple_md5 = [ 1 for cnt in encountered_md5sums.values() if cnt > 1 ] + multiple_md5 = [1 for cnt in encountered_md5sums.values() if cnt > 1] if multiple_md5: - notify(f'encountered {sum(multiple_md5)} MinHashes multiple times') + notify(f"encountered {sum(multiple_md5)} MinHashes multiple times") if args.unique: - notify('...and removed the duplicates, because --unique was specified.') + notify("...and removed the duplicates, because --unique was specified.") def split(args): @@ -160,50 +174,59 @@ def split(args): _extend_signatures_with_from_file(args) output_names = set() - output_scaled_template = '{md5sum}.k={ksize}.scaled={scaled}.{moltype}.dup={dup}.{basename}' + args.extension - output_num_template = '{md5sum}.k={ksize}.num={num}.{moltype}.dup={dup}.{basename}' + args.extension + output_scaled_template = ( + "{md5sum}.k={ksize}.scaled={scaled}.{moltype}.dup={dup}.{basename}" + + args.extension + ) + output_num_template = ( + "{md5sum}.k={ksize}.num={num}.{moltype}.dup={dup}.{basename}" + args.extension + ) if args.output_dir: if not os.path.exists(args.output_dir): - notify(f'Creating --output-dir {args.output_dir}') + notify(f"Creating --output-dir {args.output_dir}") os.mkdir(args.output_dir) progress = sourmash_args.SignatureLoadingProgress() - loader = sourmash_args.load_many_signatures(args.signatures, - ksize=args.ksize, - moltype=moltype, - picklist=picklist, - progress=progress, - yield_all_files=args.force, - force=args.force) + loader = sourmash_args.load_many_signatures( + args.signatures, + ksize=args.ksize, + moltype=moltype, + picklist=picklist, + progress=progress, + yield_all_files=args.force, + force=args.force, + ) for sig, sigloc in loader: # save each file individually -- md5sum = sig.md5sum()[:8] minhash = sig.minhash basename = os.path.basename(sig.filename) - if not basename or basename == '-': - basename = 'none' - - params = dict(basename=basename, - md5sum=md5sum, - scaled=minhash.scaled, - ksize=minhash.ksize, - num=minhash.num, - moltype=minhash.moltype) + if not basename or basename == "-": + basename = "none" + + params = dict( + basename=basename, + md5sum=md5sum, + scaled=minhash.scaled, + ksize=minhash.ksize, + num=minhash.num, + moltype=minhash.moltype, + ) if minhash.scaled: output_template = output_scaled_template - else: # num + else: # num assert minhash.num output_template = output_num_template # figure out if this is duplicate, build unique filename n = 0 - params['dup'] = n + params["dup"] = n output_name = output_template.format(**params) while output_name in output_names: - params['dup'] = n + params["dup"] = n output_name = output_template.format(**params) n += 1 @@ -218,9 +241,9 @@ def split(args): # save! with sourmash_args.SaveSignaturesToLocation(output_name) as save_sigs: save_sigs.add(sig) - notify(f'writing sig to {output_name}') + notify(f"writing sig to {output_name}") - notify(f'loaded and split {len(progress)} signatures total.') + notify(f"loaded and split {len(progress)} signatures total.") if picklist: sourmash_args.report_picklist(args, picklist) @@ -242,24 +265,39 @@ def describe(args): csv_obj = sourmash_args.FileOutputCSV(args.csv) csv_fp = csv_obj.open() - w = csv.DictWriter(csv_fp, - ['signature_file', 'md5', 'ksize', 'moltype', - 'num', 'scaled', 'n_hashes', 'seed', - 'with_abundance', 'name', 'filename', 'license', - 'sum_hashes'], - extrasaction='ignore') + w = csv.DictWriter( + csv_fp, + [ + "signature_file", + "md5", + "ksize", + "moltype", + "num", + "scaled", + "n_hashes", + "seed", + "with_abundance", + "name", + "filename", + "license", + "sum_hashes", + ], + extrasaction="ignore", + ) w.writeheader() # start loading! progress = sourmash_args.SignatureLoadingProgress() - loader = sourmash_args.load_many_signatures(args.signatures, - ksize=args.ksize, - moltype=moltype, - picklist=picklist, - progress=progress, - yield_all_files=args.force, - force=args.force, - pattern=pattern_search) + loader = sourmash_args.load_many_signatures( + args.signatures, + ksize=args.ksize, + moltype=moltype, + picklist=picklist, + progress=progress, + yield_all_files=args.force, + force=args.force, + pattern=pattern_search, + ) for sig, location in loader: # extract info, write as appropriate. @@ -285,7 +323,8 @@ def describe(args): if w: w.writerow(locals()) - print_results('''\ + print_results( + """\ --- signature filename: {location} signature: {p_name} @@ -295,7 +334,9 @@ def describe(args): size: {n_hashes} sum hashes: {sum_hashes} signature license: {license} -''', **locals()) +""", + **locals(), + ) if csv_obj: csv_obj.close() @@ -311,9 +352,10 @@ def manifest(args): set_quiet(args.quiet, args.debug) try: - loader = sourmash_args.load_file_as_index(args.location, - yield_all_files=args.force) - except ValueError as exc: + loader = sourmash_args.load_file_as_index( + args.location, yield_all_files=args.force + ) + except ValueError: error(f"Cannot open '{args.location}' as a sourmash signature collection.") error("Use -d/--debug for details.") sys.exit(-1) @@ -325,12 +367,11 @@ def manifest(args): else: debug("sig manifest: forcing rebuild.") - manifest = sourmash_args.get_manifest(loader, require=True, - rebuild=rebuild) + manifest = sourmash_args.get_manifest(loader, require=True, rebuild=rebuild) - manifest.write_to_filename(args.output, - database_format=args.manifest_format, - ok_if_exists=args.force) + manifest.write_to_filename( + args.output, database_format=args.manifest_format, ok_if_exists=args.force + ) notify(f"manifest contains {len(manifest)} signatures total.") notify(f"wrote manifest to '{args.output}' ({args.manifest_format})") @@ -343,12 +384,14 @@ def overlap(args): moltype = sourmash_args.calculate_moltype(args) - sig1 = sourmash.load_one_signature(args.signature1, ksize=args.ksize, - select_moltype=moltype) - sig2 = sourmash.load_one_signature(args.signature2, ksize=args.ksize, - select_moltype=moltype) + sig1 = sourmash.load_one_signature( + args.signature1, ksize=args.ksize, select_moltype=moltype + ) + sig2 = sourmash.load_one_signature( + args.signature2, ksize=args.ksize, select_moltype=moltype + ) - notify(f'loaded one signature each from {args.signature1} and {args.signature2}') + notify(f"loaded one signature each from {args.signature1} and {args.signature2}") try: similarity = sig1.similarity(sig2) @@ -384,7 +427,8 @@ def overlap(args): disjoint_2 = len(hashes_2 - hashes_1) num_union = len(hashes_1.union(hashes_2)) - print('''\ + print( + """\ first signature: signature filename: {sig1_file} signature: {name1} @@ -408,7 +452,8 @@ def overlap(args): only in first: {disjoint_1} only in second: {disjoint_2} total (union): {num_union} -'''.format(**locals())) +""".format(**locals()) + ) def merge(args): @@ -425,13 +470,15 @@ def merge(args): # start loading! progress = sourmash_args.SignatureLoadingProgress() - loader = sourmash_args.load_many_signatures(args.signatures, - ksize=args.ksize, - moltype=moltype, - picklist=picklist, - progress=progress, - yield_all_files=args.force, - force=args.force) + loader = sourmash_args.load_many_signatures( + args.signatures, + ksize=args.ksize, + moltype=moltype, + picklist=picklist, + progress=progress, + yield_all_files=args.force, + force=args.force, + ) for sigobj, sigloc in loader: # first signature? initialize a bunch of stuff @@ -452,8 +499,12 @@ def merge(args): mh.merge(sigobj_mh) except (TypeError, ValueError) as exc: - error("ERROR when merging signature '{}' ({}) from file {}", - sigobj, sigobj.md5sum()[:8], sigloc) + error( + "ERROR when merging signature '{}' ({}) from file {}", + sigobj, + sigobj.md5sum()[:8], + sigloc, + ) error(str(exc)) sys.exit(-1) @@ -466,7 +517,7 @@ def merge(args): with sourmash_args.SaveSignaturesToLocation(args.output) as save_sigs: save_sigs.add(merged_sigobj) - notify(f'loaded and merged {len(progress)} signatures') + notify(f"loaded and merged {len(progress)} signatures") if picklist: sourmash_args.report_picklist(args, picklist) @@ -488,13 +539,15 @@ def intersect(args): # start loading! progress = sourmash_args.SignatureLoadingProgress() - loader = sourmash_args.load_many_signatures(args.signatures, - ksize=args.ksize, - moltype=moltype, - picklist=picklist, - progress=progress, - yield_all_files=args.force, - force=args.force) + loader = sourmash_args.load_many_signatures( + args.signatures, + ksize=args.ksize, + moltype=moltype, + picklist=picklist, + progress=progress, + yield_all_files=args.force, + force=args.force, + ) for sigobj, sigloc in loader: if first_sig is None: @@ -519,10 +572,10 @@ def intersect(args): # borrow abundances from a signature? if args.abundances_from: - notify(f'loading signature from {args.abundances_from}, keeping abundances') - abund_sig = sourmash.load_one_signature(args.abundances_from, - ksize=args.ksize, - select_moltype=moltype) + notify(f"loading signature from {args.abundances_from}, keeping abundances") + abund_sig = sourmash.load_one_signature( + args.abundances_from, ksize=args.ksize, select_moltype=moltype + ) if not abund_sig.minhash.track_abundance: error("--track-abundance not set on loaded signature?! exiting.") sys.exit(-1) @@ -533,7 +586,7 @@ def intersect(args): with sourmash_args.SaveSignaturesToLocation(args.output) as save_sigs: save_sigs.add(intersect_sigobj) - notify(f'loaded and intersected {len(progress)} signatures') + notify(f"loaded and intersected {len(progress)} signatures") if picklist: sourmash_args.report_picklist(args, picklist) @@ -546,9 +599,9 @@ def inflate(args): moltype = sourmash_args.calculate_moltype(args) picklist = sourmash_args.load_picklist(args) - inflate_sig = sourmash_args.load_query_signature(args.signature_from, - ksize=args.ksize, - select_moltype=moltype) + inflate_sig = sourmash_args.load_query_signature( + args.signature_from, ksize=args.ksize, select_moltype=moltype + ) inflate_from_mh = inflate_sig.minhash ksize = inflate_from_mh.ksize moltype = inflate_from_mh.moltype @@ -560,19 +613,20 @@ def inflate(args): # start loading! progress = sourmash_args.SignatureLoadingProgress() - loader = sourmash_args.load_many_signatures(args.other_sigs, - ksize=ksize, - moltype=moltype, - picklist=picklist, - progress=progress, - yield_all_files=args.force, - force=args.force) + loader = sourmash_args.load_many_signatures( + args.other_sigs, + ksize=ksize, + moltype=moltype, + picklist=picklist, + progress=progress, + yield_all_files=args.force, + force=args.force, + ) with sourmash_args.SaveSignaturesToLocation(args.output) as save_sigs: for sigobj, sigloc in loader: inflated_mh = sigobj.minhash.inflate(inflate_from_mh) - inflated_sigobj = sourmash.SourmashSignature(inflated_mh, - name=sigobj.name) + inflated_sigobj = sourmash.SourmashSignature(inflated_mh, name=sigobj.name) save_sigs.add(inflated_sigobj) @@ -580,7 +634,7 @@ def inflate(args): error("no signatures to inflate!?") sys.exit(-1) - notify(f'loaded and intersected {len(save_sigs)} signatures') + notify(f"loaded and intersected {len(save_sigs)} signatures") if picklist: sourmash_args.report_picklist(args, picklist) @@ -593,38 +647,41 @@ def subtract(args): moltype = sourmash_args.calculate_moltype(args) from_sigfile = args.signature_from - from_sigobj = sourmash.load_one_signature(from_sigfile, ksize=args.ksize, select_moltype=moltype) + from_sigobj = sourmash.load_one_signature( + from_sigfile, ksize=args.ksize, select_moltype=moltype + ) - if args.abundances_from: # it's ok to work with abund signatures if -A. + if args.abundances_from: # it's ok to work with abund signatures if -A. args.flatten = True from_mh = from_sigobj.minhash if from_mh.track_abundance and not args.flatten: - error('Cannot use subtract on signatures with abundance tracking, sorry!') + error("Cannot use subtract on signatures with abundance tracking, sorry!") sys.exit(1) subtract_mins = set(from_mh.hashes) - notify(f'loaded signature from {from_sigfile}...', end='\r') + notify(f"loaded signature from {from_sigfile}...", end="\r") progress = sourmash_args.SignatureLoadingProgress() for sigfile in args.subtraction_sigs: - for sigobj in sourmash_args.load_file_as_signatures(sigfile, - ksize=args.ksize, - select_moltype=moltype, - progress=progress): + for sigobj in sourmash_args.load_file_as_signatures( + sigfile, ksize=args.ksize, select_moltype=moltype, progress=progress + ): if not sigobj.minhash.is_compatible(from_mh): error("incompatible minhashes; specify -k and/or molecule type.") sys.exit(-1) if sigobj.minhash.track_abundance and not args.flatten: - error('Cannot use subtract on signatures with abundance tracking, sorry!') + error( + "Cannot use subtract on signatures with abundance tracking, sorry!" + ) sys.exit(1) subtract_mins -= set(sigobj.minhash.hashes) - notify(f'loaded and subtracted signatures from {sigfile}...', end='\r') + notify(f"loaded and subtracted signatures from {sigfile}...", end="\r") if not len(progress): error("no signatures to subtract!?") @@ -636,10 +693,10 @@ def subtract(args): # borrow abundances from somewhere? if args.abundances_from: - notify(f'loading signature from {args.abundances_from}, keeping abundances') - abund_sig = sourmash.load_one_signature(args.abundances_from, - ksize=args.ksize, - select_moltype=moltype) + notify(f"loading signature from {args.abundances_from}, keeping abundances") + abund_sig = sourmash.load_one_signature( + args.abundances_from, ksize=args.ksize, select_moltype=moltype + ) if not abund_sig.minhash.track_abundance: error("--track-abundance not set on loaded signature?! exiting.") sys.exit(-1) @@ -651,7 +708,7 @@ def subtract(args): with sourmash_args.SaveSignaturesToLocation(args.output) as save_sigs: save_sigs.add(subtract_sigobj) - notify(f'loaded and subtracted {len(progress)} signatures') + notify(f"loaded and subtracted {len(progress)} signatures") def rename(args): @@ -669,14 +726,16 @@ def rename(args): # start loading! progress = sourmash_args.SignatureLoadingProgress() - loader = sourmash_args.load_many_signatures(args.signatures, - ksize=args.ksize, - moltype=moltype, - picklist=picklist, - progress=progress, - yield_all_files=args.force, - force=args.force, - pattern=pattern_search) + loader = sourmash_args.load_many_signatures( + args.signatures, + ksize=args.ksize, + moltype=moltype, + picklist=picklist, + progress=progress, + yield_all_files=args.force, + force=args.force, + pattern=pattern_search, + ) for sigobj, sigloc in loader: sigobj = sigobj.to_mutable() @@ -703,14 +762,15 @@ def extract(args): # further filtering on md5 or name? filter_fn = None if args.md5 is not None or args.name is not None: + def filter_fn(row): # match? keep = False if args.name: - name = row['name'] or row['filename'] + name = row["name"] or row["filename"] if args.name in name: keep = True - if args.md5 and args.md5 in row['md5']: + if args.md5 and args.md5 in row["md5"]: keep = True return keep @@ -722,13 +782,11 @@ def filter_fn(row): # start loading! total_rows_examined = 0 for filename in args.signatures: - idx = sourmash_args.load_file_as_index(filename, - yield_all_files=args.force) + idx = sourmash_args.load_file_as_index(filename, yield_all_files=args.force) idx = idx.select(ksize=args.ksize, moltype=moltype) - idx = sourmash_args.apply_picklist_and_pattern(idx, picklist, - pattern_search) + idx = sourmash_args.apply_picklist_and_pattern(idx, picklist, pattern_search) manifest = sourmash_args.get_manifest(idx) total_rows_examined += len(manifest) @@ -743,7 +801,9 @@ def filter_fn(row): try: idx = idx.select(picklist=sub_picklist) except ValueError: - error("** This input collection doesn't support 'extract' with picklists or patterns.") + error( + "** This input collection doesn't support 'extract' with picklists or patterns." + ) error("** EXITING.") error("**") error("** You can use 'sourmash sig cat' with a picklist or pattern,") @@ -779,31 +839,29 @@ def filter(args): save_sigs.open() for filename in args.signatures: - siglist = sourmash_args.load_file_as_signatures(filename, - ksize=args.ksize, - select_moltype=moltype, - progress=progress) + siglist = sourmash_args.load_file_as_signatures( + filename, ksize=args.ksize, select_moltype=moltype, progress=progress + ) siglist = list(siglist) # select! if args.md5 is not None: - siglist = [ ss for ss in siglist if args.md5 in ss.md5sum() ] + siglist = [ss for ss in siglist if args.md5 in ss.md5sum()] if args.name is not None: - siglist = [ ss for ss in siglist if args.name in str(ss) ] + siglist = [ss for ss in siglist if args.name in str(ss)] for ss in siglist: mh = ss.minhash if not mh.track_abundance: - notify(f'ignoring signature {ss} - track_abundance not set.') + notify(f"ignoring signature {ss} - track_abundance not set.") continue abunds = mh.hashes abunds2 = {} for k, v in abunds.items(): if v >= args.min_abundance: - if args.max_abundance is None or \ - v <= args.max_abundance: - abunds2[k] = v + if args.max_abundance is None or v <= args.max_abundance: + abunds2[k] = v filtered_mh = mh.copy_and_clear() filtered_mh.set_abundances(abunds2) @@ -833,22 +891,24 @@ def flatten(args): # start loading! progress = sourmash_args.SignatureLoadingProgress() - loader = sourmash_args.load_many_signatures(args.signatures, - ksize=args.ksize, - moltype=moltype, - picklist=picklist, - progress=progress, - yield_all_files=args.force, - force=args.force) + loader = sourmash_args.load_many_signatures( + args.signatures, + ksize=args.ksize, + moltype=moltype, + picklist=picklist, + progress=progress, + yield_all_files=args.force, + force=args.force, + ) for ss, sigloc in loader: # select! if args.md5 is not None: if args.md5 not in ss.md5sum(): - continue # skip + continue # skip if args.name is not None: if args.name not in ss.name: - continue # skip + continue # skip ss = ss.to_mutable() ss.minhash = ss.minhash.flatten() @@ -872,11 +932,11 @@ def downsample(args): _extend_signatures_with_from_file(args) if not args.num_hashes and not args.scaled: - error('ERROR: must specify either --num or --scaled value') + error("ERROR: must specify either --num or --scaled value") sys.exit(-1) if args.num_hashes and args.scaled: - error('ERROR: cannot specify both --num and --scaled') + error("ERROR: cannot specify both --num and --scaled") sys.exit(-1) # open output for saving sigs @@ -885,13 +945,15 @@ def downsample(args): # start loading! progress = sourmash_args.SignatureLoadingProgress() - loader = sourmash_args.load_many_signatures(args.signatures, - ksize=args.ksize, - moltype=moltype, - picklist=picklist, - progress=progress, - yield_all_files=args.force, - force=args.force) + loader = sourmash_args.load_many_signatures( + args.signatures, + ksize=args.ksize, + moltype=moltype, + picklist=picklist, + progress=progress, + yield_all_files=args.force, + force=args.force, + ) for ss, sigloc in loader: sigobj = ss.to_mutable() mh = sigobj.minhash @@ -906,7 +968,9 @@ def downsample(args): max_hash = _get_max_hash_for_scaled(args.scaled) mins = mh.hashes if max(mins) < max_hash: - raise ValueError("this num MinHash does not have enough hashes to convert it into a scaled MinHash.") + raise ValueError( + "this num MinHash does not have enough hashes to convert it into a scaled MinHash." + ) mh_new = mh.copy() _set_num_scaled(mh_new, 0, args.scaled) @@ -923,7 +987,6 @@ def downsample(args): mh_new = mh.copy() _set_num_scaled(mh_new, args.num_hashes, 0) - sigobj.minhash = mh_new save_sigs.add(sigobj) @@ -944,7 +1007,7 @@ def ingest(args): siglist = [] if args.csv: for filename in args.filenames: - with open(filename, newline='') as csv_fp: + with open(filename, newline="") as csv_fp: reader = csv.reader(csv_fp) siglist = [] for row in reader: @@ -952,34 +1015,34 @@ def ingest(args): hashseed = int(row[1]) # only support a limited import type, for now ;) - assert hashfn == 'murmur64' + assert hashfn == "murmur64" assert hashseed == 42 _, _, ksize, name, hashes = row ksize = int(ksize) hashes = hashes.strip() - hashes = list(map(int, hashes.split(' ' ))) + hashes = list(map(int, hashes.split(" "))) e = sourmash.MinHash(len(hashes), ksize) e.add_many(hashes) s = sourmash.SourmashSignature(e, filename=name) siglist.append(s) - notify(f'loaded signature: {name} {s.md5sum()[:8]}') + notify(f"loaded signature: {name} {s.md5sum()[:8]}") else: for filename in args.filenames: with open(filename) as fp: x = json.loads(fp.read()) - ksize = x['kmer'] - num = x['sketchSize'] + ksize = x["kmer"] + num = x["sketchSize"] - assert x['hashType'] == "MurmurHash3_x64_128" - assert x['hashBits'] == 64 - assert x['hashSeed'] == 42 + assert x["hashType"] == "MurmurHash3_x64_128" + assert x["hashBits"] == 64 + assert x["hashSeed"] == 42 - xx = x['sketches'][0] - hashes = xx['hashes'] + xx = x["sketches"][0] + hashes = xx["hashes"] mh = sourmash.MinHash(ksize=ksize, n=num, is_protein=False) mh.add_many(hashes) @@ -987,7 +1050,7 @@ def ingest(args): s = sourmash.SourmashSignature(mh, filename=filename) siglist.append(s) - notify(f'saving {len(siglist)} signatures to JSON') + notify(f"saving {len(siglist)} signatures to JSON") with sourmash_args.SaveSignaturesToLocation(args.output) as save_sigs: save_sigs.add_many(siglist) @@ -999,24 +1062,23 @@ def export(args): set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) - query = sourmash_args.load_query_signature(args.filename, - ksize=args.ksize, - select_moltype=moltype, - select_md5=args.md5) + query = sourmash_args.load_query_signature( + args.filename, ksize=args.ksize, select_moltype=moltype, select_md5=args.md5 + ) mh = query.minhash x = {} - x['kmer'] = mh.ksize - x['sketchSize'] = len(mh) + x["kmer"] = mh.ksize + x["sketchSize"] = len(mh) - x['hashType'] = "MurmurHash3_x64_128" - x['hashBits'] = 64 - x['hashSeed'] = mh.seed + x["hashType"] = "MurmurHash3_x64_128" + x["hashBits"] = 64 + x["hashSeed"] = mh.seed ll = list(mh.hashes) - x['sketches'] = [{ 'hashes': ll }] + x["sketches"] = [{"hashes": ll}] - with FileOutput(args.output, 'wt') as fp: + with FileOutput(args.output, "wt") as fp: print(json.dumps(x), file=fp) notify(f"exported signature {query} ({query.md5sum()[:8]})") @@ -1035,16 +1097,17 @@ def kmers(args): first_sig = None query_mh = None - # start loading! progress = sourmash_args.SignatureLoadingProgress() - loader = sourmash_args.load_many_signatures(args.signatures, - ksize=args.ksize, - moltype=moltype, - picklist=picklist, - progress=progress, - yield_all_files=args.force, - force=args.force) + loader = sourmash_args.load_many_signatures( + args.signatures, + ksize=args.ksize, + moltype=moltype, + picklist=picklist, + progress=progress, + yield_all_files=args.force, + force=args.force, + ) for sigobj, sigloc in loader: # first signature? initialize a bunch of stuff @@ -1061,8 +1124,12 @@ def kmers(args): query_mh.merge(sigobj_mh) except (TypeError, ValueError) as exc: - error("ERROR when merging signature '{}' ({}) from file {}", - sigobj, sigobj.md5sum()[:8], sigloc) + error( + "ERROR when merging signature '{}' ({}) from file {}", + sigobj, + sigobj.md5sum()[:8], + sigloc, + ) error(str(exc)) sys.exit(-1) @@ -1075,13 +1142,13 @@ def kmers(args): sourmash_args.report_picklist(args, picklist) is_protein = False - if query_mh.moltype == 'DNA': + if query_mh.moltype == "DNA": if args.translate: error("ERROR: cannot use --translate with DNA sketches.") sys.exit(-1) else: is_protein = True - if args.translate: # input sequence is DNA + if args.translate: # input sequence is DNA is_protein = False if not query_mh: @@ -1089,8 +1156,10 @@ def kmers(args): sys.exit(-1) notify("") - notify(f"merged signature has the following properties:") - notify(f"k={query_mh.ksize} molecule={query_mh.moltype} num={query_mh.num} scaled={query_mh.scaled} seed={query_mh.seed}") + notify("merged signature has the following properties:") + notify( + f"k={query_mh.ksize} molecule={query_mh.moltype} num={query_mh.num} scaled={query_mh.scaled} seed={query_mh.seed}" + ) notify(f"total hashes in merged signature: {len(query_mh)}") notify("") notify("now processing sequence files for matches!") @@ -1103,11 +1172,10 @@ def kmers(args): if args.save_kmers: save_kmers = sourmash_args.FileOutputCSV(args.save_kmers) save_kmers.open() - kmer_w = csv.DictWriter(save_kmers.fp, - fieldnames=['sequence_file', - 'sequence_name', - 'kmer', - 'hashval']) + kmer_w = csv.DictWriter( + save_kmers.fp, + fieldnames=["sequence_file", "sequence_name", "kmer", "hashval"], + ) kmer_w.writeheader() save_seqs = None @@ -1117,7 +1185,7 @@ def kmers(args): # figure out protein vs dna is_protein = False - if query_mh.moltype != 'DNA': + if query_mh.moltype != "DNA": if not args.translate: is_protein = True @@ -1143,12 +1211,11 @@ def kmers(args): seq_mh.add_protein(record.sequence) else: try: - seq_mh.add_sequence(record.sequence, - not args.check_sequence) + seq_mh.add_sequence(record.sequence, not args.check_sequence) except ValueError as exc: seqname = record.name if len(seqname) > 40: - seqname = seqname[:37] + '...' + seqname = seqname[:37] + "..." notify(f"ERROR in sequence '{seqname}', file '{filename}'") notify(str(exc)) if args.force: @@ -1169,15 +1236,19 @@ def kmers(args): # output matching k-mers: if kmer_w: seq = record.sequence - kh_iter = seq_mh.kmers_and_hashes(seq, force=False, - is_protein=is_protein) + kh_iter = seq_mh.kmers_and_hashes( + seq, force=False, is_protein=is_protein + ) for kmer, hashval in kh_iter: if hashval in query_mh.hashes: found_mh.add_hash(hashval) n_kmers_found += 1 - d = dict(sequence_file=filename, - sequence_name=record.name, - kmer=kmer, hashval=hashval) + d = dict( + sequence_file=filename, + sequence_name=record.name, + kmer=kmer, + hashval=hashval, + ) kmer_w.writerow(d) # add seq_mh to found_mh @@ -1188,7 +1259,9 @@ def kmers(args): n_bp_searched += len(record.sequence) if n_bp_searched >= progress_threshold: - notify(f"... searched {n_bp_searched} from {n_files_searched} files so far") + notify( + f"... searched {n_bp_searched} from {n_files_searched} files so far" + ) while n_bp_searched >= progress_threshold: progress_threshold += progress_interval @@ -1205,10 +1278,14 @@ def kmers(args): # ...and report! notify("DONE.") - notify(f"searched {n_sequences_searched} sequences from {n_files_searched} files, containing a total of {format_bp(n_bp_searched)}.") + notify( + f"searched {n_sequences_searched} sequences from {n_files_searched} files, containing a total of {format_bp(n_bp_searched)}." + ) if save_seqs: - notify(f"matched and saved a total of {n_sequences_found} sequences with {format_bp(n_bp_saved)}.") + notify( + f"matched and saved a total of {n_sequences_found} sequences with {format_bp(n_bp_saved)}." + ) if kmer_w: notify(f"matched and saved a total of {n_kmers_found} k-mers.") @@ -1226,7 +1303,7 @@ def kmers(args): notify("NOTE: see --save-kmers or --save-sequences for output options.") -_SketchInfo = namedtuple('_SketchInfo', 'ksize, moltype, scaled, num, abund') +_SketchInfo = namedtuple("_SketchInfo", "ksize, moltype, scaled, num, abund") def _summarize_manifest(manifest): @@ -1237,22 +1314,26 @@ def _summarize_manifest(manifest): counter = Counter() hashcounts = Counter() for row in manifest.rows: - ski = _SketchInfo(ksize=row['ksize'], moltype=row['moltype'], - scaled=row['scaled'], num=row['num'], - abund=row['with_abundance']) + ski = _SketchInfo( + ksize=row["ksize"], + moltype=row["moltype"], + scaled=row["scaled"], + num=row["num"], + abund=row["with_abundance"], + ) counter[ski] += 1 - hashcounts[ski] += row['n_hashes'] - total_size += row['n_hashes'] + hashcounts[ski] += row["n_hashes"] + total_size += row["n_hashes"] # store in info_d - info_d['total_hashes'] = total_size + info_d["total_hashes"] = total_size sketch_info = [] for ski, count in counter.items(): sketch_d = dict(ski._asdict()) - sketch_d['count'] = count - sketch_d['n_hashes'] = hashcounts[ski] + sketch_d["count"] = count + sketch_d["n_hashes"] = hashcounts[ski] sketch_info.append(sketch_d) - info_d['sketch_info'] = sketch_info + info_d["sketch_info"] = sketch_info return info_d @@ -1271,22 +1352,21 @@ def fileinfo(args): # load as index! try: notify(f"** loading from '{args.path}'") - idx = sourmash_args.load_file_as_index(args.path, - yield_all_files=args.force) + idx = sourmash_args.load_file_as_index(args.path, yield_all_files=args.force) except ValueError: error(f"Cannot open '{args.path}' as a sourmash signature collection.") error("Use -d/--debug for details.") sys.exit(-1) - print_bool = lambda x: "yes" if x else "no" - print_none = lambda x: "n/a" if x is None else x + def print_bool(x): + return "yes" if x else "no" info_d = {} - info_d['path_filetype'] = type(idx).__name__ - info_d['location'] = "" if not idx.location else idx.location - info_d['is_database'] = bool(idx.is_database) - info_d['has_manifest'] = bool(idx.manifest) - info_d['num_sketches'] = len(idx) + info_d["path_filetype"] = type(idx).__name__ + info_d["location"] = "" if not idx.location else idx.location + info_d["is_database"] = bool(idx.is_database) + info_d["has_manifest"] = bool(idx.manifest) + info_d["num_sketches"] = len(idx) if text_out: print_results(f"path filetype: {info_d['path_filetype']}") @@ -1298,8 +1378,9 @@ def fileinfo(args): # also have arg to fileinfo to force recalculation notify("** examining manifest...") - manifest = sourmash_args.get_manifest(idx, rebuild=args.rebuild_manifest, - require=False) + manifest = sourmash_args.get_manifest( + idx, rebuild=args.rebuild_manifest, require=False + ) if manifest is None: # actually can't find any file type to trigger this, but leaving it @@ -1313,9 +1394,9 @@ def fileinfo(args): print_results(f"total hashes: {info_d['total_hashes']}") print_results("summary of sketches:") - for ski in info_d['sketch_info']: - mh_type = f"num={ski['num']}" if ski['num'] else f"scaled={ski['scaled']}" - mh_abund = ", abund" if ski['abund'] else "" + for ski in info_d["sketch_info"]: + mh_type = f"num={ski['num']}" if ski["num"] else f"scaled={ski['scaled']}" + mh_abund = ", abund" if ski["abund"] else "" sketch_str = f"{ski['count']} sketches with {ski['moltype']}, k={ski['ksize']}, {mh_type}{mh_abund}" @@ -1331,10 +1412,11 @@ def check(args): check signature db(s) against a picklist. """ from sourmash.picklist import PickStyle + set_quiet(args.quiet, args.debug) moltype = sourmash_args.calculate_moltype(args) picklist = sourmash_args.load_picklist(args) - pattern_search = sourmash_args.load_include_exclude_db_patterns(args) + sourmash_args.load_include_exclude_db_patterns(args) _extend_signatures_with_from_file(args) if not picklist: @@ -1358,8 +1440,7 @@ def check(args): # start loading! total_rows_examined = 0 for filename in args.signatures: - idx = sourmash_args.load_file_as_index(filename, - yield_all_files=args.force) + idx = sourmash_args.load_file_as_index(filename, yield_all_files=args.force) idx = idx.select(ksize=args.ksize, moltype=moltype) @@ -1376,14 +1457,15 @@ def check(args): # rewrite locations so that each signature can be found by filename # of its container; this follows `sig collect` logic. - rows = [] for row in sub_manifest.rows: - row['internal_location'] = filename + row["internal_location"] = filename total_manifest_rows.add_row(row) # the len(sub_manifest) here should only be run when needed :) if _debug: - debug_literal(f"examined {len(new_manifest)} new rows, found {len(sub_manifest)} matching rows") + debug_literal( + f"examined {len(new_manifest)} new rows, found {len(sub_manifest)} matching rows" + ) notify(f"loaded {total_rows_examined} signatures.") @@ -1399,7 +1481,7 @@ def check(args): n_output = 0 with sourmash_args.FileInputCSV(pickfile) as r: - with open(args.output_missing, "w", newline='') as outfp: + with open(args.output_missing, "w", newline="") as outfp: w = csv.DictWriter(outfp, fieldnames=r.fieldnames) w.writeheader() @@ -1408,18 +1490,27 @@ def check(args): if not picklist.matched_csv_row(row): n_output += 1 w.writerow(row) - notify(f"saved {n_output} non-matching rows of {n_input} picklist rows to '{args.output_missing}'") + notify( + f"saved {n_output} non-matching rows of {n_input} picklist rows to '{args.output_missing}'" + ) elif args.output_missing: - notify(f"(no remaining picklist entries; not saving to '{args.output_missing}')") + notify( + f"(no remaining picklist entries; not saving to '{args.output_missing}')" + ) # save manifest of matching! if args.save_manifest_matching and total_manifest_rows: mf = total_manifest_rows - mf.write_to_filename(args.save_manifest_matching, - database_format=args.manifest_format) - notify(f"wrote {len(mf)} matching manifest rows to '{args.save_manifest_matching}'") + mf.write_to_filename( + args.save_manifest_matching, database_format=args.manifest_format + ) + notify( + f"wrote {len(mf)} matching manifest rows to '{args.save_manifest_matching}'" + ) elif args.save_manifest_matching: - notify(f"(not saving matching manifest to '{args.save_manifest_matching}' because no matches)") + notify( + f"(not saving matching manifest to '{args.save_manifest_matching}' because no matches)" + ) if args.fail_if_missing and n_missing: error("** ERROR: missing values, and --fail-if-missing requested. Exiting.") @@ -1437,15 +1528,17 @@ def collect(args): pass else: error(f"ERROR: '{args.output}' already exists!") - error(f"ERROR: please remove it, or use --merge-previous to merge") + error("ERROR: please remove it, or use --merge-previous to merge") sys.exit(-1) elif args.merge_previous: - notify(f"WARNING: --merge-previous specified, but output file '{args.output}' does not already exist?") + notify( + f"WARNING: --merge-previous specified, but output file '{args.output}' does not already exist?" + ) # load previous manifest for --merge-previous. This gets tricky with # mismatched manifest types, which we forbid. try: - if args.manifest_format == 'sql': + if args.manifest_format == "sql": # create on-disk manifest from sourmash.index.sqlite_index import SqliteCollectionManifest @@ -1455,7 +1548,7 @@ def collect(args): collected_mf = SqliteCollectionManifest.create(args.output) else: # create in-memory manifest that will be saved as CSV - assert args.manifest_format == 'csv' + assert args.manifest_format == "csv" if args.merge_previous and os.path.exists(args.output): collected_mf = CollectionManifest.load_from_filename(args.output) @@ -1465,7 +1558,9 @@ def collect(args): if not isinstance(collected_mf, CollectionManifest): raise Exception except: - error(f"ERROR loading '{args.output}' with --merge-previous. Is it of type {args.manifest_format}?") + error( + f"ERROR loading '{args.output}' with --merge-previous. Is it of type {args.manifest_format}?" + ) sys.exit(-1) if args.merge_previous: @@ -1482,35 +1577,37 @@ def collect(args): n_files = 0 # load from_file - _extend_signatures_with_from_file(args, target_attr='locations') + _extend_signatures_with_from_file(args, target_attr="locations") # convert to abspath if args.abspath: - args.locations = [ os.path.abspath(iloc) for iloc in args.locations ] + args.locations = [os.path.abspath(iloc) for iloc in args.locations] # iterate through, loading all the manifests from all the locations. for n_files, loc in enumerate(args.locations): notify(f"Loading signature information from {loc}.") if n_files % 100 == 0: - notify(f'... loaded {len(collected_mf)} sigs from {n_files} files') + notify(f"... loaded {len(collected_mf)} sigs from {n_files} files") idx = sourmash.load_file_as_index(loc) if idx.manifest is None and require_manifest: error(f"ERROR on location '{loc}'") - error(f"sig collect requires a manifest by default, but no manifest present.") + error( + "sig collect requires a manifest by default, but no manifest present." + ) error("specify --no-require-manifest to dynamically generate one.") sys.exit(-1) mf = sourmash_args.get_manifest(idx) - rows = [] for row in mf.rows: - row['internal_location'] = loc + row["internal_location"] = loc collected_mf.add_row(row) - if args.manifest_format == 'csv': - collected_mf.write_to_filename(args.output, database_format='csv', - ok_if_exists=args.merge_previous) + if args.manifest_format == "csv": + collected_mf.write_to_filename( + args.output, database_format="csv", ok_if_exists=args.merge_previous + ) else: collected_mf.close() @@ -1522,9 +1619,9 @@ def collect(args): def main(arglist=None): args = sourmash.cli.get_parser().parse_args(arglist) submod = getattr(sourmash.cli.sig, args.subcmd) - mainmethod = getattr(submod, 'main') + mainmethod = getattr(submod, "main") return mainmethod(args) -if __name__ == '__main__': +if __name__ == "__main__": main(sys.argv) diff --git a/src/sourmash/sig/grep.py b/src/sourmash/sig/grep.py index e131ca501e..cfdc857779 100644 --- a/src/sourmash/sig/grep.py +++ b/src/sourmash/sig/grep.py @@ -28,9 +28,13 @@ def main(args): pattern = re.compile(pattern) if args.invert_match: - search_pattern = lambda vals: all(not pattern.search(val) for val in vals) + + def search_pattern(vals): + return all(not pattern.search(val) for val in vals) else: - search_pattern = lambda vals: any(pattern.search(val) for val in vals) + + def search_pattern(vals): + return any(pattern.search(val) for val in vals) # require manifests? require_manifest = True @@ -63,28 +67,27 @@ def main(args): # start loading! total_rows_examined = 0 for filename in args.signatures: - idx = sourmash_args.load_file_as_index(filename, - yield_all_files=args.force) + idx = sourmash_args.load_file_as_index(filename, yield_all_files=args.force) - idx = idx.select(ksize=args.ksize, - moltype=moltype, - picklist=picklist) + idx = idx.select(ksize=args.ksize, moltype=moltype, picklist=picklist) # get (and maybe generate) the manifest. manifest = idx.manifest if manifest is None: if require_manifest: error(f"ERROR on filename '{filename}'.") - error("sig grep requires a manifest by default, but no manifest present.") + error( + "sig grep requires a manifest by default, but no manifest present." + ) error("specify --no-require-manifest to dynamically generate one.") sys.exit(-1) else: - manifest = sourmash_args.get_manifest(idx, - require=False) + manifest = sourmash_args.get_manifest(idx, require=False) # find all matching rows. - sub_manifest = manifest.filter_on_columns(search_pattern, - ["name", "filename", "md5"]) + sub_manifest = manifest.filter_on_columns( + search_pattern, ["name", "filename", "md5"] + ) total_rows_examined += len(manifest) # write out to CSV, if desired. @@ -119,7 +122,9 @@ def main(args): notify(f"loaded {total_rows_examined} total that matched ksize & molecule type") if save_sigs: - notify(f"extracted {len(save_sigs)} signatures from {len(args.signatures)} file(s)") + notify( + f"extracted {len(save_sigs)} signatures from {len(args.signatures)} file(s)" + ) save_sigs.close() else: error("no matching signatures found!") diff --git a/src/sourmash/signature.py b/src/sourmash/signature.py index 1fd34d35e6..3faa5e856b 100644 --- a/src/sourmash/signature.py +++ b/src/sourmash/signature.py @@ -40,12 +40,9 @@ def __init__(self, minhash, name="", filename=""): self.minhash = minhash - @property def minhash(self): - return FrozenMinHash._from_objptr( - self._methodcall(lib.signature_first_mh) - ) + return FrozenMinHash._from_objptr(self._methodcall(lib.signature_first_mh)) @minhash.setter def minhash(self, value): @@ -62,11 +59,11 @@ def __repr__(self): name = self.name md5pref = self.md5sum()[:8] if name == md5pref: - return "SourmashSignature({})".format(md5pref) - else: # name != md5pref: - return "SourmashSignature('{}', {})".format(name, md5pref) + return f"SourmashSignature({md5pref})" + else: # name != md5pref: + return f"SourmashSignature('{name}', {md5pref})" - #def minhashes(self): + # def minhashes(self): # size = ffi.new("uintptr_t *") # mhs_ptr = self._methodcall(lib.signature_get_mhs, size) # size = ffi.unpack(size, 1)[0] @@ -134,40 +131,77 @@ def _display_name(self, max_length=0): def similarity(self, other, ignore_abundance=False, downsample=False): "Compute similarity with the other signature." - return self.minhash.similarity(other.minhash, - ignore_abundance=ignore_abundance, - downsample=downsample) + return self.minhash.similarity( + other.minhash, ignore_abundance=ignore_abundance, downsample=downsample + ) def jaccard(self, other): "Compute Jaccard similarity with the other MinHash signature." - return self.minhash.similarity(other.minhash, ignore_abundance=True, - downsample=False) + return self.minhash.similarity( + other.minhash, ignore_abundance=True, downsample=False + ) - def jaccard_ani(self, other, *, downsample=False, jaccard=None, prob_threshold=1e-3, err_threshold=1e-4): + def jaccard_ani( + self, + other, + *, + downsample=False, + jaccard=None, + prob_threshold=1e-3, + err_threshold=1e-4, + ): "Use jaccard to estimate ANI between two FracMinHash signatures." - return self.minhash.jaccard_ani(other.minhash, downsample=downsample, - jaccard=jaccard, prob_threshold=prob_threshold, - err_threshold=err_threshold) + return self.minhash.jaccard_ani( + other.minhash, + downsample=downsample, + jaccard=jaccard, + prob_threshold=prob_threshold, + err_threshold=err_threshold, + ) def contained_by(self, other, downsample=False): "Compute containment by the other signature. Note: ignores abundance." return self.minhash.contained_by(other.minhash, downsample=downsample) - def containment_ani(self, other, *, downsample=False, containment=None, confidence=0.95, estimate_ci=False): + def containment_ani( + self, + other, + *, + downsample=False, + containment=None, + confidence=0.95, + estimate_ci=False, + ): "Use containment to estimate ANI between two FracMinHash signatures." - return self.minhash.containment_ani(other.minhash, downsample=downsample, - containment=containment, confidence=confidence, - estimate_ci=estimate_ci) + return self.minhash.containment_ani( + other.minhash, + downsample=downsample, + containment=containment, + confidence=confidence, + estimate_ci=estimate_ci, + ) def max_containment(self, other, downsample=False): "Compute max containment w/other signature. Note: ignores abundance." return self.minhash.max_containment(other.minhash, downsample=downsample) - def max_containment_ani(self, other, *, downsample=False, max_containment=None, confidence=0.95, estimate_ci=False): + def max_containment_ani( + self, + other, + *, + downsample=False, + max_containment=None, + confidence=0.95, + estimate_ci=False, + ): "Use max containment to estimate ANI between two FracMinHash signatures." - return self.minhash.max_containment_ani(other.minhash, downsample=downsample, - max_containment=max_containment, confidence=confidence, - estimate_ci=estimate_ci) + return self.minhash.max_containment_ani( + other.minhash, + downsample=downsample, + max_containment=max_containment, + confidence=confidence, + estimate_ci=estimate_ci, + ) def avg_containment(self, other, downsample=False): """ @@ -218,11 +252,7 @@ def __setstate__(self, tup): def __reduce__(self): return ( SourmashSignature, - ( - self.minhash, - self.name, - self.filename - ), + (self.minhash, self.name, self.filename), ) def __copy__(self): @@ -279,6 +309,7 @@ def add_protein(self, sequence): def __copy__(self): return self + copy = __copy__ def to_frozen(self): @@ -325,7 +356,9 @@ def _detect_input_type(data): - Compressed memory buffers - filename """ - if hasattr(data, 'read') or hasattr(data, "fileno") or hasattr(data, "mode"): # file-like object + if ( + hasattr(data, "read") or hasattr(data, "fileno") or hasattr(data, "mode") + ): # file-like object return SigInput.FILE_LIKE elif hasattr(data, "find"): # check if it is uncompressed sig try: @@ -334,7 +367,7 @@ def _detect_input_type(data): except TypeError: if data.find(b"sourmash_signature") > 0: return SigInput.BUFFER - elif data.startswith(b'\x1F\x8B'): # gzip compressed + elif data.startswith(b"\x1F\x8B"): # gzip compressed return SigInput.BUFFER try: @@ -347,7 +380,11 @@ def _detect_input_type(data): def load_signatures( - data, ksize=None, select_moltype=None, ignore_md5sum=False, do_raise=False, + data, + ksize=None, + select_moltype=None, + ignore_md5sum=False, + do_raise=False, ): """Load a JSON string with signatures into classes. @@ -374,14 +411,18 @@ def load_signatures( input_type = _detect_input_type(data) if input_type == SigInput.UNKNOWN: if do_raise: - raise ValueError("Error in parsing signature; quitting. Cannot open file or invalid signature") + raise ValueError( + "Error in parsing signature; quitting. Cannot open file or invalid signature" + ) return size = ffi.new("uintptr_t *") try: if input_type == SigInput.FILE_LIKE: - if hasattr(data, "mode") and "t" in data.mode: # need to reopen handler as binary + if ( + hasattr(data, "mode") and "t" in data.mode + ): # need to reopen handler as binary data = data.buffer buf = data.read() @@ -423,7 +464,7 @@ def load_signatures( for sig in sigs: yield sig.to_frozen() - except Exception as e: + except Exception: if do_raise: raise @@ -461,8 +502,9 @@ def save_signatures(siglist, fp=None, compression=0): size = ffi.new("uintptr_t *") # save signature into a string (potentially compressed) - rawbuf = rustcall(lib.signatures_save_buffer, siglist_c, len(collected), - compression, size) + rawbuf = rustcall( + lib.signatures_save_buffer, siglist_c, len(collected), compression, size + ) size = size[0] # associate a finalizer with rawbuf so that it gets freed @@ -472,11 +514,11 @@ def save_signatures(siglist, fp=None, compression=0): else: result = ffi.string(buf, size) - if fp is None: # return string + if fp is None: # return string return result else: - try: # write to file + try: # write to file fp.write(result) except TypeError: - fp.write(result.decode('utf-8')) + fp.write(result.decode("utf-8")) return None diff --git a/src/sourmash/sketchcomparison.py b/src/sourmash/sketchcomparison.py index db36d20ac3..e80013edaa 100644 --- a/src/sourmash/sketchcomparison.py +++ b/src/sourmash/sketchcomparison.py @@ -4,14 +4,16 @@ import numpy as np from dataclasses import dataclass -from .signature import MinHash +from .minhash import MinHash + @dataclass class BaseMinHashComparison: """Class for standard comparison between two MinHashes""" + mh1: MinHash mh2: MinHash - ignore_abundance: bool = False # optionally ignore abundances + ignore_abundance: bool = False # optionally ignore abundances jaccard_ani_untrustworthy: bool = False def downsample_and_handle_ignore_abundance(self, cmp_num=None, cmp_scaled=None): @@ -34,11 +36,15 @@ def downsample_and_handle_ignore_abundance(self, cmp_num=None, cmp_scaled=None): raise ValueError("Error: must pass in a comparison scaled or num value.") def check_compatibility_and_downsample(self, cmp_num=None, cmp_scaled=None): - if not any([(self.mh1.num and self.mh2.num), (self.mh1.scaled and self.mh2.scaled)]): + if not any( + [(self.mh1.num and self.mh2.num), (self.mh1.scaled and self.mh2.scaled)] + ): raise TypeError("Error: Both sketches must be 'num' or 'scaled'.") - #need to downsample first because is_compatible checks scaled (though does not check num) - self.downsample_and_handle_ignore_abundance(cmp_num=cmp_num, cmp_scaled=cmp_scaled) + # need to downsample first because is_compatible checks scaled (though does not check num) + self.downsample_and_handle_ignore_abundance( + cmp_num=cmp_num, cmp_scaled=cmp_scaled + ) if not self.mh1_cmp.is_compatible(self.mh2_cmp): raise TypeError("Error: Cannot compare incompatible sketches.") self.ksize = self.mh1.ksize @@ -69,30 +75,34 @@ def angular_similarity(self): @property def cosine_similarity(self): return self.angular_similarity - + + @dataclass class NumMinHashComparison(BaseMinHashComparison): """Class for standard comparison between two num minhashes""" + cmp_num: int = None def __post_init__(self): "Initialize NumMinHashComparison using values from provided MinHashes" - if self.cmp_num is None: # record the num we're doing this comparison on + if self.cmp_num is None: # record the num we're doing this comparison on self.cmp_num = min(self.mh1.num, self.mh2.num) self.check_compatibility_and_downsample(cmp_num=self.cmp_num) @property def size_may_be_inaccurate(self): - return False # not using size estimation, can ignore + return False # not using size estimation, can ignore + @dataclass class FracMinHashComparison(BaseMinHashComparison): """Class for standard comparison between two scaled minhashes""" - cmp_scaled: int = None # optionally force scaled value for this comparison + + cmp_scaled: int = None # optionally force scaled value for this comparison threshold_bp: int = 0 estimate_ani_ci: bool = False ani_confidence: float = 0.95 -# pfn_threshold: float = 1e-3 + # pfn_threshold: float = 1e-3 def __post_init__(self): "Initialize ScaledComparison using values from provided FracMinHashes" @@ -121,19 +131,23 @@ def total_unique_intersect_hashes(self): To get true bp estimates, we would need to add `(k-1)`. However, this complicates the iterative gather algorithm, so let's stick with hashes. """ - return len(self.intersect_mh) * self.cmp_scaled # + (ksize-1) #for bp estimation + return ( + len(self.intersect_mh) * self.cmp_scaled + ) # + (ksize-1) #for bp estimation @property def mh1_containment_in_mh2(self): return self.mh1_cmp.contained_by(self.mh2_cmp) - def estimate_ani_from_mh1_containment_in_mh2(self, containment = None): + def estimate_ani_from_mh1_containment_in_mh2(self, containment=None): # build result once - m1_cani = self.mh1_cmp.containment_ani(self.mh2_cmp, - containment=containment, - confidence=self.ani_confidence, - estimate_ci=self.estimate_ani_ci) -# prob_threshold=self.pfn_threshold) + m1_cani = self.mh1_cmp.containment_ani( + self.mh2_cmp, + containment=containment, + confidence=self.ani_confidence, + estimate_ci=self.estimate_ani_ci, + ) + # prob_threshold=self.pfn_threshold) # propagate params self.ani_from_mh1_containment_in_mh2 = m1_cani.ani if m1_cani.p_exceeds_threshold: @@ -148,28 +162,32 @@ def mh2_containment_in_mh1(self): return self.mh2_cmp.contained_by(self.mh1_cmp) def estimate_ani_from_mh2_containment_in_mh1(self, containment=None): - m2_cani = self.mh2_cmp.containment_ani(self.mh1_cmp, - containment=containment, - confidence=self.ani_confidence, - estimate_ci=self.estimate_ani_ci) -# prob_threshold=self.pfn_threshold) + m2_cani = self.mh2_cmp.containment_ani( + self.mh1_cmp, + containment=containment, + confidence=self.ani_confidence, + estimate_ci=self.estimate_ani_ci, + ) + # prob_threshold=self.pfn_threshold) self.ani_from_mh2_containment_in_mh1 = m2_cani.ani if m2_cani.p_exceeds_threshold: self.potential_false_negative = True if self.estimate_ani_ci: self.ani_from_mh2_containment_in_mh1_low = m2_cani.ani_low self.ani_from_mh2_containment_in_mh1_high = m2_cani.ani_high - + @property def max_containment(self): return self.mh1_cmp.max_containment(self.mh2_cmp) def estimate_max_containment_ani(self, max_containment=None): - mc_ani_info = self.mh1_cmp.max_containment_ani(self.mh2_cmp, - max_containment=max_containment, - confidence=self.ani_confidence, - estimate_ci=self.estimate_ani_ci) -# prob_threshold=self.pfn_threshold) + mc_ani_info = self.mh1_cmp.max_containment_ani( + self.mh2_cmp, + max_containment=max_containment, + confidence=self.ani_confidence, + estimate_ci=self.estimate_ani_ci, + ) + # prob_threshold=self.pfn_threshold) # propagate params self.max_containment_ani = mc_ani_info.ani if mc_ani_info.p_exceeds_threshold: @@ -187,23 +205,41 @@ def avg_containment_ani(self): "Returns single average_containment_ani value. Sets self.potential_false_negative internally." self.estimate_ani_from_mh1_containment_in_mh2() self.estimate_ani_from_mh2_containment_in_mh1() - if any([self.ani_from_mh1_containment_in_mh2 is None, self.ani_from_mh2_containment_in_mh1 is None]): + if any( + [ + self.ani_from_mh1_containment_in_mh2 is None, + self.ani_from_mh2_containment_in_mh1 is None, + ] + ): return None else: - return (self.ani_from_mh1_containment_in_mh2 + self.ani_from_mh2_containment_in_mh1)/2 + return ( + self.ani_from_mh1_containment_in_mh2 + + self.ani_from_mh2_containment_in_mh1 + ) / 2 def estimate_all_containment_ani(self): "Estimate all containment ANI values." self.estimate_ani_from_mh1_containment_in_mh2() self.estimate_ani_from_mh2_containment_in_mh1() - if any([self.ani_from_mh1_containment_in_mh2 is None, self.ani_from_mh2_containment_in_mh1 is None]): -# self.estimate_max_containment_ani() + if any( + [ + self.ani_from_mh1_containment_in_mh2 is None, + self.ani_from_mh2_containment_in_mh1 is None, + ] + ): + # self.estimate_max_containment_ani() self.max_containment_ani = None else: - self.max_containment_ani = max([self.ani_from_mh1_containment_in_mh2, self.ani_from_mh2_containment_in_mh1]) + self.max_containment_ani = max( + [ + self.ani_from_mh1_containment_in_mh2, + self.ani_from_mh2_containment_in_mh1, + ] + ) def weighted_intersection(self, from_mh=None, from_abundD={}): - # map abundances to all intersection hashes. + # map abundances to all intersection hashes. abund_mh = self.intersect_mh.copy_and_clear() abund_mh.track_abundance = True # if from_mh is provided, it takes precedence over from_abund dict @@ -211,7 +247,7 @@ def weighted_intersection(self, from_mh=None, from_abundD={}): from_abundD = from_mh.hashes if from_abundD: # this sets any hash not present in abundD to 1. Is that desired? Or should we return 0? - abunds = {k: from_abundD.get(k, 1) for k in self.intersect_mh.hashes } + abunds = {k: from_abundD.get(k, 1) for k in self.intersect_mh.hashes} abund_mh.set_abundances(abunds) return abund_mh # if no abundances are passed in, return intersect_mh diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 8b149d7d1d..fdbc0e4cf6 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -51,8 +51,7 @@ from .index import LinearIndex from .picklist import SignaturePicklist, PickStyle from .manifest import CollectionManifest -from .save_load import (SaveSignaturesToLocation, load_file_as_index, - _load_database) +from .save_load import SaveSignaturesToLocation, load_file_as_index, _load_database DEFAULT_LOAD_K = 31 @@ -64,9 +63,9 @@ def check_scaled_bounds(arg): if f < 0: raise argparse.ArgumentTypeError("ERROR: scaled value must be positive") if f < 100: - notify('WARNING: scaled value should be >= 100. Continuing anyway.') + notify("WARNING: scaled value should be >= 100. Continuing anyway.") if f > 1e6: - notify('WARNING: scaled value should be <= 1e6. Continuing anyway.') + notify("WARNING: scaled value should be <= 1e6. Continuing anyway.") return f @@ -76,18 +75,18 @@ def check_num_bounds(arg): if f < 0: raise argparse.ArgumentTypeError("ERROR: num value must be positive") if f < 50: - notify('WARNING: num value should be >= 50. Continuing anyway.') + notify("WARNING: num value should be >= 50. Continuing anyway.") if f > 50000: - notify('WARNING: num value should be <= 50000. Continuing anyway.') + notify("WARNING: num value should be <= 50000. Continuing anyway.") return f def get_moltype(sig, require=False): mh = sig.minhash - if mh.moltype in ('DNA', 'dayhoff', 'hp', 'protein'): + if mh.moltype in ("DNA", "dayhoff", "hp", "protein"): moltype = mh.moltype else: - raise ValueError('unknown molecule type for sig {}'.format(sig)) + raise ValueError(f"unknown molecule type for sig {sig}") return moltype @@ -97,20 +96,22 @@ def calculate_moltype(args, default=None): n = 0 if args.dna: - moltype = 'DNA' + moltype = "DNA" n += 1 if args.dayhoff: - moltype = 'dayhoff' + moltype = "dayhoff" n += 1 if args.hp: - moltype = 'hp' + moltype = "hp" n += 1 if args.protein: - moltype = 'protein' + moltype = "protein" n += 1 if n > 1: - error("cannot specify more than one of --dna/--rna/--nucleotide/--protein/--hp/--dayhoff") + error( + "cannot specify more than one of --dna/--rna/--nucleotide/--protein/--hp/--dayhoff" + ) sys.exit(-1) return moltype @@ -123,7 +124,9 @@ def load_picklist(args): try: picklist = SignaturePicklist.from_picklist_args(args.picklist) - notify(f"picking column '{picklist.column_name}' of type '{picklist.coltype}' from '{picklist.pickfile}'") + notify( + f"picking column '{picklist.column_name}' of type '{picklist.coltype}' from '{picklist.pickfile}'" + ) n_empty_val, dup_vals = picklist.load() except ValueError as exc: @@ -133,19 +136,27 @@ def load_picklist(args): notify(f"loaded {len(picklist.pickset)} distinct values into picklist.") if n_empty_val: - notify(f"WARNING: {n_empty_val} empty values in column '{picklist.column_name}' in picklist file") + notify( + f"WARNING: {n_empty_val} empty values in column '{picklist.column_name}' in picklist file" + ) if dup_vals: - notify(f"WARNING: {len(dup_vals)} values in picklist column '{picklist.column_name}' were not distinct") + notify( + f"WARNING: {len(dup_vals)} values in picklist column '{picklist.column_name}' were not distinct" + ) return picklist def report_picklist(args, picklist): if picklist.pickstyle == PickStyle.INCLUDE: - notify(f"for given picklist, found {len(picklist.found)} matches to {len(picklist.pickset)} distinct values") + notify( + f"for given picklist, found {len(picklist.found)} matches to {len(picklist.pickset)} distinct values" + ) n_missing = len(picklist.pickset - picklist.found) elif picklist.pickstyle == PickStyle.EXCLUDE: - notify(f"for given picklist, found {len(picklist.found)} matches by excluding {len(picklist.pickset)} distinct values") + notify( + f"for given picklist, found {len(picklist.found)} matches by excluding {len(picklist.pickset)} distinct values" + ) n_missing = 0 if n_missing: notify(f"WARNING: {n_missing} missing picklist values.") @@ -157,19 +168,27 @@ def report_picklist(args, picklist): def load_include_exclude_db_patterns(args): if args.picklist and (args.include_db_pattern or args.exclude_db_pattern): - error("ERROR: --picklist and --include-db-pattern/--exclude cannot be used together.") + error( + "ERROR: --picklist and --include-db-pattern/--exclude cannot be used together." + ) sys.exit(-1) if args.include_db_pattern and args.exclude_db_pattern: - error("ERROR: --include-db-pattern and --exclude-db-pattern cannot be used together.") + error( + "ERROR: --include-db-pattern and --exclude-db-pattern cannot be used together." + ) sys.exit(-1) if args.include_db_pattern: pattern = re.compile(args.include_db_pattern, re.IGNORECASE) - search_pattern = lambda vals: any(pattern.search(val) for val in vals) + + def search_pattern(vals): + return any(pattern.search(val) for val in vals) elif args.exclude_db_pattern: pattern = re.compile(args.exclude_db_pattern, re.IGNORECASE) - search_pattern = lambda vals: all(not pattern.search(val) for val in vals) + + def search_pattern(vals): + return all(not pattern.search(val) for val in vals) else: search_pattern = None @@ -187,8 +206,7 @@ def apply_picklist_and_pattern(db, picklist, pattern): error("--include-db-pattern/--exclude-db-pattern require a manifest.") sys.exit(-1) - manifest = manifest.filter_on_columns(pattern, - ["name", "filename", "md5"]) + manifest = manifest.filter_on_columns(pattern, ["name", "filename", "md5"]) pattern_picklist = manifest.to_picklist() db = db.select(picklist=pattern_picklist) @@ -202,8 +220,9 @@ def load_query_signature(filename, ksize, select_moltype, select_md5=None): and indexed databases. """ try: - sl = load_file_as_signatures(filename, ksize=ksize, - select_moltype=select_moltype) + sl = load_file_as_signatures( + filename, ksize=ksize, select_moltype=select_moltype + ) sl = list(sl) except (OSError, ValueError): error(f"Cannot open query file '{filename}'") @@ -225,21 +244,21 @@ def load_query_signature(filename, ksize, select_moltype, select_md5=None): sl = [found_sig] if len(sl) and ksize is None: - ksizes = set([ ss.minhash.ksize for ss in sl ]) + ksizes = set([ss.minhash.ksize for ss in sl]) if len(ksizes) == 1: ksize = ksizes.pop() - sl = [ ss for ss in sl if ss.minhash.ksize == ksize ] - notify(f'select query k={ksize} automatically.') + sl = [ss for ss in sl if ss.minhash.ksize == ksize] + notify(f"select query k={ksize} automatically.") elif DEFAULT_LOAD_K in ksizes: - sl = [ ss for ss in sl if ss.minhash.ksize == DEFAULT_LOAD_K ] - notify(f'selecting default query k={DEFAULT_LOAD_K}.') + sl = [ss for ss in sl if ss.minhash.ksize == DEFAULT_LOAD_K] + notify(f"selecting default query k={DEFAULT_LOAD_K}.") elif ksize: - notify(f'selecting specified query k={ksize}') + notify(f"selecting specified query k={ksize}") if len(sl) != 1: error(f"When loading query from '{filename}'", filename) - error(f'{len(sl)} signatures matching ksize and molecule type;') - error('need exactly one. Specify --ksize or --dna, --rna, or --protein.') + error(f"{len(sl)} signatures matching ksize and molecule type;") + error("need exactly one. Specify --ksize or --dna, --rna, or --protein.") sys.exit(-1) return sl[0] @@ -259,7 +278,7 @@ def traverse_find_sigs(filenames, yield_all_files=False): If 'yield_all_files' is True, this will return _all_ files (but not directories). """ - endings = ('.sig', '.sig.gz') + endings = (".sig", ".sig.gz") for filename in filenames: # check for files in filenames: if os.path.isfile(filename): @@ -275,9 +294,16 @@ def traverse_find_sigs(filenames, yield_all_files=False): yield fullname -def load_dbs_and_sigs(filenames, query, is_similarity_query, *, - cache_size=None, picklist=None, pattern=None, - fail_on_empty_database=False): +def load_dbs_and_sigs( + filenames, + query, + is_similarity_query, + *, + cache_size=None, + picklist=None, + pattern=None, + fail_on_empty_database=False, +): """ Load one or more Index objects to search - databases, etc. @@ -294,7 +320,7 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, total_signatures_loaded = 0 sum_signatures_after_select = 0 for filename in filenames: - notify(f"loading from '{filename}'...", end='\r') + notify(f"loading from '{filename}'...", end="\r") try: db = _load_database(filename, False, cache_size=cache_size) @@ -308,11 +334,13 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, # get compatible signatures - moltype/ksize/num/scaled try: - db = db.select(moltype=query_mh.moltype, - ksize=query_mh.ksize, - num=query_mh.num, - scaled=query_mh.scaled, - containment=containment) + db = db.select( + moltype=query_mh.moltype, + ksize=query_mh.ksize, + num=query_mh.num, + scaled=query_mh.scaled, + containment=containment, + ) except ValueError as exc: # incompatible collection specified! notify(f"ERROR: cannot use '{filename}' for this query.") @@ -337,9 +365,13 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, # display num loaded/num selected notify("--") - notify(f"loaded {total_signatures_loaded} total signatures from {len(databases)} locations.") - notify(f"after selecting signatures compatible with search, {sum_signatures_after_select} remain.") - print('') + notify( + f"loaded {total_signatures_loaded} total signatures from {len(databases)} locations." + ) + notify( + f"after selecting signatures compatible with search, {sum_signatures_after_select} remain." + ) + print("") return databases @@ -347,15 +379,17 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, def load_pathlist_from_file(filename): "Load a list-of-files text file." try: - with open(filename, 'rt') as fp: - file_list = [ x.rstrip('\r\n') for x in fp ] + with open(filename) as fp: + file_list = [x.rstrip("\r\n") for x in fp] file_list = set(file_list) if not file_list: raise ValueError("pathlist is empty") for checkfile in file_list: if not os.path.exists(checkfile): - raise ValueError(f"file '{checkfile}' inside the pathlist does not exist") - except IOError: + raise ValueError( + f"file '{checkfile}' inside the pathlist does not exist" + ) + except OSError: raise ValueError(f"pathlist file '{filename}' does not exist") except OSError: raise ValueError(f"cannot open file '{filename}'") @@ -385,7 +419,8 @@ class FileOutput: will properly handle no argument or '-' as sys.stdout. """ - def __init__(self, filename, mode='wt', *, newline=None, encoding='utf-8'): + + def __init__(self, filename, mode="wt", *, newline=None, encoding="utf-8"): self.filename = filename self.mode = mode self.fp = None @@ -393,14 +428,15 @@ def __init__(self, filename, mode='wt', *, newline=None, encoding='utf-8'): self.encoding = encoding def open(self): - if self.filename == '-' or self.filename is None: + if self.filename == "-" or self.filename is None: return sys.stdout - self.fp = open(self.filename, self.mode, newline=self.newline, - encoding=self.encoding) + self.fp = open( + self.filename, self.mode, newline=self.newline, encoding=self.encoding + ) return self.fp def close(self): - if self.fp is not None: # in case of stdout + if self.fp is not None: # in case of stdout self.fp.close() def __enter__(self): @@ -435,17 +471,18 @@ class FileOutputCSV(FileOutput): will properly handle no argument or '-' as sys.stdout. """ + def __init__(self, filename): self.filename = filename self.fp = None def open(self): - if self.filename == '-' or self.filename is None: + if self.filename == "-" or self.filename is None: return sys.stdout - if self.filename.endswith('.gz'): - self.fp = gzip.open(self.filename, 'wt', newline='') + if self.filename.endswith(".gz"): + self.fp = gzip.open(self.filename, "wt", newline="") else: - self.fp = open(self.filename, 'w', newline='') + self.fp = open(self.filename, "w", newline="") return self.fp @@ -457,38 +494,44 @@ class _DictReader_with_version: The version is stored as a 2-tuple in the 'version_info' attribute. """ - def __init__(self, textfp, *, delimiter=','): + + def __init__(self, textfp, *, delimiter=","): self.version_info = [] # is there a '#' in the raw buffer pos 0? ch = textfp.buffer.peek(1) try: - ch = ch.decode('utf-8') + ch = ch.decode("utf-8") except UnicodeDecodeError: raise csv.Error("unable to read CSV file") # yes - read a line from the text buffer => parse - if ch.startswith('#'): + if ch.startswith("#"): line = textfp.readline() - assert line.startswith('# '), line + assert line.startswith("# "), line # note, this can set version_info to lots of different things. # revisit later, I guess. CTB. - self.version_info = line[2:].strip().split(': ', 2) + self.version_info = line[2:].strip().split(": ", 2) # build a DictReader from the remaining stream self.reader = csv.DictReader(textfp, delimiter=delimiter) self.fieldnames = self.reader.fieldnames def __iter__(self): - for row in self.reader: - yield row + yield from self.reader @contextlib.contextmanager -def FileInputCSV(filename, *, encoding='utf-8', default_csv_name=None, - zipfile_obj=None, delimiter=','): +def FileInputCSV( + filename, + *, + encoding="utf-8", + default_csv_name=None, + zipfile_obj=None, + delimiter=",", +): """A context manager for reading in CSV files in gzip, zip or text format. Assumes comma delimiter, and uses csv.DictReader. @@ -513,24 +556,20 @@ def FileInputCSV(filename, *, encoding='utf-8', default_csv_name=None, try: zi = zipfile_obj.getinfo(default_csv_name) with zipfile_obj.open(zi) as fp: - textfp = TextIOWrapper(fp, - encoding=encoding, - newline="") + textfp = TextIOWrapper(fp, encoding=encoding, newline="") r = _DictReader_with_version(textfp, delimiter=delimiter) yield r except (zipfile.BadZipFile, KeyError): - pass # uh oh, we were given a zipfile_obj and it FAILED. + pass # uh oh, we were given a zipfile_obj and it FAILED. # no matter what, if given zipfile_obj don't try .gz or regular csv return else: try: - with zipfile.ZipFile(filename, 'r') as zip_fp: + with zipfile.ZipFile(filename, "r") as zip_fp: zi = zip_fp.getinfo(default_csv_name) with zip_fp.open(zi) as fp: - textfp = TextIOWrapper(fp, - encoding=encoding, - newline="") + textfp = TextIOWrapper(fp, encoding=encoding, newline="") r = _DictReader_with_version(textfp, delimiter=delimiter) yield r @@ -545,7 +584,7 @@ def FileInputCSV(filename, *, encoding='utf-8', default_csv_name=None, # ok, not a zip file - try .gz: try: with gzip.open(filename, "rt", newline="", encoding=encoding) as fp: - fp.buffer.peek(1) # force exception if not a gzip file + fp.buffer.peek(1) # force exception if not a gzip file r = _DictReader_with_version(fp, delimiter=delimiter) yield r return @@ -553,7 +592,7 @@ def FileInputCSV(filename, *, encoding='utf-8', default_csv_name=None, pass # neither zip nor gz; regular file! - with open(filename, 'rt', newline="", encoding=encoding) as fp: + with open(filename, newline="", encoding=encoding) as fp: r = _DictReader_with_version(fp, delimiter=delimiter) yield r @@ -569,6 +608,7 @@ class SignatureLoadingProgress: You can optionally notify of reading a file with `.notify(location)`. """ + def __init__(self, reporting_interval=10): self.n_sig = 0 self.interval = reporting_interval @@ -584,17 +624,19 @@ def short_notify(self, msg_template, *args, **kwargs): """ msg = msg_template.format(*args, **kwargs) - end = kwargs.get('end', '\n') + end = kwargs.get("end", "\n") w = self.screen_width if len(msg) > w: truncate_len = len(msg) - w + 3 - msg = '<<<' + msg[truncate_len:] + msg = "<<<" + msg[truncate_len:] notify(msg, end=end) def notify(self, location): - self.short_notify(f"...{self.n_sig} sigs so far. Now reading from file '{location}'", end='\r') + self.short_notify( + f"...{self.n_sig} sigs so far. Now reading from file '{location}'", end="\r" + ) def start_file(self, location, loader): n_this = 0 @@ -606,24 +648,35 @@ def start_file(self, location, loader): n_this += 1 n_total = n_before + n_this if n_this and n_total % self.interval == 0: - self.short_notify("...loading from '{}' / {} sigs total", - location, n_total, end='\r') + self.short_notify( + "...loading from '{}' / {} sigs total", + location, + n_total, + end="\r", + ) yield result except KeyboardInterrupt: # might as well nicely handle CTRL-C while we're at it! - notify('\n(CTRL-C received! quitting.)') + notify("\n(CTRL-C received! quitting.)") sys.exit(-1) finally: self.n_sig += n_this - self.short_notify(f"Loaded {n_this} sigs from '{location}'", - end='\r') + self.short_notify(f"Loaded {n_this} sigs from '{location}'", end="\r") -def load_many_signatures(locations, progress, *, yield_all_files=False, - ksize=None, moltype=None, picklist=None, force=False, - pattern=None): +def load_many_signatures( + locations, + progress, + *, + yield_all_files=False, + ksize=None, + moltype=None, + picklist=None, + force=False, + pattern=None, +): """ Load many signatures from multiple files, with progress indicators. @@ -648,11 +701,11 @@ def load_many_signatures(locations, progress, *, yield_all_files=False, loader = idx.signatures_with_location() # go! - n = 0 # count signatures loaded + n = 0 # count signatures loaded for sig, sigloc in progress.start_file(loc, loader): yield sig, sigloc n += 1 - notify(f"loaded {n} signatures from '{loc}'", end='\r') + notify(f"loaded {n} signatures from '{loc}'", end="\r") except ValueError as exc: # trap expected errors, and either power through or display + exit. if force: @@ -693,8 +746,9 @@ def get_manifest(idx, *, require=True, rebuild=False): # need to build one... try: notify("Generating a manifest...") - m = CollectionManifest.create_manifest(idx._signatures_with_internal(), - include_signature=False) + m = CollectionManifest.create_manifest( + idx._signatures_with_internal(), include_signature=False + ) debug_literal("get_manifest: rebuilt manifest.") except NotImplementedError: if require: @@ -707,12 +761,17 @@ def get_manifest(idx, *, require=True, rebuild=False): return m -def load_file_as_signatures(filename, *, select_moltype=None, ksize=None, - picklist=None, - yield_all_files=False, - progress=None, - pattern=None, - _use_manifest=True): +def load_file_as_signatures( + filename, + *, + select_moltype=None, + ksize=None, + picklist=None, + yield_all_files=False, + progress=None, + pattern=None, + _use_manifest=True, +): """Load 'filename' as a collection of signatures. Return an iterable. If 'filename' contains an SBT or LCA indexed database, or a regular diff --git a/src/sourmash/sqlite_utils.py b/src/sourmash/sqlite_utils.py index 2b7503a2d8..8efb754a23 100644 --- a/src/sourmash/sqlite_utils.py +++ b/src/sourmash/sqlite_utils.py @@ -31,13 +31,13 @@ def open_sqlite_db(filename): # check for the 'sourmash_internal' table. cursor = conn.cursor() try: - cursor.execute('SELECT DISTINCT key, value FROM sourmash_internal') + cursor.execute("SELECT DISTINCT key, value FROM sourmash_internal") except (sqlite3.OperationalError, sqlite3.DatabaseError): debug_literal("open_sqlite_db: cannot read sourmash_internal.") # is this a taxonomy DB? try: - cursor.execute('SELECT * FROM taxonomy LIMIT 1') + cursor.execute("SELECT * FROM taxonomy LIMIT 1") except (sqlite3.OperationalError, sqlite3.DatabaseError): debug_literal("open_sqlite_db: cannot read 'taxonomy', either.") return None @@ -49,12 +49,14 @@ def add_sourmash_internal(cursor, use_type, version): """ Add use_type/version to sourmash_internal table. """ - cursor.execute(""" + cursor.execute( + """ CREATE TABLE IF NOT EXISTS sourmash_internal ( key TEXT UNIQUE, value TEXT ) - """) + """ + ) d = get_sourmash_internal(cursor) @@ -62,18 +64,23 @@ def add_sourmash_internal(cursor, use_type, version): if val is not None: # do version compatibility foo here? if version != val: - raise Exception(f"sqlite problem: for {use_type}, want version {version}, got version {val}") + raise Exception( + f"sqlite problem: for {use_type}, want version {version}, got version {val}" + ) else: - cursor.execute(""" + cursor.execute( + """ INSERT INTO sourmash_internal (key, value) VALUES (?, ?) - """, (use_type, version)) + """, + (use_type, version), + ) def get_sourmash_internal(cursor): """ Retrieve a key/value dictionary from sourmash_internal. """ - cursor.execute('SELECT DISTINCT key, value FROM sourmash_internal') + cursor.execute("SELECT DISTINCT key, value FROM sourmash_internal") d = dict(cursor) return d diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index b6ff3d9dd2..8e490ae545 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -13,9 +13,14 @@ from sourmash.logging import set_quiet, error, notify, print_results from . import tax_utils -from .tax_utils import MultiLineageDB, RankLineageInfo, LINLineageInfo, AnnotateTaxResult - -usage=''' +from .tax_utils import ( + MultiLineageDB, + RankLineageInfo, + LINLineageInfo, + AnnotateTaxResult, +) + +usage = """ sourmash taxonomy [] - manipulate/work with taxonomy information. or sourmash tax [] @@ -30,31 +35,32 @@ ** Use '-h' to get subcommand-specific help, e.g. sourmash taxonomy metagenome -h -''' +""" # outfile utils _output_type_to_ext = { - 'csv_summary': '.summarized.csv', - 'classification': '.classifications.csv', - 'krona': '.krona.tsv', - 'lineage_summary': '.lineage_summary.tsv', - 'annotate': '.with-lineages.csv', - 'human': '.human.txt', - 'lineage_csv': '.lineage.csv', - 'kreport': ".kreport.txt", - 'lingroup': ".lingroup.tsv", - 'bioboxes': '.bioboxes.profile' - } - -def make_outfile(base, output_type, *, output_dir = ""): - limit_float_decimals=False + "csv_summary": ".summarized.csv", + "classification": ".classifications.csv", + "krona": ".krona.tsv", + "lineage_summary": ".lineage_summary.tsv", + "annotate": ".with-lineages.csv", + "human": ".human.txt", + "lineage_csv": ".lineage.csv", + "kreport": ".kreport.txt", + "lingroup": ".lingroup.tsv", + "bioboxes": ".bioboxes.profile", +} + + +def make_outfile(base, output_type, *, output_dir=""): + limit_float_decimals = False if base == "-": - limit_float_decimals=True + limit_float_decimals = True return base, limit_float_decimals ext = _output_type_to_ext[output_type] - fname = base+ext + fname = base + ext if output_dir: fname = os.path.join(output_dir, fname) notify(f"saving '{output_type}' output to '{fname}'.") @@ -70,50 +76,70 @@ def metagenome(args): # first, load taxonomic_assignments try: - tax_assign = MultiLineageDB.load(args.taxonomy_csv, - keep_full_identifiers=args.keep_full_identifiers, - keep_identifier_versions=args.keep_identifier_versions, - force=args.force, lins=args.lins) + tax_assign = MultiLineageDB.load( + args.taxonomy_csv, + keep_full_identifiers=args.keep_full_identifiers, + keep_identifier_versions=args.keep_identifier_versions, + force=args.force, + lins=args.lins, + ) available_ranks = tax_assign.available_ranks except ValueError as exc: error(f"ERROR: {str(exc)}") sys.exit(-1) if not tax_assign: - error(f'ERROR: No taxonomic assignments loaded from {",".join(args.taxonomy_csv)}. Exiting.') + error( + f'ERROR: No taxonomic assignments loaded from {",".join(args.taxonomy_csv)}. Exiting.' + ) sys.exit(-1) if args.rank and args.rank not in available_ranks: - error(f"ERROR: No taxonomic information provided for rank {args.rank}: cannot summarize at this rank") + error( + f"ERROR: No taxonomic information provided for rank {args.rank}: cannot summarize at this rank" + ) sys.exit(-1) # next, collect and load gather results - gather_csvs = tax_utils.collect_gather_csvs(args.gather_csv, from_file= args.from_file) + gather_csvs = tax_utils.collect_gather_csvs( + args.gather_csv, from_file=args.from_file + ) try: - query_gather_results = tax_utils.check_and_load_gather_csvs(gather_csvs, tax_assign, force=args.force, - fail_on_missing_taxonomy=args.fail_on_missing_taxonomy, - keep_full_identifiers=args.keep_full_identifiers, - keep_identifier_versions = args.keep_identifier_versions, - lins=args.lins, - ) + query_gather_results = tax_utils.check_and_load_gather_csvs( + gather_csvs, + tax_assign, + force=args.force, + fail_on_missing_taxonomy=args.fail_on_missing_taxonomy, + keep_full_identifiers=args.keep_full_identifiers, + keep_identifier_versions=args.keep_identifier_versions, + lins=args.lins, + ) except ValueError as exc: error(f"ERROR: {str(exc)}") sys.exit(-1) if not query_gather_results: - notify('No gather results loaded. Exiting.') + notify("No gather results loaded. Exiting.") sys.exit(-1) - single_query_output_formats = ['csv_summary', 'kreport'] + single_query_output_formats = ["csv_summary", "kreport"] desired_single_outputs = [] - if len(query_gather_results) > 1: # working with multiple queries - desired_single_outputs = [x for x in args.output_format if x in single_query_output_formats] + if len(query_gather_results) > 1: # working with multiple queries + desired_single_outputs = [ + x for x in args.output_format if x in single_query_output_formats + ] if desired_single_outputs: - notify(f"WARNING: found results for multiple gather queries. Can only output multi-query result formats: skipping {', '.join(desired_single_outputs)}") + notify( + f"WARNING: found results for multiple gather queries. Can only output multi-query result formats: skipping {', '.join(desired_single_outputs)}" + ) # remove single query outputs from output format - args.output_format = [x for x in args.output_format if x not in single_query_output_formats] - if not args.output_format: # or do we want to insert `human` here so we always report something? - error(f"ERROR: No output formats remaining.") + args.output_format = [ + x for x in args.output_format if x not in single_query_output_formats + ] + if ( + not args.output_format + ): # or do we want to insert `human` here so we always report something? + error("ERROR: No output formats remaining.") sys.exit(-1) # for each queryResult, actually summarize at rank, reporting any errors that occur. @@ -126,47 +152,66 @@ def metagenome(args): # write summarized output in human-readable format if "lineage_summary" in args.output_format: - lineage_outfile, limit_float = make_outfile(args.output_base, "lineage_summary", output_dir=args.output_dir) + lineage_outfile, limit_float = make_outfile( + args.output_base, "lineage_summary", output_dir=args.output_dir + ) ## aggregate by lineage by query - lineageD, query_names= tax_utils.aggregate_by_lineage_at_rank(query_gather_results=query_gather_results, - rank=args.rank, by_query=True) + lineageD, query_names = tax_utils.aggregate_by_lineage_at_rank( + query_gather_results=query_gather_results, rank=args.rank, by_query=True + ) with FileOutputCSV(lineage_outfile) as out_fp: - tax_utils.write_lineage_sample_frac(query_names, lineageD, out_fp, sep='\t') + tax_utils.write_lineage_sample_frac(query_names, lineageD, out_fp, sep="\t") # write summarized --> krona output tsv if "krona" in args.output_format: - krona_results, header = tax_utils.format_for_krona(query_gather_results, rank=args.rank) + krona_results, header = tax_utils.format_for_krona( + query_gather_results, rank=args.rank + ) - krona_outfile, limit_float = make_outfile(args.output_base, "krona", output_dir=args.output_dir) + krona_outfile, limit_float = make_outfile( + args.output_base, "krona", output_dir=args.output_dir + ) with FileOutputCSV(krona_outfile) as out_fp: tax_utils.write_krona(header, krona_results, out_fp) if "human" in args.output_format: - summary_outfile, limit_float = make_outfile(args.output_base, "human", output_dir=args.output_dir) + summary_outfile, limit_float = make_outfile( + args.output_base, "human", output_dir=args.output_dir + ) with FileOutput(summary_outfile) as out_fp: human_display_rank = args.rank or "species" if args.lins and not args.rank: - human_display_rank = query_gather_results[0].ranks[-1] # lowest rank + human_display_rank = query_gather_results[0].ranks[-1] # lowest rank - tax_utils.write_human_summary(query_gather_results, out_fp, human_display_rank) + tax_utils.write_human_summary( + query_gather_results, out_fp, human_display_rank + ) # write summarized output csv single_query_results = query_gather_results[0] if "csv_summary" in args.output_format: - summary_outfile, limit_float = make_outfile(args.output_base, "csv_summary", output_dir=args.output_dir) + summary_outfile, limit_float = make_outfile( + args.output_base, "csv_summary", output_dir=args.output_dir + ) with FileOutputCSV(summary_outfile) as out_fp: - tax_utils.write_summary(query_gather_results, out_fp, limit_float_decimals=limit_float) + tax_utils.write_summary( + query_gather_results, out_fp, limit_float_decimals=limit_float + ) # write summarized --> kreport output tsv if "kreport" in args.output_format: - kreport_outfile, limit_float = make_outfile(args.output_base, "kreport", output_dir=args.output_dir) + kreport_outfile, limit_float = make_outfile( + args.output_base, "kreport", output_dir=args.output_dir + ) with FileOutputCSV(kreport_outfile) as out_fp: header, kreport_results = single_query_results.make_kreport_results() - tax_utils.write_output(header, kreport_results, out_fp, sep="\t", write_header=False) + tax_utils.write_output( + header, kreport_results, out_fp, sep="\t", write_header=False + ) # write summarized --> LINgroup output tsv if "lingroup" in args.output_format: @@ -176,15 +221,23 @@ def metagenome(args): error(f"ERROR: {str(exc)}") sys.exit(-1) - lingroupfile, limit_float = make_outfile(args.output_base, "lingroup", output_dir=args.output_dir) + lingroupfile, limit_float = make_outfile( + args.output_base, "lingroup", output_dir=args.output_dir + ) with FileOutputCSV(lingroupfile) as out_fp: - header, lgreport_results = single_query_results.make_lingroup_results(LINgroupsD = lingroups) - tax_utils.write_output(header, lgreport_results, out_fp, sep="\t", write_header=True) + header, lgreport_results = single_query_results.make_lingroup_results( + LINgroupsD=lingroups + ) + tax_utils.write_output( + header, lgreport_results, out_fp, sep="\t", write_header=True + ) # write cami bioboxes format if "bioboxes" in args.output_format: - bbfile, limit_float = make_outfile(args.output_base, "bioboxes", output_dir=args.output_dir) + bbfile, limit_float = make_outfile( + args.output_base, "bioboxes", output_dir=args.output_dir + ) with FileOutputCSV(bbfile) as out_fp: header_lines, bb_results = single_query_results.make_cami_bioboxes() @@ -199,14 +252,17 @@ def genome(args): # first, load taxonomic_assignments try: - tax_assign = MultiLineageDB.load(args.taxonomy_csv, - keep_full_identifiers=args.keep_full_identifiers, - keep_identifier_versions=args.keep_identifier_versions, - force=args.force, lins=args.lins) + tax_assign = MultiLineageDB.load( + args.taxonomy_csv, + keep_full_identifiers=args.keep_full_identifiers, + keep_identifier_versions=args.keep_identifier_versions, + force=args.force, + lins=args.lins, + ) available_ranks = tax_assign.available_ranks - lg_ranks=None - all_lgs=None + lg_ranks = None + all_lgs = None if args.lingroup: lingroups = tax_utils.read_lingroups(args.lingroup) lg_ranks, all_lgs = tax_utils.parse_lingroups(lingroups) @@ -216,38 +272,51 @@ def genome(args): sys.exit(-1) if not tax_assign: - error(f'ERROR: No taxonomic assignments loaded from {",".join(args.taxonomy_csv)}. Exiting.') + error( + f'ERROR: No taxonomic assignments loaded from {",".join(args.taxonomy_csv)}. Exiting.' + ) sys.exit(-1) if args.rank and args.rank not in available_ranks: - error(f"ERROR: No taxonomic information provided for rank {args.rank}: cannot classify at this rank") + error( + f"ERROR: No taxonomic information provided for rank {args.rank}: cannot classify at this rank" + ) sys.exit(-1) # get gather_csvs from args - gather_csvs = tax_utils.collect_gather_csvs(args.gather_csv, from_file=args.from_file) + gather_csvs = tax_utils.collect_gather_csvs( + args.gather_csv, from_file=args.from_file + ) try: - query_gather_results = tax_utils.check_and_load_gather_csvs(gather_csvs, tax_assign, force=args.force, - fail_on_missing_taxonomy=args.fail_on_missing_taxonomy, - keep_full_identifiers=args.keep_full_identifiers, - keep_identifier_versions = args.keep_identifier_versions, - lins=args.lins) + query_gather_results = tax_utils.check_and_load_gather_csvs( + gather_csvs, + tax_assign, + force=args.force, + fail_on_missing_taxonomy=args.fail_on_missing_taxonomy, + keep_full_identifiers=args.keep_full_identifiers, + keep_identifier_versions=args.keep_identifier_versions, + lins=args.lins, + ) except ValueError as exc: error(f"ERROR: {str(exc)}") sys.exit(-1) if not query_gather_results: - notify('No results for classification. Exiting.') + notify("No results for classification. Exiting.") sys.exit(-1) # for each queryResult, summarize at rank and classify according to thresholds, reporting any errors that occur. for queryResult in query_gather_results: try: - queryResult.build_classification_result(rank=args.rank, - ani_threshold=args.ani_threshold, - containment_threshold=args.containment_threshold, - lingroup_ranks=lg_ranks, lingroups=all_lgs) + queryResult.build_classification_result( + rank=args.rank, + ani_threshold=args.ani_threshold, + containment_threshold=args.containment_threshold, + lingroup_ranks=lg_ranks, + lingroups=all_lgs, + ) except ValueError as exc: error(f"ERROR: {str(exc)}") @@ -255,42 +324,65 @@ def genome(args): # write outputs if "csv_summary" in args.output_format: - summary_outfile, limit_float = make_outfile(args.output_base, "classification", output_dir=args.output_dir) + summary_outfile, limit_float = make_outfile( + args.output_base, "classification", output_dir=args.output_dir + ) with FileOutputCSV(summary_outfile) as out_fp: - tax_utils.write_summary(query_gather_results, out_fp, limit_float_decimals=limit_float, classification=True) + tax_utils.write_summary( + query_gather_results, + out_fp, + limit_float_decimals=limit_float, + classification=True, + ) # write summarized output in human-readable format if "human" in args.output_format: - summary_outfile, limit_float = make_outfile(args.output_base, "human", output_dir=args.output_dir) + summary_outfile, limit_float = make_outfile( + args.output_base, "human", output_dir=args.output_dir + ) with FileOutput(summary_outfile) as out_fp: - tax_utils.write_human_summary(query_gather_results, out_fp, args.rank or "species", classification=True) + tax_utils.write_human_summary( + query_gather_results, + out_fp, + args.rank or "species", + classification=True, + ) # The following require a single rank: # note: interactive krona can handle mult ranks, do we want to enable? if "krona" in args.output_format: - krona_results, header = tax_utils.format_for_krona(query_gather_results=query_gather_results, rank=args.rank, classification=True) - krona_outfile, limit_float = make_outfile(args.output_base, "krona", output_dir=args.output_dir) + krona_results, header = tax_utils.format_for_krona( + query_gather_results=query_gather_results, + rank=args.rank, + classification=True, + ) + krona_outfile, limit_float = make_outfile( + args.output_base, "krona", output_dir=args.output_dir + ) with FileOutputCSV(krona_outfile) as out_fp: tax_utils.write_krona(header, krona_results, out_fp) if "lineage_csv" in args.output_format: - lineage_outfile, _ = make_outfile(args.output_base, "lineage_csv", - output_dir=args.output_dir) + lineage_outfile, _ = make_outfile( + args.output_base, "lineage_csv", output_dir=args.output_dir + ) lineage_results = [] header = None for q_res in query_gather_results: if not header: ranks = list(q_res.ranks) - if 'strain' in ranks: # maintains prior functionality.. but we could keep strain now, i think? - ranks.remove('strain') + if ( + "strain" in ranks + ): # maintains prior functionality.. but we could keep strain now, i think? + ranks.remove("strain") header = ["ident", *ranks] - lineageD = q_res.classification_result.as_lineage_dict(q_res.query_info, ranks) + lineageD = q_res.classification_result.as_lineage_dict( + q_res.query_info, ranks + ) lineage_results.append(lineageD) with FileOutputCSV(lineage_outfile) as out_fp: tax_utils.write_output(header, lineage_results, out_fp) - - def annotate(args): @@ -304,21 +396,28 @@ def annotate(args): try: # first, load taxonomic_assignments - tax_assign = MultiLineageDB.load(args.taxonomy_csv, - keep_full_identifiers=args.keep_full_identifiers, - keep_identifier_versions=args.keep_identifier_versions, - force=args.force, lins=args.lins) + tax_assign = MultiLineageDB.load( + args.taxonomy_csv, + keep_full_identifiers=args.keep_full_identifiers, + keep_identifier_versions=args.keep_identifier_versions, + force=args.force, + lins=args.lins, + ) except ValueError as exc: error(f"ERROR: {str(exc)}") sys.exit(-1) if not tax_assign: - error(f'ERROR: No taxonomic assignments loaded from {",".join(args.taxonomy_csv)}. Exiting.') + error( + f'ERROR: No taxonomic assignments loaded from {",".join(args.taxonomy_csv)}. Exiting.' + ) sys.exit(-1) # get csv from args - input_csvs = tax_utils.collect_gather_csvs(args.gather_csv, from_file=args.from_file) + input_csvs = tax_utils.collect_gather_csvs( + args.gather_csv, from_file=args.from_file + ) # handle each gather csv separately for n, in_csv in enumerate(input_csvs): @@ -332,22 +431,28 @@ def annotate(args): # look for the column to match with taxonomic identifier id_col = None - col_options = ['name', 'match_name', 'ident', 'accession'] + col_options = ["name", "match_name", "ident", "accession"] for colname in col_options: if colname in header: id_col = colname break if not id_col: - raise ValueError(f"Cannot find taxonomic identifier column in '{in_csv}'. Tried: {', '.join(col_options)}") + raise ValueError( + f"Cannot find taxonomic identifier column in '{in_csv}'. Tried: {', '.join(col_options)}" + ) - notify(f"Starting annotation on '{in_csv}'. Using ID column: '{id_col}'") + notify( + f"Starting annotation on '{in_csv}'. Using ID column: '{id_col}'" + ) # make output file for this input - out_base = os.path.basename(in_csv.rsplit('.csv')[0]) - this_outfile, _ = make_outfile(out_base, "annotate", output_dir=args.output_dir) + out_base = os.path.basename(in_csv.rsplit(".csv")[0]) + this_outfile, _ = make_outfile( + out_base, "annotate", output_dir=args.output_dir + ) - out_header = header + ['lineage'] + out_header = header + ["lineage"] with FileOutputCSV(this_outfile) as out_fp: w = csv.DictWriter(out_fp, out_header) @@ -357,25 +462,36 @@ def annotate(args): n_missed = 0 for n, row in enumerate(r): # find lineage and write annotated row - taxres = AnnotateTaxResult(raw=row, id_col=id_col, lins=args.lins, - keep_full_identifiers=args.keep_full_identifiers, - keep_identifier_versions=args.keep_identifier_versions) - taxres.get_match_lineage(tax_assignments=tax_assign, fail_on_missing_taxonomy=args.fail_on_missing_taxonomy) - - if taxres.missed_ident: # could not assign taxonomy - n_missed+=1 + taxres = AnnotateTaxResult( + raw=row, + id_col=id_col, + lins=args.lins, + keep_full_identifiers=args.keep_full_identifiers, + keep_identifier_versions=args.keep_identifier_versions, + ) + taxres.get_match_lineage( + tax_assignments=tax_assign, + fail_on_missing_taxonomy=args.fail_on_missing_taxonomy, + ) + + if taxres.missed_ident: # could not assign taxonomy + n_missed += 1 w.writerow(taxres.row_with_lineages()) - rows_annotated = (n+1) - n_missed + rows_annotated = (n + 1) - n_missed if not rows_annotated: - raise ValueError(f"Could not annotate any rows from '{in_csv}'.") + raise ValueError( + f"Could not annotate any rows from '{in_csv}'." + ) else: - notify(f"Annotated {rows_annotated} of {n+1} total rows from '{in_csv}'.") + notify( + f"Annotated {rows_annotated} of {n+1} total rows from '{in_csv}'." + ) except ValueError as exc: if args.force: notify(str(exc)) - notify('--force is set. Attempting to continue to next file.') + notify("--force is set. Attempting to continue to next file.") else: error(f"ERROR: {str(exc)}") sys.exit(-1) @@ -385,10 +501,12 @@ def prepare(args): "Combine multiple taxonomy databases into one and/or translate formats." notify("loading taxonomies...") try: - tax_assign = MultiLineageDB.load(args.taxonomy_csv, - force=args.force, - keep_full_identifiers=args.keep_full_identifiers, - keep_identifier_versions=args.keep_identifier_versions) + tax_assign = MultiLineageDB.load( + args.taxonomy_csv, + force=args.force, + keep_full_identifiers=args.keep_full_identifiers, + keep_identifier_versions=args.keep_identifier_versions, + ) except ValueError as exc: error("ERROR while loading taxonomies!") error(str(exc)) @@ -409,14 +527,15 @@ def prepare(args): def grep(args): term = args.pattern - tax_assign = MultiLineageDB.load(args.taxonomy_csv, - force=args.force) + tax_assign = MultiLineageDB.load(args.taxonomy_csv, force=args.force) silent = args.silent or args.count notify(f"searching {len(args.taxonomy_csv)} taxonomy files for '{term}'") if args.invert_match: - notify("-v/--invert-match specified; returning only lineages that do not match.") + notify( + "-v/--invert-match specified; returning only lineages that do not match." + ) if args.rank: notify(f"limiting matches to {args.rank} level") @@ -436,6 +555,7 @@ def find_pattern(lineage, select_rank): return False if args.invert_match: + def search_pattern(l, r): return not find_pattern(l, r) else: @@ -452,22 +572,26 @@ def search_pattern(l, r): else: with FileOutputCSV(args.output) as fp: w = csv.writer(fp) - w.writerow(['ident'] + list(RankLineageInfo().taxlist[:-1])) + w.writerow(["ident"] + list(RankLineageInfo().taxlist[:-1])) for ident, lineage in sorted(match_ident): - w.writerow([ident] + [ x.name for x in lineage ]) + w.writerow([ident] + [x.name for x in lineage]) - notify(f"found {len(match_ident)} matches; saved identifiers to picklist file '{args.output}'") + notify( + f"found {len(match_ident)} matches; saved identifiers to picklist file '{args.output}'" + ) def summarize(args): "Summarize multiple taxonomy databases." notify("loading taxonomies...") try: - tax_assign = MultiLineageDB.load(args.taxonomy_files, - force=args.force, - keep_full_identifiers=args.keep_full_identifiers, - keep_identifier_versions=args.keep_identifier_versions, - lins=args.lins) + tax_assign = MultiLineageDB.load( + args.taxonomy_files, + force=args.force, + keep_full_identifiers=args.keep_full_identifiers, + keep_identifier_versions=args.keep_identifier_versions, + lins=args.lins, + ) except ValueError as exc: error("ERROR while loading taxonomies!") error(str(exc)) @@ -481,7 +605,6 @@ def summarize(args): rank_counts = defaultdict(int) name_seen = set() for v in tax_assign.values(): - sofar = [] for vv in v: name = vv.name rank = vv.rank @@ -507,7 +630,7 @@ def summarize(args): with FileOutputCSV(args.output_lineage_information) as fp: w = csv.writer(fp) - w.writerow(['rank', 'lineage_count', 'lineage']) + w.writerow(["rank", "lineage_count", "lineage"]) # output in order of most common for lineage, count in lineage_counts.most_common(): @@ -526,9 +649,9 @@ def summarize(args): def main(arglist=None): args = sourmash.cli.get_parser().parse_args(arglist) submod = getattr(sourmash.cli.sig, args.subcmd) - mainmethod = getattr(submod, 'main') + mainmethod = getattr(submod, "main") return mainmethod(args) -if __name__ == '__main__': +if __name__ == "__main__": main(sys.argv) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index df69f0ee6a..55b30a540e 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -16,24 +16,45 @@ import sqlite3 -__all__ = ['get_ident', 'ascending_taxlist', 'collect_gather_csvs', - 'load_gather_results', 'check_and_load_gather_csvs' - 'report_missing_and_skipped_identities', 'aggregate_by_lineage_at_rank' - 'format_for_krona', 'write_output', 'write_bioboxes', 'parse_lingroups', - 'combine_sumgather_csvs_by_lineage', 'write_lineage_sample_frac', - 'MultiLineageDB', 'RankLineageInfo', 'LINLineageInfo'] +__all__ = [ + "get_ident", + "ascending_taxlist", + "collect_gather_csvs", + "load_gather_results", + "check_and_load_gather_csvs" "report_missing_and_skipped_identities", + "aggregate_by_lineage_at_rank" "format_for_krona", + "write_output", + "write_bioboxes", + "parse_lingroups", + "combine_sumgather_csvs_by_lineage", + "write_lineage_sample_frac", + "MultiLineageDB", + "RankLineageInfo", + "LINLineageInfo", +] from sourmash.logging import notify from sourmash.sourmash_args import load_pathlist_from_file -RANKCODE = { "superkingdom": "D", "kingdom": "K", "phylum": "P", "class": "C", - "order": "O", "family":"F", "genus": "G", "species": "S", "unclassified": "U"} +RANKCODE = { + "superkingdom": "D", + "kingdom": "K", + "phylum": "P", + "class": "C", + "order": "O", + "family": "F", + "genus": "G", + "species": "S", + "unclassified": "U", +} + class LineagePair(NamedTuple): rank: str name: str = None taxid: int = None + @dataclass(frozen=True, order=True) class BaseLineageInfo: """ @@ -53,10 +74,13 @@ class BaseLineageInfo: Input lineage information is only used for initialization of the final `lineage` and will not be used or compared in any other class methods. """ + # need to set compare=False for any mutable type to keep this class hashable - ranks: tuple() # require ranks - lineage: tuple = None # tuple of LineagePairs - lineage_str: str = field(default=None, compare=False) # ';'- or ','-separated str of lineage names + ranks: () # require ranks + lineage: tuple = None # tuple of LineagePairs + lineage_str: str = field( + default=None, compare=False + ) # ';'- or ','-separated str of lineage names def __post_init__(self): "Initialize according to passed values" @@ -71,9 +95,11 @@ def __post_init__(self): self._init_empty() def __eq__(self, other): - if other == (): # just handy: if comparing to a null tuple, don't try to find its lineage before returning False + if ( + other == () + ): # just handy: if comparing to a null tuple, don't try to find its lineage before returning False return False - return all([self.ranks == other.ranks and self.lineage==other.lineage]) + return all([self.ranks == other.ranks and self.lineage == other.lineage]) @property def taxlist(self): @@ -108,7 +134,7 @@ def filled_lineage(self): if not self.filled_ranks: return () lowest_filled_rank_idx = self.rank_index(self.filled_ranks[-1]) - return self.lineage[:lowest_filled_rank_idx+1] + return self.lineage[: lowest_filled_rank_idx + 1] @property def lowest_lineage_name(self): @@ -125,7 +151,7 @@ def lowest_lineage_taxid(self): return self.filled_lineage[-1].taxid def _init_empty(self): - 'initialize empty genome lineage' + "initialize empty genome lineage" new_lineage = [] for rank in self.ranks: new_lineage.append(LineagePair(rank=rank)) @@ -134,7 +160,7 @@ def _init_empty(self): object.__setattr__(self, "filled_ranks", ()) def _init_from_lineage_tuples(self): - 'initialize from tuple/list of LineagePairs, allowing empty ranks and reordering if necessary' + "initialize from tuple/list of LineagePairs, allowing empty ranks and reordering if necessary" new_lineage = [] # check this is a list or tuple of lineage tuples: for rank in self.ranks: @@ -143,12 +169,14 @@ def _init_from_lineage_tuples(self): # now add input tuples in correct spots. This corrects for order and allows empty values. if not isinstance(lin_tup, LineagePair): raise ValueError(f"{lin_tup} is not tax_utils LineagePair.") - if lin_tup.rank: # skip this tuple if rank is None or "" (empty lineage tuple. is this needed?) + if lin_tup.rank: # skip this tuple if rank is None or "" (empty lineage tuple. is this needed?) try: # find index for this rank rank_idx = self.rank_index(lin_tup.rank) except ValueError as e: - raise ValueError(f"Rank '{lin_tup.rank}' not present in {', '.join(self.ranks)}") from e + raise ValueError( + f"Rank '{lin_tup.rank}' not present in {', '.join(self.ranks)}" + ) from e new_lineage[rank_idx] = lin_tup # build list of filled ranks @@ -161,10 +189,13 @@ def _init_from_lineage_str(self): """ Turn a ; or ,-separated set of lineages into a list of LineagePair objs. """ - new_lineage = self.lineage_str.split(';') + new_lineage = self.lineage_str.split(";") if len(new_lineage) == 1: - new_lineage = self.lineage_str.split(',') - new_lineage = [ LineagePair(rank=rank, name=n) for (rank, n) in zip_longest(self.ranks, new_lineage) ] + new_lineage = self.lineage_str.split(",") + new_lineage = [ + LineagePair(rank=rank, name=n) + for (rank, n) in zip_longest(self.ranks, new_lineage) + ] # build list of filled ranks filled_ranks = [a.rank for a in new_lineage if a.name is not None] object.__setattr__(self, "lineage", tuple(new_lineage)) @@ -180,7 +211,7 @@ def zip_lineage(self, truncate_empty=False): zipped = [a.name for a in self.lineage] # replace None with empty string ("") if None in zipped: - zipped = ['' if x is None else x for x in zipped] + zipped = ["" if x is None else x for x in zipped] return zipped @@ -193,11 +224,11 @@ def zip_taxid(self, truncate_empty=False): else: zipped = [a.taxid for a in self.lineage] # replace None with empty string (""); cast taxids to str - zipped = ['' if x is None else str(x) for x in zipped] + zipped = ["" if x is None else str(x) for x in zipped] return zipped - def display_lineage(self, truncate_empty=True, null_as_unclassified=False, sep = ';'): + def display_lineage(self, truncate_empty=True, null_as_unclassified=False, sep=";"): "Return lineage names as ';'-separated list" lin = sep.join(self.zip_lineage(truncate_empty=truncate_empty)) if null_as_unclassified and lin == "" or lin is None: @@ -205,12 +236,12 @@ def display_lineage(self, truncate_empty=True, null_as_unclassified=False, sep = else: return lin - def display_taxid(self, truncate_empty=True, sep = ";"): + def display_taxid(self, truncate_empty=True, sep=";"): "Return lineage taxids as ';'-separated list" return sep.join(self.zip_taxid(truncate_empty=truncate_empty)) def check_rank_availability(self, rank): - if rank in self.ranks: # rank is available + if rank in self.ranks: # rank is available return True raise ValueError(f"Desired Rank '{rank}' not available for this lineage.") @@ -234,12 +265,14 @@ def is_lineage_match(self, other, rank): """ self.check_rank_availability(rank) if not self.is_compatible(other): - raise ValueError("Cannot compare lineages from taxonomies with different ranks.") + raise ValueError( + "Cannot compare lineages from taxonomies with different ranks." + ) # always return false if rank is not filled in either of the two lineages if self.rank_is_filled(rank, other=other): rank_idx = self.rank_index(rank) - a_lin = self.lineage[:rank_idx+1] - b_lin = other.lineage[:rank_idx+1] + a_lin = self.lineage[: rank_idx + 1] + b_lin = other.lineage[: rank_idx + 1] if a_lin == b_lin: return 1 return 0 @@ -252,7 +285,7 @@ def pop_to_rank(self, rank): return replace(self) # if not, make filled_lineage at this rank + use to generate new LineageInfo new_lineage = self.lineage_at_rank(rank) - new = replace(self, lineage = new_lineage) + new = replace(self, lineage=new_lineage) # replace doesn't run the __post_init__ properly. reinitialize. new._init_from_lineage_tuples() return new @@ -265,7 +298,7 @@ def lineage_at_rank(self, rank): return self.filled_lineage # if not, return lineage tuples down to desired rank rank_idx = self.rank_index(rank) - return self.filled_lineage[:rank_idx+1] + return self.filled_lineage[: rank_idx + 1] def find_lca(self, other): """ @@ -298,8 +331,18 @@ class RankLineageInfo(BaseLineageInfo): Input lineage information is only used for initialization of the final `lineage` and will not be used or compared in any other class methods. """ - ranks: tuple = ('superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain') - lineage_dict: dict = field(default=None, compare=False) # dict of rank: name + + ranks: tuple = ( + "superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + "strain", + ) + lineage_dict: dict = field(default=None, compare=False) # dict of rank: name def __post_init__(self): "Initialize according to passed values" @@ -321,21 +364,23 @@ def _init_from_lineage_dict(self): Use NCBI taxids if available as '|'-separated 'taxpath' column. Allows empty ranks/extra columns and reordering if necessary """ - null_names = set(['[Blank]', 'na', 'null', 'NA', '']) + null_names = set(["[Blank]", "na", "null", "NA", ""]) if not isinstance(self.lineage_dict, (dict)): raise ValueError(f"{self.lineage_dict} is not dictionary") new_lineage = [] - taxpath=[] + taxpath = [] # build empty lineage and taxpath for rank in self.ranks: new_lineage.append(LineagePair(rank=rank)) # check for NCBI taxpath information - taxpath_str = self.lineage_dict.get('taxpath', []) + taxpath_str = self.lineage_dict.get("taxpath", []) if taxpath_str: - taxpath = taxpath_str.split('|') + taxpath = taxpath_str.split("|") if len(taxpath) > len(self.ranks): - raise ValueError(f"Number of NCBI taxids ({len(taxpath)}) exceeds number of ranks ({len(self.ranks)})") + raise ValueError( + f"Number of NCBI taxids ({len(taxpath)}) exceeds number of ranks ({len(self.ranks)})" + ) # now add rank information in correct spots. This corrects for order and allows empty ranks and extra dict keys for key, val in self.lineage_dict.items(): @@ -344,7 +389,7 @@ def _init_from_lineage_dict(self): rank, name = key, val rank_idx = self.rank_index(rank) except ValueError: - continue # ignore dictionary entries (columns) that don't match a rank + continue # ignore dictionary entries (columns) that don't match a rank if taxpath: try: @@ -353,8 +398,8 @@ def _init_from_lineage_dict(self): taxid = None # filter null if name is not None and name.strip() in null_names: - name = None - new_lineage[rank_idx] = LineagePair(rank=rank, name=name, taxid=taxid) + name = None + new_lineage[rank_idx] = LineagePair(rank=rank, name=name, taxid=taxid) # build list of filled ranks filled_ranks = [a.rank for a in new_lineage if a.name] @@ -382,7 +427,10 @@ class LINLineageInfo(BaseLineageInfo): Input lineage information is only used for initialization of the final `lineage` and will not be used or compared in any other class methods. """ - ranks: tuple = field(default=None, init=False, compare=False)# we will set this within class instead + + ranks: tuple = field( + default=None, init=False, compare=False + ) # we will set this within class instead lineage: tuple = None # init with n_positions if you want to set a specific number of positions n_lin_positions: int = field(default=None, compare=False) @@ -403,9 +451,11 @@ def __eq__(self, other): total ranks, with full LINs, we only check for the filled_lineage to match and don't check that the number of lin_positions match. """ - if other == (): # if comparing to a null tuple, don't try to find its lineage before returning False + if ( + other == () + ): # if comparing to a null tuple, don't try to find its lineage before returning False return False - return self.filled_lineage==other.filled_lineage + return self.filled_lineage == other.filled_lineage def _init_ranks_from_n_lin_positions(self): new_ranks = [str(x) for x in range(0, self.n_lin_positions)] @@ -418,7 +468,7 @@ def _init_empty(self): # set n_lin_positions to 0 for completely empty LINLineageInfo object.__setattr__(self, "n_lin_positions", 0) self._init_ranks_from_n_lin_positions() - new_lineage=[] + new_lineage = [] for rank in self.ranks: new_lineage.append(LineagePair(rank=rank)) # set lineage and filled_ranks (because frozen, need to do it this way) @@ -430,12 +480,16 @@ def _init_from_lineage_str(self): """ Turn a ; or ,-separated set of lineages into a list of LineagePair objs. """ - new_lineage = self.lineage_str.split(';') + new_lineage = self.lineage_str.split(";") if len(new_lineage) == 1: - new_lineage = self.lineage_str.split(',') + new_lineage = self.lineage_str.split(",") if self.n_lin_positions is not None: if self.n_lin_positions < len(new_lineage): - raise(ValueError("Provided 'n_lin_positions' has fewer positions than provided 'lineage_str'.")) + raise ( + ValueError( + "Provided 'n_lin_positions' has fewer positions than provided 'lineage_str'." + ) + ) self._init_ranks_from_n_lin_positions() else: n_lin_positions = len(new_lineage) @@ -443,14 +497,17 @@ def _init_from_lineage_str(self): self._init_ranks_from_n_lin_positions() # build lineage and n_filled_pos, filled_ranks - new_lineage = [ LineagePair(rank=rank, name=n) for (rank, n) in zip_longest(self.ranks, new_lineage) ] + new_lineage = [ + LineagePair(rank=rank, name=n) + for (rank, n) in zip_longest(self.ranks, new_lineage) + ] filled_ranks = [a.rank for a in new_lineage if a.name is not None] object.__setattr__(self, "lineage", tuple(new_lineage)) object.__setattr__(self, "filled_ranks", tuple(filled_ranks)) object.__setattr__(self, "n_filled_pos", len(filled_ranks)) - def _init_from_lineage_tuples(self): - 'initialize from tuple/list of LineagePairs, building ranks as you go' + def _init_from_lineage_tuples(self): + "initialize from tuple/list of LineagePairs, building ranks as you go" new_lineage = [] ranks = [] # check this is a list or tuple of lineage tuples: @@ -469,7 +526,6 @@ def _init_from_lineage_tuples(self): object.__setattr__(self, "filled_ranks", tuple(filled_ranks)) object.__setattr__(self, "n_filled_pos", len(filled_ranks)) - def is_compatible(self, other): """ Since we sometimes want to match LINprefixes with full LINs, @@ -486,7 +542,6 @@ def is_compatible(self, other): return False - @dataclass class LineageTree: """ @@ -494,6 +549,7 @@ class LineageTree: LineageInfo objects in 'assignments'. This tree can then be used to find lowest common ancestor agreements/confusion. """ + assignments: list = field(compare=False) def __post_init__(self): @@ -501,7 +557,7 @@ def __post_init__(self): self.add_lineages(self.assignments) def add_lineage(self, lineage): - if isinstance(lineage, (BaseLineageInfo, RankLineageInfo, LINLineageInfo)): + if isinstance(lineage, BaseLineageInfo | RankLineageInfo | LINLineageInfo): lineage = lineage.filled_lineage node = self.tree for lineage_tup in lineage: @@ -515,7 +571,9 @@ def add_lineages(self, lineages): if not lineages: raise ValueError("empty assignment passed to build_tree") if not isinstance(lineages, abc.Iterable): - raise ValueError("Must pass in an iterable containing LineagePair or LineageInfo objects.") + raise ValueError( + "Must pass in an iterable containing LineagePair or LineageInfo objects." + ) for lineageInf in lineages: self.add_lineage(lineageInf) @@ -529,13 +587,13 @@ def find_lca(self): node = self.tree lca = [] while 1: - if len(node) == 1: # descend to only child; track path + if len(node) == 1: # descend to only child; track path lineage_tup = next(iter(node.keys())) lca.append(lineage_tup) node = node[lineage_tup] - elif len(node) == 0: # at leaf; end + elif len(node) == 0: # at leaf; end return tuple(lca), 0 - else: # len(node) > 1 => confusion!! + else: # len(node) > 1 => confusion!! return tuple(lca), len(node) def ordered_paths(self, include_internal=False): @@ -550,7 +608,7 @@ def ordered_paths(self, include_internal=False): while stack: path, node = stack.pop() for key, val in node.items(): - if len(val) == 0: # leaf node + if len(val) == 0: # leaf node # if want internal paths, build up from leaf if include_internal: internal_path = path @@ -561,20 +619,19 @@ def ordered_paths(self, include_internal=False): internal_path = internal_path[:-1] # now add leaf path paths.append(path + (key,)) - else: # not leaf, add to stack + else: # not leaf, add to stack stack.append((path + (key,), val)) return paths -def get_ident(ident, *, - keep_full_identifiers=False, keep_identifier_versions=False): +def get_ident(ident, *, keep_full_identifiers=False, keep_identifier_versions=False): # split identifiers = split on whitespace # keep identifiers = don't split .[12] from assembly accessions "Hack and slash identifiers." if not keep_full_identifiers: - ident = ident.split(' ')[0] + ident = ident.split(" ")[0] if not keep_identifier_versions: - ident = ident.split('.')[0] + ident = ident.split(".")[0] return ident @@ -582,12 +639,18 @@ def ascending_taxlist(include_strain=True): """ Provide an ordered list of taxonomic ranks: strain --> superkingdom """ - ascending_taxlist = ['species', 'genus', 'family', 'order', - 'class', 'phylum', 'superkingdom'] + ascending_taxlist = [ + "species", + "genus", + "family", + "order", + "class", + "phylum", + "superkingdom", + ] if include_strain: - ascending_taxlist = ['strain'] + ascending_taxlist - for k in ascending_taxlist: - yield k + ascending_taxlist = ["strain"] + ascending_taxlist + yield from ascending_taxlist def collect_gather_csvs(cmdline_gather_input, *, from_file=None): @@ -600,7 +663,7 @@ def collect_gather_csvs(cmdline_gather_input, *, from_file=None): if gf not in gather_csvs: gather_csvs.append(gf) else: - notify(f'ignoring duplicated reference to file: {gf}') + notify(f"ignoring duplicated reference to file: {gf}") # ignore pathlist duplicates if from_file: more_files = load_pathlist_from_file(from_file) @@ -608,25 +671,29 @@ def collect_gather_csvs(cmdline_gather_input, *, from_file=None): if gf not in gather_csvs: gather_csvs.append(gf) else: - notify(f'ignoring duplicated reference to file: {gf}') + notify(f"ignoring duplicated reference to file: {gf}") return gather_csvs def read_lingroups(lingroup_csv): lingroupD = {} - n=None + n = None with sourmash_args.FileInputCSV(lingroup_csv) as r: header = r.fieldnames # check for empty file if not header: - raise ValueError(f"Cannot read lingroups from '{lingroup_csv}'. Is file empty?") + raise ValueError( + f"Cannot read lingroups from '{lingroup_csv}'. Is file empty?" + ) if "lin" not in header or "name" not in header: - raise ValueError(f"'{lingroup_csv}' must contain the following columns: 'name', 'lin'.") + raise ValueError( + f"'{lingroup_csv}' must contain the following columns: 'name', 'lin'." + ) for n, row in enumerate(r): - lingroupD[row['lin']] = row['name'] + lingroupD[row["lin"]] = row["name"] if n is None: - raise ValueError(f'No lingroups loaded from {lingroup_csv}.') + raise ValueError(f"No lingroups loaded from {lingroup_csv}.") n_lg = len(lingroupD.keys()) notify(f"Read {n+1} lingroup rows and found {n_lg} distinct lingroup prefixes.") return lingroupD @@ -646,20 +713,30 @@ def parse_lingroups(lingroupD): return lg_ranks, all_lgs -def load_gather_results(gather_csv, tax_assignments, *, seen_queries=None, force=False, - skip_idents = None, fail_on_missing_taxonomy=False, - keep_full_identifiers=False, keep_identifier_versions=False, - lins=False): +def load_gather_results( + gather_csv, + tax_assignments, + *, + seen_queries=None, + force=False, + skip_idents=None, + fail_on_missing_taxonomy=False, + keep_full_identifiers=False, + keep_identifier_versions=False, + lins=False, +): "Load a single gather csv" if not seen_queries: - seen_queries=set() + seen_queries = set() header = [] gather_results = {} with sourmash_args.FileInputCSV(gather_csv) as r: header = r.fieldnames # check for empty file if not header: - raise ValueError(f"Cannot read gather results from '{gather_csv}'. Is file empty?") + raise ValueError( + f"Cannot read gather results from '{gather_csv}'. Is file empty?" + ) this_querytaxres = None for n, row in enumerate(r): @@ -667,72 +744,101 @@ def load_gather_results(gather_csv, tax_assignments, *, seen_queries=None, force try: gatherRow = GatherRow(**row) except TypeError as exc: - raise ValueError(f"'{gather_csv}' is missing columns needed for taxonomic summarization. Please run gather with sourmash >= 4.4.") from exc + raise ValueError( + f"'{gather_csv}' is missing columns needed for taxonomic summarization. Please run gather with sourmash >= 4.4." + ) from exc # check if we've seen this query already in a different gather CSV if gatherRow.query_name in seen_queries: # do not allow loading of same query from a second CSV. - raise ValueError(f"Gather query {gatherRow.query_name} was found in more than one CSV. Cannot load from '{gather_csv}'.") - taxres = TaxResult(raw=gatherRow, keep_full_identifiers=keep_full_identifiers, - keep_identifier_versions=keep_identifier_versions, - lins=lins) - taxres.get_match_lineage(tax_assignments=tax_assignments, skip_idents=skip_idents, - fail_on_missing_taxonomy=fail_on_missing_taxonomy) + raise ValueError( + f"Gather query {gatherRow.query_name} was found in more than one CSV. Cannot load from '{gather_csv}'." + ) + taxres = TaxResult( + raw=gatherRow, + keep_full_identifiers=keep_full_identifiers, + keep_identifier_versions=keep_identifier_versions, + lins=lins, + ) + taxres.get_match_lineage( + tax_assignments=tax_assignments, + skip_idents=skip_idents, + fail_on_missing_taxonomy=fail_on_missing_taxonomy, + ) # add to matching QueryTaxResult or create new one if not this_querytaxres or not this_querytaxres.is_compatible(taxres): # get existing or initialize new - this_querytaxres = gather_results.get(gatherRow.query_name, QueryTaxResult(taxres.query_info, lins=lins)) + this_querytaxres = gather_results.get( + gatherRow.query_name, QueryTaxResult(taxres.query_info, lins=lins) + ) this_querytaxres.add_taxresult(taxres) gather_results[gatherRow.query_name] = this_querytaxres if not gather_results: - raise ValueError(f'No gather results loaded from {gather_csv}.') + raise ValueError(f"No gather results loaded from {gather_csv}.") else: notify(f"loaded {len(gather_results)} gather results from '{gather_csv}'.") - return gather_results, header #, gather_queries # can use the gather_results keys instead - - -def check_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxonomy=False, force=False, - keep_full_identifiers=False,keep_identifier_versions=False, lins=False): - ''' + return ( + gather_results, + header, + ) # , gather_queries # can use the gather_results keys instead + + +def check_and_load_gather_csvs( + gather_csvs, + tax_assign, + *, + fail_on_missing_taxonomy=False, + force=False, + keep_full_identifiers=False, + keep_identifier_versions=False, + lins=False, +): + """ Load gather csvs, checking for empties and ids missing from taxonomic assignments. - ''' + """ if not isinstance(gather_csvs, list): gather_csvs = [gather_csvs] gather_results = {} - total_missed = 0 - all_ident_missed = set() header = [] n_ignored = 0 for n, gather_csv in enumerate(gather_csvs): these_results = {} try: - these_results, header = load_gather_results(gather_csv, tax_assign, - seen_queries=gather_results.keys(), - force=force, keep_full_identifiers=keep_full_identifiers, - keep_identifier_versions = keep_identifier_versions, - fail_on_missing_taxonomy=fail_on_missing_taxonomy, - lins=lins) + these_results, header = load_gather_results( + gather_csv, + tax_assign, + seen_queries=gather_results.keys(), + force=force, + keep_full_identifiers=keep_full_identifiers, + keep_identifier_versions=keep_identifier_versions, + fail_on_missing_taxonomy=fail_on_missing_taxonomy, + lins=lins, + ) except ValueError as exc: if force: if "found in more than one CSV" in str(exc): - notify('Cannot force past duplicated gather query. Exiting.') + notify("Cannot force past duplicated gather query. Exiting.") raise if "Failing, as requested via --fail-on-missing-taxonomy" in str(exc): raise notify(str(exc)) - notify('--force is set. Attempting to continue to next set of gather results.') - n_ignored+=1 + notify( + "--force is set. Attempting to continue to next set of gather results." + ) + n_ignored += 1 continue else: - notify('Exiting.') + notify("Exiting.") raise # add these results to gather_results gather_results.update(these_results) - + # some reporting - num_gather_csvs_loaded = n+1 - n_ignored - notify(f'loaded results for {len(gather_results)} queries from {str(num_gather_csvs_loaded)} gather CSVs') + num_gather_csvs_loaded = n + 1 - n_ignored + notify( + f"loaded results for {len(gather_results)} queries from {str(num_gather_csvs_loaded)} gather CSVs" + ) # count and report missing and skipped idents report_missing_and_skipped_identities(gather_results) @@ -748,8 +854,8 @@ def report_missing_and_skipped_identities(gather_results): that are not present in taxonomic assignments, either by accident (missed) or request (skipped). """ - ident_missed= set() - ident_skipped= set() + ident_missed = set() + ident_skipped = set() total_n_missed = 0 total_n_skipped = 0 total_taxresults = 0 @@ -757,20 +863,24 @@ def report_missing_and_skipped_identities(gather_results): ident_missed.update(querytaxres.missed_idents) ident_skipped.update(querytaxres.skipped_idents) # totals are total rows in gather that were missed - do we want to report these at all? - total_n_missed+= querytaxres.n_missed - total_n_skipped+= querytaxres.n_skipped + total_n_missed += querytaxres.n_missed + total_n_skipped += querytaxres.n_skipped total_taxresults += len(querytaxres.raw_taxresults) if ident_missed: - notify(f'of {total_taxresults} gather results, lineage assignments for {total_n_missed} results were missed.') - notify(f'The following are missing from the taxonomy information: {", ".join(ident_missed)}') + notify( + f"of {total_taxresults} gather results, lineage assignments for {total_n_missed} results were missed." + ) + notify( + f'The following are missing from the taxonomy information: {", ".join(ident_missed)}' + ) def aggregate_by_lineage_at_rank(query_gather_results, rank, *, by_query=False): - ''' - Aggregate list of summarized_lineage_results at rank, keeping + """ + Aggregate list of summarized_lineage_results at rank, keeping query names or not (but this aggregates across queries if multiple). - ''' + """ lineage_summary = defaultdict(float) if by_query: lineage_summary = defaultdict(dict) @@ -784,9 +894,11 @@ def aggregate_by_lineage_at_rank(query_gather_results, rank, *, by_query=False): raise ValueError(f"Error: rank '{rank}' not available for aggregation.") for res in queryResult.summarized_lineage_results[rank]: - lineage = res.lineage.display_lineage(null_as_unclassified = True) + lineage = res.lineage.display_lineage(null_as_unclassified=True) if by_query: - lineage_summary[lineage][query_name] = res.fraction # v5?: res.f_weighted_at_rank + lineage_summary[lineage][ + query_name + ] = res.fraction # v5?: res.f_weighted_at_rank else: lineage_summary[lineage] += res.fraction @@ -794,21 +906,23 @@ def aggregate_by_lineage_at_rank(query_gather_results, rank, *, by_query=False): if not by_query: n_queries = len(all_queries) for lin, fraction in lineage_summary.items(): - lineage_summary[lin] = fraction/n_queries + lineage_summary[lin] = fraction / n_queries return lineage_summary, all_queries def format_for_krona(query_gather_results, rank, *, classification=False): - ''' + """ Aggregate and format for krona output. Single query recommended, but we don't want query headers. - ''' + """ # make header header = query_gather_results[0].make_krona_header(min_rank=rank) krona_results = [] # do we want to block more than one query for summarization? if len(query_gather_results) > 1: - notify('WARNING: results from more than one query found. Krona summarization not recommended.\n' \ - 'Percentage assignment will be normalized by the number of queries to maintain range 0-100%.') + notify( + "WARNING: results from more than one query found. Krona summarization not recommended.\n" + "Percentage assignment will be normalized by the number of queries to maintain range 0-100%." + ) if classification: # for classification, just write the results @@ -820,13 +934,17 @@ def format_for_krona(query_gather_results, rank, *, classification=False): # but also misleading, since we're using best_only and there may # be more matches that are not included here, making % unclassified seem higher than it would # be with summarization. We previously excluded it -- is that the behavior we want to keep? - krona_results.extend([q_res.krona_classified])#, q_res.krona_unclassified]) + krona_results.extend( + [q_res.krona_classified] + ) # , q_res.krona_unclassified]) else: - lineage_summary, _ = aggregate_by_lineage_at_rank(query_gather_results, rank, by_query=False) + lineage_summary, _ = aggregate_by_lineage_at_rank( + query_gather_results, rank, by_query=False + ) # sort by fraction lin_items = list(lineage_summary.items()) - lin_items.sort(key = lambda x: -x[1]) + lin_items.sort(key=lambda x: -x[1]) # reformat lineage for krona_results printing unclassified_fraction = 0 @@ -836,20 +954,20 @@ def format_for_krona(query_gather_results, rank, *, classification=False): unclassified_fraction = fraction continue else: - lin_list = lin.split(';') + lin_list = lin.split(";") krona_results.append((fraction, *lin_list)) # handle unclassified if unclassified_fraction: - len_unclassified_lin = len(header) -1 - unclassifed_lin = ["unclassified"]*len_unclassified_lin + len_unclassified_lin = len(header) - 1 + unclassifed_lin = ["unclassified"] * len_unclassified_lin krona_results.append((unclassified_fraction, *unclassifed_lin)) return krona_results, header -def write_krona(header, krona_results, out_fp, *, sep='\t'): - 'write krona output' +def write_krona(header, krona_results, out_fp, *, sep="\t"): + "write krona output" # CTB: do we want to optionally allow restriction to a specific rank # & above? NTP: think we originally kept krona to a specific rank, but # that may have been how we were plotting, since krona plots can be @@ -861,7 +979,7 @@ def write_krona(header, krona_results, out_fp, *, sep='\t'): tsv_output.writerow(res) -def write_output(header, results, out_fp, *, sep=',', write_header=True): +def write_output(header, results, out_fp, *, sep=",", write_header=True): """ write pre-generated results list of rows, with each row being a dictionary @@ -873,25 +991,34 @@ def write_output(header, results, out_fp, *, sep=',', write_header=True): output.writerow(res) -def write_bioboxes(header_lines, results, out_fp, *, sep='\t'): +def write_bioboxes(header_lines, results, out_fp, *, sep="\t"): """ write pre-generated results list of rows, with each row being list. """ for inf in header_lines: - out_fp.write(inf + '\n') + out_fp.write(inf + "\n") for res in results: - res = sep.join(res) + '\n' + res = sep.join(res) + "\n" out_fp.write(res) -def write_summary(query_gather_results, csv_fp, *, sep=',', limit_float_decimals=False, classification=False): - ''' +def write_summary( + query_gather_results, + csv_fp, + *, + sep=",", + limit_float_decimals=False, + classification=False, +): + """ Write taxonomy-summarized gather results for each rank. - ''' - w= None + """ + w = None for q_res in query_gather_results: - header, summary = q_res.make_full_summary(limit_float=limit_float_decimals, classification=classification) + header, summary = q_res.make_full_summary( + limit_float=limit_float_decimals, classification=classification + ) if w is None: w = csv.DictWriter(csv_fp, header, delimiter=sep) w.writeheader() @@ -899,29 +1026,41 @@ def write_summary(query_gather_results, csv_fp, *, sep=',', limit_float_decimals w.writerow(res) -def write_human_summary(query_gather_results, out_fp, display_rank, classification=False): - ''' +def write_human_summary( + query_gather_results, out_fp, display_rank, classification=False +): + """ Write human-readable taxonomy-summarized gather results for a specific rank. - ''' + """ for queryResult in query_gather_results: - results = queryResult.make_human_summary(display_rank=display_rank, classification=classification) + results = queryResult.make_human_summary( + display_rank=display_rank, classification=classification + ) if classification: out_fp.write("sample name status proportion cANI lineage\n") out_fp.write("----------- ------ ---------- ---- -------\n") for rD in results: - out_fp.write("{query_name:<15s} {status} {f_weighted_at_rank} {query_ani_at_rank} {lineage}\n".format(**rD)) + out_fp.write( + "{query_name:<15s} {status} {f_weighted_at_rank} {query_ani_at_rank} {lineage}\n".format( + **rD + ) + ) else: out_fp.write("sample name proportion cANI lineage\n") out_fp.write("----------- ---------- ---- -------\n") for rD in results: - out_fp.write("{query_name:<15s} {f_weighted_at_rank} {query_ani_at_rank} {lineage}\n".format(**rD)) + out_fp.write( + "{query_name:<15s} {f_weighted_at_rank} {query_ani_at_rank} {lineage}\n".format( + **rD + ) + ) -def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, sep='\t'): - ''' +def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, sep="\t"): + """ takes in a lineage dictionary with sample counts (output of aggregate_by_lineage_at_rank) and produces a tab-separated file with fractions for each sample. @@ -935,7 +1074,7 @@ def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, sep='\t'): lin_a 0.4 0.17 0.6 lin_b 0.0 0.0 0.1 lin_c 0.3 0.4 0.2 - ''' + """ header = ["lineage"] + sample_names w = csv.DictWriter(out_fp, header, delimiter=sep) @@ -943,14 +1082,14 @@ def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, sep='\t'): blank_row = {query_name: 0 for query_name in sample_names} unclassified_row = None for lin, sampleinfo in sorted(lineage_dict.items()): - #add lineage and 0 placeholders - row = {'lineage': lin} + # add lineage and 0 placeholders + row = {"lineage": lin} row.update(blank_row) # add info for query_names that exist for this lineage row.update(sampleinfo) # if unclassified, save this row for the end - if lin== "unclassified": - row.update({'lineage': 'unclassified'}) + if lin == "unclassified": + row.update({"lineage": "unclassified"}) unclassified_row = row continue # write row @@ -961,6 +1100,7 @@ def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, sep='\t'): class LineageDB(abc.Mapping): "Base LineageDB class built around an assignments dictionary." + def __init__(self, assign_d, avail_ranks): self.assignments = assign_d self.available_ranks = set(avail_ranks) @@ -982,8 +1122,16 @@ def __bool__(self): return bool(self.assignments) @classmethod - def load(cls, filename, *, delimiter=',', force=False, - keep_full_identifiers=False, keep_identifier_versions=True, lins=False): + def load( + cls, + filename, + *, + delimiter=",", + force=False, + keep_full_identifiers=False, + keep_identifier_versions=True, + lins=False, + ): """ Load a taxonomy assignment CSV file into a LineageDB. @@ -993,9 +1141,11 @@ def load(cls, filename, *, delimiter=',', force=False, 'keep_identifier_versions=False' will remove trailing versions, e.g. 'IDENT.1' => 'IDENT'. """ - include_strain=False + include_strain = False if not keep_identifier_versions and keep_full_identifiers: - raise ValueError("keep_identifer_versions=False doesn't make sense with keep_full_identifiers=True") + raise ValueError( + "keep_identifer_versions=False doesn't make sense with keep_full_identifiers=True" + ) if not os.path.exists(filename): raise ValueError(f"'{filename}' does not exist") @@ -1006,42 +1156,46 @@ def load(cls, filename, *, delimiter=',', force=False, with sourmash_args.FileInputCSV(filename) as r: header = r.fieldnames if not header: - raise ValueError(f'cannot read taxonomy assignments from {filename}') + raise ValueError(f"cannot read taxonomy assignments from {filename}") identifier = "ident" # check for ident/identifier, handle some common alternatives if "ident" not in header: # check for ident/identifier, handle some common alternatives - if 'identifiers' in header: - identifier = 'identifiers' + if "identifiers" in header: + identifier = "identifiers" header = ["ident" if "identifiers" == x else x for x in header] - elif 'accession' in header: - identifier = 'accession' + elif "accession" in header: + identifier = "accession" header = ["ident" if "accession" == x else x for x in header] - elif 'name' in header and 'lineage' in header: - return cls.load_from_gather_with_lineages(filename, - force=force, - lins=lins) + elif "name" in header and "lineage" in header: + return cls.load_from_gather_with_lineages( + filename, force=force, lins=lins + ) else: header_str = ",".join([repr(x) for x in header]) - raise ValueError(f'No taxonomic identifiers found; headers are {header_str}') + raise ValueError( + f"No taxonomic identifiers found; headers are {header_str}" + ) if lins and "lin" not in header: - raise ValueError(f"'lin' column not found: cannot read LIN taxonomy assignments from {filename}.") + raise ValueError( + f"'lin' column not found: cannot read LIN taxonomy assignments from {filename}." + ) if not lins: # is "strain" an available rank? if "strain" in header: - include_strain=True + include_strain = True # check that all ranks are in header ranks = list(RankLineageInfo().taxlist) if not include_strain: - ranks.remove('strain') + ranks.remove("strain") if not set(ranks).issubset(header): # for now, just raise err if not all ranks are present. # in future, we can define `ranks` differently if desired # return them from this function so we can check the `available` ranks - raise ValueError('Not all taxonomy ranks present') + raise ValueError("Not all taxonomy ranks present") assignments = {} num_rows = 0 @@ -1053,13 +1207,17 @@ def load(cls, filename, *, delimiter=',', force=False, for n, row in enumerate(r): num_rows += 1 if lins: - lineageInfo = LINLineageInfo(lineage_str=row['lin']) + lineageInfo = LINLineageInfo(lineage_str=row["lin"]) if n_pos is not None: if lineageInfo.n_lin_positions != n_pos: - raise ValueError(f"For taxonomic summarization, all LIN assignments must use the same number of LIN positions.") + raise ValueError( + "For taxonomic summarization, all LIN assignments must use the same number of LIN positions." + ) else: - n_pos = lineageInfo.n_lin_positions # set n_pos with first entry - ranks=lineageInfo.ranks + n_pos = ( + lineageInfo.n_lin_positions + ) # set n_pos with first entry + ranks = lineageInfo.ranks else: # read lineage from row dictionary lineageInfo = RankLineageInfo(lineage_dict=row) @@ -1067,9 +1225,11 @@ def load(cls, filename, *, delimiter=',', force=False, ident = row[identifier] # fold, spindle, and mutilate ident? - ident = get_ident(ident, - keep_full_identifiers=keep_full_identifiers, - keep_identifier_versions=keep_identifier_versions) + ident = get_ident( + ident, + keep_full_identifiers=keep_full_identifiers, + keep_identifier_versions=keep_identifier_versions, + ) # store lineage tuple lineage = lineageInfo.filled_lineage @@ -1078,27 +1238,27 @@ def load(cls, filename, *, delimiter=',', force=False, if ident in assignments: if assignments[ident] != lineage: if not force: - raise ValueError(f"multiple lineages for identifier {ident}") + raise ValueError( + f"multiple lineages for identifier {ident}" + ) else: assignments[ident] = lineage if not lins: - if lineage[-1].rank == 'species': + if lineage[-1].rank == "species": n_species += 1 - elif lineage[-1].rank == 'strain': + elif lineage[-1].rank == "strain": n_species += 1 n_strains += 1 return LineageDB(assignments, ranks) - @classmethod def load_from_gather_with_lineages(cls, filename, *, force=False, lins=False): """ Load an annotated gather-with-lineages CSV file produced by 'tax annotate' into a LineageDB. """ - include_strain = False if not os.path.exists(filename): raise ValueError(f"'{filename}' does not exist") @@ -1109,12 +1269,14 @@ def load_from_gather_with_lineages(cls, filename, *, force=False, lins=False): with sourmash_args.FileInputCSV(filename) as r: header = r.fieldnames if not header: - raise ValueError(f'cannot read taxonomy assignments from {filename}') + raise ValueError(f"cannot read taxonomy assignments from {filename}") if "name" not in header or "lineage" not in header: - raise ValueError(f"Expected headers 'name' and 'lineage' not found. Is this a with-lineages file?") + raise ValueError( + "Expected headers 'name' and 'lineage' not found. Is this a with-lineages file?" + ) - ranks=None + ranks = None assignments = {} num_rows = 0 n_species = 0 @@ -1124,13 +1286,13 @@ def load_from_gather_with_lineages(cls, filename, *, force=False, lins=False): for n, row in enumerate(r): num_rows += 1 - name = row['name'] + name = row["name"] ident = get_ident(name) if lins: - lineageInfo = LINLineageInfo(lineage_str=row['lineage']) + lineageInfo = LINLineageInfo(lineage_str=row["lineage"]) else: - lineageInfo = RankLineageInfo(lineage_str= row['lineage']) + lineageInfo = RankLineageInfo(lineage_str=row["lineage"]) if ranks is None: ranks = lineageInfo.taxlist @@ -1142,14 +1304,16 @@ def load_from_gather_with_lineages(cls, filename, *, force=False, lins=False): # this should not happen with valid # sourmash tax annotate output, but check anyway. if not force: - raise ValueError(f"multiple lineages for identifier {ident}") + raise ValueError( + f"multiple lineages for identifier {ident}" + ) else: assignments[ident] = lineage if isinstance(lineageInfo, RankLineageInfo): - if lineage[-1].rank == 'species': + if lineage[-1].rank == "species": n_species += 1 - elif lineage[-1].rank == 'strain': + elif lineage[-1].rank == "strain": n_species += 1 n_strains += 1 @@ -1160,10 +1324,19 @@ class LineageDB_Sqlite(abc.Mapping): """ A LineageDB based on a sqlite3 database with a 'sourmash_taxonomy' table. """ + # NOTE: 'order' is a reserved name in sql, so we have to use 'order_'. - columns = ('superkingdom', 'phylum', 'order_', 'class', 'family', - 'genus', 'species', 'strain') - table_name = 'sourmash_taxonomy' + columns = ( + "superkingdom", + "phylum", + "order_", + "class", + "family", + "genus", + "species", + "strain", + ) + table_name = "sourmash_taxonomy" def __init__(self, conn, *, table_name=None): self.conn = conn @@ -1175,10 +1348,10 @@ def __init__(self, conn, *, table_name=None): # check that the right table is there. c = conn.cursor() try: - c.execute(f'SELECT * FROM {self.table_name} LIMIT 1') + c.execute(f"SELECT * FROM {self.table_name} LIMIT 1") except (sqlite3.DatabaseError, sqlite3.OperationalError): raise ValueError("not a taxonomy database") - + # check: can we do a 'select' on the right table? self.__len__() c = conn.cursor() @@ -1188,7 +1361,7 @@ def __init__(self, conn, *, table_name=None): for column, rank in zip(self.columns, RankLineageInfo().taxlist): query = f'SELECT COUNT({column}) FROM {self.table_name} WHERE {column} IS NOT NULL AND {column} != ""' c.execute(query) - cnt, = c.fetchone() + (cnt,) = c.fetchone() if cnt: ranks.add(rank) @@ -1209,16 +1382,16 @@ def load(cls, location): except sqlite3.OperationalError: info = {} - if 'SqliteLineage' in info: - if info['SqliteLineage'] != '1.0': + if "SqliteLineage" in info: + if info["SqliteLineage"] != "1.0": raise IndexNotSupported - table_name = 'sourmash_taxonomy' + table_name = "sourmash_taxonomy" else: # legacy support for old taxonomy DB, pre sourmash_internal. try: - c.execute('SELECT * FROM taxonomy LIMIT 1') - table_name = 'taxonomy' + c.execute("SELECT * FROM taxonomy LIMIT 1") + table_name = "taxonomy" except sqlite3.OperationalError: pass @@ -1229,13 +1402,16 @@ def load(cls, location): def _make_tup(self, row): "build a tuple of LineagePairs for this sqlite row" - tup = [ LineagePair(n, r) for (n, r) in zip(RankLineageInfo().taxlist, row) ] + tup = [LineagePair(n, r) for (n, r) in zip(RankLineageInfo().taxlist, row)] return tuple(tup) def __getitem__(self, ident): "Retrieve lineage for identifer" c = self.cursor - c.execute(f'SELECT superkingdom, phylum, class, order_, family, genus, species, strain FROM {self.table_name} WHERE ident=?', (ident,)) + c.execute( + f"SELECT superkingdom, phylum, class, order_, family, genus, species, strain FROM {self.table_name} WHERE ident=?", + (ident,), + ) # retrieve names list... names = c.fetchone() @@ -1256,24 +1432,26 @@ def __bool__(self): def __len__(self): "Return number of rows" c = self.conn.cursor() - c.execute(f'SELECT COUNT(DISTINCT ident) FROM {self.table_name}') - nrows, = c.fetchone() + c.execute(f"SELECT COUNT(DISTINCT ident) FROM {self.table_name}") + (nrows,) = c.fetchone() return nrows def __iter__(self): "Return all identifiers" # create new cursor so as to allow other operations c = self.conn.cursor() - c.execute(f'SELECT DISTINCT ident FROM {self.table_name}') + c.execute(f"SELECT DISTINCT ident FROM {self.table_name}") - for ident, in c: + for (ident,) in c: yield ident def items(self): "return all items in the sqlite database" c = self.conn.cursor() - c.execute(f'SELECT DISTINCT ident, superkingdom, phylum, class, order_, family, genus, species, strain FROM {self.table_name}') + c.execute( + f"SELECT DISTINCT ident, superkingdom, phylum, class, order_, family, genus, species, strain FROM {self.table_name}" + ) for ident, *names in c: yield ident, self._make_tup(names) @@ -1347,10 +1525,10 @@ def __len__(self): def __bool__(self): "True if any contained database has content." - return any( bool(db) for db in self.lineage_dbs ) + return any(bool(db) for db in self.lineage_dbs) def save(self, filename_or_fp, file_format): - assert file_format in ('sql', 'csv') + assert file_format in ("sql", "csv") is_filename = False try: @@ -1358,18 +1536,20 @@ def save(self, filename_or_fp, file_format): except AttributeError: is_filename = True - if file_format == 'sql': + if file_format == "sql": if not is_filename: - raise ValueError("file format '{file_format}' requires a filename, not a file handle") + raise ValueError( + "file format '{file_format}' requires a filename, not a file handle" + ) self._save_sqlite(filename_or_fp) - elif file_format == 'csv': + elif file_format == "csv": # we need a file handle; open file. fp = filename_or_fp if is_filename: - if filename_or_fp.endswith('.gz'): - fp = gzip.open(filename_or_fp, 'wt', newline="") + if filename_or_fp.endswith(".gz"): + fp = gzip.open(filename_or_fp, "wt", newline="") else: - fp = open(filename_or_fp, 'w', newline="") + fp = open(filename_or_fp, "w", newline="") try: self._save_csv(fp) @@ -1389,13 +1569,14 @@ def _save_sqlite(self, filename, *, conn=None): cursor = db.cursor() try: - sqlite_utils.add_sourmash_internal(cursor, 'SqliteLineage', '1.0') + sqlite_utils.add_sourmash_internal(cursor, "SqliteLineage", "1.0") except sqlite3.OperationalError: raise ValueError("attempt to write a readonly database") try: # CTB: could add 'IF NOT EXIST' here; would need tests, too. - cursor.execute(""" + cursor.execute( + """ CREATE TABLE sourmash_taxonomy ( ident TEXT NOT NULL, @@ -1408,49 +1589,54 @@ class TEXT, species TEXT, strain TEXT ) - """) - did_create = True + """ + ) except sqlite3.OperationalError: # already exists? raise ValueError(f"taxonomy table already exists in '{filename}'") # follow up and create index - cursor.execute("CREATE UNIQUE INDEX sourmash_taxonomy_ident ON sourmash_taxonomy(ident);") + cursor.execute( + "CREATE UNIQUE INDEX sourmash_taxonomy_ident ON sourmash_taxonomy(ident);" + ) for ident, tax in self.items(): - x = [ident, *[ t.name for t in tax ]] + x = [ident, *[t.name for t in tax]] # fill the taxonomy tuple with empty values until it's the # right length for the SQL statement - while len(x) < 9: - x.append('') + x.append("") - cursor.execute('INSERT INTO sourmash_taxonomy (ident, superkingdom, phylum, class, order_, family, genus, species, strain) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', x) + cursor.execute( + "INSERT INTO sourmash_taxonomy (ident, superkingdom, phylum, class, order_, family, genus, species, strain) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", + x, + ) db.commit() def _save_csv(self, fp): - headers = ['identifiers'] + list(RankLineageInfo().taxlist) + headers = ["identifiers"] + list(RankLineageInfo().taxlist) w = csv.DictWriter(fp, fieldnames=headers) w.writeheader() for n, (ident, tax) in enumerate(self.items()): row = {} - row['identifiers'] = ident + row["identifiers"] = ident # convert tax LineagePairs into dictionary for t in tax: row[t.rank] = t.name # add strain if needed - if 'strain' not in row: - row['strain'] = '' + if "strain" not in row: + row["strain"] = "" w.writerow(row) @classmethod def load(cls, locations, **kwargs): "Load one or more taxonomies from the given location(s)" - force = kwargs.get('force', False) + force = kwargs.get("force", False) if isinstance(locations, str): raise TypeError("'locations' should be a list, not a string") @@ -1475,7 +1661,9 @@ def load(cls, locations, **kwargs): except (ValueError, csv.Error) as exc: # for the last loader, just pass along ValueError... if not force: - raise ValueError(f"cannot read taxonomy assignments from '{location}': {str(exc)}") + raise ValueError( + f"cannot read taxonomy assignments from '{location}': {str(exc)}" + ) # nothing loaded, goodbye! if not loaded and not force: @@ -1506,7 +1694,7 @@ class GatherRow: # essential columns query_name: str - name: str # match_name + name: str # match_name f_unique_weighted: float f_unique_to_query: float unique_intersect_bp: int @@ -1549,6 +1737,7 @@ class GatherRow: @dataclass class QueryInfo: "Class for storing query information" + query_name: str query_md5: str query_filename: str @@ -1564,7 +1753,9 @@ def __post_init__(self): self.ksize = int(self.ksize) self.scaled = int(self.scaled) self.query_n_hashes = int(self.query_n_hashes) if self.query_n_hashes else 0 - self.total_weighted_hashes = int(self.total_weighted_hashes) if self.total_weighted_hashes else 0 + self.total_weighted_hashes = ( + int(self.total_weighted_hashes) if self.total_weighted_hashes else 0 + ) @property def total_weighted_bp(self): @@ -1576,7 +1767,8 @@ class BaseTaxResult: """ Base class for sourmash taxonomic annotation. """ - raw: dict # csv row + + raw: dict # csv row keep_full_identifiers: bool = False keep_identifier_versions: bool = False match_ident: str = field(init=False) @@ -1594,29 +1786,32 @@ def get_ident(self, id_col=None): else: self.match_ident = self.raw.name if not self.keep_full_identifiers: - self.match_ident = self.match_ident.split(' ')[0] + self.match_ident = self.match_ident.split(" ")[0] else: - #overrides version bc can't keep full without keeping version + # overrides version bc can't keep full without keeping version self.keep_identifier_versions = True if not self.keep_identifier_versions: - self.match_ident = self.match_ident.split('.')[0] + self.match_ident = self.match_ident.split(".")[0] - - def get_match_lineage(self, tax_assignments, skip_idents=None, fail_on_missing_taxonomy=False): + def get_match_lineage( + self, tax_assignments, skip_idents=None, fail_on_missing_taxonomy=False + ): if skip_idents and self.match_ident in skip_idents: self.skipped_ident = True else: lin = tax_assignments.get(self.match_ident) if lin: if self.lins: - self.lineageInfo = LINLineageInfo(lineage = lin) + self.lineageInfo = LINLineageInfo(lineage=lin) else: - self.lineageInfo = RankLineageInfo(lineage = lin) + self.lineageInfo = RankLineageInfo(lineage=lin) else: - self.missed_ident=True + self.missed_ident = True self.match_lineage_attempted = True if self.missed_ident and fail_on_missing_taxonomy: - raise ValueError(f"Error: ident '{self.match_ident}' is not in the taxonomy database. Failing, as requested via --fail-on-missing-taxonomy") + raise ValueError( + f"Error: ident '{self.match_ident}' is not in the taxonomy database. Failing, as requested via --fail-on-missing-taxonomy" + ) @dataclass @@ -1624,7 +1819,8 @@ class AnnotateTaxResult(BaseTaxResult): """ Class to enable taxonomic annotation of any sourmash CSV. """ - id_col: str = 'name' + + id_col: str = "name" def __post_init__(self): if self.id_col not in self.raw.keys(): @@ -1667,22 +1863,24 @@ class TaxResult(BaseTaxResult): Use RankLineageInfo or LINLineageInfo to store lineage information. """ + raw: GatherRow query_name: str = field(init=False) query_info: QueryInfo = field(init=False) def __post_init__(self): self.get_ident() - self.query_name = self.raw.query_name # convenience - self.query_info = QueryInfo(query_name = self.raw.query_name, - query_md5=self.raw.query_md5, - query_filename = self.raw.query_filename, - query_bp = self.raw.query_bp, - query_n_hashes = self.raw.query_n_hashes, - total_weighted_hashes = self.raw.total_weighted_hashes, - ksize = self.raw.ksize, - scaled = self.raw.scaled - ) + self.query_name = self.raw.query_name # convenience + self.query_info = QueryInfo( + query_name=self.raw.query_name, + query_md5=self.raw.query_md5, + query_filename=self.raw.query_filename, + query_bp=self.raw.query_bp, + query_n_hashes=self.raw.query_n_hashes, + total_weighted_hashes=self.raw.total_weighted_hashes, + ksize=self.raw.ksize, + scaled=self.raw.scaled, + ) # cast and store the imp bits self.f_unique_to_query = float(self.raw.f_unique_to_query) self.f_unique_weighted = float(self.raw.f_unique_weighted) @@ -1701,6 +1899,7 @@ class SummarizedGatherResult: Methods included for returning formatted results for different outputs. """ + rank: str fraction: float lineage: RankLineageInfo @@ -1713,23 +1912,32 @@ def __post_init__(self): def check_values(self): if any([self.fraction > 1, self.f_weighted_at_rank > 1]): - raise ValueError(f"Summarized fraction is > 100% of the query! This should not be possible. Please check that your input files come directly from a single gather run per query.") + raise ValueError( + "Summarized fraction is > 100% of the query! This should not be possible. Please check that your input files come directly from a single gather run per query." + ) # is this true for weighted too, or is that set to 0 when --ignore-abundance is used? - if any([self.fraction <=0, self.f_weighted_at_rank <= 0]): # this shouldn't actually happen, but it breaks ANI estimation, so let's check for it. - raise ValueError(f"Summarized fraction is <=0% of the query! This should not occur.") + if any( + [self.fraction <= 0, self.f_weighted_at_rank <= 0] + ): # this shouldn't actually happen, but it breaks ANI estimation, so let's check for it. + raise ValueError( + "Summarized fraction is <=0% of the query! This should not occur." + ) def set_query_ani(self, query_info): - self.query_ani_at_rank = containment_to_distance(self.fraction, query_info.ksize, query_info.scaled, - n_unique_kmers=query_info.query_n_hashes, - sequence_len_bp=query_info.query_bp).ani - + self.query_ani_at_rank = containment_to_distance( + self.fraction, + query_info.ksize, + query_info.scaled, + n_unique_kmers=query_info.query_n_hashes, + sequence_len_bp=query_info.query_bp, + ).ani def as_lineage_dict(self, query_info, ranks): - ''' + """ Format to dict for writing lineage-CSV file suitable for use with sourmash tax ... -t. - ''' + """ lD = {} - lD['ident'] = query_info.query_name + lD["ident"] = query_info.query_name for rank in ranks: lin_name = self.lineage.name_at_rank(rank) if lin_name is None: @@ -1739,52 +1947,54 @@ def as_lineage_dict(self, query_info, ranks): def as_summary_dict(self, query_info, limit_float=False): sD = asdict(self) - sD['lineage'] = self.lineage.display_lineage(null_as_unclassified=True) - sD['query_name'] = query_info.query_name - sD['query_md5'] = query_info.query_md5 - sD['query_filename'] = query_info.query_filename - sD['total_weighted_hashes'] = str(query_info.total_weighted_hashes) - sD['bp_match_at_rank'] = str(self.bp_match_at_rank) + sD["lineage"] = self.lineage.display_lineage(null_as_unclassified=True) + sD["query_name"] = query_info.query_name + sD["query_md5"] = query_info.query_md5 + sD["query_filename"] = query_info.query_filename + sD["total_weighted_hashes"] = str(query_info.total_weighted_hashes) + sD["bp_match_at_rank"] = str(self.bp_match_at_rank) if limit_float: - sD['fraction'] = f'{self.fraction:.3f}' - sD['f_weighted_at_rank'] = f'{self.f_weighted_at_rank:.3f}' + sD["fraction"] = f"{self.fraction:.3f}" + sD["f_weighted_at_rank"] = f"{self.f_weighted_at_rank:.3f}" if self.query_ani_at_rank: - sD['query_ani_at_rank'] = f'{self.query_ani_at_rank:.3f}' + sD["query_ani_at_rank"] = f"{self.query_ani_at_rank:.3f}" else: - sD['fraction'] = str(self.fraction) - sD['f_weighted_at_rank'] = str(self.f_weighted_at_rank) + sD["fraction"] = str(self.fraction) + sD["f_weighted_at_rank"] = str(self.f_weighted_at_rank) - return(sD) + return sD def as_human_friendly_dict(self, query_info): sD = self.as_summary_dict(query_info=query_info, limit_float=True) - sD['f_weighted_at_rank'] = f"{self.f_weighted_at_rank*100:>4.1f}%" + sD["f_weighted_at_rank"] = f"{self.f_weighted_at_rank*100:>4.1f}%" if self.query_ani_at_rank is not None: - sD['query_ani_at_rank'] = f"{self.query_ani_at_rank*100:>3.1f}%" + sD["query_ani_at_rank"] = f"{self.query_ani_at_rank*100:>3.1f}%" else: - sD['query_ani_at_rank'] = '- ' + sD["query_ani_at_rank"] = "- " return sD def as_kreport_dict(self, query_info): """ Produce kreport dict for named taxonomic groups. """ - lowest_assignment_rank = 'species' + lowest_assignment_rank = "species" sD = {} - sD['num_bp_assigned'] = str(0) - sD['ncbi_taxid'] = None + sD["num_bp_assigned"] = str(0) + sD["ncbi_taxid"] = None # total percent containment, weighted to include abundance info - sD['percent_containment'] = f'{self.f_weighted_at_rank * 100:.2f}' - sD["num_bp_contained"] = str(int(self.f_weighted_at_rank * query_info.total_weighted_bp)) + sD["percent_containment"] = f"{self.f_weighted_at_rank * 100:.2f}" + sD["num_bp_contained"] = str( + int(self.f_weighted_at_rank * query_info.total_weighted_bp) + ) if isinstance(self.lineage, LINLineageInfo): raise ValueError("Cannot produce 'kreport' with LIN taxonomy.") if self.lineage != RankLineageInfo(): this_rank = self.lineage.lowest_rank - sD['rank_code'] = RANKCODE[this_rank] - sD['sci_name'] = self.lineage.lowest_lineage_name + sD["rank_code"] = RANKCODE[this_rank] + sD["sci_name"] = self.lineage.lowest_lineage_name taxid = self.lineage.lowest_lineage_taxid if taxid: - sD['ncbi_taxid'] = str(taxid) + sD["ncbi_taxid"] = str(taxid) # the number of bp actually 'assigned' at this rank. Sourmash assigns everything # at genome level, but since kreport traditionally doesn't include 'strain' or genome, # it is reasonable to state that sourmash assigns at 'species' level for this. @@ -1792,19 +2002,21 @@ def as_kreport_dict(self, query_info): if this_rank == lowest_assignment_rank: sD["num_bp_assigned"] = sD["num_bp_contained"] else: - sD['sci_name'] = 'unclassified' - sD['rank_code'] = RANKCODE['unclassified'] + sD["sci_name"] = "unclassified" + sD["rank_code"] = RANKCODE["unclassified"] sD["num_bp_assigned"] = sD["num_bp_contained"] return sD - + def as_lingroup_dict(self, query_info, lg_name): """ Produce lingroup report dict for lingroups. """ sD = {} # total percent containment, weighted to include abundance info - sD['percent_containment'] = f'{self.f_weighted_at_rank * 100:.2f}' - sD["num_bp_contained"] = str(int(self.f_weighted_at_rank * query_info.total_weighted_bp)) + sD["percent_containment"] = f"{self.f_weighted_at_rank * 100:.2f}" + sD["num_bp_contained"] = str( + int(self.f_weighted_at_rank * query_info.total_weighted_bp) + ) sD["lin"] = self.lineage.display_lineage() sD["name"] = lg_name return sD @@ -1814,11 +2026,11 @@ def as_cami_bioboxes(self): Format taxonomy-summarized gather results as CAMI profiling Bioboxes format. - Columns are: TAXID RANK TAXPATH TAXPATHSN PERCENTAGE + Columns are: TAXID RANK TAXPATH TAXPATHSN PERCENTAGE """ if isinstance(self.lineage, LINLineageInfo): raise ValueError("Cannot produce 'bioboxes' with LIN taxonomy.") - if self.lineage != RankLineageInfo(): # if not unassigned + if self.lineage != RankLineageInfo(): # if not unassigned taxid = self.lineage.lowest_lineage_taxid if taxid: taxpath = self.lineage.display_taxid(sep="|") @@ -1826,7 +2038,9 @@ def as_cami_bioboxes(self): else: taxpath = None taxpathsn = self.lineage.display_lineage(sep="|") - percentage = f"{(self.f_weighted_at_rank * 100):.2f}" # fix at 2 decimal points + percentage = ( + f"{(self.f_weighted_at_rank * 100):.2f}" # fix at 2 decimal points + ) return [taxid, self.rank, taxpath, taxpathsn, percentage] return [] @@ -1842,39 +2056,47 @@ class ClassificationResult(SummarizedGatherResult): Methods included for returning formatted results for different outputs. """ + "Class for storing query classification information" status: str = field(init=False) def __post_init__(self): # check for out of bounds values, default "nomatch" if no match at all self.check_values() - self.status = 'nomatch' #None? + self.status = "nomatch" # None? def set_status(self, query_info, containment_threshold=None, ani_threshold=None): # if any matches, use 'below_threshold' as default; set 'match' if meets threshold if any([containment_threshold is not None, ani_threshold is not None]): - self.status="below_threshold" + self.status = "below_threshold" self.set_query_ani(query_info=query_info) - if ani_threshold is not None: # if provided, just use ani thresh, don't use containment threshold + if ( + ani_threshold is not None + ): # if provided, just use ani thresh, don't use containment threshold if self.query_ani_at_rank >= ani_threshold: - self.status = 'match' + self.status = "match" # v5?: switch to using self.f_weighted_at_rank here - elif containment_threshold is not None and self.fraction >= containment_threshold: - self.status = 'match' + elif ( + containment_threshold is not None and self.fraction >= containment_threshold + ): + self.status = "match" def build_krona_result(self, rank=None): krona_classified, krona_unclassified = None, None if rank is not None and rank == self.rank: - lin_as_list = self.lineage.display_lineage().split(';') - krona_classification = (self.fraction, *lin_as_list) # v5?: f_weighted_at_rank - krona_classified = (krona_classification) + lin_as_list = self.lineage.display_lineage().split(";") + krona_classification = ( + self.fraction, + *lin_as_list, + ) # v5?: f_weighted_at_rank + krona_classified = krona_classification # handle unclassified - do we want/need this? - unclassified_fraction= 1.0-self.fraction #v5?: f_weighted_at_rank + unclassified_fraction = 1.0 - self.fraction # v5?: f_weighted_at_rank len_unclassified_lin = len(lin_as_list) - unclassifed_lin = ["unclassified"]*(len_unclassified_lin) + unclassifed_lin = ["unclassified"] * (len_unclassified_lin) krona_unclassified = (unclassified_fraction, *unclassifed_lin) return krona_classified, krona_unclassified - + @dataclass class QueryTaxResult: @@ -1887,11 +2109,12 @@ class QueryTaxResult: Contains methods for formatting results for different outputs. """ - query_info: QueryInfo # initialize with QueryInfo dataclass + + query_info: QueryInfo # initialize with QueryInfo dataclass lins: bool = False def __post_init__(self): - self.query_name = self.query_info.query_name # for convenience + self.query_name = self.query_info.query_name # for convenience self._init_taxresult_vars() self._init_summarization_vars() self._init_classification_results() @@ -1899,7 +2122,7 @@ def __post_init__(self): def _init_taxresult_vars(self): self.ranks = [] self.raw_taxresults = [] - self.skipped_idents= set() + self.skipped_idents = set() self.missed_idents = set() self.n_missed = 0 self.n_skipped = 0 @@ -1913,13 +2136,13 @@ def _init_summarization_vars(self): self._init_summarization_results() def _init_summarization_results(self): - self.total_f_weighted = defaultdict(float) #0.0 - self.total_f_classified = defaultdict(float)#0.0 - self.total_bp_classified = defaultdict(int) #0 + self.total_f_weighted = defaultdict(float) # 0.0 + self.total_f_classified = defaultdict(float) # 0.0 + self.total_bp_classified = defaultdict(int) # 0 self.summarized_lineage_results = defaultdict(list) def _init_classification_results(self): - self.status = 'nomatch' + self.status = "nomatch" self.classified_ranks = [] self.classification_result = None self.krona_classified = None @@ -1940,76 +2163,114 @@ def add_taxresult(self, taxresult): # check that all query parameters match if self.is_compatible(taxresult=taxresult): if not taxresult.match_lineage_attempted: - raise ValueError("Error: Cannot add TaxResult. Please use get_match_lineage() to add taxonomic lineage information first.") + raise ValueError( + "Error: Cannot add TaxResult. Please use get_match_lineage() to add taxonomic lineage information first." + ) if not self.ranks: self.ranks = taxresult.lineageInfo.ranks if taxresult.skipped_ident: - self.n_skipped +=1 + self.n_skipped += 1 self.skipped_idents.add(taxresult.match_ident) elif taxresult.missed_ident: - self.n_missed +=1 + self.n_missed += 1 self.missed_idents.add(taxresult.match_ident) self.raw_taxresults.append(taxresult) else: - raise ValueError("Error: Cannot add TaxResult: query information does not match.") + raise ValueError( + "Error: Cannot add TaxResult: query information does not match." + ) def summarize_up_ranks(self, single_rank=None, force_resummarize=False): - if self.summarized_ranks: # has already been summarized + if self.summarized_ranks: # has already been summarized if force_resummarize: self._init_summarization_vars() else: - raise ValueError("Error: already summarized using rank(s): '{', '.join(self.summarized_ranks)}'. Use 'force_resummarize=True' to reset and resummarize") + raise ValueError( + "Error: already summarized using rank(s): '{', '.join(self.summarized_ranks)}'. Use 'force_resummarize=True' to reset and resummarize" + ) # set ranks levels to summarize self.summarized_ranks = self.ascending_ranks if single_rank: if single_rank not in self.summarized_ranks: - raise ValueError(f"Error: rank '{single_rank}' not in available ranks ({', '.join(self.summarized_ranks)})") + raise ValueError( + f"Error: rank '{single_rank}' not in available ranks ({', '.join(self.summarized_ranks)})" + ) self.summarized_ranks = [single_rank] - notify(f"Starting summarization up rank(s): {', '.join(self.summarized_ranks)} ") + notify( + f"Starting summarization up rank(s): {', '.join(self.summarized_ranks)} " + ) for taxres in self.raw_taxresults: lininfo = taxres.lineageInfo - if lininfo and lininfo.filled_lineage: # won't always have lineage to summarize (skipped idents, missed idents) + if ( + lininfo and lininfo.filled_lineage + ): # won't always have lineage to summarize (skipped idents, missed idents) # notify + track perfect matches if taxres.f_unique_to_query >= 1.0: if taxres.match_ident not in self.perfect_match: - notify(f"WARNING: 100% match! Is query '{self.query_name}' identical to its database match, '{taxres.match_ident}'?") + notify( + f"WARNING: 100% match! Is query '{self.query_name}' identical to its database match, '{taxres.match_ident}'?" + ) self.perfect_match.add(taxres.match_ident) # add this taxresult to summary for rank in self.summarized_ranks: - if rank in lininfo.filled_ranks: # only store if this rank is filled. + if ( + rank in lininfo.filled_ranks + ): # only store if this rank is filled. lin_at_rank = lininfo.pop_to_rank(rank) - self.sum_uniq_weighted[rank][lin_at_rank] += taxres.f_unique_weighted - self.sum_uniq_to_query[rank][lin_at_rank] += taxres.f_unique_to_query - self.sum_uniq_bp[rank][lin_at_rank] += taxres.unique_intersect_bp + self.sum_uniq_weighted[rank][ + lin_at_rank + ] += taxres.f_unique_weighted + self.sum_uniq_to_query[rank][ + lin_at_rank + ] += taxres.f_unique_to_query + self.sum_uniq_bp[rank][ + lin_at_rank + ] += taxres.unique_intersect_bp # reset ranks levels to the ones that were actually summarized + that we can access for summarized result - self.summarized_ranks = [x for x in self.summarized_ranks if x in self.sum_uniq_bp.keys()] + self.summarized_ranks = [ + x for x in self.summarized_ranks if x in self.sum_uniq_bp.keys() + ] if single_rank and single_rank not in self.summarized_ranks: - raise ValueError(f"Error: rank '{single_rank}' was not available for any matching lineages.") + raise ValueError( + f"Error: rank '{single_rank}' was not available for any matching lineages." + ) def build_summarized_result(self, single_rank=None, force_resummarize=False): # just reset if we've already built summarized result (avoid adding to existing)? Or write in an error/force option? self._init_summarization_results() # if taxresults haven't been summarized, do that first if not self.summarized_ranks or force_resummarize: - self.summarize_up_ranks(single_rank=single_rank, force_resummarize=force_resummarize) + self.summarize_up_ranks( + single_rank=single_rank, force_resummarize=force_resummarize + ) # catch potential error from running summarize_up_ranks separately and passing in different single_rank if single_rank and single_rank not in self.summarized_ranks: - raise ValueError(f"Error: rank '{single_rank}' not in summarized rank(s), {','.join(self.summarized_ranks)}") + raise ValueError( + f"Error: rank '{single_rank}' not in summarized rank(s), {','.join(self.summarized_ranks)}" + ) # rank loop is currently done in __main__ - for rank in self.summarized_ranks[::-1]: # reverse so that results are in descending order - sum_uniq_to_query = self.sum_uniq_to_query[rank] #should be lineage: value + for rank in self.summarized_ranks[ + ::-1 + ]: # reverse so that results are in descending order + sum_uniq_to_query = self.sum_uniq_to_query[rank] # should be lineage: value # first, sort sorted_sum_uniq_to_query = list(sum_uniq_to_query.items()) - sorted_sum_uniq_to_query.sort(key = lambda x: -x[1]) + sorted_sum_uniq_to_query.sort(key=lambda x: -x[1]) for lineage, f_unique in sorted_sum_uniq_to_query: # does this ever happen? do we need it? - if f_unique == 0: #no annotated results for this query. do we need to handle this differently now? + if ( + f_unique == 0 + ): # no annotated results for this query. do we need to handle this differently now? continue f_weighted_at_rank = self.sum_uniq_weighted[rank][lineage] bp_intersect_at_rank = self.sum_uniq_bp[rank][lineage] - sres = SummarizedGatherResult(lineage=lineage, rank=rank, - f_weighted_at_rank=f_weighted_at_rank, fraction=f_unique, - bp_match_at_rank=bp_intersect_at_rank) + sres = SummarizedGatherResult( + lineage=lineage, + rank=rank, + f_weighted_at_rank=f_weighted_at_rank, + fraction=f_unique, + bp_match_at_rank=bp_intersect_at_rank, + ) sres.set_query_ani(query_info=self.query_info) self.summarized_lineage_results[rank].append(sres) @@ -2028,43 +2289,69 @@ def build_summarized_result(self, single_rank=None, force_resummarize=False): f_unique = 1.0 - self.total_f_classified[rank] if f_unique > 0: f_weighted_at_rank = 1.0 - self.total_f_weighted[rank] - bp_intersect_at_rank = self.query_info.query_bp - self.total_bp_classified[rank] - sres = SummarizedGatherResult(lineage=lineage, rank=rank, f_weighted_at_rank=f_weighted_at_rank, - fraction=f_unique, bp_match_at_rank=bp_intersect_at_rank, query_ani_at_rank=query_ani) + bp_intersect_at_rank = ( + self.query_info.query_bp - self.total_bp_classified[rank] + ) + sres = SummarizedGatherResult( + lineage=lineage, + rank=rank, + f_weighted_at_rank=f_weighted_at_rank, + fraction=f_unique, + bp_match_at_rank=bp_intersect_at_rank, + query_ani_at_rank=query_ani, + ) self.summarized_lineage_results[rank].append(sres) - def build_classification_result(self, rank=None, ani_threshold=None, containment_threshold=0.1, force_resummarize=False, lingroup_ranks=None, lingroups=None): + def build_classification_result( + self, + rank=None, + ani_threshold=None, + containment_threshold=0.1, + force_resummarize=False, + lingroup_ranks=None, + lingroups=None, + ): if containment_threshold is not None and not 0 <= containment_threshold <= 1: - raise ValueError(f"Containment threshold must be between 0 and 1 (input value: {containment_threshold}).") + raise ValueError( + f"Containment threshold must be between 0 and 1 (input value: {containment_threshold})." + ) if ani_threshold is not None and not 0 <= ani_threshold <= 1: - raise ValueError(f"ANI threshold must be between 0 and 1 (input value: {ani_threshold}).") - self._init_classification_results() # init some fields + raise ValueError( + f"ANI threshold must be between 0 and 1 (input value: {ani_threshold})." + ) + self._init_classification_results() # init some fields if not self.summarized_ranks or force_resummarize: - self.summarize_up_ranks(single_rank=rank, force_resummarize=force_resummarize) + self.summarize_up_ranks( + single_rank=rank, force_resummarize=force_resummarize + ) # catch potential error from running summarize_up_ranks separately and passing in different single_rank self.classified_ranks = self.summarized_ranks # if a rank is provided, we need to classify ONLY using that rank if rank: if rank not in self.summarized_ranks: - raise ValueError(f"Error: rank '{rank}' not in summarized rank(s), {','.join(self.summarized_ranks)}") + raise ValueError( + f"Error: rank '{rank}' not in summarized rank(s), {','.join(self.summarized_ranks)}" + ) else: self.classified_ranks = [rank] if lingroup_ranks: notify("Restricting classification to lingroups.") - self.classified_ranks = [x for x in self.classified_ranks if x in lingroup_ranks] + self.classified_ranks = [ + x for x in self.classified_ranks if x in lingroup_ranks + ] if not self.classified_ranks: - raise ValueError(f"Error: no ranks remain for classification.") + raise ValueError("Error: no ranks remain for classification.") # CLASSIFY using summarization--> best only result. Best way = use ANI or containment threshold classif = None - for this_rank in self.classified_ranks: # ascending order or just single rank + for this_rank in self.classified_ranks: # ascending order or just single rank # reset for this rank - f_weighted=0.0 - f_unique_at_rank=0.0 - bp_intersect_at_rank=0 + f_weighted = 0.0 + f_unique_at_rank = 0.0 + bp_intersect_at_rank = 0 sum_uniq_to_query = self.sum_uniq_to_query[this_rank] # sort the results and grab best sorted_sum_uniq_to_query = list(sum_uniq_to_query.items()) - sorted_sum_uniq_to_query.sort(key = lambda x: -x[1]) + sorted_sum_uniq_to_query.sort(key=lambda x: -x[1]) # select best-at-rank only this_lineage, f_unique_at_rank = sorted_sum_uniq_to_query[0] # if in desired lineage groups, continue (or??) @@ -2074,19 +2361,33 @@ def build_classification_result(self, rank=None, ani_threshold=None, containment bp_intersect_at_rank = self.sum_uniq_bp[this_rank][this_lineage] f_weighted = self.sum_uniq_weighted[this_rank][this_lineage] - classif = ClassificationResult(rank=this_rank, fraction=f_unique_at_rank, lineage=this_lineage, - f_weighted_at_rank=f_weighted, bp_match_at_rank=bp_intersect_at_rank) - - classif.set_status(self.query_info, containment_threshold=containment_threshold, ani_threshold=ani_threshold) + classif = ClassificationResult( + rank=this_rank, + fraction=f_unique_at_rank, + lineage=this_lineage, + f_weighted_at_rank=f_weighted, + bp_match_at_rank=bp_intersect_at_rank, + ) + + classif.set_status( + self.query_info, + containment_threshold=containment_threshold, + ani_threshold=ani_threshold, + ) # determine whether to move on to a higher tax rank (if avail) - if classif.status == 'match' or classif.status == "nomatch": # not sure we want/need the `nomatch` part... + if ( + classif.status == "match" or classif.status == "nomatch" + ): # not sure we want/need the `nomatch` part... break # store the final classification result self.classification_result = classif # could do this later, in __main__.py, for example - self.krona_classified, self.krona_unclassified = self.classification_result.build_krona_result(rank=rank) - self.krona_header = self.make_krona_header(min_rank = rank) + ( + self.krona_classified, + self.krona_unclassified, + ) = self.classification_result.build_krona_result(rank=rank) + self.krona_header = self.make_krona_header(min_rank=rank) def make_krona_header(self, min_rank): "make header for krona output" @@ -2096,7 +2397,7 @@ def make_krona_header(self, min_rank): raise ValueError(f"Rank '{min_rank}' not present in summarized ranks.") else: rank_index = self.ranks.index(min_rank) - return ["fraction"] + list(self.ranks[:rank_index+1]) + return ["fraction"] + list(self.ranks[: rank_index + 1]) def check_classification(self): if not self.classification_result: @@ -2125,41 +2426,65 @@ def make_full_summary(self, classification=False, limit_float=False): rD = {} if classification: self.check_classification() - header= ["query_name", "status", "rank", "fraction", "lineage", - "query_md5", "query_filename", "f_weighted_at_rank", - "bp_match_at_rank", "query_ani_at_rank"] - rD = self.classification_result.as_summary_dict(query_info = self.query_info, limit_float=limit_float) - del rD['total_weighted_hashes'] + header = [ + "query_name", + "status", + "rank", + "fraction", + "lineage", + "query_md5", + "query_filename", + "f_weighted_at_rank", + "bp_match_at_rank", + "query_ani_at_rank", + ] + rD = self.classification_result.as_summary_dict( + query_info=self.query_info, limit_float=limit_float + ) + del rD["total_weighted_hashes"] results.append(rD) else: self.check_summarization() - header= ["query_name", "rank", "fraction", "lineage", "query_md5", - "query_filename", "f_weighted_at_rank", "bp_match_at_rank", - "query_ani_at_rank", "total_weighted_hashes"] - - for rank in self.summarized_ranks[::-1]: #descending - unclassified=[] + header = [ + "query_name", + "rank", + "fraction", + "lineage", + "query_md5", + "query_filename", + "f_weighted_at_rank", + "bp_match_at_rank", + "query_ani_at_rank", + "total_weighted_hashes", + ] + + for rank in self.summarized_ranks[::-1]: # descending + unclassified = [] rank_results = self.summarized_lineage_results[rank] - rank_results.sort(key=lambda res: -res.fraction) #v5?: f_weighted_at_rank) + rank_results.sort( + key=lambda res: -res.fraction + ) # v5?: f_weighted_at_rank) for res in rank_results: - rD = res.as_summary_dict(query_info=self.query_info, limit_float=limit_float) + rD = res.as_summary_dict( + query_info=self.query_info, limit_float=limit_float + ) # save unclassified for the end - if rD['lineage'] == "unclassified": + if rD["lineage"] == "unclassified": unclassified.append(rD) else: results.append(rD) - results +=unclassified + results += unclassified return header, results def make_kreport_results(self): - ''' + """ Format taxonomy-summarized gather results as kraken-style kreport. STANDARD KREPORT FORMAT: - `Percent Reads Contained in Taxon`: The cumulative percentage of reads for this taxon and all descendants. - `Number of Reads Contained in Taxon`: The cumulative number of reads for this taxon and all descendants. - `Number of Reads Assigned to Taxon`: The number of reads assigned directly to this taxon (not a cumulative count of all descendants). - - `Rank Code`: (U)nclassified, (R)oot, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. + - `Rank Code`: (U)nclassified, (R)oot, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. - `NCBI Taxon ID`: Numerical ID from the NCBI taxonomy database. - `Scientific Name`: The scientific name of the taxon. @@ -2191,30 +2516,43 @@ def make_kreport_results(self): - `Percent Contained in Taxon`: Percent of all base pairs contained by this taxon (weighted by abundance if tracked) - `Estimated base pairs Contained in Taxon`: Number of base pairs contained by this taxon (weighted by abundance if tracked) - `Estimated base pairs Assigned to Taxon`: Number of base pairs at species-level (weighted by abundance if tracked) - - `Rank Code`: (U)nclassified, (R)oot, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. + - `Rank Code`: (U)nclassified, (R)oot, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. - `NCBI Taxon ID` will not be reported (blank entries). - `Scientific Name`: The scientific name of the taxon. In the future, we may wish to report the NCBI taxid when we can (NCBI taxonomy only). - ''' + """ self.check_summarization() - header = ["percent_containment", "num_bp_contained", "num_bp_assigned", "rank_code", "ncbi_taxid", "sci_name"] + header = [ + "percent_containment", + "num_bp_contained", + "num_bp_assigned", + "rank_code", + "ncbi_taxid", + "sci_name", + ] if self.query_info.total_weighted_hashes == 0: - raise ValueError("ERROR: cannot produce 'kreport' format from gather results before sourmash v4.5.0") + raise ValueError( + "ERROR: cannot produce 'kreport' format from gather results before sourmash v4.5.0" + ) required_ranks = set(RANKCODE.keys()) - acceptable_ranks = list(self.ranks) + ['unclassified', 'kingdom'] + acceptable_ranks = list(self.ranks) + ["unclassified", "kingdom"] if not required_ranks.issubset(set(acceptable_ranks)): - raise ValueError("ERROR: cannot produce 'kreport' format from ranks {', '.join(self.ranks)}") + raise ValueError( + "ERROR: cannot produce 'kreport' format from ranks {', '.join(self.ranks)}" + ) kreport_results = [] - unclassified_recorded=False + unclassified_recorded = False # want to order results descending by rank for rank in self.ranks: - if rank == 'strain': # no code for strain, can't include in this output afaik + if ( + rank == "strain" + ): # no code for strain, can't include in this output afaik continue rank_results = self.summarized_lineage_results[rank] for res in rank_results: kresD = res.as_kreport_dict(self.query_info) - if kresD['sci_name'] == "unclassified": + if kresD["sci_name"] == "unclassified": # SummarizedGatherResults have an unclassified lineage at every rank, to facilitate reporting at a specific rank. # Here, we only need to report it once, since it will be the same fraction for all ranks if unclassified_recorded: @@ -2224,7 +2562,9 @@ def make_kreport_results(self): kreport_results.append(kresD) return header, kreport_results - def make_lingroup_results(self, LINgroupsD): # LingroupsD is dictionary {lg_prefix: lg_name} + def make_lingroup_results( + self, LINgroupsD + ): # LingroupsD is dictionary {lg_prefix: lg_name} """ Report results for the specified LINGroups. Keep LCA paths in order as much as possible. @@ -2233,7 +2573,9 @@ def make_lingroup_results(self, LINgroupsD): # LingroupsD is dictionary {lg_pref header = ["name", "lin", "percent_containment", "num_bp_contained"] if self.query_info.total_weighted_hashes == 0: - raise ValueError("ERROR: cannot produce 'lingroup' format from gather results before sourmash v4.5.0") + raise ValueError( + "ERROR: cannot produce 'lingroup' format from gather results before sourmash v4.5.0" + ) # find the ranks we need to consider lg_ranks, all_lgs = parse_lingroups(LINgroupsD) @@ -2243,17 +2585,19 @@ def make_lingroup_results(self, LINgroupsD): # LingroupsD is dictionary {lg_pref for rank in lg_ranks: rank_results = self.summarized_lineage_results[rank] for res in rank_results: - if res.lineage in all_lgs:# is this lineage in the list of LINgroups? - this_lingroup_name = LINgroupsD[res.lineage.display_lineage(truncate_empty=True)] + if res.lineage in all_lgs: # is this lineage in the list of LINgroups? + this_lingroup_name = LINgroupsD[ + res.lineage.display_lineage(truncate_empty=True) + ] lg_resD = res.as_lingroup_dict(self.query_info, this_lingroup_name) lg_results[res.lineage] = lg_resD # We want to return in ~ depth order: descending each specific path in order # use LineageTree to find ordered paths lg_tree = LineageTree(all_lgs) - ordered_paths = lg_tree.ordered_paths(include_internal = True) + ordered_paths = lg_tree.ordered_paths(include_internal=True) # store results in order: - lingroup_results=[] + lingroup_results = [] for lg in ordered_paths: # get LINInfo object lg_LINInfo = LINLineageInfo(lineage=lg) @@ -2261,9 +2605,9 @@ def make_lingroup_results(self, LINgroupsD): # LingroupsD is dictionary {lg_pref lg_res = lg_results.get(lg_LINInfo) if lg_res: lingroup_results.append(lg_res) - + return header, lingroup_results - + def make_cami_bioboxes(self): """ info: https://github.com/CAMI-challenge/contest_information/blob/master/file_formats/CAMI_TP_specification.mkd @@ -2271,17 +2615,17 @@ def make_cami_bioboxes(self): columns: TAXID - specifies a unique alphanumeric ID for a node in a reference tree such as the NCBI taxonomy RANK - superkingdom --> strain - TAXPATH - the path from the root of the reference taxonomy to the respective taxon + TAXPATH - the path from the root of the reference taxonomy to the respective taxon TAXPATHSN - scientific names of taxpath PERCENTAGE (0-100) - field specifies what percentage of the sample was assigned to the respective TAXID example: - + #CAMI Submission for Taxonomic Profiling @Version:0.9.1 @SampleID:SAMPLEID @Ranks:superkingdom|phylum|class|order|family|genus|species|strain - + @@TAXID RANK TAXPATH TAXPATHSN PERCENTAGE 2 superkingdom 2 Bacteria 98.81211 2157 superkingdom 2157 Archaea 1.18789 @@ -2296,7 +2640,7 @@ def make_cami_bioboxes(self): 204455 order 2|1224|28211|204455 Bacteria|Proteobacteria|Alphaproteobacteria|Rhodobacterales 8.42263 2158 order 2157|28890|183925|2158 Archaea|Euryarchaeotes|Methanobacteria|Methanobacteriales 1.18789 """ - # build CAMI header info + # build CAMI header info header_title = "# Taxonomic Profiling Output" version_info = "@Version:0.10.0" program = "@__program__:sourmash" @@ -2308,9 +2652,9 @@ def make_cami_bioboxes(self): rank_info = f"@Ranks:{'|'.join(ranks)}" header_lines = [header_title, sample_info, version_info, rank_info, program] - colnames = ["@@TAXID","RANK","TAXPATH","TAXPATHSN","PERCENTAGE"] - header_lines.append('\t'.join(colnames)) - + colnames = ["@@TAXID", "RANK", "TAXPATH", "TAXPATHSN", "PERCENTAGE"] + header_lines.append("\t".join(colnames)) + # now build results in CAMI format bioboxes_results = [] # order results by rank (descending), then percentage @@ -2322,4 +2666,3 @@ def make_cami_bioboxes(self): bioboxes_results.append(bb_info) return header_lines, bioboxes_results - diff --git a/src/sourmash/utils.py b/src/sourmash/utils.py index 71afc20261..1910504e05 100644 --- a/src/sourmash/utils.py +++ b/src/sourmash/utils.py @@ -42,7 +42,7 @@ def decode_str(s): """Decodes a SourmashStr""" try: if s.len == 0: - return u"" + return "" return ffi.unpack(s.data, s.len).decode("utf-8", "replace") finally: if s.owned: diff --git a/tests/conftest.py b/tests/conftest.py index 3281133cd5..9cc035bb4a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,11 +5,14 @@ import pytest import matplotlib.pyplot as plt -plt.rcParams.update({'figure.max_open_warning': 0}) + +plt.rcParams.update({"figure.max_open_warning": 0}) from sourmash_tst_utils import TempDirectory, RunnerContext + sys.stdout = sys.stderr + @pytest.fixture def runtmp(): with TempDirectory() as location: @@ -66,16 +69,17 @@ def use_manifest(request): return request.param -@pytest.fixture(params=['json', 'sql']) +@pytest.fixture(params=["json", "sql"]) def lca_db_format(request): return request.param -@pytest.fixture(params=['csv', 'sql']) +@pytest.fixture(params=["csv", "sql"]) def manifest_db_format(request): return request.param -@pytest.fixture(params=['sig', 'sig.gz', 'zip', '.d/', '.sqldb']) + +@pytest.fixture(params=["sig", "sig.gz", "zip", ".d/", ".sqldb"]) def sig_save_extension(request): return request.param @@ -89,29 +93,37 @@ def pytest_collection_modifyitems(items, config): deselected_items = [] for item in items: - if fixture_name in getattr(item, 'fixturenames', ()): + if fixture_name in getattr(item, "fixturenames", ()): selected_items.append(item) else: deselected_items.append(item) config.hook.pytest_deselected(items=deselected_items) items[:] = selected_items + + # --- END - Only run tests using a particular fixture --- # + def pytest_addoption(parser): - parser.addoption("--usesfixture", - action="store", - default=None, - help="just run tests that use a particular fixture") + parser.addoption( + "--usesfixture", + action="store", + default=None, + help="just run tests that use a particular fixture", + ) + + parser.addoption( + "--run-hypothesis", action="store_true", help="run hypothesis tests" + ) - parser.addoption("--run-hypothesis", action="store_true", - help="run hypothesis tests") def pytest_runtest_setup(item): if item.config.getoption("--run-hypothesis"): if not any(mark for mark in item.iter_markers(name="hypothesis")): pytest.skip("--run-hypothesis option set, running only hypothesis tests") + settings.register_profile("ci", max_examples=1000) settings.register_profile("dev", max_examples=10) settings.register_profile("debug", max_examples=10, verbosity=Verbosity.verbose) -settings.load_profile(os.getenv(u'HYPOTHESIS_PROFILE', 'default')) +settings.load_profile(os.getenv("HYPOTHESIS_PROFILE", "default")) diff --git a/tests/sourmash_tst_utils.py b/tests/sourmash_tst_utils.py index 7425934d2a..a2a35cb2e6 100644 --- a/tests/sourmash_tst_utils.py +++ b/tests/sourmash_tst_utils.py @@ -7,7 +7,6 @@ import collections import pprint import traceback -from io import open # pylint: disable=redefined-builtin from io import StringIO from pathlib import Path @@ -15,13 +14,21 @@ from importlib.metadata import entry_points -SIG_FILES = [os.path.join('demo', f) for f in ( - "SRR2060939_1.sig", "SRR2060939_2.sig", "SRR2241509_1.sig", - "SRR2255622_1.sig", "SRR453566_1.sig", "SRR453569_1.sig", "SRR453570_1.sig") +SIG_FILES = [ + os.path.join("demo", f) + for f in ( + "SRR2060939_1.sig", + "SRR2060939_2.sig", + "SRR2241509_1.sig", + "SRR2255622_1.sig", + "SRR453566_1.sig", + "SRR453569_1.sig", + "SRR453570_1.sig", + ) ] -def scriptpath(scriptname='sourmash'): +def scriptpath(scriptname="sourmash"): """Return the path to the scripts, in both dev and install situations.""" # note - it doesn't matter what the scriptname is here, as long as # it's some script present in this version of sourmash. @@ -34,7 +41,7 @@ def scriptpath(scriptname='sourmash'): if os.path.exists(os.path.join(path, scriptname)): return path - for path in os.environ['PATH'].split(':'): + for path in os.environ["PATH"].split(":"): if os.path.exists(os.path.join(path, scriptname)): return path @@ -42,7 +49,7 @@ def scriptpath(scriptname='sourmash'): def _runscript(scriptname): """Find & run a script with exec (i.e. not via os.system or subprocess).""" namespace = {"__name__": "__main__"} - namespace['sys'] = globals()['sys'] + namespace["sys"] = globals()["sys"] try: (script,) = entry_points(name=scriptname, group="console_scripts") @@ -57,15 +64,15 @@ def _runscript(scriptname): if os.path.isfile(scriptfile): if os.path.isfile(scriptfile): exec( # pylint: disable=exec-used - compile(Path(scriptfile).read_text(), scriptfile, 'exec'), - namespace) + compile(Path(scriptfile).read_text(), scriptfile, "exec"), namespace + ) return 0 return -1 -ScriptResults = collections.namedtuple('ScriptResults', - ['status', 'out', 'err']) +ScriptResults = collections.namedtuple("ScriptResults", ["status", "out", "err"]) + def runscript(scriptname, args, **kwargs): """Run a Python script using exec(). @@ -81,8 +88,8 @@ def runscript(scriptname, args, **kwargs): sysargs.extend(args) cwd = os.getcwd() - in_directory = kwargs.get('in_directory', cwd) - fail_ok = kwargs.get('fail_ok', False) + in_directory = kwargs.get("in_directory", cwd) + fail_ok = kwargs.get("fail_ok", False) try: status = -1 @@ -90,8 +97,8 @@ def runscript(scriptname, args, **kwargs): sys.argv = sysargs oldin = None - if 'stdin_data' in kwargs: - oldin, sys.stdin = sys.stdin, StringIO(kwargs['stdin_data']) + if "stdin_data" in kwargs: + oldin, sys.stdin = sys.stdin, StringIO(kwargs["stdin_data"]) oldout, olderr = sys.stdout, sys.stderr sys.stdout = StringIO() @@ -101,13 +108,13 @@ def runscript(scriptname, args, **kwargs): os.chdir(in_directory) try: - print('running:', scriptname, 'in:', in_directory, file=oldout) - print('arguments', sysargs, file=oldout) + print("running:", scriptname, "in:", in_directory, file=oldout) + print("arguments", sysargs, file=oldout) status = _runscript(scriptname) except SystemExit as err: status = err.code - if status == None: + if status is None: status = 0 except: # pylint: disable=bare-except traceback.print_exc(file=sys.stderr) @@ -133,14 +140,13 @@ def runscript(scriptname, args, **kwargs): def get_test_data(filename): filepath = resources.files("sourmash") / "tests" / "test-data" / filename if not filepath.exists() or not os.path.isfile(filepath): - filepath = os.path.join(os.path.dirname(__file__), 'test-data', - filename) + filepath = os.path.join(os.path.dirname(__file__), "test-data", filename) return filepath -class TempDirectory(object): +class TempDirectory: def __init__(self): - self.tempdir = tempfile.mkdtemp(prefix='sourmashtest_') + self.tempdir = tempfile.mkdtemp(prefix="sourmashtest_") def __enter__(self): return self.tempdir @@ -158,10 +164,10 @@ def __exit__(self, exc_type, exc_value, traceback): class SourmashCommandFailed(Exception): def __init__(self, msg): Exception.__init__(self, msg) - self.message = msg + self.message = msg -class RunnerContext(object): +class RunnerContext: """ I am a RunnerContext object from sourmash_tst_utils. @@ -171,6 +177,7 @@ class RunnerContext(object): You can use the 'output' method to build filenames in my temp directory. """ + def __init__(self, location): self.location = location self.last_command = None @@ -178,25 +185,26 @@ def __init__(self, location): def run_sourmash(self, *args, **kwargs): "Run the sourmash script with the given arguments." - kwargs['fail_ok'] = True - if 'in_directory' not in kwargs: - kwargs['in_directory'] = self.location + kwargs["fail_ok"] = True + if "in_directory" not in kwargs: + kwargs["in_directory"] = self.location - cmdlist = ['sourmash'] - cmdlist.extend(( str(x) for x in args)) + cmdlist = ["sourmash"] + cmdlist.extend(str(x) for x in args) self.last_command = " ".join(cmdlist) - self.last_result = runscript('sourmash', args, **kwargs) + self.last_result = runscript("sourmash", args, **kwargs) if self.last_result.status: raise SourmashCommandFailed(self.last_result.err) return self.last_result + sourmash = run_sourmash def run(self, scriptname, *args, **kwargs): "Run a script with the given arguments." - if 'in_directory' not in kwargs: - kwargs['in_directory'] = self.location + if "in_directory" not in kwargs: + kwargs["in_directory"] = self.location self.last_command = " ".join(args) self.last_result = runscript(scriptname, args, **kwargs) return self.last_result @@ -207,18 +215,18 @@ def output(self, path): def __str__(self): s = "" if self.last_command: - s += "Last command run:\n{}\n".format(repr(self.last_command)) + s += f"Last command run:\n{repr(self.last_command)}\n" if self.last_result: s += "\nLAST RESULT:\n" - s += "- exit code: {}\n\n".format(self.last_result.status) + s += f"- exit code: {self.last_result.status}\n\n" if self.last_result.out: - s += "- stdout:\n---\n{}---\n".format(self.last_result.out) + s += f"- stdout:\n---\n{self.last_result.out}---\n" else: - s += '(no stdout)\n\n' + s += "(no stdout)\n\n" if self.last_result.err: - s += "- stderr:\n---\n{}---\n".format(self.last_result.err) + s += f"- stderr:\n---\n{self.last_result.err}---\n" else: - s += '(no stderr)\n' + s += "(no stderr)\n" return s diff --git a/tests/test__minhash_hypothesis.py b/tests/test__minhash_hypothesis.py index 7f1b421dbd..2778358caa 100644 --- a/tests/test__minhash_hypothesis.py +++ b/tests/test__minhash_hypothesis.py @@ -7,9 +7,11 @@ from sourmash.minhash import _get_max_hash_for_scaled -@given(st.lists(st.integers(min_value=0, max_value=2**64 - 1), min_size=10, max_size=1000), - st.lists(st.integers(min_value=0, max_value=2**64 - 1), min_size=10, max_size=1000), - st.integers(min_value=10, max_value=1000)) +@given( + st.lists(st.integers(min_value=0, max_value=2**64 - 1), min_size=10, max_size=1000), + st.lists(st.integers(min_value=0, max_value=2**64 - 1), min_size=10, max_size=1000), + st.integers(min_value=10, max_value=1000), +) @example([1, 2], [3, 4], 2) def test_set_abundance_num_hypothesis(hashes, abundances, sketch_size): a = MinHash(sketch_size, 10, track_abundance=True) @@ -25,9 +27,11 @@ def test_set_abundance_num_hypothesis(hashes, abundances, sketch_size): assert oracle[k] == v -@given(st.lists(st.integers(min_value=0, max_value=2**64 - 1), min_size=10, max_size=1000), - st.lists(st.integers(min_value=0, max_value=2**64 - 1), min_size=10, max_size=1000), - st.integers(min_value=1000, max_value=10000)) +@given( + st.lists(st.integers(min_value=0, max_value=2**64 - 1), min_size=10, max_size=1000), + st.lists(st.integers(min_value=0, max_value=2**64 - 1), min_size=10, max_size=1000), + st.integers(min_value=1000, max_value=10000), +) @example([0], [0], 1000) def test_set_abundance_scaled_hypothesis(hashes, abundances, scaled): a = MinHash(0, 10, track_abundance=True, scaled=scaled) diff --git a/tests/test_api.py b/tests/test_api.py index ccaf321df6..a06a610c83 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -9,10 +9,10 @@ def test_sourmash_signature_api(c): e = sourmash.MinHash(n=1, ksize=20) sig = sourmash.SourmashSignature(e) - with open(c.output('xxx.sig'), 'wt') as fp: + with open(c.output("xxx.sig"), "w") as fp: sourmash.save_signatures([sig], fp) - sig_x1 = sourmash.load_one_signature(c.output('xxx.sig')) - sig_x2 = list(sourmash.load_file_as_signatures(c.output('xxx.sig')))[0] + sig_x1 = sourmash.load_one_signature(c.output("xxx.sig")) + sig_x2 = list(sourmash.load_file_as_signatures(c.output("xxx.sig")))[0] assert sig_x1 == sig assert sig_x2 == sig @@ -21,12 +21,12 @@ def test_sourmash_signature_api(c): @utils.in_tempdir def test_load_index_0_no_file(c): with pytest.raises(ValueError) as exc: - idx = sourmash.load_file_as_index(c.output('does-not-exist')) - assert 'Error while reading signatures from ' in str(exc.value) + sourmash.load_file_as_index(c.output("does-not-exist")) + assert "Error while reading signatures from " in str(exc.value) def test_load_index_1(): - testfile = utils.get_test_data('prot/protein.sbt.zip') + testfile = utils.get_test_data("prot/protein.sbt.zip") idx = sourmash.load_file_as_index(testfile) sigs = list(idx.signatures()) @@ -34,7 +34,7 @@ def test_load_index_1(): def test_load_index_2(): - testfile = utils.get_test_data('prot/protein.lca.json.gz') + testfile = utils.get_test_data("prot/protein.lca.json.gz") idx = sourmash.load_file_as_index(testfile) sigs = list(idx.signatures()) @@ -42,7 +42,7 @@ def test_load_index_2(): def test_load_index_3(): - testfile = utils.get_test_data('prot/protein/') + testfile = utils.get_test_data("prot/protein/") idx = sourmash.load_file_as_index(testfile) sigs = list(idx.signatures()) @@ -50,7 +50,7 @@ def test_load_index_3(): def test_load_index_4(): - testfile = utils.get_test_data('prot/all.zip') + testfile = utils.get_test_data("prot/all.zip") idx = sourmash.load_file_as_index(testfile) sigs = list(idx.signatures()) @@ -58,7 +58,7 @@ def test_load_index_4(): def test_load_index_4_b(): - testfile = utils.get_test_data('prot/protein.zip') + testfile = utils.get_test_data("prot/protein.zip") idx = sourmash.load_file_as_index(testfile) sigs = list(idx.signatures()) @@ -67,19 +67,24 @@ def test_load_index_4_b(): def test_load_fasta_as_signature(): # try loading a fasta file - should fail with informative exception - testfile = utils.get_test_data('short.fa') + testfile = utils.get_test_data("short.fa") with pytest.raises(Exception) as exc: - idx = sourmash.load_file_as_index(testfile) + sourmash.load_file_as_index(testfile) print(exc.value) - assert f"Error while reading signatures from '{testfile}' - got sequences instead! Is this a FASTA/FASTQ file?" in str(exc.value) + assert ( + f"Error while reading signatures from '{testfile}' - got sequences instead! Is this a FASTA/FASTQ file?" + in str(exc.value) + ) def test_load_and_search_sbt_api(): - treefile = utils.get_test_data('prot/protein.sbt.zip') - queryfile = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') + treefile = utils.get_test_data("prot/protein.sbt.zip") + queryfile = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) tree = sourmash.load_sbt_index(treefile) query = sourmash.load_one_signature(queryfile) diff --git a/tests/test_bugs.py b/tests/test_bugs.py index e0f3c5daf0..2b8f677279 100644 --- a/tests/test_bugs.py +++ b/tests/test_bugs.py @@ -1,11 +1,12 @@ import sourmash_tst_utils as utils + @utils.in_tempdir def test_bug_803(c): # can we do a 'sourmash search' on an LCA database and a query with abundance? - query = utils.get_test_data('track_abund/47.fa.sig') - lca_db = utils.get_test_data('lca/47+63.lca.json') + query = utils.get_test_data("track_abund/47.fa.sig") + lca_db = utils.get_test_data("lca/47+63.lca.json") - c.run_sourmash('search', query, lca_db, '--ignore-abundance') + c.run_sourmash("search", query, lca_db, "--ignore-abundance") print(c) - assert 'NC_009665.1 Shewanella baltica OS185, complete genome' in str(c) + assert "NC_009665.1 Shewanella baltica OS185, complete genome" in str(c) diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index 232ba6a218..7f8365118f 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -26,31 +26,33 @@ def _write_file(runtmp, basename, lines, *, gz=False): else: xopen = open - with xopen(loc, 'wt') as fp: + with xopen(loc, "wt") as fp: fp.write("\n".join(lines)) return loc def test_run_sourmash_signature_cmd(): - status, out, err = utils.runscript('sourmash', ['signature'], fail_ok=True) - assert not 'sourmash: error: argument cmd: invalid choice:' in err - assert 'Manipulate signature files:' in out - assert status != 0 # no args provided, ok ;) + status, out, err = utils.runscript("sourmash", ["signature"], fail_ok=True) + assert "sourmash: error: argument cmd: invalid choice:" not in err + assert "Manipulate signature files:" in out + assert status != 0 # no args provided, ok ;) def test_run_sourmash_sig_cmd(): - status, out, err = utils.runscript('sourmash', ['sig'], fail_ok=True) - assert not 'sourmash: error: argument cmd: invalid choice:' in err - assert 'Manipulate signature files:' in out - assert status != 0 # no args provided, ok ;) + status, out, err = utils.runscript("sourmash", ["sig"], fail_ok=True) + assert "sourmash: error: argument cmd: invalid choice:" not in err + assert "Manipulate signature files:" in out + assert status != 0 # no args provided, ok ;) def test_run_cat_via_parse_args(): # run a command ('sourmash.sig.cat') with args constructed via parse_args - import sourmash.sig, sourmash.cli - sig47 = utils.get_test_data('47.fa.sig') + import sourmash.sig + import sourmash.cli - args = sourmash.cli.parse_args(['sig', 'cat', sig47]) + sig47 = utils.get_test_data("47.fa.sig") + + args = sourmash.cli.parse_args(["sig", "cat", sig47]) sourmash.sig.cat(args) @@ -58,10 +60,10 @@ def test_sig_merge_1_use_full_signature_in_cmd(runtmp): c = runtmp # merge of 47 & 63 should be union of mins - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - sig47and63 = utils.get_test_data('47+63.fa.sig') - c.run_sourmash('signature', 'merge', sig47, sig63) + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + sig47and63 = utils.get_test_data("47+63.fa.sig") + c.run_sourmash("signature", "merge", sig47, sig63) # stdout should be new signature out = c.last_result.out @@ -80,16 +82,21 @@ def test_sig_merge_1_fromfile_picklist(runtmp): c = runtmp # merge of 47 & 63 should be union of mins - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - sig47and63 = utils.get_test_data('47+63.fa.sig') - - from_file = _write_file(runtmp, 'list.txt', [sig47, sig63]) - picklist = _write_file(runtmp, 'pl.csv', - ['md5short', '09a08691', '38729c63']) - - c.run_sourmash('signature', 'merge', '--from-file', from_file, - '--picklist', f'{picklist}:md5short:md5short') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + sig47and63 = utils.get_test_data("47+63.fa.sig") + + from_file = _write_file(runtmp, "list.txt", [sig47, sig63]) + picklist = _write_file(runtmp, "pl.csv", ["md5short", "09a08691", "38729c63"]) + + c.run_sourmash( + "signature", + "merge", + "--from-file", + from_file, + "--picklist", + f"{picklist}:md5short:md5short", + ) # stdout should be new signature out = c.last_result.out @@ -109,17 +116,23 @@ def test_sig_merge_1_fromfile_picklist_gz(runtmp): c = runtmp # merge of 47 & 63 should be union of mins - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - sig47and63 = utils.get_test_data('47+63.fa.sig') - - from_file = _write_file(runtmp, 'list.txt', [sig47, sig63]) - picklist = _write_file(runtmp, 'pl.csv', - ['md5short', '09a08691', '38729c63'], - gz=True) - - c.run_sourmash('signature', 'merge', '--from-file', from_file, - '--picklist', f'{picklist}:md5short:md5short') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + sig47and63 = utils.get_test_data("47+63.fa.sig") + + from_file = _write_file(runtmp, "list.txt", [sig47, sig63]) + picklist = _write_file( + runtmp, "pl.csv", ["md5short", "09a08691", "38729c63"], gz=True + ) + + c.run_sourmash( + "signature", + "merge", + "--from-file", + from_file, + "--picklist", + f"{picklist}:md5short:md5short", + ) # stdout should be new signature out = c.last_result.out @@ -137,10 +150,10 @@ def test_sig_merge_1_fromfile_picklist_gz(runtmp): @utils.in_tempdir def test_sig_merge_1(c): # merge of 47 & 63 should be union of mins - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - sig47and63 = utils.get_test_data('47+63.fa.sig') - c.run_sourmash('sig', 'merge', sig47, sig63) + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + sig47and63 = utils.get_test_data("47+63.fa.sig") + c.run_sourmash("sig", "merge", sig47, sig63) # stdout should be new signature out = c.last_result.out @@ -158,9 +171,9 @@ def test_sig_merge_1(c): @utils.in_tempdir def test_sig_merge_1_multisig(c): # merge of 47 & 63 should be union of mins; here, sigs are in same file. - multisig = utils.get_test_data('47+63-multisig.sig') - sig47and63 = utils.get_test_data('47+63.fa.sig') - c.run_sourmash('sig', 'merge', multisig, '--flatten') + multisig = utils.get_test_data("47+63-multisig.sig") + sig47and63 = utils.get_test_data("47+63.fa.sig") + c.run_sourmash("sig", "merge", multisig, "--flatten") # stdout should be new signature out = c.last_result.out @@ -178,13 +191,25 @@ def test_sig_merge_1_multisig(c): @utils.in_tempdir def test_sig_merge_1_name(c): # check name arg - sig2 = utils.get_test_data('2.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - - assignedSigName = 'SIG_NAME' - outsig = c.output('merged2and63.sig') - - c.run_sourmash('sig', 'merge', sig2, sig63, '--dna', '-k', '31', '-o', "merged2and63.sig", '--name', assignedSigName ) + sig2 = utils.get_test_data("2.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + + assignedSigName = "SIG_NAME" + outsig = c.output("merged2and63.sig") + + c.run_sourmash( + "sig", + "merge", + sig2, + sig63, + "--dna", + "-k", + "31", + "-o", + "merged2and63.sig", + "--name", + assignedSigName, + ) test_merge_sig = sourmash.load_one_signature(outsig) @@ -197,10 +222,10 @@ def test_sig_merge_1_name(c): @utils.in_tempdir def test_sig_merge_1_ksize_moltype(c): # check ksize, moltype args - sig2 = utils.get_test_data('2.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - sig2and63 = utils.get_test_data('2+63.fa.sig') - c.run_sourmash('sig', 'merge', sig2, sig63, '--dna', '-k', '31') + sig2 = utils.get_test_data("2.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + sig2and63 = utils.get_test_data("2+63.fa.sig") + c.run_sourmash("sig", "merge", sig2, sig63, "--dna", "-k", "31") # stdout should be new signature out = c.last_result.out @@ -218,12 +243,12 @@ def test_sig_merge_1_ksize_moltype(c): @utils.in_tempdir def test_sig_merge_1_ksize_moltype_fail(c): # check ksize, moltype args - sig2 = utils.get_test_data('2.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - sig2and63 = utils.get_test_data('2+63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + utils.get_test_data("2+63.fa.sig") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('sig', 'merge', sig2, sig63) + c.run_sourmash("sig", "merge", sig2, sig63) assert "ERROR when merging signature" in str(exc.value) @@ -231,8 +256,8 @@ def test_sig_merge_1_ksize_moltype_fail(c): @utils.in_tempdir def test_sig_merge_2(c): # merge of 47 with nothing should be 47 - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'merge', sig47) + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "merge", sig47) # stdout should be new signature out = c.last_result.out @@ -248,46 +273,52 @@ def test_sig_merge_2(c): @utils.in_tempdir def test_sig_merge_3_abund_ab_ok(c): # merge of 47 and 63 with abund should work - sig47abund = utils.get_test_data('track_abund/47.fa.sig') - sig63abund = utils.get_test_data('track_abund/63.fa.sig') + sig47abund = utils.get_test_data("track_abund/47.fa.sig") + sig63abund = utils.get_test_data("track_abund/63.fa.sig") - c.run_sourmash('sig', 'merge', sig47abund, sig63abund) - actual_merge_sig = sourmash.load_one_signature(c.last_result.out) + c.run_sourmash("sig", "merge", sig47abund, sig63abund) + sourmash.load_one_signature(c.last_result.out) # CTB: should check that this merge did what we think it should do! @utils.in_tempdir def test_sig_merge_3_abund_ab(c): # merge of 47 with abund, with 63 without, should fail; and vice versa - sig47 = utils.get_test_data('47.fa.sig') - sig63abund = utils.get_test_data('track_abund/63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63abund = utils.get_test_data("track_abund/63.fa.sig") - with pytest.raises(SourmashCommandFailed) as e: - c.run_sourmash('sig', 'merge', sig47, sig63abund) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("sig", "merge", sig47, sig63abund) print(c.last_result) - assert 'incompatible signatures: track_abundance is False in first sig, True in second' in c.last_result.err + assert ( + "incompatible signatures: track_abundance is False in first sig, True in second" + in c.last_result.err + ) @utils.in_tempdir def test_sig_merge_3_abund_ba(c): # merge of 47 without abund, with 63 with, should fail - sig47 = utils.get_test_data('47.fa.sig') - sig63abund = utils.get_test_data('track_abund/63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63abund = utils.get_test_data("track_abund/63.fa.sig") - with pytest.raises(SourmashCommandFailed) as e: - c.run_sourmash('sig', 'merge', sig63abund, sig47) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("sig", "merge", sig63abund, sig47) print(c.last_result) - assert 'incompatible signatures: track_abundance is True in first sig, False in second' in c.last_result.err + assert ( + "incompatible signatures: track_abundance is True in first sig, False in second" + in c.last_result.err + ) @utils.in_tempdir def test_sig_filter_1(c): # test basic filtering - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') - c.run_sourmash('sig', 'filter', sig47, sig63) + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") + c.run_sourmash("sig", "filter", sig47, sig63) # stdout should be new signature out = c.last_result.out @@ -307,8 +338,8 @@ def test_sig_filter_1(c): @utils.in_tempdir def test_sig_filter_2(c): # test basic filtering - sig47 = utils.get_test_data('track_abund/47.fa.sig') - c.run_sourmash('sig', 'filter', '-m', '2', '-M', '5', sig47) + sig47 = utils.get_test_data("track_abund/47.fa.sig") + c.run_sourmash("sig", "filter", "-m", "2", "-M", "5", sig47) # stdout should be new signature out = c.last_result.out @@ -317,7 +348,7 @@ def test_sig_filter_2(c): test_sig = sourmash.load_one_signature(sig47) abunds = test_sig.minhash.hashes - abunds = { k: v for (k, v) in abunds.items() if v >= 2 and v <= 5 } + abunds = {k: v for (k, v) in abunds.items() if v >= 2 and v <= 5} assert abunds assert filtered_sig.minhash.hashes == abunds @@ -326,8 +357,8 @@ def test_sig_filter_2(c): @utils.in_tempdir def test_sig_filter_3(c): # test basic filtering - sig47 = utils.get_test_data('track_abund/47.fa.sig') - c.run_sourmash('sig', 'filter', '-m', '2', sig47) + sig47 = utils.get_test_data("track_abund/47.fa.sig") + c.run_sourmash("sig", "filter", "-m", "2", sig47) # stdout should be new signature out = c.last_result.out @@ -336,7 +367,7 @@ def test_sig_filter_3(c): test_sig = sourmash.load_one_signature(sig47) abunds = test_sig.minhash.hashes - abunds = { k: v for (k, v) in abunds.items() if v >= 2 } + abunds = {k: v for (k, v) in abunds.items() if v >= 2} assert abunds assert filtered_sig.minhash.hashes == abunds @@ -345,8 +376,8 @@ def test_sig_filter_3(c): @utils.in_tempdir def test_sig_filter_3_ksize_select(c): # test filtering with ksize selectiong - psw_mag = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') - c.run_sourmash('sig', 'filter', '-m', '2', psw_mag, '-k', '31') + psw_mag = utils.get_test_data("lca/TARA_PSW_MAG_00136.sig") + c.run_sourmash("sig", "filter", "-m", "2", psw_mag, "-k", "31") # stdout should be new signature out = c.last_result.out @@ -355,7 +386,7 @@ def test_sig_filter_3_ksize_select(c): test_sig = sourmash.load_one_signature(psw_mag, ksize=31) abunds = test_sig.minhash.hashes - abunds = { k: v for (k, v) in abunds.items() if v >= 2 } + abunds = {k: v for (k, v) in abunds.items() if v >= 2} assert abunds assert filtered_sig.minhash.hashes == abunds @@ -364,11 +395,11 @@ def test_sig_filter_3_ksize_select(c): @utils.in_tempdir def test_sig_merge_flatten(c): # merge of 47 without abund, with 63 with, will succeed with --flatten - sig47 = utils.get_test_data('47.fa.sig') - sig63abund = utils.get_test_data('track_abund/63.fa.sig') - sig47and63 = utils.get_test_data('47+63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63abund = utils.get_test_data("track_abund/63.fa.sig") + sig47and63 = utils.get_test_data("47+63.fa.sig") - c.run_sourmash('sig', 'merge', sig63abund, sig47, '--flatten') + c.run_sourmash("sig", "merge", sig63abund, sig47, "--flatten") print(c.last_result) out = c.last_result.out @@ -386,11 +417,11 @@ def test_sig_merge_flatten(c): @utils.in_tempdir def test_sig_merge_flatten_2(c): # merge of 47 with abund, with 63 with, will succeed with --flatten - sig47abund = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - sig47and63 = utils.get_test_data('47+63.fa.sig') + sig47abund = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + sig47and63 = utils.get_test_data("47+63.fa.sig") - c.run_sourmash('sig', 'merge', sig63, sig47abund, '--flatten') + c.run_sourmash("sig", "merge", sig63, sig47abund, "--flatten") print(c.last_result) out = c.last_result.out @@ -410,7 +441,7 @@ def test_sig_intersect_0(runtmp): c = runtmp with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sig', 'intersect') + c.run_sourmash("sig", "intersect") err = c.last_result.err assert "no signatures provided to intersect!?" in err @@ -420,10 +451,10 @@ def test_sig_intersect_1(runtmp): c = runtmp # intersect of 47 and 63 should be intersection of mins - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - sig47and63 = utils.get_test_data('47+63-intersect.fa.sig') - c.run_sourmash('sig', 'intersect', sig47, sig63) + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + sig47and63 = utils.get_test_data("47+63-intersect.fa.sig") + c.run_sourmash("sig", "intersect", sig47, sig63) # stdout should be new signature out = c.last_result.out @@ -442,16 +473,21 @@ def test_sig_intersect_1_fromfile_picklist(runtmp): c = runtmp # intersect of 47 and 63 should be intersection of mins - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - sig47and63 = utils.get_test_data('47+63-intersect.fa.sig') - - from_file = _write_file(runtmp, 'list.txt', [sig47, sig63]) - picklist = _write_file(runtmp, 'pl.csv', - ['md5short', '09a08691', '38729c63']) - - c.run_sourmash('signature', 'intersect', '--from-file', from_file, - '--picklist', f'{picklist}:md5short:md5short') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + sig47and63 = utils.get_test_data("47+63-intersect.fa.sig") + + from_file = _write_file(runtmp, "list.txt", [sig47, sig63]) + picklist = _write_file(runtmp, "pl.csv", ["md5short", "09a08691", "38729c63"]) + + c.run_sourmash( + "signature", + "intersect", + "--from-file", + from_file, + "--picklist", + f"{picklist}:md5short:md5short", + ) # stdout should be new signature out = c.last_result.out @@ -470,10 +506,10 @@ def test_sig_intersect_1_fromfile_picklist(runtmp): def test_sig_intersect_2(c): # intersect of 47 with abund and 63 with abund should be same # as without abund, i.e. intersect 'flattens' - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') - sig47and63 = utils.get_test_data('47+63-intersect.fa.sig') - c.run_sourmash('sig', 'intersect', sig47, sig63) + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") + sig47and63 = utils.get_test_data("47+63-intersect.fa.sig") + c.run_sourmash("sig", "intersect", sig47, sig63) # stdout should be new signature out = c.last_result.out @@ -491,9 +527,9 @@ def test_sig_intersect_2(c): @utils.in_tempdir def test_sig_intersect_3(c): # use --abundances-from to preserve abundances from sig #47 - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') - c.run_sourmash('sig', 'intersect', '--abundances-from', sig47, sig63) + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") + c.run_sourmash("sig", "intersect", "--abundances-from", sig47, sig63) # stdout should be new signature out = c.last_result.out @@ -510,7 +546,7 @@ def test_sig_intersect_3(c): mh63_mins.intersection_update(mh47_abunds) # take abundances from mh47 & create new sig - mh47_abunds = { k: mh47_abunds[k] for k in mh63_mins } + mh47_abunds = {k: mh47_abunds[k] for k in mh63_mins} test_mh = mh47.copy_and_clear() test_mh.set_abundances(mh47_abunds) @@ -523,9 +559,9 @@ def test_sig_intersect_3(c): @utils.in_tempdir def test_sig_intersect_4(c): # use --abundances-from to preserve abundances from sig #47 - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') - c.run_sourmash('sig', 'intersect', '--abundances-from', sig47, sig63) + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") + c.run_sourmash("sig", "intersect", "--abundances-from", sig47, sig63) # stdout should be new signature out = c.last_result.out @@ -542,7 +578,7 @@ def test_sig_intersect_4(c): mh63_mins.intersection_update(mh47_abunds) # take abundances from mh47 & create new sig - mh47_abunds = { k: mh47_abunds[k] for k in mh63_mins } + mh47_abunds = {k: mh47_abunds[k] for k in mh63_mins} test_mh = mh47.copy_and_clear() test_mh.set_abundances(mh47_abunds) @@ -556,41 +592,41 @@ def test_sig_intersect_4(c): def test_sig_intersect_5(c): # use --abundances-from to preserve abundances from sig #47 # make sure that you can't specify a flat sig for --abundances-from - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sig', 'intersect', '--abundances-from', sig47, sig63) + c.run_sourmash("sig", "intersect", "--abundances-from", sig47, sig63) @utils.in_tempdir def test_sig_intersect_6_ksize_fail(c): # specify ksize to intersect 2.fa.sig with 47.fa.sig - 2.fa.sig contains # multiple ksizes. - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sig', 'intersect', sig2, sig47) + c.run_sourmash("sig", "intersect", sig2, sig47) @utils.in_tempdir def test_sig_intersect_6_ksize_succeed(c): # specify ksize to intersect 2.fa.sig with 47.fa.sig - 2.fa.sig contains # multiple ksizes. - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") - c.run_sourmash('sig', 'intersect', '-k', '31', sig2, sig47) + c.run_sourmash("sig", "intersect", "-k", "31", sig2, sig47) - assert 'loaded and intersected 2 signatures' in c.last_result.err + assert "loaded and intersected 2 signatures" in c.last_result.err @utils.in_tempdir def test_sig_intersect_7(c): # intersect of 47 and nothing should be self - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'intersect', sig47) + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "intersect", sig47) # stdout should be new signature out = c.last_result.out @@ -608,8 +644,8 @@ def test_sig_intersect_7(c): @utils.in_tempdir def test_sig_intersect_8_multisig(c): # intersect of all the multisig stuff should be nothing - sig47 = utils.get_test_data('47+63-multisig.sig') - c.run_sourmash('sig', 'intersect', sig47) + sig47 = utils.get_test_data("47+63-multisig.sig") + c.run_sourmash("sig", "intersect", sig47) # stdout should be new signature out = c.last_result.out @@ -621,9 +657,9 @@ def test_sig_intersect_8_multisig(c): def test_sig_inflate_1(runtmp): # basic inflate test - inflate 47 flat with 47 abund - sig47_flat = utils.get_test_data('47.fa.sig') - sig47_abund = utils.get_test_data('track_abund/47.fa.sig') - runtmp.run_sourmash('sig', 'inflate', sig47_abund, sig47_flat) + sig47_flat = utils.get_test_data("47.fa.sig") + sig47_abund = utils.get_test_data("track_abund/47.fa.sig") + runtmp.run_sourmash("sig", "inflate", sig47_abund, sig47_flat) # stdout should be new signature out = runtmp.last_result.out @@ -641,9 +677,9 @@ def test_sig_inflate_1(runtmp): def test_sig_inflate_2(runtmp): # use abundances from sig #47 - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - runtmp.run_sourmash('sig', 'inflate', sig47, sig63) + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + runtmp.run_sourmash("sig", "inflate", sig47, sig63) # stdout should be new signature out = runtmp.last_result.out @@ -660,7 +696,7 @@ def test_sig_inflate_2(runtmp): mh63_mins.intersection_update(mh47_abunds) # take abundances from mh47 & create new sig - mh47_abunds = { k: mh47_abunds[k] for k in mh63_mins } + mh47_abunds = {k: mh47_abunds[k] for k in mh63_mins} test_mh = mh47.copy_and_clear() test_mh.set_abundances(mh47_abunds) @@ -672,34 +708,33 @@ def test_sig_inflate_2(runtmp): def test_sig_inflate_3(runtmp): # should fail on flat first sig - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('sig', 'inflate', sig63, sig47) + with pytest.raises(SourmashCommandFailed): + runtmp.run_sourmash("sig", "inflate", sig63, sig47) - assert 'has no abundances' in runtmp.last_result.err + assert "has no abundances" in runtmp.last_result.err def test_sig_inflate_4_picklist(runtmp): # try out picklists - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - sig47_flat = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + sig47_flat = utils.get_test_data("47.fa.sig") ss63 = sourmash.load_one_signature(sig63, ksize=31) - picklist = _write_file(runtmp, 'pl.csv', ['md5', ss63.md5sum()]) + _write_file(runtmp, "pl.csv", ["md5", ss63.md5sum()]) print(ss63.md5sum()) - - runtmp.run_sourmash('sig', 'inflate', sig47, sig63, sig47_flat, - '--picklist', f'pl.csv:md5:md5') + runtmp.run_sourmash( + "sig", "inflate", sig47, sig63, sig47_flat, "--picklist", "pl.csv:md5:md5" + ) # stdout should be new signature out = runtmp.last_result.out - err = runtmp.last_result.err actual_inflate_sig = sourmash.load_one_signature(out) @@ -713,7 +748,7 @@ def test_sig_inflate_4_picklist(runtmp): mh63_mins.intersection_update(mh47_abunds) # take abundances from mh47 & create new sig - mh47_abunds = { k: mh47_abunds[k] for k in mh63_mins } + mh47_abunds = {k: mh47_abunds[k] for k in mh63_mins} test_mh = mh47.copy_and_clear() test_mh.set_abundances(mh47_abunds) @@ -725,21 +760,21 @@ def test_sig_inflate_4_picklist(runtmp): def test_sig_inflate_5_bad_moltype(runtmp): # should fail when no signatures match moltype - sig47 = utils.get_test_data('track_abund/47.fa.sig') - prot = utils.get_test_data('prot/protein.zip') + sig47 = utils.get_test_data("track_abund/47.fa.sig") + prot = utils.get_test_data("prot/protein.zip") - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('sig', 'inflate', sig47, prot) + with pytest.raises(SourmashCommandFailed): + runtmp.run_sourmash("sig", "inflate", sig47, prot) - assert 'no signatures to inflate' in runtmp.last_result.err + assert "no signatures to inflate" in runtmp.last_result.err @utils.in_tempdir def test_sig_subtract_1(c): # subtract of 63 from 47 - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - c.run_sourmash('sig', 'subtract', sig47, sig63) + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + c.run_sourmash("sig", "subtract", sig47, sig63) # stdout should be new signature out = c.last_result.out @@ -758,9 +793,9 @@ def test_sig_subtract_1_abund(runtmp): # subtract 63 from 47, with abundances borrowed from 47 c = runtmp - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') - c.run_sourmash('sig', 'subtract', sig47, sig63, '-A', sig47) + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") + c.run_sourmash("sig", "subtract", sig47, sig63, "-A", sig47) # stdout should be new signature out = c.last_result.out @@ -791,21 +826,21 @@ def test_sig_subtract_1_abund_is_flat(runtmp): # subtract 63 from 47, with abundances borrowed from 47 c = runtmp - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') - sig47_flat = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") + sig47_flat = utils.get_test_data("47.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sig', 'subtract', sig47, sig63, '-A', sig47_flat) + c.run_sourmash("sig", "subtract", sig47, sig63, "-A", sig47_flat) def test_sig_subtract_1_flatten(runtmp): # subtract 63 from 47, with abund signatures originally and --flatten c = runtmp - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') - c.run_sourmash('sig', 'subtract', sig47, sig63, '--flatten') + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") + c.run_sourmash("sig", "subtract", sig47, sig63, "--flatten") # stdout should be new signature out = c.last_result.out @@ -824,9 +859,9 @@ def test_sig_subtract_1_flatten(runtmp): @utils.in_tempdir def test_sig_subtract_1_multisig(c): # subtract of everything from 47 - sig47 = utils.get_test_data('47.fa.sig') - multisig = utils.get_test_data('47+63-multisig.sig') - c.run_sourmash('sig', 'subtract', sig47, multisig, '--flatten') + sig47 = utils.get_test_data("47.fa.sig") + multisig = utils.get_test_data("47+63-multisig.sig") + c.run_sourmash("sig", "subtract", sig47, multisig, "--flatten") # stdout should be new signature out = c.last_result.out @@ -839,60 +874,60 @@ def test_sig_subtract_1_multisig(c): @utils.in_tempdir def test_sig_subtract_2(c): # subtract of 63 from 47 should fail if 47 has abund - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sig', 'subtract', sig47, sig63) + c.run_sourmash("sig", "subtract", sig47, sig63) @utils.in_tempdir def test_sig_subtract_3(c): # subtract of 63 from 47 should fail if 63 has abund - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sig', 'subtract', sig47, sig63) + c.run_sourmash("sig", "subtract", sig47, sig63) @utils.in_tempdir def test_sig_subtract_4_ksize_fail(c): # subtract of 2 from 47 should fail without -k specified - sig47 = utils.get_test_data('47.fa.sig') - sig2 = utils.get_test_data('2.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig2 = utils.get_test_data("2.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sig', 'subtract', sig47, sig2) + c.run_sourmash("sig", "subtract", sig47, sig2) @utils.in_tempdir def test_sig_subtract_4_ksize_succeed(c): # subtract of 2 from 47 should fail without -k specified - sig47 = utils.get_test_data('47.fa.sig') - sig2 = utils.get_test_data('2.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig2 = utils.get_test_data("2.fa.sig") - c.run_sourmash('sig', 'subtract', sig47, sig2, '-k', '31') - assert 'loaded and subtracted 1 signatures' in c.last_result.err + c.run_sourmash("sig", "subtract", sig47, sig2, "-k", "31") + assert "loaded and subtracted 1 signatures" in c.last_result.err def test_sig_subtract_5_bad_moltype(runtmp): # should fail when no matching sigs - sig47 = utils.get_test_data('47.fa.sig') - prot = utils.get_test_data('prot/protein.zip') + sig47 = utils.get_test_data("47.fa.sig") + prot = utils.get_test_data("prot/protein.zip") - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('sig', 'subtract', '-k', '31', sig47, prot) + with pytest.raises(SourmashCommandFailed): + runtmp.run_sourmash("sig", "subtract", "-k", "31", sig47, prot) - assert 'no signatures to subtract' in runtmp.last_result.err + assert "no signatures to subtract" in runtmp.last_result.err def test_sig_rename_1(runtmp): c = runtmp # set new name for 47 - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'rename', sig47, 'fiz bar') + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "rename", sig47, "fiz bar") # stdout should be new signature out = c.last_result.out @@ -905,20 +940,27 @@ def test_sig_rename_1(runtmp): assert actual_rename_sig.minhash == test_rename_sig.minhash assert test_rename_sig.name != actual_rename_sig.name - assert actual_rename_sig.name == 'fiz bar' + assert actual_rename_sig.name == "fiz bar" def test_sig_rename_1_fromfile_picklist(runtmp): c = runtmp # set new name for 47 - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") - from_file = _write_file(runtmp, 'list.txt', [sig47]) - picklist = _write_file(runtmp, 'pl.csv', ['md5short', '09a08691']) + from_file = _write_file(runtmp, "list.txt", [sig47]) + picklist = _write_file(runtmp, "pl.csv", ["md5short", "09a08691"]) - c.run_sourmash('sig', 'rename', '--from-file', from_file, 'fiz bar', - '--picklist', f'{picklist}:md5short:md5short') + c.run_sourmash( + "sig", + "rename", + "--from-file", + from_file, + "fiz bar", + "--picklist", + f"{picklist}:md5short:md5short", + ) # stdout should be new signature out = c.last_result.out @@ -931,22 +973,22 @@ def test_sig_rename_1_fromfile_picklist(runtmp): assert actual_rename_sig.minhash == test_rename_sig.minhash assert test_rename_sig.name != actual_rename_sig.name - assert actual_rename_sig.name == 'fiz bar' + assert actual_rename_sig.name == "fiz bar" @utils.in_tempdir def test_sig_rename_1_multisig(c): # set new name for multiple signatures/files - multisig = utils.get_test_data('47+63-multisig.sig') - other_sig = utils.get_test_data('2.fa.sig') - c.run_sourmash('sig', 'rename', multisig, other_sig, 'fiz bar') + multisig = utils.get_test_data("47+63-multisig.sig") + other_sig = utils.get_test_data("2.fa.sig") + c.run_sourmash("sig", "rename", multisig, other_sig, "fiz bar") # stdout should be new signature out = c.last_result.out n = 0 for sig in load_signatures(out): - assert sig.name == 'fiz bar' + assert sig.name == "fiz bar" n += 1 assert n == 9, n @@ -955,16 +997,16 @@ def test_sig_rename_1_multisig(c): @utils.in_tempdir def test_sig_rename_1_multisig_ksize(c): # set new name for multiple signatures/files; select k=31 - multisig = utils.get_test_data('47+63-multisig.sig') - other_sig = utils.get_test_data('2.fa.sig') - c.run_sourmash('sig', 'rename', multisig, other_sig, 'fiz bar', '-k', '31') + multisig = utils.get_test_data("47+63-multisig.sig") + other_sig = utils.get_test_data("2.fa.sig") + c.run_sourmash("sig", "rename", multisig, other_sig, "fiz bar", "-k", "31") # stdout should be new signature out = c.last_result.out n = 0 for sig in load_signatures(out): - assert sig.name == 'fiz bar' + assert sig.name == "fiz bar" n += 1 assert n == 7, n @@ -973,23 +1015,23 @@ def test_sig_rename_1_multisig_ksize(c): @utils.in_tempdir def test_sig_rename_2_output_to_same(c): # change name of signature "in place", same output file - sig47 = utils.get_test_data('47.fa.sig') - inplace = c.output('inplace.sig') + sig47 = utils.get_test_data("47.fa.sig") + inplace = c.output("inplace.sig") shutil.copyfile(sig47, inplace) print(inplace) - c.run_sourmash('sig', 'rename', '-d', inplace, 'fiz bar', '-o', inplace) + c.run_sourmash("sig", "rename", "-d", inplace, "fiz bar", "-o", inplace) actual_rename_sig = sourmash.load_one_signature(inplace) - assert actual_rename_sig.name == 'fiz bar' + assert actual_rename_sig.name == "fiz bar" @utils.in_tempdir def test_sig_rename_3_file_dne(c): # rename on a file that does not exist should fail! - with pytest.raises(SourmashCommandFailed) as e: - c.run_sourmash('sig', 'rename', 'no-such-sig', 'fiz bar') + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("sig", "rename", "no-such-sig", "fiz bar") assert "Error while reading signatures from 'no-such-sig'" in c.last_result.err @@ -997,7 +1039,7 @@ def test_sig_rename_3_file_dne(c): @utils.in_tempdir def test_sig_rename_3_file_dne_force(c): # rename on a file that does not exist should fail! - c.run_sourmash('sig', 'rename', 'no-such-sig', 'fiz bar', '-f') + c.run_sourmash("sig", "rename", "no-such-sig", "fiz bar", "-f") print(c.last_result.err) assert "Error while reading signatures from 'no-such-sig'" in c.last_result.err @@ -1005,35 +1047,37 @@ def test_sig_rename_3_file_dne_force(c): def test_sig_rename_4_pattern_include(runtmp): # test sig rename --include-db-pattern - sigfiles = glob.glob(utils.get_test_data('prot/*.zip')) - runtmp.sourmash('sig', 'rename', '--include', 'shewanella', - *sigfiles, 'SHEWME', '-o', 'out.zip') + sigfiles = glob.glob(utils.get_test_data("prot/*.zip")) + runtmp.sourmash( + "sig", "rename", "--include", "shewanella", *sigfiles, "SHEWME", "-o", "out.zip" + ) - idx = sourmash.load_file_as_index(runtmp.output('out.zip')) - names = [ ss.name for ss in idx.signatures() ] + idx = sourmash.load_file_as_index(runtmp.output("out.zip")) + names = [ss.name for ss in idx.signatures()] for n in names: - assert n == 'SHEWME' + assert n == "SHEWME" assert len(names) == 2 def test_sig_rename_4_pattern_exclude(runtmp): # test sig rename --exclude-db-pattern - sigfiles = glob.glob(utils.get_test_data('prot/*.zip')) - runtmp.sourmash('sig', 'rename', '--exclude', 'shewanella', - *sigfiles, 'NOSHEW', '-o', 'out.zip') + sigfiles = glob.glob(utils.get_test_data("prot/*.zip")) + runtmp.sourmash( + "sig", "rename", "--exclude", "shewanella", *sigfiles, "NOSHEW", "-o", "out.zip" + ) - idx = sourmash.load_file_as_index(runtmp.output('out.zip')) - names = [ ss.name for ss in idx.signatures() ] + idx = sourmash.load_file_as_index(runtmp.output("out.zip")) + names = [ss.name for ss in idx.signatures()] for n in names: - assert n == 'NOSHEW' + assert n == "NOSHEW" assert len(names) == 6 @utils.in_thisdir def test_sig_cat_1(c): # cat 47 to 47... - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'cat', sig47) + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "cat", sig47) # stdout should be same signature out = c.last_result.out @@ -1047,8 +1091,8 @@ def test_sig_cat_1(c): @utils.in_thisdir def test_sig_cat_1_no_unique(c): # cat 47 to 47... twice - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'cat', sig47, sig47) + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "cat", sig47, sig47) # stdout should be same signature out = c.last_result.out @@ -1059,15 +1103,15 @@ def test_sig_cat_1_no_unique(c): for n, sig in enumerate(actual_cat_sigs): assert sig == test_cat_sig - assert n == 1 # two signatures, but enumerate stops at 1. - assert 'encountered 1 MinHashes multiple times' in c.last_result.err + assert n == 1 # two signatures, but enumerate stops at 1. + assert "encountered 1 MinHashes multiple times" in c.last_result.err @utils.in_thisdir def test_sig_cat_1_unique(c): # cat 47 to 47... twice... and get unique - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'cat', sig47, sig47, '--unique') + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "cat", sig47, sig47, "--unique") # stdout should be same signature out = c.last_result.out @@ -1079,18 +1123,18 @@ def test_sig_cat_1_unique(c): for n, sig in enumerate(actual_cat_sigs): assert sig == test_cat_sig - assert n == 0 # enumerate stops at 0, first sig. - assert 'encountered 1 MinHashes multiple times' in err - assert '...and removed the duplicates, because --unique was specified.' in err + assert n == 0 # enumerate stops at 0, first sig. + assert "encountered 1 MinHashes multiple times" in err + assert "...and removed the duplicates, because --unique was specified." in err @utils.in_thisdir def test_sig_cat_2(c): # cat several - sig47 = utils.get_test_data('47.fa.sig') - sig47abund = utils.get_test_data('track_abund/47.fa.sig') - multisig = utils.get_test_data('47+63-multisig.sig') - c.run_sourmash('sig', 'cat', sig47, sig47abund, multisig) + sig47 = utils.get_test_data("47.fa.sig") + sig47abund = utils.get_test_data("track_abund/47.fa.sig") + multisig = utils.get_test_data("47+63-multisig.sig") + c.run_sourmash("sig", "cat", sig47, sig47abund, multisig) # stdout should be same signatures out = c.last_result.out @@ -1098,40 +1142,44 @@ def test_sig_cat_2(c): siglist = list(load_signatures(out)) print(len(siglist)) - assert repr(siglist) == """[SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 57e2b22f), SourmashSignature('NC_009661.1 Shewanella baltica OS185 plasmid pS18501, complete sequence', bde81a41), SourmashSignature('NC_011663.1 Shewanella baltica OS223, complete genome', f033bbd8), SourmashSignature('NC_011664.1 Shewanella baltica OS223 plasmid pS22301, complete sequence', 87a9aec4), SourmashSignature('NC_011668.1 Shewanella baltica OS223 plasmid pS22302, complete sequence', 837bf2a7), SourmashSignature('NC_011665.1 Shewanella baltica OS223 plasmid pS22303, complete sequence', 485c3377)]""" + assert ( + repr(siglist) + == """[SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 57e2b22f), SourmashSignature('NC_009661.1 Shewanella baltica OS185 plasmid pS18501, complete sequence', bde81a41), SourmashSignature('NC_011663.1 Shewanella baltica OS223, complete genome', f033bbd8), SourmashSignature('NC_011664.1 Shewanella baltica OS223 plasmid pS22301, complete sequence', 87a9aec4), SourmashSignature('NC_011668.1 Shewanella baltica OS223 plasmid pS22302, complete sequence', 837bf2a7), SourmashSignature('NC_011665.1 Shewanella baltica OS223 plasmid pS22303, complete sequence', 485c3377)]""" + ) @utils.in_tempdir def test_sig_cat_2_out(c): # cat several - sig47 = utils.get_test_data('47.fa.sig') - sig47abund = utils.get_test_data('track_abund/47.fa.sig') - multisig = utils.get_test_data('47+63-multisig.sig') - c.run_sourmash('sig', 'cat', sig47, sig47abund, multisig, - '-o', 'out.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig47abund = utils.get_test_data("track_abund/47.fa.sig") + multisig = utils.get_test_data("47+63-multisig.sig") + c.run_sourmash("sig", "cat", sig47, sig47abund, multisig, "-o", "out.sig") # stdout should be same signatures - out = c.output('out.sig') + out = c.output("out.sig") siglist = list(load_signatures(out)) print(len(siglist)) - assert repr(siglist) == """[SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 57e2b22f), SourmashSignature('NC_009661.1 Shewanella baltica OS185 plasmid pS18501, complete sequence', bde81a41), SourmashSignature('NC_011663.1 Shewanella baltica OS223, complete genome', f033bbd8), SourmashSignature('NC_011664.1 Shewanella baltica OS223 plasmid pS22301, complete sequence', 87a9aec4), SourmashSignature('NC_011668.1 Shewanella baltica OS223 plasmid pS22302, complete sequence', 837bf2a7), SourmashSignature('NC_011665.1 Shewanella baltica OS223 plasmid pS22303, complete sequence', 485c3377)]""" + assert ( + repr(siglist) + == """[SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 57e2b22f), SourmashSignature('NC_009661.1 Shewanella baltica OS185 plasmid pS18501, complete sequence', bde81a41), SourmashSignature('NC_011663.1 Shewanella baltica OS223, complete genome', f033bbd8), SourmashSignature('NC_011664.1 Shewanella baltica OS223 plasmid pS22301, complete sequence', 87a9aec4), SourmashSignature('NC_011668.1 Shewanella baltica OS223 plasmid pS22302, complete sequence', 837bf2a7), SourmashSignature('NC_011665.1 Shewanella baltica OS223 plasmid pS22303, complete sequence', 485c3377)]""" + ) @utils.in_tempdir def test_sig_cat_2_out_inplace(c): # cat several; check that we can overwrite one of the input files. - sig47 = utils.get_test_data('47.fa.sig') - input_sig = c.output('inp.sig') + sig47 = utils.get_test_data("47.fa.sig") + input_sig = c.output("inp.sig") shutil.copyfile(sig47, input_sig) - sig47abund = utils.get_test_data('track_abund/47.fa.sig') - multisig = utils.get_test_data('47+63-multisig.sig') + sig47abund = utils.get_test_data("track_abund/47.fa.sig") + multisig = utils.get_test_data("47+63-multisig.sig") # write out to input. - c.run_sourmash('sig', 'cat', input_sig, sig47abund, multisig, - '-o', input_sig) + c.run_sourmash("sig", "cat", input_sig, sig47abund, multisig, "-o", input_sig) # stdout should be same signatures out = input_sig @@ -1139,25 +1187,27 @@ def test_sig_cat_2_out_inplace(c): siglist = list(load_signatures(out)) print(len(siglist)) - assert repr(siglist) == """[SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 57e2b22f), SourmashSignature('NC_009661.1 Shewanella baltica OS185 plasmid pS18501, complete sequence', bde81a41), SourmashSignature('NC_011663.1 Shewanella baltica OS223, complete genome', f033bbd8), SourmashSignature('NC_011664.1 Shewanella baltica OS223 plasmid pS22301, complete sequence', 87a9aec4), SourmashSignature('NC_011668.1 Shewanella baltica OS223 plasmid pS22302, complete sequence', 837bf2a7), SourmashSignature('NC_011665.1 Shewanella baltica OS223 plasmid pS22303, complete sequence', 485c3377)]""" + assert ( + repr(siglist) + == """[SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 57e2b22f), SourmashSignature('NC_009661.1 Shewanella baltica OS185 plasmid pS18501, complete sequence', bde81a41), SourmashSignature('NC_011663.1 Shewanella baltica OS223, complete genome', f033bbd8), SourmashSignature('NC_011664.1 Shewanella baltica OS223 plasmid pS22301, complete sequence', 87a9aec4), SourmashSignature('NC_011668.1 Shewanella baltica OS223 plasmid pS22302, complete sequence', 837bf2a7), SourmashSignature('NC_011665.1 Shewanella baltica OS223 plasmid pS22303, complete sequence', 485c3377)]""" + ) @utils.in_tempdir def test_sig_cat_3_filelist(c): # cat using a file list as input - sig47 = utils.get_test_data('47.fa.sig') - sig47abund = utils.get_test_data('track_abund/47.fa.sig') - multisig = utils.get_test_data('47+63-multisig.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig47abund = utils.get_test_data("track_abund/47.fa.sig") + multisig = utils.get_test_data("47+63-multisig.sig") filelist = c.output("filelist") - with open(filelist, 'w') as f: + with open(filelist, "w") as f: f.write("\n".join((sig47, sig47abund, multisig))) - c.run_sourmash('sig', 'cat', filelist, - '-o', 'out.sig') + c.run_sourmash("sig", "cat", filelist, "-o", "out.sig") # stdout should be same signatures - out = c.output('out.sig') + out = c.output("out.sig") # make this a list, not a set, because a set will collapse identical # signatures. `sig cat` does not collapse identical signatures, although @@ -1174,27 +1224,29 @@ def test_sig_cat_3_filelist(c): assert len(all_sigs) == len(siglist) # sort the signatures by something deterministic and unique - siglist.sort(key = lambda x: x.md5sum()) + siglist.sort(key=lambda x: x.md5sum()) - assert repr(siglist) == """[SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_011665.1 Shewanella baltica OS223 plasmid pS22303, complete sequence', 485c3377), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 57e2b22f), SourmashSignature('NC_011668.1 Shewanella baltica OS223 plasmid pS22302, complete sequence', 837bf2a7), SourmashSignature('NC_011664.1 Shewanella baltica OS223 plasmid pS22301, complete sequence', 87a9aec4), SourmashSignature('NC_009661.1 Shewanella baltica OS185 plasmid pS18501, complete sequence', bde81a41), SourmashSignature('NC_011663.1 Shewanella baltica OS223, complete genome', f033bbd8)]""" + assert ( + repr(siglist) + == """[SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_011665.1 Shewanella baltica OS223 plasmid pS22303, complete sequence', 485c3377), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 57e2b22f), SourmashSignature('NC_011668.1 Shewanella baltica OS223 plasmid pS22302, complete sequence', 837bf2a7), SourmashSignature('NC_011664.1 Shewanella baltica OS223 plasmid pS22301, complete sequence', 87a9aec4), SourmashSignature('NC_009661.1 Shewanella baltica OS185 plasmid pS18501, complete sequence', bde81a41), SourmashSignature('NC_011663.1 Shewanella baltica OS223, complete genome', f033bbd8)]""" + ) @utils.in_tempdir def test_sig_cat_4_filelist_with_dbs(c): # cat using a file list as input - sig47 = utils.get_test_data('47.fa.sig') - sig47abund = utils.get_test_data('track_abund/47.fa.sig') - sbt = utils.get_test_data('v6.sbt.zip') + sig47 = utils.get_test_data("47.fa.sig") + sig47abund = utils.get_test_data("track_abund/47.fa.sig") + sbt = utils.get_test_data("v6.sbt.zip") filelist = c.output("filelist") - with open(filelist, 'w') as f: + with open(filelist, "w") as f: f.write("\n".join((sig47, sig47abund, sbt))) - c.run_sourmash('sig', 'cat', filelist, - '-o', 'out.sig') + c.run_sourmash("sig", "cat", filelist, "-o", "out.sig") # stdout should be same signatures - out = c.output('out.sig') + out = c.output("out.sig") siglist = list(load_signatures(out)) print(len(siglist)) @@ -1211,27 +1263,29 @@ def test_sig_cat_4_filelist_with_dbs(c): assert len(all_sigs) == len(siglist) # sort the signatures by something deterministic and unique - siglist.sort(key = lambda x: x.md5sum()) + siglist.sort(key=lambda x: x.md5sum()) - assert repr(siglist) == """[SourmashSignature('', 0107d767), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('', 4e94e602), SourmashSignature('', 60f7e23c), SourmashSignature('', 6d6e87e1), SourmashSignature('', b59473c9), SourmashSignature('', f0c834bc), SourmashSignature('', f71e7817)]""" + assert ( + repr(siglist) + == """[SourmashSignature('', 0107d767), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('', 4e94e602), SourmashSignature('', 60f7e23c), SourmashSignature('', 6d6e87e1), SourmashSignature('', b59473c9), SourmashSignature('', f0c834bc), SourmashSignature('', f71e7817)]""" + ) @utils.in_tempdir def test_sig_cat_5_from_file(c): # cat using a file list as input - sig47 = utils.get_test_data('47.fa.sig') - sig47abund = utils.get_test_data('track_abund/47.fa.sig') - sbt = utils.get_test_data('v6.sbt.zip') + sig47 = utils.get_test_data("47.fa.sig") + sig47abund = utils.get_test_data("track_abund/47.fa.sig") + sbt = utils.get_test_data("v6.sbt.zip") filelist = c.output("filelist") - with open(filelist, 'w') as f: + with open(filelist, "w") as f: f.write("\n".join((sig47, sig47abund, sbt))) - c.run_sourmash('sig', 'cat', '--from-file', filelist, - '-o', 'out.sig') + c.run_sourmash("sig", "cat", "--from-file", filelist, "-o", "out.sig") # stdout should be same signatures - out = c.output('out.sig') + out = c.output("out.sig") siglist = list(load_signatures(out)) print(len(siglist)) @@ -1248,30 +1302,40 @@ def test_sig_cat_5_from_file(c): assert len(all_sigs) == len(siglist) # sort the signatures by something deterministic and unique - siglist.sort(key = lambda x: x.md5sum()) + siglist.sort(key=lambda x: x.md5sum()) - assert repr(siglist) == """[SourmashSignature('', 0107d767), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('', 4e94e602), SourmashSignature('', 60f7e23c), SourmashSignature('', 6d6e87e1), SourmashSignature('', b59473c9), SourmashSignature('', f0c834bc), SourmashSignature('', f71e7817)]""" + assert ( + repr(siglist) + == """[SourmashSignature('', 0107d767), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('', 4e94e602), SourmashSignature('', 60f7e23c), SourmashSignature('', 6d6e87e1), SourmashSignature('', b59473c9), SourmashSignature('', f0c834bc), SourmashSignature('', f71e7817)]""" + ) def test_sig_cat_5_from_file_picklist(runtmp): c = runtmp # cat using a file list as input - sig47 = utils.get_test_data('47.fa.sig') - sbt = utils.get_test_data('v6.sbt.zip') + sig47 = utils.get_test_data("47.fa.sig") + sbt = utils.get_test_data("v6.sbt.zip") filelist = c.output("filelist") - with open(filelist, 'w') as f: + with open(filelist, "w") as f: f.write("\n".join((sig47, sbt))) - picklist = _write_file(runtmp, 'pl.csv', ['md5short', '09a08691']) + picklist = _write_file(runtmp, "pl.csv", ["md5short", "09a08691"]) - c.run_sourmash('sig', 'cat', '--from-file', filelist, - '--picklist', f'{picklist}:md5short:md5short', - '-o', 'out.sig') + c.run_sourmash( + "sig", + "cat", + "--from-file", + filelist, + "--picklist", + f"{picklist}:md5short:md5short", + "-o", + "out.sig", + ) # stdout should be same signatures - out = c.output('out.sig') + out = c.output("out.sig") siglist = list(load_signatures(out)) print(len(siglist)) @@ -1286,46 +1350,46 @@ def test_sig_cat_5_from_file_picklist(runtmp): assert len(all_sigs) == len(siglist) # sort the signatures by something deterministic and unique - siglist.sort(key = lambda x: x.md5sum()) + siglist.sort(key=lambda x: x.md5sum()) - assert repr(siglist) == """[SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691)]""" + assert ( + repr(siglist) + == """[SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691)]""" + ) def test_sig_cat_6_pattern_include(runtmp): # test --include-db-pattern - sigfiles = glob.glob(utils.get_test_data('prot/*.zip')) + sigfiles = glob.glob(utils.get_test_data("prot/*.zip")) - runtmp.sourmash('sig', 'cat', '--include', 'shewanella', *sigfiles, - '-o', 'out.zip') + runtmp.sourmash("sig", "cat", "--include", "shewanella", *sigfiles, "-o", "out.zip") - idx = sourmash.load_file_as_index(runtmp.output('out.zip')) + idx = sourmash.load_file_as_index(runtmp.output("out.zip")) assert len(idx) == 2 - names = [ ss.name for ss in idx.signatures() ] + names = [ss.name for ss in idx.signatures()] for n in names: - assert 'shewanella' in n.lower(), n + assert "shewanella" in n.lower(), n def test_sig_cat_6_pattern_exclude(runtmp): # test --exclude-db-pattern - sigfiles = glob.glob(utils.get_test_data('prot/*.zip')) + sigfiles = glob.glob(utils.get_test_data("prot/*.zip")) - runtmp.sourmash('sig', 'cat', '--exclude', 'shewanella', *sigfiles, - '-o', 'out.zip') + runtmp.sourmash("sig", "cat", "--exclude", "shewanella", *sigfiles, "-o", "out.zip") - idx = sourmash.load_file_as_index(runtmp.output('out.zip')) + idx = sourmash.load_file_as_index(runtmp.output("out.zip")) assert len(idx) == 18 - names = [ ss.name for ss in idx.signatures() ] + names = [ss.name for ss in idx.signatures()] for n in names: - assert 'shewanella' not in n.lower(), n + assert "shewanella" not in n.lower(), n def test_sig_cat_6_pattern_exclude_no_manifest(runtmp): # test --exclude-db-pattern - db = utils.get_test_data('v6.sbt.zip') + db = utils.get_test_data("v6.sbt.zip") with pytest.raises(SourmashCommandFailed) as e: - runtmp.sourmash('sig', 'cat', '--exclude', 'shewanella', db, - '-o', 'out.zip') + runtmp.sourmash("sig", "cat", "--exclude", "shewanella", db, "-o", "out.zip") assert "require a manifest" in str(e) @@ -1333,10 +1397,10 @@ def test_sig_cat_6_pattern_exclude_no_manifest(runtmp): def test_sig_split_1(runtmp): c = runtmp # split 47 into 1 sig :) - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'split', sig47) + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "split", sig47) - outname = '09a08691.k=31.scaled=1000.DNA.dup=0.47.fa.sig' + outname = "09a08691.k=31.scaled=1000.DNA.dup=0.47.fa.sig" assert os.path.exists(c.output(outname)) @@ -1349,15 +1413,21 @@ def test_sig_split_1(runtmp): def test_sig_split_1_fromfile_picklist(runtmp): c = runtmp # split 47 into 1 sig :) - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") - from_file = _write_file(runtmp, 'list.txt', [sig47]) - picklist = _write_file(runtmp, 'pl.csv', ['md5short', '09a08691']) + from_file = _write_file(runtmp, "list.txt", [sig47]) + picklist = _write_file(runtmp, "pl.csv", ["md5short", "09a08691"]) - c.run_sourmash('sig', 'split', '--from-file', from_file, - '--picklist', f'{picklist}:md5short:md5short') + c.run_sourmash( + "sig", + "split", + "--from-file", + from_file, + "--picklist", + f"{picklist}:md5short:md5short", + ) - outname = '09a08691.k=31.scaled=1000.DNA.dup=0.47.fa.sig' + outname = "09a08691.k=31.scaled=1000.DNA.dup=0.47.fa.sig" assert os.path.exists(c.output(outname)) @@ -1370,27 +1440,27 @@ def test_sig_split_1_fromfile_picklist(runtmp): @utils.in_tempdir def test_sig_split_1_overwrite(c): # check message about overwriting - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'split', sig47) + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "split", sig47) - outname = '09a08691.k=31.scaled=1000.DNA.dup=0.47.fa.sig' + outname = "09a08691.k=31.scaled=1000.DNA.dup=0.47.fa.sig" assert os.path.exists(c.output(outname)) - c.run_sourmash('sig', 'split', sig47) + c.run_sourmash("sig", "split", sig47) err = c.last_result.err print(err) - assert '** overwriting existing file ' + outname in err + assert "** overwriting existing file " + outname in err @utils.in_tempdir def test_sig_split_2(c): # split 47 twice - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'split', sig47, sig47) + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "split", sig47, sig47) - outname1 = '09a08691.k=31.scaled=1000.DNA.dup=0.47.fa.sig' - outname2 = '09a08691.k=31.scaled=1000.DNA.dup=1.47.fa.sig' + outname1 = "09a08691.k=31.scaled=1000.DNA.dup=0.47.fa.sig" + outname2 = "09a08691.k=31.scaled=1000.DNA.dup=1.47.fa.sig" assert os.path.exists(c.output(outname1)) assert os.path.exists(c.output(outname2)) @@ -1407,12 +1477,12 @@ def test_sig_split_2(c): @utils.in_tempdir def test_sig_split_2_outdir(c): # split 47 twice, put in outdir - sig47 = utils.get_test_data('47.fa.sig') - outdir = c.output('sigout/') - c.run_sourmash('sig', 'split', sig47, sig47, '--outdir', outdir) + sig47 = utils.get_test_data("47.fa.sig") + outdir = c.output("sigout/") + c.run_sourmash("sig", "split", sig47, sig47, "--outdir", outdir) - outname1 = 'sigout/09a08691.k=31.scaled=1000.DNA.dup=0.47.fa.sig' - outname2 = 'sigout/09a08691.k=31.scaled=1000.DNA.dup=1.47.fa.sig' + outname1 = "sigout/09a08691.k=31.scaled=1000.DNA.dup=0.47.fa.sig" + outname2 = "sigout/09a08691.k=31.scaled=1000.DNA.dup=1.47.fa.sig" assert os.path.exists(c.output(outname1)) assert os.path.exists(c.output(outname2)) @@ -1429,12 +1499,12 @@ def test_sig_split_2_outdir(c): @utils.in_tempdir def test_sig_split_2_output_dir(c): # split 47 twice, put in outdir via --output-dir instead of --outdir - sig47 = utils.get_test_data('47.fa.sig') - outdir = c.output('sigout/') - c.run_sourmash('sig', 'split', sig47, sig47, '--output-dir', outdir) + sig47 = utils.get_test_data("47.fa.sig") + outdir = c.output("sigout/") + c.run_sourmash("sig", "split", sig47, sig47, "--output-dir", outdir) - outname1 = 'sigout/09a08691.k=31.scaled=1000.DNA.dup=0.47.fa.sig' - outname2 = 'sigout/09a08691.k=31.scaled=1000.DNA.dup=1.47.fa.sig' + outname1 = "sigout/09a08691.k=31.scaled=1000.DNA.dup=0.47.fa.sig" + outname2 = "sigout/09a08691.k=31.scaled=1000.DNA.dup=1.47.fa.sig" assert os.path.exists(c.output(outname1)) assert os.path.exists(c.output(outname2)) @@ -1451,16 +1521,18 @@ def test_sig_split_2_output_dir(c): @utils.in_tempdir def test_sig_split_3_multisig(c): # split 47 and 47+63-multisig.sig - sig47 = utils.get_test_data('47.fa.sig') - multisig = utils.get_test_data('47+63-multisig.sig') - c.run_sourmash('sig', 'split', sig47, multisig) - - outlist = ['57e2b22f.k=31.scaled=1000.DNA.dup=0.none.sig', - 'bde81a41.k=31.scaled=1000.DNA.dup=0.none.sig', - 'f033bbd8.k=31.scaled=1000.DNA.dup=0.none.sig', - '87a9aec4.k=31.scaled=1000.DNA.dup=0.none.sig', - '837bf2a7.k=31.scaled=1000.DNA.dup=0.none.sig', - '485c3377.k=31.scaled=1000.DNA.dup=0.none.sig'] + sig47 = utils.get_test_data("47.fa.sig") + multisig = utils.get_test_data("47+63-multisig.sig") + c.run_sourmash("sig", "split", sig47, multisig) + + outlist = [ + "57e2b22f.k=31.scaled=1000.DNA.dup=0.none.sig", + "bde81a41.k=31.scaled=1000.DNA.dup=0.none.sig", + "f033bbd8.k=31.scaled=1000.DNA.dup=0.none.sig", + "87a9aec4.k=31.scaled=1000.DNA.dup=0.none.sig", + "837bf2a7.k=31.scaled=1000.DNA.dup=0.none.sig", + "485c3377.k=31.scaled=1000.DNA.dup=0.none.sig", + ] for filename in outlist: assert os.path.exists(c.output(filename)) @@ -1469,16 +1541,18 @@ def test_sig_split_3_multisig_sig_gz(runtmp): # split 47 and 47+63-multisig.sig with a .sig.gz extension c = runtmp - sig47 = utils.get_test_data('47.fa.sig') - multisig = utils.get_test_data('47+63-multisig.sig') - c.run_sourmash('sig', 'split', sig47, multisig, '-E', '.sig.gz') - - outlist = ['57e2b22f.k=31.scaled=1000.DNA.dup=0.none.sig.gz', - 'bde81a41.k=31.scaled=1000.DNA.dup=0.none.sig.gz', - 'f033bbd8.k=31.scaled=1000.DNA.dup=0.none.sig.gz', - '87a9aec4.k=31.scaled=1000.DNA.dup=0.none.sig.gz', - '837bf2a7.k=31.scaled=1000.DNA.dup=0.none.sig.gz', - '485c3377.k=31.scaled=1000.DNA.dup=0.none.sig.gz'] + sig47 = utils.get_test_data("47.fa.sig") + multisig = utils.get_test_data("47+63-multisig.sig") + c.run_sourmash("sig", "split", sig47, multisig, "-E", ".sig.gz") + + outlist = [ + "57e2b22f.k=31.scaled=1000.DNA.dup=0.none.sig.gz", + "bde81a41.k=31.scaled=1000.DNA.dup=0.none.sig.gz", + "f033bbd8.k=31.scaled=1000.DNA.dup=0.none.sig.gz", + "87a9aec4.k=31.scaled=1000.DNA.dup=0.none.sig.gz", + "837bf2a7.k=31.scaled=1000.DNA.dup=0.none.sig.gz", + "485c3377.k=31.scaled=1000.DNA.dup=0.none.sig.gz", + ] for filename in outlist: assert os.path.exists(c.output(filename)) @@ -1487,16 +1561,18 @@ def test_sig_split_3_multisig_zip(runtmp): # split 47 and 47+63-multisig.sig with a .zip extension c = runtmp - sig47 = utils.get_test_data('47.fa.sig') - multisig = utils.get_test_data('47+63-multisig.sig') - c.run_sourmash('sig', 'split', sig47, multisig, '-E', '.zip') - - outlist = ['57e2b22f.k=31.scaled=1000.DNA.dup=0.none.zip', - 'bde81a41.k=31.scaled=1000.DNA.dup=0.none.zip', - 'f033bbd8.k=31.scaled=1000.DNA.dup=0.none.zip', - '87a9aec4.k=31.scaled=1000.DNA.dup=0.none.zip', - '837bf2a7.k=31.scaled=1000.DNA.dup=0.none.zip', - '485c3377.k=31.scaled=1000.DNA.dup=0.none.zip'] + sig47 = utils.get_test_data("47.fa.sig") + multisig = utils.get_test_data("47+63-multisig.sig") + c.run_sourmash("sig", "split", sig47, multisig, "-E", ".zip") + + outlist = [ + "57e2b22f.k=31.scaled=1000.DNA.dup=0.none.zip", + "bde81a41.k=31.scaled=1000.DNA.dup=0.none.zip", + "f033bbd8.k=31.scaled=1000.DNA.dup=0.none.zip", + "87a9aec4.k=31.scaled=1000.DNA.dup=0.none.zip", + "837bf2a7.k=31.scaled=1000.DNA.dup=0.none.zip", + "485c3377.k=31.scaled=1000.DNA.dup=0.none.zip", + ] for filename in outlist: assert os.path.exists(c.output(filename)) @@ -1504,17 +1580,19 @@ def test_sig_split_3_multisig_zip(runtmp): @utils.in_tempdir def test_sig_split_4_sbt_prot(c): # split sbt - sbt1 = utils.get_test_data('prot/protein.sbt.zip') - sbt2 = utils.get_test_data('prot/dayhoff.sbt.zip') - sbt3 = utils.get_test_data('prot/hp.sbt.zip') - c.run_sourmash('sig', 'split', sbt1, sbt2, sbt3) - - outlist = ['16869d2c.k=19.scaled=100.protein.dup=0.GCA_001593925.1_ASM159392v1_protein.faa.gz.sig', - '120d311c.k=19.scaled=100.protein.dup=0.GCA_001593935.1_ASM159393v1_protein.faa.gz.sig', - 'fbca5e52.k=19.scaled=100.dayhoff.dup=0.GCA_001593925.1_ASM159392v1_protein.faa.gz.sig', - '1cbd888b.k=19.scaled=100.dayhoff.dup=0.GCA_001593935.1_ASM159393v1_protein.faa.gz.sig', - 'ea2a1ad2.k=19.scaled=100.hp.dup=0.GCA_001593925.1_ASM159392v1_protein.faa.gz.sig', - 'bb0e6d90.k=19.scaled=100.hp.dup=0.GCA_001593935.1_ASM159393v1_protein.faa.gz.sig'] + sbt1 = utils.get_test_data("prot/protein.sbt.zip") + sbt2 = utils.get_test_data("prot/dayhoff.sbt.zip") + sbt3 = utils.get_test_data("prot/hp.sbt.zip") + c.run_sourmash("sig", "split", sbt1, sbt2, sbt3) + + outlist = [ + "16869d2c.k=19.scaled=100.protein.dup=0.GCA_001593925.1_ASM159392v1_protein.faa.gz.sig", + "120d311c.k=19.scaled=100.protein.dup=0.GCA_001593935.1_ASM159393v1_protein.faa.gz.sig", + "fbca5e52.k=19.scaled=100.dayhoff.dup=0.GCA_001593925.1_ASM159392v1_protein.faa.gz.sig", + "1cbd888b.k=19.scaled=100.dayhoff.dup=0.GCA_001593935.1_ASM159393v1_protein.faa.gz.sig", + "ea2a1ad2.k=19.scaled=100.hp.dup=0.GCA_001593925.1_ASM159392v1_protein.faa.gz.sig", + "bb0e6d90.k=19.scaled=100.hp.dup=0.GCA_001593935.1_ASM159393v1_protein.faa.gz.sig", + ] for filename in outlist: assert os.path.exists(c.output(filename)) @@ -1522,20 +1600,22 @@ def test_sig_split_4_sbt_prot(c): @utils.in_tempdir def test_sig_split_4_lca_prot(c): # split lca - lca1 = utils.get_test_data('prot/protein.lca.json.gz') - lca2 = utils.get_test_data('prot/dayhoff.lca.json.gz') - lca3 = utils.get_test_data('prot/hp.lca.json.gz') - c.run_sourmash('sig', 'split', lca1, lca2, lca3) + lca1 = utils.get_test_data("prot/protein.lca.json.gz") + lca2 = utils.get_test_data("prot/dayhoff.lca.json.gz") + lca3 = utils.get_test_data("prot/hp.lca.json.gz") + c.run_sourmash("sig", "split", lca1, lca2, lca3) print(c.last_result.out) print(c.last_result.err) - outlist = ['16869d2c.k=19.scaled=100.protein.dup=0.none.sig', - '120d311c.k=19.scaled=100.protein.dup=0.none.sig', - 'fbca5e52.k=19.scaled=100.dayhoff.dup=0.none.sig', - '1cbd888b.k=19.scaled=100.dayhoff.dup=0.none.sig', - 'ea2a1ad2.k=19.scaled=100.hp.dup=0.none.sig', - 'bb0e6d90.k=19.scaled=100.hp.dup=0.none.sig'] + outlist = [ + "16869d2c.k=19.scaled=100.protein.dup=0.none.sig", + "120d311c.k=19.scaled=100.protein.dup=0.none.sig", + "fbca5e52.k=19.scaled=100.dayhoff.dup=0.none.sig", + "1cbd888b.k=19.scaled=100.dayhoff.dup=0.none.sig", + "ea2a1ad2.k=19.scaled=100.hp.dup=0.none.sig", + "bb0e6d90.k=19.scaled=100.hp.dup=0.none.sig", + ] for filename in outlist: assert os.path.exists(c.output(filename)) @@ -1543,23 +1623,25 @@ def test_sig_split_4_lca_prot(c): @utils.in_tempdir def test_sig_split_5_no_exist(c): # no such file - with pytest.raises(SourmashCommandFailed) as e: - c.run_sourmash('sig', 'split', 'foo') + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("sig", "split", "foo") def test_sig_split_6_numsigs(runtmp): c = runtmp - sigs11 = utils.get_test_data('genome-s11.fa.gz.sig') - c.run_sourmash('sig', 'split', sigs11) + sigs11 = utils.get_test_data("genome-s11.fa.gz.sig") + c.run_sourmash("sig", "split", sigs11) print(c.last_result.out) print(c.last_result.err) - outlist = ['1437d8ea.k=21.num=500.DNA.dup=0.genome-s11.fa.gz.sig', - '37aea787.k=7.num=500.protein.dup=0.genome-s11.fa.gz.sig', - '68c565be.k=30.num=500.DNA.dup=0.genome-s11.fa.gz.sig', - '73b6df1c.k=10.num=500.protein.dup=0.genome-s11.fa.gz.sig'] + outlist = [ + "1437d8ea.k=21.num=500.DNA.dup=0.genome-s11.fa.gz.sig", + "37aea787.k=7.num=500.protein.dup=0.genome-s11.fa.gz.sig", + "68c565be.k=30.num=500.DNA.dup=0.genome-s11.fa.gz.sig", + "73b6df1c.k=10.num=500.protein.dup=0.genome-s11.fa.gz.sig", + ] for filename in outlist: assert os.path.exists(c.output(filename)) @@ -1569,8 +1651,8 @@ def test_sig_extract_1(runtmp): c = runtmp # extract 47 from 47... :) - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'extract', sig47) + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "extract", sig47) # stdout should be new signature out = c.last_result.out @@ -1586,9 +1668,9 @@ def test_sig_extract_1_from_file(runtmp): c = runtmp # extract 47 from 47... :) - sig47 = utils.get_test_data('47.fa.sig') - from_file = _write_file(runtmp, 'list.txt', [sig47]) - c.run_sourmash('sig', 'extract', '--from-file', from_file) + sig47 = utils.get_test_data("47.fa.sig") + from_file = _write_file(runtmp, "list.txt", [sig47]) + c.run_sourmash("sig", "extract", "--from-file", from_file) # stdout should be new signature out = c.last_result.out @@ -1602,9 +1684,9 @@ def test_sig_extract_1_from_file(runtmp): @utils.in_tempdir def test_sig_extract_2(c): # extract matches to 47's md5sum from among several - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - c.run_sourmash('sig', 'extract', sig47, sig63, '--md5', '09a0869') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + c.run_sourmash("sig", "extract", sig47, sig63, "--md5", "09a0869") # stdout should be new signature out = c.last_result.out @@ -1621,10 +1703,10 @@ def test_sig_extract_2(c): @utils.in_tempdir def test_sig_extract_2_zipfile(c): # extract matches to 47's md5sum from among several in a zipfile - all_zip = utils.get_test_data('prot/all.zip') - sig47 = utils.get_test_data('47.fa.sig') + all_zip = utils.get_test_data("prot/all.zip") + sig47 = utils.get_test_data("47.fa.sig") - c.run_sourmash('sig', 'extract', all_zip, '--md5', '09a0869') + c.run_sourmash("sig", "extract", all_zip, "--md5", "09a0869") # stdout should be new signature out = c.last_result.out @@ -1641,17 +1723,17 @@ def test_sig_extract_2_zipfile(c): @utils.in_tempdir def test_sig_extract_3(c): # extract nothing (no md5 match) - sig47 = utils.get_test_data('47.fa.sig') - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('sig', 'extract', sig47, '--md5', 'FOO') + sig47 = utils.get_test_data("47.fa.sig") + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("sig", "extract", sig47, "--md5", "FOO") @utils.in_tempdir def test_sig_extract_4(c): # extract matches to 47's name from among several signatures - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - c.run_sourmash('sig', 'extract', sig47, sig63, '--name', 'NC_009665.1') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + c.run_sourmash("sig", "extract", sig47, sig63, "--name", "NC_009665.1") # stdout should be new signature out = c.last_result.out @@ -1668,17 +1750,17 @@ def test_sig_extract_4(c): @utils.in_tempdir def test_sig_extract_5(c): # extract nothing (no name match) - sig47 = utils.get_test_data('47.fa.sig') - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('sig', 'extract', sig47, '--name', 'FOO') + sig47 = utils.get_test_data("47.fa.sig") + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("sig", "extract", sig47, "--name", "FOO") @utils.in_tempdir def test_sig_extract_6(c): # extract matches to several names from among several signatures - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - c.run_sourmash('sig', 'extract', sig47, sig63, '--name', 'Shewanella') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + c.run_sourmash("sig", "extract", sig47, sig63, "--name", "Shewanella") # stdout should be new signature out = c.last_result.out @@ -1692,8 +1774,8 @@ def test_sig_extract_6(c): @utils.in_tempdir def test_sig_extract_7(c): # extract matches based on ksize - sig2 = utils.get_test_data('2.fa.sig') - c.run_sourmash('sig', 'extract', sig2, '-k', '31') + sig2 = utils.get_test_data("2.fa.sig") + c.run_sourmash("sig", "extract", sig2, "-k", "31") # stdout should be new signature out = c.last_result.out @@ -1707,8 +1789,8 @@ def test_sig_extract_7(c): @utils.in_tempdir def test_sig_extract_7_no_ksize(c): # extract all three matches when -k not specified - sig2 = utils.get_test_data('2.fa.sig') - c.run_sourmash('sig', 'extract', sig2) + sig2 = utils.get_test_data("2.fa.sig") + c.run_sourmash("sig", "extract", sig2) # stdout should be new signature out = c.last_result.out @@ -1721,18 +1803,18 @@ def test_sig_extract_7_no_ksize(c): def test_sig_extract_8_empty_picklist_fail(runtmp): # what happens with an empty picklist? - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # make empty picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline=""): pass picklist_arg = f"{picklist_csv}:md5full:md5" with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) err = runtmp.last_result.err print(err) @@ -1742,15 +1824,15 @@ def test_sig_extract_8_empty_picklist_fail(runtmp): def test_sig_extract_8_nofile_picklist_fail(runtmp): # what happens when picklist file does not exist? - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # picklist file does not exist - picklist_csv = runtmp.output('pick.csv') + picklist_csv = runtmp.output("pick.csv") picklist_arg = f"{picklist_csv}:md5full:md5" with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) err = runtmp.last_result.err print(err) @@ -1760,25 +1842,27 @@ def test_sig_extract_8_nofile_picklist_fail(runtmp): def test_sig_extract_8_picklist_md5(runtmp): # extract 47 from 47, using a picklist w/full md5 - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:md5full:md5" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -1799,26 +1883,28 @@ def test_sig_extract_8_picklist_md5(runtmp): def test_sig_extract_8_picklist_md5_zipfile(runtmp): # extract 47 from a zipfile, using a picklist w/full md5 - allzip = utils.get_test_data('prot/all.zip') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + allzip = utils.get_test_data("prot/all.zip") + sig47 = utils.get_test_data("47.fa.sig") + utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:md5full:md5" - runtmp.sourmash('sig', 'extract', allzip, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", allzip, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -1840,54 +1926,68 @@ def test_sig_extract_8_picklist_md5_zipfile(runtmp): def test_sig_extract_8_picklist_md5_lca_fail(runtmp): # try to extract 47 from an LCA database, using a picklist w/full md5; will # fail. - allzip = utils.get_test_data('lca/47+63.lca.json') + allzip = utils.get_test_data("lca/47+63.lca.json") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='50a9274021e43eda8b2e77f8fa60ae8e', - md5short='50a9274021e43eda8b2e77f8fa60ae8e'[:8], - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="50a9274021e43eda8b2e77f8fa60ae8e", + md5short="50a9274021e43eda8b2e77f8fa60ae8e"[:8], + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:md5full:md5" - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('sig', 'extract', allzip, '--picklist', picklist_arg, - '--md5', '50a9274021e4') + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash( + "sig", + "extract", + allzip, + "--picklist", + picklist_arg, + "--md5", + "50a9274021e4", + ) # this happens b/c the implementation of 'extract' uses picklists, and # LCA databases don't support multiple picklists. print(runtmp.last_result.err) - assert "This input collection doesn't support 'extract' with picklists or patterns." in runtmp.last_result.err + assert ( + "This input collection doesn't support 'extract' with picklists or patterns." + in runtmp.last_result.err + ) def test_sig_extract_8_picklist_md5_include(runtmp): # extract 47 from 47, using a picklist w/full md5:: explicit include - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:md5full:md5:include" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -1908,25 +2008,27 @@ def test_sig_extract_8_picklist_md5_include(runtmp): def test_sig_extract_8_picklist_md5_exclude(runtmp): # extract 63 from 47,63 by excluding 47, using a picklist w/full md5 - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:md5full:md5:exclude" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -1948,30 +2050,45 @@ def test_sig_extract_8_picklist_md5_exclude(runtmp): def test_sig_extract_8_picklist_md5_require_all(runtmp): # extract 47 from 47, using a picklist w/full md5; # confirm that check missing picklist val errors out on --picklist-require - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) - w.writerow(dict(exactName='', md5full='BAD MD5', - md5short='', fullIdent='', nodotIdent='')) + w.writerow( + dict( + exactName="", + md5full="BAD MD5", + md5short="", + fullIdent="", + nodotIdent="", + ) + ) picklist_arg = f"{picklist_csv}:md5full:md5" with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'extract', sig47, sig63, - '--picklist', picklist_arg, - '--picklist-require-all') + runtmp.sourmash( + "sig", + "extract", + sig47, + sig63, + "--picklist", + picklist_arg, + "--picklist-require-all", + ) # stdout should be new signature out = runtmp.last_result.out @@ -1988,31 +2105,33 @@ def test_sig_extract_8_picklist_md5_require_all(runtmp): assert "loaded 1 total that matched ksize & molecule type" in err assert "extracted 1 signatures from 2 file(s)" in err assert "for given picklist, found 1 matches to 2 distinct values" in err - assert 'WARNING: 1 missing picklist values.' in err - assert 'ERROR: failing because --picklist-require-all was set' in err + assert "WARNING: 1 missing picklist values." in err + assert "ERROR: failing because --picklist-require-all was set" in err def test_sig_extract_8_picklist_name(runtmp): # extract 47 from 47, using a picklist w/full md5 - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:exactName:name" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -2025,25 +2144,27 @@ def test_sig_extract_8_picklist_name(runtmp): def test_sig_extract_8_picklist_name_exclude(runtmp): # exclude 47 based on name - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:exactName:name:exclude" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -2056,25 +2177,27 @@ def test_sig_extract_8_picklist_name_exclude(runtmp): def test_sig_extract_8_picklist_ident(runtmp): # extract 47 from 47, using a picklist w/full md5 - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:fullIdent:ident" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -2087,25 +2210,27 @@ def test_sig_extract_8_picklist_ident(runtmp): def test_sig_extract_8_picklist_ident_exclude(runtmp): # exclude 47 based on ident - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:fullIdent:ident:exclude" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -2118,25 +2243,27 @@ def test_sig_extract_8_picklist_ident_exclude(runtmp): def test_sig_extract_8_picklist_ident_dot(runtmp): # extract 47 from 47, using a picklist w/full md5 - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:nodotIdent:identprefix" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -2149,25 +2276,27 @@ def test_sig_extract_8_picklist_ident_dot(runtmp): def test_sig_extract_8_picklist_ident_dot_exclude(runtmp): # exlude 47 based on identprefix - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:nodotIdent:identprefix:exclude" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -2180,25 +2309,27 @@ def test_sig_extract_8_picklist_ident_dot_exclude(runtmp): def test_sig_extract_8_picklist_md5_short(runtmp): # extract 47 from 47, using a picklist w/full md5 - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:md5short:md5prefix8" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -2211,25 +2342,27 @@ def test_sig_extract_8_picklist_md5_short(runtmp): def test_sig_extract_8_picklist_md5_short_exclude(runtmp): # exclude 47 based on md5prefix8 - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:md5short:md5prefix8:exclude" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -2242,25 +2375,27 @@ def test_sig_extract_8_picklist_md5_short_exclude(runtmp): def test_sig_extract_8_picklist_md5_short_alias(runtmp): # extract 47 from 47, using a picklist w/full md5 - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:md5short:md5short" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -2273,25 +2408,27 @@ def test_sig_extract_8_picklist_md5_short_alias(runtmp): def test_sig_extract_8_picklist_md5_short_alias_exclude(runtmp): # exlude 47 based on md5prefix8 alias, md5short - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:md5short:md5short:exclude" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -2304,57 +2441,63 @@ def test_sig_extract_8_picklist_md5_short_alias_exclude(runtmp): def test_sig_extract_8_picklist_md5_short_alias_with_md5_selector_nomatch(runtmp): # extract 47 from 47, using a picklist w/full md5 and also md5 selector - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:md5short:md5short" with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'extract', sig47, sig63, - '--picklist', picklist_arg, - '--md5', 'XXX') # no match to md5 selector here + runtmp.sourmash( + "sig", "extract", sig47, sig63, "--picklist", picklist_arg, "--md5", "XXX" + ) # no match to md5 selector here err = runtmp.last_result.err assert "no matching signatures to save!" in err -def test_sig_extract_8_picklist_md5_short_alias_with_md5_selector_nomatch_exclude(runtmp): +def test_sig_extract_8_picklist_md5_short_alias_with_md5_selector_nomatch_exclude( + runtmp, +): # exclude 47 using a picklist w/full md5 and also md5 selector - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:md5short:md5short:exclude" with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'extract', sig47, sig63, - '--picklist', picklist_arg, - '--md5', 'XXX') # no match to md5 selector here + runtmp.sourmash( + "sig", "extract", sig47, sig63, "--picklist", picklist_arg, "--md5", "XXX" + ) # no match to md5 selector here err = runtmp.last_result.err assert "no matching signatures to save!" in err @@ -2362,26 +2505,36 @@ def test_sig_extract_8_picklist_md5_short_alias_with_md5_selector_nomatch_exclud def test_sig_extract_8_picklist_md5_short_alias_with_md5_selector(runtmp): # extract 47 from 47, using a picklist w/full md5 and also md5 selector - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:md5short:md5short" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg, - '--md5', '09a08691ce5295215') + runtmp.sourmash( + "sig", + "extract", + sig47, + sig63, + "--picklist", + picklist_arg, + "--md5", + "09a08691ce5295215", + ) # stdout should be new signature out = runtmp.last_result.out @@ -2391,54 +2544,64 @@ def test_sig_extract_8_picklist_md5_short_alias_with_md5_selector(runtmp): assert actual_extract_sig == test_extract_sig + def test_sig_extract_8_picklist_md5_short_alias_with_md5_selector_exclude(runtmp): # exclude 47, using a picklist w/full md5; but try to select with md5 selector - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:md5short:md5short:exclude" with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg, - '--md5', '09a08691ce5295215') + runtmp.sourmash( + "sig", + "extract", + sig47, + sig63, + "--picklist", + picklist_arg, + "--md5", + "09a08691ce5295215", + ) # NTP: do we want to emit a more informative "conflicting selectors" type of msg? err = runtmp.last_result.err print(err) assert "loaded 1 distinct values into picklist." in err assert "loaded 1 total that matched ksize & molecule type" in err - assert 'no matching signatures to save!' in err + assert "no matching signatures to save!" in err def test_sig_extract_8_picklist_md5_nomatch(runtmp): # use an empty picklist => no match - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: - w = csv.DictWriter(csvfp, fieldnames=['md5short']) + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: + w = csv.DictWriter(csvfp, fieldnames=["md5short"]) w.writeheader() picklist_arg = f"{picklist_csv}:md5short:md5prefix8" with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', - picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -2451,19 +2614,18 @@ def test_sig_extract_8_picklist_md5_nomatch(runtmp): def test_sig_extract_8_picklist_md5_nomatch_exclude(runtmp): # use an empty picklist to exclude => no match => include everything - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: - w = csv.DictWriter(csvfp, fieldnames=['md5short']) + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: + w = csv.DictWriter(csvfp, fieldnames=["md5short"]) w.writeheader() picklist_arg = f"{picklist_csv}:md5short:md5prefix8:exclude" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', - picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be both signatures out = runtmp.last_result.out @@ -2478,91 +2640,94 @@ def test_sig_extract_8_picklist_md5_nomatch_exclude(runtmp): err = runtmp.last_result.err print(err) assert runtmp.last_result.status == 0 - assert 'loaded 0 distinct values into picklist.' in err - assert 'loaded 2 total that matched ksize & molecule type' in err - assert 'extracted 2 signatures from 2 file(s)' in err - assert 'for given picklist, found 2 matches by excluding 0 distinct values' in err + assert "loaded 0 distinct values into picklist." in err + assert "loaded 2 total that matched ksize & molecule type" in err + assert "extracted 2 signatures from 2 file(s)" in err + assert "for given picklist, found 2 matches by excluding 0 distinct values" in err def test_sig_extract_9_picklist_md5_ksize_hp_select(runtmp): # test with -k and moltype selector - sigdir = utils.get_test_data('prot/') + sigdir = utils.get_test_data("prot/") # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: - w = csv.DictWriter(csvfp, fieldnames=['md5']) + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: + w = csv.DictWriter(csvfp, fieldnames=["md5"]) w.writeheader() - w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) + w.writerow(dict(md5="ea2a1ad233c2908529d124a330bcb672")) picklist_arg = f"{picklist_csv}:md5:md5" - runtmp.sourmash('sig', 'extract', sigdir, '--picklist', - picklist_arg, '-k', '19', '--hp') + runtmp.sourmash( + "sig", "extract", sigdir, "--picklist", picklist_arg, "-k", "19", "--hp" + ) # stdout should be new signature out = runtmp.last_result.out actual_extract_sig = sourmash.load_one_signature(out) print(actual_extract_sig.md5sum) - assert str(actual_extract_sig) == 'GCA_001593925' - assert actual_extract_sig.md5sum() == 'ea2a1ad233c2908529d124a330bcb672' + assert str(actual_extract_sig) == "GCA_001593925" + assert actual_extract_sig.md5sum() == "ea2a1ad233c2908529d124a330bcb672" assert actual_extract_sig.minhash.ksize == 19 - assert actual_extract_sig.minhash.moltype == 'hp' + assert actual_extract_sig.minhash.moltype == "hp" def test_sig_extract_9_picklist_md5_ksize_hp_select_exclude(runtmp): # test picklist exclude with -k and moltype selector - sigdir = utils.get_test_data('prot/') + sigdir = utils.get_test_data("prot/") # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: - w = csv.DictWriter(csvfp, fieldnames=['md5']) + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: + w = csv.DictWriter(csvfp, fieldnames=["md5"]) w.writeheader() - w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) + w.writerow(dict(md5="ea2a1ad233c2908529d124a330bcb672")) picklist_arg = f"{picklist_csv}:md5:md5:exclude" - runtmp.sourmash('sig', 'extract', sigdir, '--picklist', - picklist_arg, '-k', '19', '--hp') + runtmp.sourmash( + "sig", "extract", sigdir, "--picklist", picklist_arg, "-k", "19", "--hp" + ) # stdout should be new signature out = runtmp.last_result.out actual_extract_sig = sourmash.load_one_signature(out) print(actual_extract_sig.md5sum) - assert str(actual_extract_sig) == 'GCA_001593935' - assert actual_extract_sig.md5sum() == 'bb0e6d90df01b7bd5d0956a5f9e3ed12' + assert str(actual_extract_sig) == "GCA_001593935" + assert actual_extract_sig.md5sum() == "bb0e6d90df01b7bd5d0956a5f9e3ed12" assert actual_extract_sig.minhash.ksize == 19 - assert actual_extract_sig.minhash.moltype == 'hp' + assert actual_extract_sig.minhash.moltype == "hp" def test_sig_extract_10_picklist_md5_dups_and_empty(runtmp): # test empty picklist values, and duplicate picklist values - sigdir = utils.get_test_data('prot/') + sigdir = utils.get_test_data("prot/") # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: - w = csv.DictWriter(csvfp, fieldnames=['md5']) + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: + w = csv.DictWriter(csvfp, fieldnames=["md5"]) w.writeheader() - w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) - w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) - w.writerow(dict(md5='')) + w.writerow(dict(md5="ea2a1ad233c2908529d124a330bcb672")) + w.writerow(dict(md5="ea2a1ad233c2908529d124a330bcb672")) + w.writerow(dict(md5="")) picklist_arg = f"{picklist_csv}:md5:md5" - runtmp.sourmash('sig', 'extract', sigdir, '--picklist', - picklist_arg, '-k', '19', '--hp') + runtmp.sourmash( + "sig", "extract", sigdir, "--picklist", picklist_arg, "-k", "19", "--hp" + ) # stdout should be new signature out = runtmp.last_result.out actual_extract_sig = sourmash.load_one_signature(out) assert actual_extract_sig.minhash.ksize == 19 - assert actual_extract_sig.minhash.moltype == 'hp' - assert actual_extract_sig.md5sum() == 'ea2a1ad233c2908529d124a330bcb672' + assert actual_extract_sig.minhash.moltype == "hp" + assert actual_extract_sig.md5sum() == "ea2a1ad233c2908529d124a330bcb672" err = runtmp.last_result.err print(err) @@ -2573,29 +2738,30 @@ def test_sig_extract_10_picklist_md5_dups_and_empty(runtmp): def test_sig_extract_10_picklist_md5_dups_and_empty_exclude(runtmp): # test empty picklist values, and duplicate picklist values for exclude - sigdir = utils.get_test_data('prot/') + sigdir = utils.get_test_data("prot/") # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: - w = csv.DictWriter(csvfp, fieldnames=['md5']) + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: + w = csv.DictWriter(csvfp, fieldnames=["md5"]) w.writeheader() - w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) - w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) - w.writerow(dict(md5='')) + w.writerow(dict(md5="ea2a1ad233c2908529d124a330bcb672")) + w.writerow(dict(md5="ea2a1ad233c2908529d124a330bcb672")) + w.writerow(dict(md5="")) picklist_arg = f"{picklist_csv}:md5:md5:exclude" - runtmp.sourmash('sig', 'extract', sigdir, '--picklist', - picklist_arg, '-k', '19', '--hp') + runtmp.sourmash( + "sig", "extract", sigdir, "--picklist", picklist_arg, "-k", "19", "--hp" + ) # stdout should be new signature out = runtmp.last_result.out actual_extract_sig = sourmash.load_one_signature(out) assert actual_extract_sig.minhash.ksize == 19 - assert actual_extract_sig.minhash.moltype == 'hp' - assert actual_extract_sig.md5sum() == 'bb0e6d90df01b7bd5d0956a5f9e3ed12' + assert actual_extract_sig.minhash.moltype == "hp" + assert actual_extract_sig.md5sum() == "bb0e6d90df01b7bd5d0956a5f9e3ed12" err = runtmp.last_result.err print(err) @@ -2606,20 +2772,21 @@ def test_sig_extract_10_picklist_md5_dups_and_empty_exclude(runtmp): def test_sig_extract_11_picklist_bad_coltype(runtmp): # test with invalid picklist coltype - sigdir = utils.get_test_data('prot/') + sigdir = utils.get_test_data("prot/") # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: - w = csv.DictWriter(csvfp, fieldnames=['md5']) + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: + w = csv.DictWriter(csvfp, fieldnames=["md5"]) w.writeheader() - w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) + w.writerow(dict(md5="ea2a1ad233c2908529d124a330bcb672")) picklist_arg = f"{picklist_csv}:md5:BADCOLTYPE" with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'extract', sigdir, '--picklist', - picklist_arg, '-k', '19', '--hp') + runtmp.sourmash( + "sig", "extract", sigdir, "--picklist", picklist_arg, "-k", "19", "--hp" + ) err = runtmp.last_result.err print(err) @@ -2628,20 +2795,21 @@ def test_sig_extract_11_picklist_bad_coltype(runtmp): def test_sig_extract_11_picklist_bad_coltype_exclude(runtmp): # test with invalid picklist coltype - sigdir = utils.get_test_data('prot/') + sigdir = utils.get_test_data("prot/") # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: - w = csv.DictWriter(csvfp, fieldnames=['md5']) + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: + w = csv.DictWriter(csvfp, fieldnames=["md5"]) w.writeheader() - w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) + w.writerow(dict(md5="ea2a1ad233c2908529d124a330bcb672")) picklist_arg = f"{picklist_csv}:md5:BADCOLTYPE:exclude" with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'extract', sigdir, '--picklist', - picklist_arg, '-k', '19', '--hp') + runtmp.sourmash( + "sig", "extract", sigdir, "--picklist", picklist_arg, "-k", "19", "--hp" + ) err = runtmp.last_result.err print(err) @@ -2650,20 +2818,21 @@ def test_sig_extract_11_picklist_bad_coltype_exclude(runtmp): def test_sig_extract_12_picklist_bad_argstr(runtmp): # test with invalid argument format to --picklist - sigdir = utils.get_test_data('prot/') + sigdir = utils.get_test_data("prot/") # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: - w = csv.DictWriter(csvfp, fieldnames=['md5']) + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: + w = csv.DictWriter(csvfp, fieldnames=["md5"]) w.writeheader() - w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) + w.writerow(dict(md5="ea2a1ad233c2908529d124a330bcb672")) picklist_arg = f"{picklist_csv}" with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'extract', sigdir, '--picklist', - picklist_arg, '-k', '19', '--hp') + runtmp.sourmash( + "sig", "extract", sigdir, "--picklist", picklist_arg, "-k", "19", "--hp" + ) err = runtmp.last_result.err print(err) @@ -2672,42 +2841,47 @@ def test_sig_extract_12_picklist_bad_argstr(runtmp): def test_sig_extract_12_picklist_bad_pickstyle(runtmp): # test with invalid argument format to --picklist - sigdir = utils.get_test_data('prot/') + sigdir = utils.get_test_data("prot/") # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: - w = csv.DictWriter(csvfp, fieldnames=['md5']) + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: + w = csv.DictWriter(csvfp, fieldnames=["md5"]) w.writeheader() - w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) + w.writerow(dict(md5="ea2a1ad233c2908529d124a330bcb672")) picklist_arg = f"{picklist_csv}:md5:md5:XXX" with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'extract', sigdir, '--picklist', - picklist_arg, '-k', '19', '--hp') + runtmp.sourmash( + "sig", "extract", sigdir, "--picklist", picklist_arg, "-k", "19", "--hp" + ) err = runtmp.last_result.err print(err) - assert "invalid picklist 'pickstyle' argument 4: 'XXX' must be 'include' or 'exclude'" in err + assert ( + "invalid picklist 'pickstyle' argument 4: 'XXX' must be 'include' or 'exclude'" + in err + ) def test_sig_extract_12_picklist_bad_colname(runtmp): # test with invalid picklist colname - sigdir = utils.get_test_data('prot/') + sigdir = utils.get_test_data("prot/") # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: - w = csv.DictWriter(csvfp, fieldnames=['md5']) + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: + w = csv.DictWriter(csvfp, fieldnames=["md5"]) w.writeheader() - w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) + w.writerow(dict(md5="ea2a1ad233c2908529d124a330bcb672")) picklist_arg = f"{picklist_csv}:BADCOLNAME:md5" with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'extract', sigdir, '--picklist', - picklist_arg, '-k', '19', '--hp') + runtmp.sourmash( + "sig", "extract", sigdir, "--picklist", picklist_arg, "-k", "19", "--hp" + ) err = runtmp.last_result.err print(err) @@ -2716,20 +2890,21 @@ def test_sig_extract_12_picklist_bad_colname(runtmp): def test_sig_extract_12_picklist_bad_colname_exclude(runtmp): # test with invalid picklist colname - sigdir = utils.get_test_data('prot/') + sigdir = utils.get_test_data("prot/") # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: - w = csv.DictWriter(csvfp, fieldnames=['md5']) + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: + w = csv.DictWriter(csvfp, fieldnames=["md5"]) w.writeheader() - w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) + w.writerow(dict(md5="ea2a1ad233c2908529d124a330bcb672")) picklist_arg = f"{picklist_csv}:BADCOLNAME:md5:exclude" with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'extract', sigdir, '--picklist', - picklist_arg, '-k', '19', '--hp') + runtmp.sourmash( + "sig", "extract", sigdir, "--picklist", picklist_arg, "-k", "19", "--hp" + ) err = runtmp.last_result.err print(err) @@ -2738,45 +2913,47 @@ def test_sig_extract_12_picklist_bad_colname_exclude(runtmp): def test_sig_extract_11_pattern_include(runtmp): # test --include-db-pattern - sigfiles = glob.glob(utils.get_test_data('prot/*.zip')) + sigfiles = glob.glob(utils.get_test_data("prot/*.zip")) - runtmp.sourmash('sig', 'extract', '--include', 'shewanella', *sigfiles, - '-o', 'out.zip') + runtmp.sourmash( + "sig", "extract", "--include", "shewanella", *sigfiles, "-o", "out.zip" + ) - idx = sourmash.load_file_as_index(runtmp.output('out.zip')) + idx = sourmash.load_file_as_index(runtmp.output("out.zip")) assert len(idx) == 2 - names = [ ss.name for ss in idx.signatures() ] + names = [ss.name for ss in idx.signatures()] for n in names: - assert 'shewanella' in n.lower(), n + assert "shewanella" in n.lower(), n def test_sig_extract_11_pattern_exclude(runtmp): # test --exclude-db-pattern - sigfiles = glob.glob(utils.get_test_data('prot/*.zip')) + sigfiles = glob.glob(utils.get_test_data("prot/*.zip")) - runtmp.sourmash('sig', 'extract', '--exclude', 'shewanella', *sigfiles, - '-o', 'out.zip') + runtmp.sourmash( + "sig", "extract", "--exclude", "shewanella", *sigfiles, "-o", "out.zip" + ) - idx = sourmash.load_file_as_index(runtmp.output('out.zip')) + idx = sourmash.load_file_as_index(runtmp.output("out.zip")) assert len(idx) == 18 - names = [ ss.name for ss in idx.signatures() ] + names = [ss.name for ss in idx.signatures()] for n in names: - assert 'shewanella' not in n.lower(), n + assert "shewanella" not in n.lower(), n def test_sig_extract_identical_md5s(runtmp): # test that we properly handle different signatures with identical md5s - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss = load_signatures(sig47) sig = list(ss)[0] new_sig = sig.to_mutable() - new_sig.name = 'foo' - sig47foo = runtmp.output('foo.sig') + new_sig.name = "foo" + sig47foo = runtmp.output("foo.sig") # this was only a problem when the signatures are stored in the same file - with open(sig47foo, 'wt') as fp: + with open(sig47foo, "w") as fp: sourmash.save_signatures([new_sig, sig], fp) - runtmp.run_sourmash('sig', 'extract', '--name', 'foo', sig47foo) + runtmp.run_sourmash("sig", "extract", "--name", "foo", sig47foo) out = runtmp.last_result.out print(out) @@ -2784,18 +2961,18 @@ def test_sig_extract_identical_md5s(runtmp): ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert 'Shewanella' not in ss.name - assert 'foo' in ss.name - assert ss.md5sum() == '09a08691ce52952152f0e866a59f6261' + assert "Shewanella" not in ss.name + assert "foo" in ss.name + assert ss.md5sum() == "09a08691ce52952152f0e866a59f6261" def test_sig_flatten_1(runtmp): c = runtmp # extract matches to several names from among several signatures & flatten - sig47abund = utils.get_test_data('track_abund/47.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'flatten', sig47abund, '--name', 'Shewanella') + sig47abund = utils.get_test_data("track_abund/47.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "flatten", sig47abund, "--name", "Shewanella") # stdout should be new signature out = c.last_result.out @@ -2813,14 +2990,20 @@ def test_sig_flatten_1_from_file(runtmp): c = runtmp # extract matches to several names from among several signatures & flatten - sig47abund = utils.get_test_data('track_abund/47.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') + sig47abund = utils.get_test_data("track_abund/47.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") - from_file = _write_file(runtmp, 'list.txt', [sig47abund]) - picklist = _write_file(runtmp, 'pl.csv', ['md5short', '09a08691']) + from_file = _write_file(runtmp, "list.txt", [sig47abund]) + picklist = _write_file(runtmp, "pl.csv", ["md5short", "09a08691"]) - c.run_sourmash('sig', 'flatten', '--from-file', from_file, - '--picklist', f'{picklist}:md5short:md5short') + c.run_sourmash( + "sig", + "flatten", + "--from-file", + from_file, + "--picklist", + f"{picklist}:md5short:md5short", + ) # stdout should be new signature out = c.last_result.out @@ -2837,10 +3020,10 @@ def test_sig_flatten_1_from_file(runtmp): @utils.in_tempdir def test_sig_flatten_1_select_name(c): # extract matches to several names from among several signatures & flatten - sig47abund = utils.get_test_data('track_abund/47.fa.sig') - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'flatten', sig2, sig47abund, '--name', 'Shewanella') + sig47abund = utils.get_test_data("track_abund/47.fa.sig") + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "flatten", sig2, sig47abund, "--name", "Shewanella") # stdout should be new signature out = c.last_result.out @@ -2858,10 +3041,10 @@ def test_sig_flatten_1_select_md5(runtmp): c = runtmp # extract matches to several names from among several signatures & flatten - sig47abund = utils.get_test_data('track_abund/47.fa.sig') - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'flatten', sig2, sig47abund, '--md5', '09a08691c') + sig47abund = utils.get_test_data("track_abund/47.fa.sig") + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "flatten", sig2, sig47abund, "--md5", "09a08691c") # stdout should be new signature out = c.last_result.out @@ -2878,8 +3061,8 @@ def test_sig_flatten_1_select_md5(runtmp): def test_sig_flatten_2_ksize(runtmp): c = runtmp # flatten only one signature selected using ksize - psw_mag = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') - c.run_sourmash('sig', 'flatten', psw_mag, '-k', '31') + psw_mag = utils.get_test_data("lca/TARA_PSW_MAG_00136.sig") + c.run_sourmash("sig", "flatten", psw_mag, "-k", "31") # stdout should be new signature out = c.last_result.out @@ -2893,8 +3076,8 @@ def test_sig_flatten_2_ksize(runtmp): @utils.in_tempdir def test_sig_downsample_1_scaled(c): # downsample a scaled signature - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'downsample', '--scaled', '10000', sig47) + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "downsample", "--scaled", "10000", sig47) # stdout should be new signature out = c.last_result.out @@ -2910,8 +3093,8 @@ def test_sig_downsample_1_scaled(c): @utils.in_tempdir def test_sig_downsample_1_scaled_downsample_multisig(c): # downsample many scaled signatures in one file - multisig = utils.get_test_data('47+63-multisig.sig') - c.run_sourmash('sig', 'downsample', '--scaled', '10000', multisig) + multisig = utils.get_test_data("47+63-multisig.sig") + c.run_sourmash("sig", "downsample", "--scaled", "10000", multisig) # stdout should be new signatures out = c.last_result.out @@ -2923,8 +3106,8 @@ def test_sig_downsample_1_scaled_downsample_multisig(c): @utils.in_tempdir def test_sig_downsample_1_scaled_to_num(c): # downsample a scaled signature - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'downsample', '--num', '500', sig47) + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "downsample", "--num", "500", sig47) # stdout should be new signature out = c.last_result.out @@ -2938,70 +3121,72 @@ def test_sig_downsample_1_scaled_to_num(c): test_mins = test_downsample_sig.minhash.hashes.keys() test_mins = list(test_mins) test_mins.sort() - test_mins = test_mins[:500] # take 500 smallest + test_mins = test_mins[:500] # take 500 smallest assert actual_mins == test_mins def test_sig_downsample_check_num_bounds_negative(runtmp): - c=runtmp - sig47 = utils.get_test_data('47.fa.sig') + c = runtmp + sig47 = utils.get_test_data("47.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sig', 'downsample', '--num', '-5', sig47) + c.run_sourmash("sig", "downsample", "--num", "-5", sig47) assert "ERROR: num value must be positive" in c.last_result.err def test_sig_downsample_check_num_bounds_less_than_minimum(runtmp): - c=runtmp - sig47 = utils.get_test_data('47.fa.sig') + c = runtmp + sig47 = utils.get_test_data("47.fa.sig") - c.run_sourmash('sig', 'downsample', '--num', '25', sig47) + c.run_sourmash("sig", "downsample", "--num", "25", sig47) assert "WARNING: num value should be >= 50. Continuing anyway." in c.last_result.err def test_sig_downsample_check_num_bounds_more_than_maximum(runtmp): - c=runtmp - sig47 = utils.get_test_data('47.fa.sig') + c = runtmp + sig47 = utils.get_test_data("47.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sig', 'downsample', '--num', '100000', sig47) + c.run_sourmash("sig", "downsample", "--num", "100000", sig47) - assert "WARNING: num value should be <= 50000. Continuing anyway." in c.last_result.err + assert ( + "WARNING: num value should be <= 50000. Continuing anyway." in c.last_result.err + ) @utils.in_tempdir def test_sig_downsample_1_scaled_to_num_fail(c): # downsample a scaled signature - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sig', 'downsample', '--num', '50000', sig47) + c.run_sourmash("sig", "downsample", "--num", "50000", sig47) @utils.in_tempdir def test_sig_downsample_1_scaled_empty(c): # downsample a scaled signature - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sig', 'downsample', sig47) + c.run_sourmash("sig", "downsample", sig47) @utils.in_tempdir def test_sig_downsample_2_num(c): # downsample a num signature - sigs11 = utils.get_test_data('genome-s11.fa.gz.sig') - c.run_sourmash('sig', 'downsample', '--num', '500', - '-k', '21', '--dna', sigs11) + sigs11 = utils.get_test_data("genome-s11.fa.gz.sig") + c.run_sourmash("sig", "downsample", "--num", "500", "-k", "21", "--dna", sigs11) # stdout should be new signature out = c.last_result.out - test_downsample_sig = sourmash.load_one_signature(sigs11, ksize=21, - select_moltype='DNA') + test_downsample_sig = sourmash.load_one_signature( + sigs11, ksize=21, select_moltype="DNA" + ) actual_downsample_sig = sourmash.load_one_signature(out) test_mh = test_downsample_sig.minhash.downsample(num=500) @@ -3011,15 +3196,17 @@ def test_sig_downsample_2_num(c): @utils.in_tempdir def test_sig_downsample_2_num_to_scaled(c): # downsample a num signature and convert it into a scaled sig - sigs11 = utils.get_test_data('genome-s11.fa.gz.sig') - c.run_sourmash('sig', 'downsample', '--scaled', '10000', - '-k', '21', '--dna', sigs11) + sigs11 = utils.get_test_data("genome-s11.fa.gz.sig") + c.run_sourmash( + "sig", "downsample", "--scaled", "10000", "-k", "21", "--dna", sigs11 + ) # stdout should be new signature out = c.last_result.out - test_downsample_sig = sourmash.load_one_signature(sigs11, ksize=21, - select_moltype='DNA') + test_downsample_sig = sourmash.load_one_signature( + sigs11, ksize=21, select_moltype="DNA" + ) actual_downsample_sig = sourmash.load_one_signature(out) test_mins = test_downsample_sig.minhash.hashes.keys() @@ -3027,7 +3214,7 @@ def test_sig_downsample_2_num_to_scaled(c): # select those mins that are beneath the new max hash... max_hash = actual_downsample_sig.minhash._max_hash - test_mins_down = { k for k in test_mins if k < max_hash } + test_mins_down = {k for k in test_mins if k < max_hash} assert test_mins_down == set(actual_mins) @@ -3035,38 +3222,49 @@ def test_sig_downsample_2_num_to_scaled(c): def test_sig_downsample_2_num_to_scaled_fail(c): # downsample a num signature and FAIL to convert it into a scaled sig # because new scaled is too low - sigs11 = utils.get_test_data('genome-s11.fa.gz.sig') + sigs11 = utils.get_test_data("genome-s11.fa.gz.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sig', 'downsample', '--scaled', '100', - '-k', '21', '--dna', sigs11) + c.run_sourmash( + "sig", "downsample", "--scaled", "100", "-k", "21", "--dna", sigs11 + ) @utils.in_tempdir def test_sig_downsample_2_num_and_scaled_both_fail(c): # cannot specify both --num and --scaled - sigs11 = utils.get_test_data('genome-s11.fa.gz.sig') + sigs11 = utils.get_test_data("genome-s11.fa.gz.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sig', 'downsample', '--scaled', '100', '--num', '50', - '-k', '21', '--dna', sigs11) + c.run_sourmash( + "sig", + "downsample", + "--scaled", + "100", + "--num", + "50", + "-k", + "21", + "--dna", + sigs11, + ) @utils.in_tempdir def test_sig_downsample_2_num_empty(c): # downsample a num signature - sigs11 = utils.get_test_data('genome-s11.fa.gz.sig') + sigs11 = utils.get_test_data("genome-s11.fa.gz.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sig', 'downsample', '-k', '21', '--dna', sigs11) + c.run_sourmash("sig", "downsample", "-k", "21", "--dna", sigs11) def test_sig_describe_1(runtmp): c = runtmp # get basic info on a signature - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'describe', sig47) + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "describe", sig47) out = c.last_result.out print(c.last_result) @@ -3087,12 +3285,18 @@ def test_sig_describe_1_fromfile_picklist(runtmp): c = runtmp # get basic info on a signature - sig47 = utils.get_test_data('47.fa.sig') - from_file = _write_file(runtmp, 'list.txt', [sig47]) - picklist = _write_file(runtmp, 'pl.csv', ['md5short', '09a08691']) - - c.run_sourmash('sig', 'describe', '--from-file', from_file, - '--picklist', f'{picklist}:md5short:md5short') + sig47 = utils.get_test_data("47.fa.sig") + from_file = _write_file(runtmp, "list.txt", [sig47]) + picklist = _write_file(runtmp, "pl.csv", ["md5short", "09a08691"]) + + c.run_sourmash( + "sig", + "describe", + "--from-file", + from_file, + "--picklist", + f"{picklist}:md5short:md5short", + ) out = c.last_result.out print(c.last_result) @@ -3112,41 +3316,55 @@ def test_sig_describe_1_fromfile_picklist(runtmp): @utils.in_thisdir def test_sig_describe_protein(c): # test describe on a singleton protein signature - testdata = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - c.run_sourmash('sig', 'describe', testdata) + testdata = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + c.run_sourmash("sig", "describe", testdata) - assert 'k=19 molecule=protein num=0 scaled=100 seed=42 track_abundance=0' in c.last_result.out + assert ( + "k=19 molecule=protein num=0 scaled=100 seed=42 track_abundance=0" + in c.last_result.out + ) @utils.in_thisdir def test_sig_describe_hp(c): # test describe on a singleton hp signature - testdata = utils.get_test_data('prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - c.run_sourmash('sig', 'describe', testdata) + testdata = utils.get_test_data( + "prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + c.run_sourmash("sig", "describe", testdata) - assert 'k=19 molecule=hp num=0 scaled=100 seed=42 track_abundance=0' in c.last_result.out + assert ( + "k=19 molecule=hp num=0 scaled=100 seed=42 track_abundance=0" + in c.last_result.out + ) @utils.in_thisdir def test_sig_describe_dayhoff(c): # test describe on a singleton dayhoff signature - testdata = utils.get_test_data('prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - c.run_sourmash('sig', 'describe', testdata) + testdata = utils.get_test_data( + "prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + c.run_sourmash("sig", "describe", testdata) - assert 'k=19 molecule=dayhoff num=0 scaled=100 seed=42 track_abundance=0' in c.last_result.out + assert ( + "k=19 molecule=dayhoff num=0 scaled=100 seed=42 track_abundance=0" + in c.last_result.out + ) @utils.in_tempdir def test_sig_describe_1_hp(c): # get basic info on a signature - testdata = utils.get_test_data('short.fa') - c.run_sourmash('compute', '-k', '21,30', - '--dayhoff', '--hp', '--protein', - '--dna', - testdata) + testdata = utils.get_test_data("short.fa") + c.run_sourmash( + "compute", "-k", "21,30", "--dayhoff", "--hp", "--protein", "--dna", testdata + ) # stdout should be new signature - computed_sig = os.path.join(c.location, 'short.fa.sig') - c.run_sourmash('sig', 'describe', computed_sig) + computed_sig = os.path.join(c.location, "short.fa.sig") + c.run_sourmash("sig", "describe", computed_sig) out = c.last_result.out print(c.last_result.out) @@ -3237,16 +3455,15 @@ def test_sig_describe_1_hp(c): """.splitlines() for line in out.splitlines(): - cleaned_line = line.strip().replace( - testdata_dirname, '').replace(location, '') + cleaned_line = line.strip().replace(testdata_dirname, "").replace(location, "") assert cleaned_line in expected_output, cleaned_line @utils.in_tempdir def test_sig_describe_1_multisig(c): # get basic info on multiple signatures in a single file - sigs = utils.get_test_data('47+63-multisig.sig') - c.run_sourmash('sig', 'describe', sigs) + sigs = utils.get_test_data("47+63-multisig.sig") + c.run_sourmash("sig", "describe", sigs) out = c.last_result.out print(c.last_result) @@ -3265,8 +3482,8 @@ def test_sig_describe_1_multisig(c): @utils.in_tempdir def test_sig_describe_1_sbt(c): # get basic info on multiple signatures in an SBT - sigs = utils.get_test_data('prot/protein.sbt.zip') - c.run_sourmash('sig', 'describe', sigs) + sigs = utils.get_test_data("prot/protein.sbt.zip") + c.run_sourmash("sig", "describe", sigs) out = c.last_result.out print(c.last_result) @@ -3282,8 +3499,8 @@ def test_sig_describe_1_sbt(c): @utils.in_tempdir def test_sig_describe_1_lca(c): # get basic info on multiple signatures in an LCA database - sigs = utils.get_test_data('prot/protein.lca.json.gz') - c.run_sourmash('sig', 'describe', sigs) + sigs = utils.get_test_data("prot/protein.lca.json.gz") + c.run_sourmash("sig", "describe", sigs) out = c.last_result.out print(c.last_result) @@ -3299,8 +3516,8 @@ def test_sig_describe_1_lca(c): @utils.in_tempdir def test_sig_describe_1_dir(c): # get basic info on multiple signatures in a directory - sigs = utils.get_test_data('prot/protein/') - c.run_sourmash('sig', 'describe', sigs) + sigs = utils.get_test_data("prot/protein/") + c.run_sourmash("sig", "describe", sigs) out = c.last_result.out print(c.last_result) @@ -3320,8 +3537,8 @@ def test_sig_describe_1_dir(c): @utils.in_tempdir def test_sig_describe_1_zipfile(c): # get basic info on multiple signatures in a zipfile - sigs = utils.get_test_data('prot/all.zip') - c.run_sourmash('sig', 'describe', sigs) + sigs = utils.get_test_data("prot/all.zip") + c.run_sourmash("sig", "describe", sigs) out = c.last_result.out print(c.last_result) @@ -3342,8 +3559,8 @@ def test_sig_describe_1_sig_abund(runtmp): # check output of sig describe on a sketch with abundances c = runtmp - sigfile = utils.get_test_data('track_abund/47.fa.sig') - c.run_sourmash('sig', 'describe', sigfile) + sigfile = utils.get_test_data("track_abund/47.fa.sig") + c.run_sourmash("sig", "describe", sigfile) out = c.last_result.out print(c.last_result.out) @@ -3363,18 +3580,22 @@ def test_sig_describe_1_sig_abund(runtmp): @utils.in_thisdir def test_sig_describe_stdin(c): - sig = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - with open(sig, 'rt') as fp: + sig = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + with open(sig) as fp: data = fp.read() - c.run_sourmash('sig', 'describe', '-', stdin_data=data) + c.run_sourmash("sig", "describe", "-", stdin_data=data) - assert 'signature: GCA_001593925' in c.last_result.out + assert "signature: GCA_001593925" in c.last_result.out @utils.in_tempdir def test_sig_describe_empty(c): - sig = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') + sig = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) ss = sourmash.load_file_as_signatures(sig) ss = list(ss) @@ -3382,34 +3603,34 @@ def test_sig_describe_empty(c): ss = ss[0] ss = ss.to_mutable() - ss.name = '' - ss.filename = '' + ss.name = "" + ss.filename = "" - outsig = c.output('xxx.sig') - with open(outsig, 'wt') as fp: + outsig = c.output("xxx.sig") + with open(outsig, "w") as fp: sourmash.save_signatures([ss], fp) ss = sourmash.load_file_as_signatures(outsig) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert ss.name == '' - assert ss.filename == '' + assert ss.name == "" + assert ss.filename == "" - c.run_sourmash('sig', 'describe', outsig) + c.run_sourmash("sig", "describe", outsig) print(c.last_result.out) - assert 'signature: ** no name **' in c.last_result.out - assert 'source file: ** no name **' in c.last_result.out + assert "signature: ** no name **" in c.last_result.out + assert "source file: ** no name **" in c.last_result.out def test_sig_describe_sqldb(runtmp): # make a sqldb and run fileinfo on it - gcf_all = glob.glob(utils.get_test_data('gather/GCF*.sig')) - sqldb = runtmp.output('some.sqldb') + gcf_all = glob.glob(utils.get_test_data("gather/GCF*.sig")) + sqldb = runtmp.output("some.sqldb") - runtmp.sourmash('sig', 'cat', '-k', '31', *gcf_all, '-o', sqldb) + runtmp.sourmash("sig", "cat", "-k", "31", *gcf_all, "-o", sqldb) - runtmp.sourmash('sig', 'describe', sqldb) + runtmp.sourmash("sig", "describe", sqldb) err = runtmp.last_result.err print(err) @@ -3417,28 +3638,30 @@ def test_sig_describe_sqldb(runtmp): out = runtmp.last_result.out print(out) - assert 'md5: 4289d4241be8573145282352215ca3c4' in out - assert 'md5: 85c3aeec6457c0b1d210472ddeb67714' in out + assert "md5: 4289d4241be8573145282352215ca3c4" in out + assert "md5: 85c3aeec6457c0b1d210472ddeb67714" in out def test_sig_describe_2_csv(runtmp): # output info in CSV spreadsheet c = runtmp - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - c.run_sourmash('sig', 'describe', sig47, sig63, '--csv', 'out.csv') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + c.run_sourmash("sig", "describe", sig47, sig63, "--csv", "out.csv") - expected_md5 = ['09a08691ce52952152f0e866a59f6261', - '38729c6374925585db28916b82a6f513'] + expected_md5 = [ + "09a08691ce52952152f0e866a59f6261", + "38729c6374925585db28916b82a6f513", + ] - with open(c.output('out.csv'), 'rt') as fp: + with open(c.output("out.csv")) as fp: r = csv.DictReader(fp) n = 0 for row, md5 in zip(r, expected_md5): - assert row['md5'] == md5 + assert row["md5"] == md5 n += 1 assert n == 2 @@ -3448,20 +3671,22 @@ def test_sig_describe_2_csv_gz(runtmp): # output info in CSV spreadsheet, gzipped c = runtmp - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - c.run_sourmash('sig', 'describe', sig47, sig63, '--csv', 'out.csv.gz') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + c.run_sourmash("sig", "describe", sig47, sig63, "--csv", "out.csv.gz") - expected_md5 = ['09a08691ce52952152f0e866a59f6261', - '38729c6374925585db28916b82a6f513'] + expected_md5 = [ + "09a08691ce52952152f0e866a59f6261", + "38729c6374925585db28916b82a6f513", + ] - with gzip.open(c.output('out.csv.gz'), 'rt', newline="") as fp: + with gzip.open(c.output("out.csv.gz"), "rt", newline="") as fp: r = csv.DictReader(fp) n = 0 for row, md5 in zip(r, expected_md5): - assert row['md5'] == md5 + assert row["md5"] == md5 n += 1 assert n == 2 @@ -3471,31 +3696,29 @@ def test_sig_describe_2_csv_abund(runtmp): # output info in CSV spreadsheet, for abund sig c = runtmp - sig47 = utils.get_test_data('track_abund/47.fa.sig') - c.run_sourmash('sig', 'describe', sig47, '--csv', 'out.csv') + sig47 = utils.get_test_data("track_abund/47.fa.sig") + c.run_sourmash("sig", "describe", sig47, "--csv", "out.csv") - with open(c.output('out.csv'), 'rt') as fp: + with open(c.output("out.csv")) as fp: r = csv.DictReader(fp) - n = 0 - rows = list(r) assert len(rows) == 1 row = rows[0] - assert row['signature_file'] == sig47 - assert row['md5'] == "09a08691ce52952152f0e866a59f6261" - assert row['ksize'] == "31" - assert row['moltype'] == "DNA" - assert row['num'] == "0" - assert row['scaled'] == "1000" - assert row['n_hashes'] == "5177" - assert row['seed'] == "42" - assert row['with_abundance'] == "1" - assert row['name'] == "NC_009665.1 Shewanella baltica OS185, complete genome" - assert row['filename'] == "podar-ref/47.fa" - assert row['license'] == "CC0" - assert row['sum_hashes'] == "5292" + assert row["signature_file"] == sig47 + assert row["md5"] == "09a08691ce52952152f0e866a59f6261" + assert row["ksize"] == "31" + assert row["moltype"] == "DNA" + assert row["num"] == "0" + assert row["scaled"] == "1000" + assert row["n_hashes"] == "5177" + assert row["seed"] == "42" + assert row["with_abundance"] == "1" + assert row["name"] == "NC_009665.1 Shewanella baltica OS185, complete genome" + assert row["filename"] == "podar-ref/47.fa" + assert row["license"] == "CC0" + assert row["sum_hashes"] == "5292" def test_sig_describe_2_csv_as_picklist(runtmp): @@ -3503,14 +3726,12 @@ def test_sig_describe_2_csv_as_picklist(runtmp): # pickfile c = runtmp - sig47 = utils.get_test_data('47.fa.sig') - outcsv = runtmp.output('out.csv') + sig47 = utils.get_test_data("47.fa.sig") + outcsv = runtmp.output("out.csv") - c.run_sourmash('sig', 'describe', sig47, - '--csv', outcsv) + c.run_sourmash("sig", "describe", sig47, "--csv", outcsv) - c.run_sourmash('sig', 'describe', sig47, - '--picklist', f'{outcsv}::manifest') + c.run_sourmash("sig", "describe", sig47, "--picklist", f"{outcsv}::manifest") out = c.last_result.out print(c.last_result) @@ -3531,10 +3752,9 @@ def test_sig_describe_2_include_db_pattern(runtmp): # test sig describe --include-db-pattern c = runtmp - allzip = utils.get_test_data('prot/all.zip') + allzip = utils.get_test_data("prot/all.zip") - c.run_sourmash('sig', 'describe', allzip, - '--include-db-pattern', 'os185') + c.run_sourmash("sig", "describe", allzip, "--include-db-pattern", "os185") out = c.last_result.out print(c.last_result) @@ -3555,10 +3775,11 @@ def test_sig_describe_2_exclude_db_pattern(runtmp): # test sig describe --exclude-db-pattern c = runtmp - allzip = utils.get_test_data('prot/all.zip') + allzip = utils.get_test_data("prot/all.zip") - c.run_sourmash('sig', 'describe', allzip, '--dna', '-k', '31', - '--exclude-db-pattern', 'os223') + c.run_sourmash( + "sig", "describe", allzip, "--dna", "-k", "31", "--exclude-db-pattern", "os223" + ) out = c.last_result.out print(c.last_result) @@ -3577,13 +3798,13 @@ def test_sig_describe_2_exclude_db_pattern(runtmp): def test_sig_describe_3_manifest_works(runtmp): # test on a manifest with relative paths, in proper location - mf = utils.get_test_data('scaled/mf.csv') - runtmp.sourmash('sig', 'describe', mf, '--csv', 'out.csv') + mf = utils.get_test_data("scaled/mf.csv") + runtmp.sourmash("sig", "describe", mf, "--csv", "out.csv") out = runtmp.last_result.out print(out) - with open(runtmp.output('out.csv'), newline='') as fp: + with open(runtmp.output("out.csv"), newline="") as fp: r = csv.reader(fp) rows = list(r) assert len(rows) == 16 # 15 signatures, plus head @@ -3593,41 +3814,41 @@ def test_sig_describe_3_manifest_fails_when_moved(runtmp): # test on a manifest with relative paths, when in wrong place; # should fail, because actual signatures cannot be loaded now. # note: this tests lazy loading. - mf = utils.get_test_data('scaled/mf.csv') - shutil.copyfile(mf, runtmp.output('mf.csv')) + mf = utils.get_test_data("scaled/mf.csv") + shutil.copyfile(mf, runtmp.output("mf.csv")) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'describe', 'mf.csv') + runtmp.sourmash("sig", "describe", "mf.csv") + - @utils.in_tempdir def test_sig_overlap(c): # get overlap details - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - c.run_sourmash('sig', 'overlap', sig47, sig63) + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + c.run_sourmash("sig", "overlap", sig47, sig63) out = c.last_result.out print(out) # md5s - assert '09a08691ce52952152f0e866a59f6261' in out - assert '38729c6374925585db28916b82a6f513' in out + assert "09a08691ce52952152f0e866a59f6261" in out + assert "38729c6374925585db28916b82a6f513" in out - assert 'similarity: 0.32069' in out - assert 'number of hashes in common: 2529' in out + assert "similarity: 0.32069" in out + assert "number of hashes in common: 2529" in out @utils.in_tempdir def test_import_export_1(c): # check to make sure we can import what we've exported! - inp = utils.get_test_data('genome-s11.fa.gz.sig') - outp = c.output('export.json') + inp = utils.get_test_data("genome-s11.fa.gz.sig") + outp = c.output("export.json") - c.run_sourmash('sig', 'export', inp, '-o', outp, '-k', '21', '--dna') - c.run_sourmash('sig', 'import', outp) + c.run_sourmash("sig", "export", inp, "-o", outp, "-k", "21", "--dna") + c.run_sourmash("sig", "import", outp) - original = sourmash.load_one_signature(inp, ksize=21, select_moltype='DNA') + original = sourmash.load_one_signature(inp, ksize=21, select_moltype="DNA") roundtrip = sourmash.load_one_signature(c.last_result.out) assert original.minhash == roundtrip.minhash @@ -3636,13 +3857,13 @@ def test_import_export_1(c): @utils.in_tempdir def test_import_export_1_by_md5(c): # check to make sure we can import what we've exported! - inp = utils.get_test_data('genome-s11.fa.gz.sig') - outp = c.output('export.json') + inp = utils.get_test_data("genome-s11.fa.gz.sig") + outp = c.output("export.json") - c.run_sourmash('sig', 'export', inp, '-o', outp, '--md5', '1437d8eae6') - c.run_sourmash('sig', 'import', outp) + c.run_sourmash("sig", "export", inp, "-o", outp, "--md5", "1437d8eae6") + c.run_sourmash("sig", "import", outp) - original = sourmash.load_one_signature(inp, ksize=21, select_moltype='DNA') + original = sourmash.load_one_signature(inp, ksize=21, select_moltype="DNA") roundtrip = sourmash.load_one_signature(c.last_result.out) assert original.minhash == roundtrip.minhash @@ -3655,271 +3876,259 @@ def test_import_export_2(c): # mash sketch -s 500 -k 21 ./tests/test-data/genome-s11.fa.gz # mash info -d ./tests/test-data/genome-s11.fa.gz.msh > tests/test-data/genome-s11.fa.gz.msh.json_dump # - sig1 = utils.get_test_data('genome-s11.fa.gz.sig') - msh_sig = utils.get_test_data('genome-s11.fa.gz.msh.json_dump') + sig1 = utils.get_test_data("genome-s11.fa.gz.sig") + msh_sig = utils.get_test_data("genome-s11.fa.gz.msh.json_dump") - c.run_sourmash('sig', 'import', msh_sig) + c.run_sourmash("sig", "import", msh_sig) imported = sourmash.load_one_signature(c.last_result.out) - compare = sourmash.load_one_signature(sig1, ksize=21, select_moltype='DNA') + compare = sourmash.load_one_signature(sig1, ksize=21, select_moltype="DNA") assert imported.minhash == compare.minhash def test_import_mash_csv_to_sig(runtmp): # test copied over from 'sourmash import_csv'. - testdata1 = utils.get_test_data('short.fa.msh.dump') - testdata2 = utils.get_test_data('short.fa') + testdata1 = utils.get_test_data("short.fa.msh.dump") + testdata2 = utils.get_test_data("short.fa") - runtmp.sourmash('sig', 'import', '--csv', testdata1, '-o', 'xxx.sig') + runtmp.sourmash("sig", "import", "--csv", testdata1, "-o", "xxx.sig") - runtmp.sourmash('sketch', 'dna', '-p', 'k=31,num=970', testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=970", testdata2) - runtmp.sourmash('search', '-k', '31', 'short.fa.sig', 'xxx.sig') + runtmp.sourmash("search", "-k", "31", "short.fa.sig", "xxx.sig") print("RUNTEMP", runtmp) - assert '1 matches' in runtmp.last_result.out - assert '100.0% short.fa' in runtmp.last_result.out + assert "1 matches" in runtmp.last_result.out + assert "100.0% short.fa" in runtmp.last_result.out def test_sig_manifest_1_zipfile(runtmp): # make a manifest from a .zip file - protzip = utils.get_test_data('prot/protein.zip') - runtmp.sourmash('sig', 'manifest', protzip, '-o', 'SOURMASH-MANIFEST.csv') + protzip = utils.get_test_data("prot/protein.zip") + runtmp.sourmash("sig", "manifest", protzip, "-o", "SOURMASH-MANIFEST.csv") - manifest_fn = runtmp.output('SOURMASH-MANIFEST.csv') - with open(manifest_fn, newline='') as csvfp: + manifest_fn = runtmp.output("SOURMASH-MANIFEST.csv") + with open(manifest_fn, newline="") as csvfp: manifest = CollectionManifest.load_from_csv(csvfp) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list def test_sig_manifest_1_zipfile_csv_gz(runtmp): # make a gzipped manifest from a .zip file - protzip = utils.get_test_data('prot/protein.zip') - runtmp.sourmash('sig', 'manifest', protzip, - '-o', 'SOURMASH-MANIFEST.csv.gz') + protzip = utils.get_test_data("prot/protein.zip") + runtmp.sourmash("sig", "manifest", protzip, "-o", "SOURMASH-MANIFEST.csv.gz") - manifest_fn = runtmp.output('SOURMASH-MANIFEST.csv.gz') - with gzip.open(manifest_fn, "rt", newline='') as csvfp: + manifest_fn = runtmp.output("SOURMASH-MANIFEST.csv.gz") + with gzip.open(manifest_fn, "rt", newline="") as csvfp: manifest = CollectionManifest.load_from_csv(csvfp) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list def test_sig_manifest_1_zipfile_already_exists(runtmp): # make a manifest from a .zip file; f - protzip = utils.get_test_data('prot/protein.zip') + protzip = utils.get_test_data("prot/protein.zip") - mf_csv = runtmp.output('mf.csv') + mf_csv = runtmp.output("mf.csv") with open(mf_csv, "w") as fp: fp.write("hello, world") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'manifest', protzip, '-o', 'mf.csv') + runtmp.sourmash("sig", "manifest", protzip, "-o", "mf.csv") def test_sig_manifest_1_zipfile_already_exists_force(runtmp): # make a manifest from a .zip file - protzip = utils.get_test_data('prot/protein.zip') + protzip = utils.get_test_data("prot/protein.zip") - mf_csv = runtmp.output('mf.csv') + mf_csv = runtmp.output("mf.csv") with open(mf_csv, "w") as fp: fp.write("hello, world") - runtmp.sourmash('sig', 'manifest', protzip, '-o', 'mf.csv', '-f') + runtmp.sourmash("sig", "manifest", protzip, "-o", "mf.csv", "-f") - with open(mf_csv, newline='') as csvfp: + with open(mf_csv, newline="") as csvfp: manifest = CollectionManifest.load_from_csv(csvfp) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list def test_sig_manifest_1_zipfile_already_exists_sql(runtmp): # make a manifest from a .zip file - protzip = utils.get_test_data('prot/protein.zip') + protzip = utils.get_test_data("prot/protein.zip") - mf_csv = runtmp.output('mf.mfsql') - runtmp.sourmash('sig', 'manifest', protzip, '-o', 'mf.mfsql', '-F', 'sql') - runtmp.sourmash('sig', 'manifest', protzip, '-o', 'mf.mfsql', '-F', 'sql', - '-f') + mf_csv = runtmp.output("mf.mfsql") + runtmp.sourmash("sig", "manifest", protzip, "-o", "mf.mfsql", "-F", "sql") + runtmp.sourmash("sig", "manifest", protzip, "-o", "mf.mfsql", "-F", "sql", "-f") manifest = CollectionManifest.load_from_filename(mf_csv) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list def test_sig_manifest_2_sigfile(runtmp): # make a manifest from a .sig file - sigfile = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') + sigfile = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) - runtmp.sourmash('sig', 'manifest', sigfile, '-o', 'SOURMASH-MANIFEST.csv') + runtmp.sourmash("sig", "manifest", sigfile, "-o", "SOURMASH-MANIFEST.csv") - status = runtmp.last_result.status - out = runtmp.last_result.out - err = runtmp.last_result.err - - manifest_fn = runtmp.output('SOURMASH-MANIFEST.csv') - with open(manifest_fn, newline='') as csvfp: + manifest_fn = runtmp.output("SOURMASH-MANIFEST.csv") + with open(manifest_fn, newline="") as csvfp: manifest = CollectionManifest.load_from_csv(csvfp) assert len(manifest) == 1 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list def test_sig_manifest_3_sbt(runtmp): # make a manifest from an SBT - protzip = utils.get_test_data('prot/protein.sbt.zip') - runtmp.sourmash('sig', 'manifest', protzip, '-o', 'SOURMASH-MANIFEST.csv') + protzip = utils.get_test_data("prot/protein.sbt.zip") + runtmp.sourmash("sig", "manifest", protzip, "-o", "SOURMASH-MANIFEST.csv") - manifest_fn = runtmp.output('SOURMASH-MANIFEST.csv') - with open(manifest_fn, newline='') as csvfp: + manifest_fn = runtmp.output("SOURMASH-MANIFEST.csv") + with open(manifest_fn, newline="") as csvfp: manifest = CollectionManifest.load_from_csv(csvfp) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list def test_sig_manifest_4_lca(runtmp): # make a manifest from a .lca.json file - sigfile = utils.get_test_data('prot/protein.lca.json.gz') - runtmp.sourmash('sig', 'manifest', sigfile, '-o', - 'SOURMASH-MANIFEST.csv') + sigfile = utils.get_test_data("prot/protein.lca.json.gz") + runtmp.sourmash("sig", "manifest", sigfile, "-o", "SOURMASH-MANIFEST.csv") - manifest_fn = runtmp.output('SOURMASH-MANIFEST.csv') - with open(manifest_fn, newline='') as csvfp: + manifest_fn = runtmp.output("SOURMASH-MANIFEST.csv") + with open(manifest_fn, newline="") as csvfp: manifest = CollectionManifest.load_from_csv(csvfp) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list def test_sig_manifest_5_dir(runtmp): # make a manifest from a directory - sigfile = utils.get_test_data('prot/protein/') - runtmp.sourmash('sig', 'manifest', sigfile, '-o', 'SOURMASH-MANIFEST.csv') - - status = runtmp.last_result.status - out = runtmp.last_result.out - err = runtmp.last_result.err + sigfile = utils.get_test_data("prot/protein/") + runtmp.sourmash("sig", "manifest", sigfile, "-o", "SOURMASH-MANIFEST.csv") - manifest_fn = runtmp.output('SOURMASH-MANIFEST.csv') - with open(manifest_fn, newline='') as csvfp: + manifest_fn = runtmp.output("SOURMASH-MANIFEST.csv") + with open(manifest_fn, newline="") as csvfp: manifest = CollectionManifest.load_from_csv(csvfp) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list def test_sig_manifest_6_pathlist(runtmp): # make a manifest from a pathlist file - sigfiles = utils.get_test_data('prot/protein/*.sig') + sigfiles = utils.get_test_data("prot/protein/*.sig") sigfiles = glob.glob(sigfiles) - pathlist = runtmp.output('pathlist.txt') - with open(pathlist, 'wt') as fp: + pathlist = runtmp.output("pathlist.txt") + with open(pathlist, "w") as fp: fp.write("\n".join(sigfiles)) - runtmp.sourmash('sig', 'manifest', pathlist, '-o', 'SOURMASH-MANIFEST.csv') + runtmp.sourmash("sig", "manifest", pathlist, "-o", "SOURMASH-MANIFEST.csv") - status = runtmp.last_result.status - out = runtmp.last_result.out - err = runtmp.last_result.err - - manifest_fn = runtmp.output('SOURMASH-MANIFEST.csv') - with open(manifest_fn, newline='') as csvfp: + manifest_fn = runtmp.output("SOURMASH-MANIFEST.csv") + with open(manifest_fn, newline="") as csvfp: manifest = CollectionManifest.load_from_csv(csvfp) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list # note: the manifest output for pathlists will contain the locations # used in the pathlist. This is required by StandaloneManifestIndex. for row in manifest.rows: - iloc = row['internal_location'] + iloc = row["internal_location"] print(iloc) - assert iloc.startswith('/'), iloc + assert iloc.startswith("/"), iloc def test_sig_manifest_does_not_exist(runtmp): with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('sig', 'manifest', 'does-not-exist', - '-o', 'out.csv') + runtmp.run_sourmash("sig", "manifest", "does-not-exist", "-o", "out.csv") - assert "Cannot open 'does-not-exist' as a sourmash signature collection." in runtmp.last_result.err + assert ( + "Cannot open 'does-not-exist' as a sourmash signature collection." + in runtmp.last_result.err + ) def test_sig_manifest_7_allzip_1(runtmp): # the rebuilt manifest w/o '-f' will miss dna-sig.noext - allzip = utils.get_test_data('prot/all.zip') - runtmp.sourmash('sig', 'manifest', allzip, '-o', 'xyz.csv') + allzip = utils.get_test_data("prot/all.zip") + runtmp.sourmash("sig", "manifest", allzip, "-o", "xyz.csv") - manifest_fn = runtmp.output('xyz.csv') - with open(manifest_fn, newline='') as csvfp: + manifest_fn = runtmp.output("xyz.csv") + with open(manifest_fn, newline="") as csvfp: manifest = CollectionManifest.load_from_csv(csvfp) assert len(manifest) == 7 - filenames = set( row['internal_location'] for row in manifest.rows ) - assert 'dna-sig.noext' not in filenames + filenames = set(row["internal_location"] for row in manifest.rows) + assert "dna-sig.noext" not in filenames def test_sig_manifest_7_allzip_2(runtmp): # the rebuilt manifest w/ '-f' will contain dna-sig.noext - allzip = utils.get_test_data('prot/all.zip') - runtmp.sourmash('sig', 'manifest', allzip, '-o', 'xyz.csv', '-f') + allzip = utils.get_test_data("prot/all.zip") + runtmp.sourmash("sig", "manifest", allzip, "-o", "xyz.csv", "-f") - manifest_fn = runtmp.output('xyz.csv') - with open(manifest_fn, newline='') as csvfp: + manifest_fn = runtmp.output("xyz.csv") + with open(manifest_fn, newline="") as csvfp: manifest = CollectionManifest.load_from_csv(csvfp) assert len(manifest) == 8 - filenames = set( row['internal_location'] for row in manifest.rows ) - assert 'dna-sig.noext' in filenames + filenames = set(row["internal_location"] for row in manifest.rows) + assert "dna-sig.noext" in filenames def test_sig_manifest_7_allzip_3(runtmp): # the existing manifest contains 'dna-sig.noext' whther or not -f is # used. - allzip = utils.get_test_data('prot/all.zip') - runtmp.sourmash('sig', 'manifest', allzip, '-o', 'xyz.csv', - '--no-rebuild') + allzip = utils.get_test_data("prot/all.zip") + runtmp.sourmash("sig", "manifest", allzip, "-o", "xyz.csv", "--no-rebuild") - manifest_fn = runtmp.output('xyz.csv') - with open(manifest_fn, newline='') as csvfp: + manifest_fn = runtmp.output("xyz.csv") + with open(manifest_fn, newline="") as csvfp: manifest = CollectionManifest.load_from_csv(csvfp) assert len(manifest) == 8 - filenames = set( row['internal_location'] for row in manifest.rows ) - assert 'dna-sig.noext' in filenames + filenames = set(row["internal_location"] for row in manifest.rows) + assert "dna-sig.noext" in filenames def test_sig_manifest_8_sqldb(runtmp): # make a sqldb and then run sig manifest on it. - gcf_all = glob.glob(utils.get_test_data('gather/GCF*.sig')) - sqldb = runtmp.output('some.sqldb') + gcf_all = glob.glob(utils.get_test_data("gather/GCF*.sig")) + sqldb = runtmp.output("some.sqldb") - runtmp.sourmash('sig', 'cat', '-k', '31', *gcf_all, '-o', sqldb) + runtmp.sourmash("sig", "cat", "-k", "31", *gcf_all, "-o", sqldb) # need to use '--no-rebuild-manifest' with 'sig manifest' on sqldb, # because it has a manifest but not the _signatures_with_internal @@ -3927,11 +4136,10 @@ def test_sig_manifest_8_sqldb(runtmp): # so, this should fail... with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'manifest', sqldb, '-o', 'mf.csv') + runtmp.sourmash("sig", "manifest", sqldb, "-o", "mf.csv") # ...and this should succeed: - runtmp.sourmash('sig', 'manifest', sqldb, '-o', 'mf.csv', - '--no-rebuild') + runtmp.sourmash("sig", "manifest", sqldb, "-o", "mf.csv", "--no-rebuild") err = runtmp.last_result.err print(err) @@ -3939,23 +4147,22 @@ def test_sig_manifest_8_sqldb(runtmp): out = runtmp.last_result.out print(out) - assert 'manifest contains 12 signatures total.' in err + assert "manifest contains 12 signatures total." in err assert "wrote manifest to 'mf.csv'" in err - mf = CollectionManifest.load_from_filename(runtmp.output('mf.csv')) + mf = CollectionManifest.load_from_filename(runtmp.output("mf.csv")) assert len(mf) == 12 def test_sig_manifest_8_sqldb_out(runtmp): # make a zip and run manifest out on it to make a sql format manifest. - gcf_all = glob.glob(utils.get_test_data('gather/GCF*.sig')) - zipfile = runtmp.output('some.zip') + gcf_all = glob.glob(utils.get_test_data("gather/GCF*.sig")) + zipfile = runtmp.output("some.zip") - runtmp.sourmash('sig', 'cat', '-k', '31', *gcf_all, '-o', zipfile) + runtmp.sourmash("sig", "cat", "-k", "31", *gcf_all, "-o", zipfile) # ...and this should succeed: - runtmp.sourmash('sig', 'manifest', zipfile, '-o', 'mf.sqldb', - '-F', 'sql') + runtmp.sourmash("sig", "manifest", zipfile, "-o", "mf.sqldb", "-F", "sql") err = runtmp.last_result.err print(err) @@ -3963,38 +4170,46 @@ def test_sig_manifest_8_sqldb_out(runtmp): out = runtmp.last_result.out print(out) - assert 'manifest contains 12 signatures total.' in err + assert "manifest contains 12 signatures total." in err assert "wrote manifest to 'mf.sqldb'" in err - mf = CollectionManifest.load_from_filename(runtmp.output('mf.sqldb')) + mf = CollectionManifest.load_from_filename(runtmp.output("mf.sqldb")) assert len(mf) == 12 def test_sig_kmers_1_dna(runtmp): # test sig kmers on dna - seqfile = utils.get_test_data('short.fa') + seqfile = utils.get_test_data("short.fa") - runtmp.sourmash('sketch', 'dna', seqfile, '-p', 'scaled=1') - ss = sourmash.load_one_signature(runtmp.output('short.fa.sig')) + runtmp.sourmash("sketch", "dna", seqfile, "-p", "scaled=1") + ss = sourmash.load_one_signature(runtmp.output("short.fa.sig")) mh = ss.minhash - assert mh.moltype == 'DNA' - - runtmp.sourmash('sig', 'kmers', '--sig', 'short.fa.sig', - '--seq', seqfile, - '--save-kmers', 'short.csv', - '--save-sequences', 'matched.fa') + assert mh.moltype == "DNA" + + runtmp.sourmash( + "sig", + "kmers", + "--sig", + "short.fa.sig", + "--seq", + seqfile, + "--save-kmers", + "short.csv", + "--save-sequences", + "matched.fa", + ) out = runtmp.last_result.out print(out) err = runtmp.last_result.err print(err) - assert 'total hashes in merged signature: 970' in err - assert 'found 970 distinct matching hashes (100.0%)' in err + assert "total hashes in merged signature: 970" in err + assert "found 970 distinct matching hashes (100.0%)" in err # check FASTA output - assert os.path.exists(runtmp.output('matched.fa')) - with screed.open(runtmp.output('matched.fa')) as f: + assert os.path.exists(runtmp.output("matched.fa")) + with screed.open(runtmp.output("matched.fa")) as f: records = list(f) assert len(records) == 1 assert len(records[0].sequence) == 1000, len(records[0].sequence) @@ -4005,8 +4220,8 @@ def test_sig_kmers_1_dna(runtmp): assert seq_mh.similarity(mh) == 1.0 # check CSV output w/k-mers and hashes etc - assert os.path.exists(runtmp.output('short.csv')) - with open(runtmp.output('short.csv'), newline='') as fp: + assert os.path.exists(runtmp.output("short.csv")) + with open(runtmp.output("short.csv"), newline="") as fp: r = csv.DictReader(fp) rows = list(r) assert len(rows) == 970 @@ -4014,58 +4229,56 @@ def test_sig_kmers_1_dna(runtmp): check_mh = mh.copy_and_clear() check_mh2 = mh.copy_and_clear() for row in rows: - check_mh.add_sequence(row['kmer']) - check_mh2.add_hash(int(row['hashval'])) + check_mh.add_sequence(row["kmer"]) + check_mh2.add_hash(int(row["hashval"])) assert check_mh.similarity(mh) == 1.0 assert check_mh2.similarity(mh) == 1.0 def test_sig_kmers_1_dna_more_in_query(runtmp): # test sig kmers on dna, where query has more than matches - seqfile = utils.get_test_data('short.fa') + seqfile = utils.get_test_data("short.fa") - runtmp.sourmash('sketch', 'dna', seqfile, '-p', 'scaled=1') - ss = sourmash.load_one_signature(runtmp.output('short.fa.sig')) + runtmp.sourmash("sketch", "dna", seqfile, "-p", "scaled=1") + ss = sourmash.load_one_signature(runtmp.output("short.fa.sig")) mh = ss.minhash - assert mh.moltype == 'DNA' + assert mh.moltype == "DNA" # make a new sequence for query, with more k-mers - query_seqfile = runtmp.output('query.fa') - with open(query_seqfile, 'wt') as fp: + query_seqfile = runtmp.output("query.fa") + with open(query_seqfile, "w") as fp: with screed.open(seqfile) as screed_iter: for record in screed_iter: fp.write(f">{record.name}\n{record.sequence}AGTTACGATC\n") - runtmp.sourmash('sig', 'kmers', '--sig', 'short.fa.sig', - '--seq', query_seqfile) + runtmp.sourmash("sig", "kmers", "--sig", "short.fa.sig", "--seq", query_seqfile) out = runtmp.last_result.out print(out) err = runtmp.last_result.err print(err) - assert 'total hashes in merged signature: 970' in err + assert "total hashes in merged signature: 970" in err # should only find 970 overlapping hashes here -- - assert 'found 970 distinct matching hashes (100.0%)' in err + assert "found 970 distinct matching hashes (100.0%)" in err def test_sig_kmers_1_dna_empty_seq(runtmp): # test sig kmers with empty query seq - seqfile = utils.get_test_data('short.fa') + seqfile = utils.get_test_data("short.fa") - runtmp.sourmash('sketch', 'dna', seqfile, '-p', 'scaled=1') - ss = sourmash.load_one_signature(runtmp.output('short.fa.sig')) + runtmp.sourmash("sketch", "dna", seqfile, "-p", "scaled=1") + ss = sourmash.load_one_signature(runtmp.output("short.fa.sig")) mh = ss.minhash - assert mh.moltype == 'DNA' + assert mh.moltype == "DNA" # make a new sequence for query, with more k-mers - query_seqfile = runtmp.output('query.fa') - with open(query_seqfile, 'wt') as fp: + query_seqfile = runtmp.output("query.fa") + with open(query_seqfile, "w"): pass with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'kmers', '--sig', 'short.fa.sig', - '--seq', query_seqfile) + runtmp.sourmash("sig", "kmers", "--sig", "short.fa.sig", "--seq", query_seqfile) out = runtmp.last_result.out print(out) @@ -4077,16 +4290,15 @@ def test_sig_kmers_1_dna_empty_seq(runtmp): def test_sig_kmers_1_dna_empty_sig(runtmp): # test sig kmers with empty query sig - seqfile = utils.get_test_data('short.fa') + seqfile = utils.get_test_data("short.fa") mh = sourmash.MinHash(ksize=31, n=0, scaled=1) ss = sourmash.SourmashSignature(mh, name="empty") - with open(runtmp.output('empty.sig'), 'wt') as fp: + with open(runtmp.output("empty.sig"), "w") as fp: sourmash.save_signatures([ss], fp) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'kmers', '--sig', 'empty.sig', - '--seq', seqfile) + runtmp.sourmash("sig", "kmers", "--sig", "empty.sig", "--seq", seqfile) out = runtmp.last_result.out print(out) @@ -4098,51 +4310,58 @@ def test_sig_kmers_1_dna_empty_sig(runtmp): def test_sig_kmers_1_dna_single_sig(runtmp): # test sig kmers with a fabricated query sig with a single hash - seqfile = utils.get_test_data('short.fa') + seqfile = utils.get_test_data("short.fa") mh = sourmash.MinHash(ksize=31, n=0, scaled=1) mh.add_hash(1070961951490202715) ss = sourmash.SourmashSignature(mh, name="small") - with open(runtmp.output('small.sig'), 'wt') as fp: + with open(runtmp.output("small.sig"), "w") as fp: sourmash.save_signatures([ss], fp) - runtmp.sourmash('sig', 'kmers', '--sig', 'small.sig', - '--seq', seqfile) + runtmp.sourmash("sig", "kmers", "--sig", "small.sig", "--seq", seqfile) out = runtmp.last_result.out print(out) err = runtmp.last_result.err print(err) - assert 'total hashes in merged signature: 1' in err - assert 'found 1 distinct matching hashes (100.0%)' in err + assert "total hashes in merged signature: 1" in err + assert "found 1 distinct matching hashes (100.0%)" in err def test_sig_kmers_1_dna_lowscaled(runtmp): # test sig kmers on dna with a scaled of 100, so not all k-mers - seqfile = utils.get_test_data('short.fa') + seqfile = utils.get_test_data("short.fa") - runtmp.sourmash('sketch', 'dna', seqfile, '-p', 'scaled=100') - ss = sourmash.load_one_signature(runtmp.output('short.fa.sig')) + runtmp.sourmash("sketch", "dna", seqfile, "-p", "scaled=100") + ss = sourmash.load_one_signature(runtmp.output("short.fa.sig")) mh = ss.minhash - assert mh.moltype == 'DNA' - - runtmp.sourmash('sig', 'kmers', '--sig', 'short.fa.sig', - '--seq', seqfile, - '--save-kmers', 'short.csv', - '--save-sequences', 'matched.fa') + assert mh.moltype == "DNA" + + runtmp.sourmash( + "sig", + "kmers", + "--sig", + "short.fa.sig", + "--seq", + seqfile, + "--save-kmers", + "short.csv", + "--save-sequences", + "matched.fa", + ) out = runtmp.last_result.out print(out) err = runtmp.last_result.err print(err) - assert 'total hashes in merged signature: 5' in err - assert 'found 5 distinct matching hashes (100.0%)' in err + assert "total hashes in merged signature: 5" in err + assert "found 5 distinct matching hashes (100.0%)" in err # check FASTA output - assert os.path.exists(runtmp.output('matched.fa')) - with screed.open(runtmp.output('matched.fa')) as f: + assert os.path.exists(runtmp.output("matched.fa")) + with screed.open(runtmp.output("matched.fa")) as f: records = list(f) assert len(records) == 1 assert len(records[0].sequence) == 1000, len(records[0].sequence) @@ -4153,8 +4372,8 @@ def test_sig_kmers_1_dna_lowscaled(runtmp): assert seq_mh.similarity(mh) == 1.0 # check CSV output w/k-mers and hashes etc - assert os.path.exists(runtmp.output('short.csv')) - with open(runtmp.output('short.csv'), newline='') as fp: + assert os.path.exists(runtmp.output("short.csv")) + with open(runtmp.output("short.csv"), newline="") as fp: r = csv.DictReader(fp) rows = list(r) assert len(rows) == 5 @@ -4162,37 +4381,45 @@ def test_sig_kmers_1_dna_lowscaled(runtmp): check_mh = mh.copy_and_clear() check_mh2 = mh.copy_and_clear() for row in rows: - check_mh.add_sequence(row['kmer']) - check_mh2.add_hash(int(row['hashval'])) + check_mh.add_sequence(row["kmer"]) + check_mh2.add_hash(int(row["hashval"])) assert check_mh.similarity(mh) == 1.0 assert check_mh2.similarity(mh) == 1.0 def test_sig_kmers_1_dna_num(runtmp): # test sig kmers on dna with a scaled of 100, so not all k-mers - seqfile = utils.get_test_data('short.fa') + seqfile = utils.get_test_data("short.fa") - runtmp.sourmash('sketch', 'dna', seqfile, '-p', 'num=50') - ss = sourmash.load_one_signature(runtmp.output('short.fa.sig')) + runtmp.sourmash("sketch", "dna", seqfile, "-p", "num=50") + ss = sourmash.load_one_signature(runtmp.output("short.fa.sig")) mh = ss.minhash - assert mh.moltype == 'DNA' - - runtmp.sourmash('sig', 'kmers', '--sig', 'short.fa.sig', - '--seq', seqfile, - '--save-kmers', 'short.csv', - '--save-sequences', 'matched.fa') + assert mh.moltype == "DNA" + + runtmp.sourmash( + "sig", + "kmers", + "--sig", + "short.fa.sig", + "--seq", + seqfile, + "--save-kmers", + "short.csv", + "--save-sequences", + "matched.fa", + ) out = runtmp.last_result.out print(out) err = runtmp.last_result.err print(err) - assert 'total hashes in merged signature: 50' in err - assert 'found 50 distinct matching hashes (100.0%)' in err + assert "total hashes in merged signature: 50" in err + assert "found 50 distinct matching hashes (100.0%)" in err # check FASTA output - assert os.path.exists(runtmp.output('matched.fa')) - with screed.open(runtmp.output('matched.fa')) as f: + assert os.path.exists(runtmp.output("matched.fa")) + with screed.open(runtmp.output("matched.fa")) as f: records = list(f) assert len(records) == 1 assert len(records[0].sequence) == 1000, len(records[0].sequence) @@ -4203,8 +4430,8 @@ def test_sig_kmers_1_dna_num(runtmp): assert seq_mh.similarity(mh) == 1.0 # check CSV output w/k-mers and hashes etc - assert os.path.exists(runtmp.output('short.csv')) - with open(runtmp.output('short.csv'), newline='') as fp: + assert os.path.exists(runtmp.output("short.csv")) + with open(runtmp.output("short.csv"), newline="") as fp: r = csv.DictReader(fp) rows = list(r) assert len(rows) == 50 @@ -4212,37 +4439,46 @@ def test_sig_kmers_1_dna_num(runtmp): check_mh = mh.copy_and_clear() check_mh2 = mh.copy_and_clear() for row in rows: - check_mh.add_sequence(row['kmer']) - check_mh2.add_hash(int(row['hashval'])) + check_mh.add_sequence(row["kmer"]) + check_mh2.add_hash(int(row["hashval"])) assert check_mh.similarity(mh) == 1.0 assert check_mh2.similarity(mh) == 1.0 def test_sig_kmers_1_dna_translate_protein(runtmp): # test sig kmers on dna - seqfile = utils.get_test_data('short.fa') + seqfile = utils.get_test_data("short.fa") - runtmp.sourmash('sketch', 'translate', seqfile, '-p', 'scaled=1') - ss = sourmash.load_one_signature(runtmp.output('short.fa.sig')) + runtmp.sourmash("sketch", "translate", seqfile, "-p", "scaled=1") + ss = sourmash.load_one_signature(runtmp.output("short.fa.sig")) mh = ss.minhash - assert mh.moltype == 'protein' - - runtmp.sourmash('sig', 'kmers', '--sig', 'short.fa.sig', - '--seq', seqfile, - '--save-kmers', 'short.csv', - '--save-sequences', 'matched.fa', '--translate') + assert mh.moltype == "protein" + + runtmp.sourmash( + "sig", + "kmers", + "--sig", + "short.fa.sig", + "--seq", + seqfile, + "--save-kmers", + "short.csv", + "--save-sequences", + "matched.fa", + "--translate", + ) out = runtmp.last_result.out print(out) err = runtmp.last_result.err print(err) - assert 'total hashes in merged signature: 1942' in err - assert 'found 1942 distinct matching hashes (100.0%)' in err + assert "total hashes in merged signature: 1942" in err + assert "found 1942 distinct matching hashes (100.0%)" in err # check FASTA output - assert os.path.exists(runtmp.output('matched.fa')) - with screed.open(runtmp.output('matched.fa')) as f: + assert os.path.exists(runtmp.output("matched.fa")) + with screed.open(runtmp.output("matched.fa")) as f: records = list(f) assert len(records) == 1 assert len(records[0].sequence) == 1000, len(records[0].sequence) @@ -4253,8 +4489,8 @@ def test_sig_kmers_1_dna_translate_protein(runtmp): assert seq_mh.similarity(mh) == 1.0 # check CSV output w/k-mers and hashes etc - assert os.path.exists(runtmp.output('short.csv')) - with open(runtmp.output('short.csv'), newline='') as fp: + assert os.path.exists(runtmp.output("short.csv")) + with open(runtmp.output("short.csv"), newline="") as fp: r = csv.DictReader(fp) rows = list(r) assert len(rows) == 1942 @@ -4262,37 +4498,46 @@ def test_sig_kmers_1_dna_translate_protein(runtmp): check_mh = mh.copy_and_clear() check_mh2 = mh.copy_and_clear() for row in rows: - check_mh.add_sequence(row['kmer']) - check_mh2.add_hash(int(row['hashval'])) + check_mh.add_sequence(row["kmer"]) + check_mh2.add_hash(int(row["hashval"])) assert check_mh.similarity(mh) == 1.0 assert check_mh2.similarity(mh) == 1.0 def test_sig_kmers_1_dna_translate_dayhoff(runtmp): # test sig kmers on dna - seqfile = utils.get_test_data('short.fa') + seqfile = utils.get_test_data("short.fa") - runtmp.sourmash('sketch', 'translate', seqfile, '-p', 'scaled=1,dayhoff') - ss = sourmash.load_one_signature(runtmp.output('short.fa.sig')) + runtmp.sourmash("sketch", "translate", seqfile, "-p", "scaled=1,dayhoff") + ss = sourmash.load_one_signature(runtmp.output("short.fa.sig")) mh = ss.minhash - assert mh.moltype == 'dayhoff' - - runtmp.sourmash('sig', 'kmers', '--sig', 'short.fa.sig', - '--seq', seqfile, - '--save-kmers', 'short.csv', - '--save-sequences', 'matched.fa', '--translate') + assert mh.moltype == "dayhoff" + + runtmp.sourmash( + "sig", + "kmers", + "--sig", + "short.fa.sig", + "--seq", + seqfile, + "--save-kmers", + "short.csv", + "--save-sequences", + "matched.fa", + "--translate", + ) out = runtmp.last_result.out print(out) err = runtmp.last_result.err print(err) - assert 'total hashes in merged signature: 1906' in err - assert 'found 1906 distinct matching hashes (100.0%)' in err + assert "total hashes in merged signature: 1906" in err + assert "found 1906 distinct matching hashes (100.0%)" in err # check FASTA output - assert os.path.exists(runtmp.output('matched.fa')) - with screed.open(runtmp.output('matched.fa')) as f: + assert os.path.exists(runtmp.output("matched.fa")) + with screed.open(runtmp.output("matched.fa")) as f: records = list(f) assert len(records) == 1 assert len(records[0].sequence) == 1000, len(records[0].sequence) @@ -4303,8 +4548,8 @@ def test_sig_kmers_1_dna_translate_dayhoff(runtmp): assert seq_mh.similarity(mh) == 1.0 # check CSV output w/k-mers and hashes etc - assert os.path.exists(runtmp.output('short.csv')) - with open(runtmp.output('short.csv'), newline='') as fp: + assert os.path.exists(runtmp.output("short.csv")) + with open(runtmp.output("short.csv"), newline="") as fp: r = csv.DictReader(fp) rows = list(r) assert len(rows) == 1906 @@ -4312,37 +4557,46 @@ def test_sig_kmers_1_dna_translate_dayhoff(runtmp): check_mh = mh.copy_and_clear() check_mh2 = mh.copy_and_clear() for row in rows: - check_mh.add_sequence(row['kmer']) - check_mh2.add_hash(int(row['hashval'])) + check_mh.add_sequence(row["kmer"]) + check_mh2.add_hash(int(row["hashval"])) assert check_mh.similarity(mh) == 1.0 assert check_mh2.similarity(mh) == 1.0 def test_sig_kmers_1_dna_translate_hp(runtmp): # test sig kmers on dna - seqfile = utils.get_test_data('short.fa') + seqfile = utils.get_test_data("short.fa") - runtmp.sourmash('sketch', 'translate', seqfile, '-p', 'scaled=1,hp') - ss = sourmash.load_one_signature(runtmp.output('short.fa.sig')) + runtmp.sourmash("sketch", "translate", seqfile, "-p", "scaled=1,hp") + ss = sourmash.load_one_signature(runtmp.output("short.fa.sig")) mh = ss.minhash - assert mh.moltype == 'hp' - - runtmp.sourmash('sig', 'kmers', '--sig', 'short.fa.sig', - '--seq', seqfile, - '--save-kmers', 'short.csv', - '--save-sequences', 'matched.fa', '--translate') + assert mh.moltype == "hp" + + runtmp.sourmash( + "sig", + "kmers", + "--sig", + "short.fa.sig", + "--seq", + seqfile, + "--save-kmers", + "short.csv", + "--save-sequences", + "matched.fa", + "--translate", + ) out = runtmp.last_result.out print(out) err = runtmp.last_result.err print(err) - assert 'total hashes in merged signature: 1750' in err - assert 'found 1750 distinct matching hashes (100.0%)' in err + assert "total hashes in merged signature: 1750" in err + assert "found 1750 distinct matching hashes (100.0%)" in err # check FASTA output - assert os.path.exists(runtmp.output('matched.fa')) - with screed.open(runtmp.output('matched.fa')) as f: + assert os.path.exists(runtmp.output("matched.fa")) + with screed.open(runtmp.output("matched.fa")) as f: records = list(f) assert len(records) == 1 assert len(records[0].sequence) == 1000, len(records[0].sequence) @@ -4353,8 +4607,8 @@ def test_sig_kmers_1_dna_translate_hp(runtmp): assert seq_mh.similarity(mh) == 1.0 # check CSV output w/k-mers and hashes etc - assert os.path.exists(runtmp.output('short.csv')) - with open(runtmp.output('short.csv'), newline='') as fp: + assert os.path.exists(runtmp.output("short.csv")) + with open(runtmp.output("short.csv"), newline="") as fp: r = csv.DictReader(fp) rows = list(r) assert len(rows) == 1750 @@ -4362,37 +4616,45 @@ def test_sig_kmers_1_dna_translate_hp(runtmp): check_mh = mh.copy_and_clear() check_mh2 = mh.copy_and_clear() for row in rows: - check_mh.add_sequence(row['kmer']) - check_mh2.add_hash(int(row['hashval'])) + check_mh.add_sequence(row["kmer"]) + check_mh2.add_hash(int(row["hashval"])) assert check_mh.similarity(mh) == 1.0 assert check_mh2.similarity(mh) == 1.0 def test_sig_kmers_2_protein(runtmp): # test out sig kmers on an faa file - seqfile = utils.get_test_data('ecoli.faa') + seqfile = utils.get_test_data("ecoli.faa") - runtmp.sourmash('sketch', 'protein', seqfile, '-p', 'scaled=1') - ss = sourmash.load_one_signature(runtmp.output('ecoli.faa.sig')) + runtmp.sourmash("sketch", "protein", seqfile, "-p", "scaled=1") + ss = sourmash.load_one_signature(runtmp.output("ecoli.faa.sig")) mh = ss.minhash - assert mh.moltype == 'protein' - - runtmp.sourmash('sig', 'kmers', '--sig', 'ecoli.faa.sig', - '--seq', seqfile, - '--save-kmers', 'ecoli.csv', - '--save-sequences', 'matched.fa') + assert mh.moltype == "protein" + + runtmp.sourmash( + "sig", + "kmers", + "--sig", + "ecoli.faa.sig", + "--seq", + seqfile, + "--save-kmers", + "ecoli.csv", + "--save-sequences", + "matched.fa", + ) out = runtmp.last_result.out print(out) err = runtmp.last_result.err print(err) - assert 'total hashes in merged signature: 1112' in err - assert 'found 1112 distinct matching hashes (100.0%)' in err + assert "total hashes in merged signature: 1112" in err + assert "found 1112 distinct matching hashes (100.0%)" in err # check FASTA output - assert os.path.exists(runtmp.output('matched.fa')) - with screed.open(runtmp.output('matched.fa')) as f: + assert os.path.exists(runtmp.output("matched.fa")) + with screed.open(runtmp.output("matched.fa")) as f: records = list(f) assert len(records) == 2 assert len(records[0].sequence) == 820, len(records[0].sequence) @@ -4404,8 +4666,8 @@ def test_sig_kmers_2_protein(runtmp): assert seq_mh.similarity(mh) == 1.0 # check CSV output w/k-mers and hashes etc - assert os.path.exists(runtmp.output('ecoli.csv')) - with open(runtmp.output('ecoli.csv'), newline='') as fp: + assert os.path.exists(runtmp.output("ecoli.csv")) + with open(runtmp.output("ecoli.csv"), newline="") as fp: r = csv.DictReader(fp) rows = list(r) assert len(rows) == 1112 @@ -4413,37 +4675,45 @@ def test_sig_kmers_2_protein(runtmp): check_mh = mh.copy_and_clear() check_mh2 = mh.copy_and_clear() for row in rows: - check_mh.add_protein(row['kmer']) - check_mh2.add_hash(int(row['hashval'])) + check_mh.add_protein(row["kmer"]) + check_mh2.add_hash(int(row["hashval"])) assert check_mh.similarity(mh) == 1.0 assert check_mh2.similarity(mh) == 1.0 def test_sig_kmers_2_dayhoff(runtmp): # test out sig kmers on an faa file - seqfile = utils.get_test_data('ecoli.faa') + seqfile = utils.get_test_data("ecoli.faa") - runtmp.sourmash('sketch', 'protein', seqfile, '-p', 'scaled=1,dayhoff') - ss = sourmash.load_one_signature(runtmp.output('ecoli.faa.sig')) + runtmp.sourmash("sketch", "protein", seqfile, "-p", "scaled=1,dayhoff") + ss = sourmash.load_one_signature(runtmp.output("ecoli.faa.sig")) mh = ss.minhash - assert mh.moltype == 'dayhoff' - - runtmp.sourmash('sig', 'kmers', '--sig', 'ecoli.faa.sig', - '--seq', seqfile, - '--save-kmers', 'ecoli.csv', - '--save-sequences', 'matched.fa') + assert mh.moltype == "dayhoff" + + runtmp.sourmash( + "sig", + "kmers", + "--sig", + "ecoli.faa.sig", + "--seq", + seqfile, + "--save-kmers", + "ecoli.csv", + "--save-sequences", + "matched.fa", + ) out = runtmp.last_result.out print(out) err = runtmp.last_result.err print(err) - assert 'total hashes in merged signature: 1100' in err - assert 'found 1100 distinct matching hashes (100.0%)' in err + assert "total hashes in merged signature: 1100" in err + assert "found 1100 distinct matching hashes (100.0%)" in err # check FASTA output - assert os.path.exists(runtmp.output('matched.fa')) - with screed.open(runtmp.output('matched.fa')) as f: + assert os.path.exists(runtmp.output("matched.fa")) + with screed.open(runtmp.output("matched.fa")) as f: records = list(f) assert len(records) == 2 assert len(records[0].sequence) == 820, len(records[0].sequence) @@ -4455,8 +4725,8 @@ def test_sig_kmers_2_dayhoff(runtmp): assert seq_mh.similarity(mh) == 1.0 # check CSV output w/k-mers and hashes etc - assert os.path.exists(runtmp.output('ecoli.csv')) - with open(runtmp.output('ecoli.csv'), newline='') as fp: + assert os.path.exists(runtmp.output("ecoli.csv")) + with open(runtmp.output("ecoli.csv"), newline="") as fp: r = csv.DictReader(fp) rows = list(r) assert len(rows) == 1100 @@ -4464,37 +4734,45 @@ def test_sig_kmers_2_dayhoff(runtmp): check_mh = mh.copy_and_clear() check_mh2 = mh.copy_and_clear() for row in rows: - check_mh.add_protein(row['kmer']) - check_mh2.add_hash(int(row['hashval'])) + check_mh.add_protein(row["kmer"]) + check_mh2.add_hash(int(row["hashval"])) assert check_mh.similarity(mh) == 1.0 assert check_mh2.similarity(mh) == 1.0 def test_sig_kmers_2_hp(runtmp): # test out sig kmers on an faa file - seqfile = utils.get_test_data('ecoli.faa') + seqfile = utils.get_test_data("ecoli.faa") - runtmp.sourmash('sketch', 'protein', seqfile, '-p', 'scaled=1,hp') - ss = sourmash.load_one_signature(runtmp.output('ecoli.faa.sig')) + runtmp.sourmash("sketch", "protein", seqfile, "-p", "scaled=1,hp") + ss = sourmash.load_one_signature(runtmp.output("ecoli.faa.sig")) mh = ss.minhash - assert mh.moltype == 'hp' - - runtmp.sourmash('sig', 'kmers', '--sig', 'ecoli.faa.sig', - '--seq', seqfile, - '--save-kmers', 'ecoli.csv', - '--save-sequences', 'matched.fa') + assert mh.moltype == "hp" + + runtmp.sourmash( + "sig", + "kmers", + "--sig", + "ecoli.faa.sig", + "--seq", + seqfile, + "--save-kmers", + "ecoli.csv", + "--save-sequences", + "matched.fa", + ) out = runtmp.last_result.out print(out) err = runtmp.last_result.err print(err) - assert 'total hashes in merged signature: 1048' in err - assert 'found 1048 distinct matching hashes (100.0%)' in err + assert "total hashes in merged signature: 1048" in err + assert "found 1048 distinct matching hashes (100.0%)" in err # check FASTA output - assert os.path.exists(runtmp.output('matched.fa')) - with screed.open(runtmp.output('matched.fa')) as f: + assert os.path.exists(runtmp.output("matched.fa")) + with screed.open(runtmp.output("matched.fa")) as f: records = list(f) assert len(records) == 2 assert len(records[0].sequence) == 820, len(records[0].sequence) @@ -4506,8 +4784,8 @@ def test_sig_kmers_2_hp(runtmp): assert seq_mh.similarity(mh) == 1.0 # check CSV output w/k-mers and hashes etc - assert os.path.exists(runtmp.output('ecoli.csv')) - with open(runtmp.output('ecoli.csv'), newline='') as fp: + assert os.path.exists(runtmp.output("ecoli.csv")) + with open(runtmp.output("ecoli.csv"), newline="") as fp: r = csv.DictReader(fp) rows = list(r) assert len(rows) == 1048 @@ -4515,33 +4793,33 @@ def test_sig_kmers_2_hp(runtmp): check_mh = mh.copy_and_clear() check_mh2 = mh.copy_and_clear() for row in rows: - check_mh.add_protein(row['kmer']) - check_mh2.add_hash(int(row['hashval'])) + check_mh.add_protein(row["kmer"]) + check_mh2.add_hash(int(row["hashval"])) assert check_mh.similarity(mh) == 1.0 assert check_mh2.similarity(mh) == 1.0 def test_sig_check_1(runtmp): # basic check functionality - sigfiles = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/salmonella-picklist.csv') + sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/salmonella-picklist.csv") - runtmp.sourmash('sig', 'check', *sigfiles, - "--picklist", f"{picklist}::manifest", - "-m", "mf.csv") + runtmp.sourmash( + "sig", "check", *sigfiles, "--picklist", f"{picklist}::manifest", "-m", "mf.csv" + ) - out_mf = runtmp.output('mf.csv') + out_mf = runtmp.output("mf.csv") assert os.path.exists(out_mf) # all should match. - with open(out_mf, newline='') as fp: + with open(out_mf, newline="") as fp: mf = CollectionManifest.load_from_csv(fp) assert len(mf) == 24 idx = sourmash.load_file_as_index(out_mf) siglist = list(idx.signatures()) assert len(siglist) == 24 - ksizes = set([ ss.minhash.ksize for ss in siglist ]) + ksizes = set([ss.minhash.ksize for ss in siglist]) assert len(ksizes) == 3 assert 11 in ksizes assert 21 in ksizes @@ -4550,25 +4828,31 @@ def test_sig_check_1(runtmp): def test_sig_check_1_mf_csv_gz(runtmp): # basic check functionality, with gzipped manifest output - sigfiles = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/salmonella-picklist.csv') - - runtmp.sourmash('sig', 'check', *sigfiles, - "--picklist", f"{picklist}::manifest", - "-m", "mf.csv.gz") - - out_mf = runtmp.output('mf.csv.gz') + sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/salmonella-picklist.csv") + + runtmp.sourmash( + "sig", + "check", + *sigfiles, + "--picklist", + f"{picklist}::manifest", + "-m", + "mf.csv.gz", + ) + + out_mf = runtmp.output("mf.csv.gz") assert os.path.exists(out_mf) # all should match. - with gzip.open(out_mf, "rt", newline='') as fp: + with gzip.open(out_mf, "rt", newline="") as fp: mf = CollectionManifest.load_from_csv(fp) assert len(mf) == 24 idx = sourmash.load_file_as_index(out_mf) siglist = list(idx.signatures()) assert len(siglist) == 24 - ksizes = set([ ss.minhash.ksize for ss in siglist ]) + ksizes = set([ss.minhash.ksize for ss in siglist]) assert len(ksizes) == 3 assert 11 in ksizes assert 21 in ksizes @@ -4577,30 +4861,36 @@ def test_sig_check_1_mf_csv_gz(runtmp): def test_sig_check_1_gz(runtmp): # basic check functionality with gzipped picklist - sigfiles = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/salmonella-picklist.csv') - picklist_gz = runtmp.output('salmonella.csv.gz') + sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/salmonella-picklist.csv") + picklist_gz = runtmp.output("salmonella.csv.gz") with gzip.open(picklist_gz, "w") as outfp: with open(picklist, "rb") as infp: outfp.write(infp.read()) - runtmp.sourmash('sig', 'check', *sigfiles, - "--picklist", "salmonella.csv.gz::manifest", - "-m", "mf.csv") - - out_mf = runtmp.output('mf.csv') + runtmp.sourmash( + "sig", + "check", + *sigfiles, + "--picklist", + "salmonella.csv.gz::manifest", + "-m", + "mf.csv", + ) + + out_mf = runtmp.output("mf.csv") assert os.path.exists(out_mf) # all should match. - with open(out_mf, newline='') as fp: + with open(out_mf, newline="") as fp: mf = CollectionManifest.load_from_csv(fp) assert len(mf) == 24 idx = sourmash.load_file_as_index(out_mf) siglist = list(idx.signatures()) assert len(siglist) == 24 - ksizes = set([ ss.minhash.ksize for ss in siglist ]) + ksizes = set([ss.minhash.ksize for ss in siglist]) assert len(ksizes) == 3 assert 11 in ksizes assert 21 in ksizes @@ -4609,25 +4899,32 @@ def test_sig_check_1_gz(runtmp): def test_sig_check_1_nofail(runtmp): # basic check functionality with --fail-if-missing - sigfiles = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/salmonella-picklist.csv') - - runtmp.sourmash('sig', 'check', *sigfiles, - "--picklist", f"{picklist}::manifest", - "-m", "mf.csv", '--fail-if-missing') - - out_mf = runtmp.output('mf.csv') + sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/salmonella-picklist.csv") + + runtmp.sourmash( + "sig", + "check", + *sigfiles, + "--picklist", + f"{picklist}::manifest", + "-m", + "mf.csv", + "--fail-if-missing", + ) + + out_mf = runtmp.output("mf.csv") assert os.path.exists(out_mf) # all should match. - with open(out_mf, newline='') as fp: + with open(out_mf, newline="") as fp: mf = CollectionManifest.load_from_csv(fp) assert len(mf) == 24 idx = sourmash.load_file_as_index(out_mf) siglist = list(idx.signatures()) assert len(siglist) == 24 - ksizes = set([ ss.minhash.ksize for ss in siglist ]) + ksizes = set([ss.minhash.ksize for ss in siglist]) assert len(ksizes) == 3 assert 11 in ksizes assert 21 in ksizes @@ -4636,44 +4933,54 @@ def test_sig_check_1_nofail(runtmp): def test_sig_check_1_no_picklist(runtmp): # basic check functionality - sigfiles = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/salmonella-picklist.csv') + sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) + utils.get_test_data("gather/salmonella-picklist.csv") with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('sig', 'check', *sigfiles) + runtmp.sourmash("sig", "check", *sigfiles) assert "No picklist provided?! Exiting." in str(exc) -@pytest.mark.parametrize("column, coltype", - (('md5', 'md5'), - ('md5', 'md5prefix8'), - ('name', 'name'), - ('name', 'ident'), - ('name', 'identprefix'), - )) +@pytest.mark.parametrize( + "column, coltype", + ( + ("md5", "md5"), + ("md5", "md5prefix8"), + ("name", "name"), + ("name", "ident"), + ("name", "identprefix"), + ), +) def test_sig_check_1_column(runtmp, column, coltype): # basic check functionality for various columns/coltypes - sigfiles = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/salmonella-picklist.csv') - - runtmp.sourmash('sig', 'check', *sigfiles, - "--picklist", f"{picklist}:{column}:{coltype}", - "-m", "mf.csv", - "-o", "missing.csv") - - out_mf = runtmp.output('mf.csv') + sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/salmonella-picklist.csv") + + runtmp.sourmash( + "sig", + "check", + *sigfiles, + "--picklist", + f"{picklist}:{column}:{coltype}", + "-m", + "mf.csv", + "-o", + "missing.csv", + ) + + out_mf = runtmp.output("mf.csv") assert os.path.exists(out_mf) # all should match. - with open(out_mf, newline='') as fp: + with open(out_mf, newline="") as fp: mf = CollectionManifest.load_from_csv(fp) assert len(mf) == 24 idx = sourmash.load_file_as_index(out_mf) siglist = list(idx.signatures()) assert len(siglist) == 24 - ksizes = set([ ss.minhash.ksize for ss in siglist ]) + ksizes = set([ss.minhash.ksize for ss in siglist]) assert len(ksizes) == 3 assert 11 in ksizes assert 21 in ksizes @@ -4682,113 +4989,133 @@ def test_sig_check_1_column(runtmp, column, coltype): def test_sig_check_1_diff_col_name(runtmp): # 'sig check' with 'name2' column instead of default name - sigfiles = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/salmonella-picklist-diffcolumn.csv') - - runtmp.sourmash('sig', 'check', *sigfiles, - "--picklist", f"{picklist}:name2:name", - "-o", "missing.csv", - '-m', 'mf.csv') - - out_mf = runtmp.output('mf.csv') + sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/salmonella-picklist-diffcolumn.csv") + + runtmp.sourmash( + "sig", + "check", + *sigfiles, + "--picklist", + f"{picklist}:name2:name", + "-o", + "missing.csv", + "-m", + "mf.csv", + ) + + out_mf = runtmp.output("mf.csv") assert os.path.exists(out_mf) - missing_csv = runtmp.output('missing.csv') + missing_csv = runtmp.output("missing.csv") assert os.path.exists(missing_csv) # should be 24 matching manifest rows - with open(out_mf, newline='') as fp: + with open(out_mf, newline="") as fp: mf = CollectionManifest.load_from_csv(fp) assert len(mf) == 24 # internal locations should match sigfile_set = set(sigfiles) for row in mf.rows: - assert row['internal_location'] in sigfile_set + assert row["internal_location"] in sigfile_set idx = sourmash.load_file_as_index(out_mf) siglist = list(idx.signatures()) assert len(siglist) == 24 - ksizes = set([ ss.minhash.ksize for ss in siglist ]) + ksizes = set([ss.minhash.ksize for ss in siglist]) assert len(ksizes) == 3 assert 11 in ksizes assert 21 in ksizes assert 31 in ksizes # should be one non-matching picklist row - with open(missing_csv, newline='') as fp: + with open(missing_csv, newline="") as fp: rows = list(csv.reader(fp)) - assert len(rows) == 2 # header row + data row - assert rows[1][0] == 'NOT THERE' + assert len(rows) == 2 # header row + data row + assert rows[1][0] == "NOT THERE" def test_sig_check_1_diff_col_name_zip(runtmp): # 'sig check' with 'name2' column instead of default name, on a zip file - sigfiles = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/salmonella-picklist-diffcolumn.csv') + sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/salmonella-picklist-diffcolumn.csv") # first create a zip db - runtmp.sourmash('sig', 'cat', *sigfiles, '-o', 'gcf.zip') + runtmp.sourmash("sig", "cat", *sigfiles, "-o", "gcf.zip") # now run against this zip - runtmp.sourmash('sig', 'check', 'gcf.zip', - "--picklist", f"{picklist}:name2:name", - "-o", "missing.csv", - '-m', 'mf.csv') - - out_mf = runtmp.output('mf.csv') + runtmp.sourmash( + "sig", + "check", + "gcf.zip", + "--picklist", + f"{picklist}:name2:name", + "-o", + "missing.csv", + "-m", + "mf.csv", + ) + + out_mf = runtmp.output("mf.csv") assert os.path.exists(out_mf) - missing_csv = runtmp.output('missing.csv') + missing_csv = runtmp.output("missing.csv") assert os.path.exists(missing_csv) # should be 24 matching manifest rows - with open(out_mf, newline='') as fp: + with open(out_mf, newline="") as fp: mf = CollectionManifest.load_from_csv(fp) assert len(mf) == 24 # internal locations should all point to zip - ilocs = set(( row['internal_location'] for row in mf.rows )) + ilocs = set(row["internal_location"] for row in mf.rows) assert len(ilocs) == 1 # can we get 'em? idx = sourmash.load_file_as_index(out_mf) siglist = list(idx.signatures()) assert len(siglist) == 24 - ksizes = set([ ss.minhash.ksize for ss in siglist ]) + ksizes = set([ss.minhash.ksize for ss in siglist]) assert len(ksizes) == 3 assert 11 in ksizes assert 21 in ksizes assert 31 in ksizes # should be one non-matching picklist row - with open(missing_csv, newline='') as fp: + with open(missing_csv, newline="") as fp: rows = list(csv.reader(fp)) - assert len(rows) == 2 # header row + data row - assert rows[1][0] == 'NOT THERE' + assert len(rows) == 2 # header row + data row + assert rows[1][0] == "NOT THERE" def test_sig_check_1_diff_col_name_exclude(runtmp): # 'sig check' with 'name2' column, :exclude picklist - sigfiles = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/salmonella-picklist-diffcolumn.csv') - - runtmp.sourmash('sig', 'check', *sigfiles, - "--picklist", f"{picklist}:name2:name:exclude", - '-m', 'mf.csv') - - out_mf = runtmp.output('mf.csv') + sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/salmonella-picklist-diffcolumn.csv") + + runtmp.sourmash( + "sig", + "check", + *sigfiles, + "--picklist", + f"{picklist}:name2:name:exclude", + "-m", + "mf.csv", + ) + + out_mf = runtmp.output("mf.csv") assert os.path.exists(out_mf) # should be 12 matching manifest rows - with open(out_mf, newline='') as fp: + with open(out_mf, newline="") as fp: mf = CollectionManifest.load_from_csv(fp) assert len(mf) == 12 idx = sourmash.load_file_as_index(out_mf) siglist = list(idx.signatures()) assert len(siglist) == 12 - ksizes = set([ ss.minhash.ksize for ss in siglist ]) + ksizes = set([ss.minhash.ksize for ss in siglist]) assert len(ksizes) == 3 assert 11 in ksizes assert 21 in ksizes @@ -4797,72 +5124,98 @@ def test_sig_check_1_diff_col_name_exclude(runtmp): def test_sig_check_1_ksize(runtmp): # basic check functionality with selection for ksize - sigfiles = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/salmonella-picklist.csv') - - runtmp.sourmash('sig', 'check', *sigfiles, '-k', '31', - "--picklist", f"{picklist}::manifest", - "-m", "mf.csv") - - out_mf = runtmp.output('mf.csv') + sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/salmonella-picklist.csv") + + runtmp.sourmash( + "sig", + "check", + *sigfiles, + "-k", + "31", + "--picklist", + f"{picklist}::manifest", + "-m", + "mf.csv", + ) + + out_mf = runtmp.output("mf.csv") assert os.path.exists(out_mf) # 8 of the 24 should match. - with open(out_mf, newline='') as fp: + with open(out_mf, newline="") as fp: mf = CollectionManifest.load_from_csv(fp) assert len(mf) == 8 idx = sourmash.load_file_as_index(out_mf) siglist = list(idx.signatures()) assert len(siglist) == 8 - ksizes = set([ ss.minhash.ksize for ss in siglist ]) + ksizes = set([ss.minhash.ksize for ss in siglist]) assert len(ksizes) == 1 assert 31 in ksizes def test_sig_check_1_ksize_output_sql(runtmp): # basic check functionality with selection for ksize - sigfiles = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/salmonella-picklist.csv') - - runtmp.sourmash('sig', 'check', *sigfiles, '-k', '31', - "--picklist", f"{picklist}::manifest", - "-m", "mf.mfsql", "-F", "sql") - - out_mf = runtmp.output('mf.mfsql') + sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/salmonella-picklist.csv") + + runtmp.sourmash( + "sig", + "check", + *sigfiles, + "-k", + "31", + "--picklist", + f"{picklist}::manifest", + "-m", + "mf.mfsql", + "-F", + "sql", + ) + + out_mf = runtmp.output("mf.mfsql") assert os.path.exists(out_mf) # 8 of the 24 should match. mf = CollectionManifest.load_from_filename(out_mf) assert len(mf) == 8 - assert mf.conn # check that it's a sqlite manifest! hacky... + assert mf.conn # check that it's a sqlite manifest! hacky... idx = sourmash.load_file_as_index(out_mf) siglist = list(idx.signatures()) assert len(siglist) == 8 - ksizes = set([ ss.minhash.ksize for ss in siglist ]) + ksizes = set([ss.minhash.ksize for ss in siglist]) assert len(ksizes) == 1 assert 31 in ksizes def test_sig_check_2_output_missing(runtmp): # output missing all as identical to input picklist - sigfiles = utils.get_test_data('gather/combined.sig') - picklist = utils.get_test_data('gather/salmonella-picklist.csv') - - runtmp.sourmash('sig', 'check', sigfiles, - "--picklist", f"{picklist}::manifest", - "-o", "missing.csv", "-m", "mf.csv") - - out_csv = runtmp.output('missing.csv') + sigfiles = utils.get_test_data("gather/combined.sig") + picklist = utils.get_test_data("gather/salmonella-picklist.csv") + + runtmp.sourmash( + "sig", + "check", + sigfiles, + "--picklist", + f"{picklist}::manifest", + "-o", + "missing.csv", + "-m", + "mf.csv", + ) + + out_csv = runtmp.output("missing.csv") assert os.path.exists(out_csv) - mf_csv = runtmp.output('mf.csv') + mf_csv = runtmp.output("mf.csv") assert not os.path.exists(mf_csv) assert "not saving matching manifest" in runtmp.last_result.err # everything is missing with 'combined.sig' - with open(out_csv, newline='') as fp: + with open(out_csv, newline="") as fp: r = csv.DictReader(fp) rows = list(r) @@ -4871,51 +5224,67 @@ def test_sig_check_2_output_missing(runtmp): def test_sig_check_2_output_missing_error_exit(runtmp): # output missing all as identical to input picklist - sigfiles = utils.get_test_data('gather/combined.sig') - picklist = utils.get_test_data('gather/salmonella-picklist.csv') + sigfiles = utils.get_test_data("gather/combined.sig") + picklist = utils.get_test_data("gather/salmonella-picklist.csv") # should error exit... with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'check', sigfiles, - "--picklist", f"{picklist}::manifest", - "-o", "missing.csv", '--fail') + runtmp.sourmash( + "sig", + "check", + sigfiles, + "--picklist", + f"{picklist}::manifest", + "-o", + "missing.csv", + "--fail", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) # ...and also output stuff! - out_csv = runtmp.output('missing.csv') + out_csv = runtmp.output("missing.csv") assert os.path.exists(out_csv) # everything is missing with 'combined.sig' - with open(out_csv, newline='') as fp: + with open(out_csv, newline="") as fp: r = csv.DictReader(fp) rows = list(r) assert len(rows) == 24 -@pytest.mark.parametrize("column, coltype", - (('md5', 'md5'), - ('md5', 'md5prefix8'), - ('name', 'name'), - ('name', 'ident'), - ('name', 'identprefix'), - )) +@pytest.mark.parametrize( + "column, coltype", + ( + ("md5", "md5"), + ("md5", "md5prefix8"), + ("name", "name"), + ("name", "ident"), + ("name", "identprefix"), + ), +) def test_sig_check_2_output_missing_column(runtmp, column, coltype): # output missing all as identical to input picklist - sigfiles = utils.get_test_data('gather/combined.sig') - picklist = utils.get_test_data('gather/salmonella-picklist.csv') - - runtmp.sourmash('sig', 'check', sigfiles, - "--picklist", f"{picklist}::manifest", - "-o", "missing.csv") - - out_csv = runtmp.output('missing.csv') + sigfiles = utils.get_test_data("gather/combined.sig") + picklist = utils.get_test_data("gather/salmonella-picklist.csv") + + runtmp.sourmash( + "sig", + "check", + sigfiles, + "--picklist", + f"{picklist}::manifest", + "-o", + "missing.csv", + ) + + out_csv = runtmp.output("missing.csv") assert os.path.exists(out_csv) # everything is missing with 'combined.sig' - with open(out_csv, newline='') as fp: + with open(out_csv, newline="") as fp: r = csv.DictReader(fp) rows = list(r) @@ -4924,25 +5293,33 @@ def test_sig_check_2_output_missing_column(runtmp, column, coltype): def test_sig_check_2_output_missing_exclude(runtmp): # 'exclude' with '-o' shouldn't work - sigfiles = utils.get_test_data('gather/combined.sig') - picklist = utils.get_test_data('gather/salmonella-picklist.csv') + sigfiles = utils.get_test_data("gather/combined.sig") + picklist = utils.get_test_data("gather/salmonella-picklist.csv") with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('sig', 'check', sigfiles, - "--picklist", f"{picklist}:name:name:exclude", - "-o", "missing.csv") - - assert "** ERROR: Cannot use an 'exclude' picklist with '-o/--output-missing'" in str(exc) + runtmp.sourmash( + "sig", + "check", + sigfiles, + "--picklist", + f"{picklist}:name:name:exclude", + "-o", + "missing.csv", + ) + + assert ( + "** ERROR: Cannot use an 'exclude' picklist with '-o/--output-missing'" + in str(exc) + ) def test_sig_check_3_no_manifest(runtmp): # fail check when no manifest, by default - sbt = utils.get_test_data('v6.sbt.zip') - picklist = utils.get_test_data('v6.sbt.zip.mf.csv') + sbt = utils.get_test_data("v6.sbt.zip") + picklist = utils.get_test_data("v6.sbt.zip.mf.csv") - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('sig', 'check', sbt, - '--picklist', f"{picklist}::manifest") + with pytest.raises(SourmashCommandFailed): + runtmp.run_sourmash("sig", "check", sbt, "--picklist", f"{picklist}::manifest") print(runtmp.last_result.out) print(runtmp.last_result.err) @@ -4953,12 +5330,21 @@ def test_sig_check_3_no_manifest(runtmp): def test_sig_check_3_no_manifest_ok(runtmp): # generate manifest if --no-require-manifest - sbt = utils.get_test_data('v6.sbt.zip') - picklist = utils.get_test_data('v6.sbt.zip.mf.csv') - - runtmp.run_sourmash('sig', 'check', sbt, "--no-require-manifest", - '--picklist', f"{picklist}::manifest") + sbt = utils.get_test_data("v6.sbt.zip") + picklist = utils.get_test_data("v6.sbt.zip.mf.csv") + + runtmp.run_sourmash( + "sig", + "check", + sbt, + "--no-require-manifest", + "--picklist", + f"{picklist}::manifest", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert "for given picklist, found 7 matches to 7 distinct values" in runtmp.last_result.err + assert ( + "for given picklist, found 7 matches to 7 distinct values" + in runtmp.last_result.err + ) diff --git a/tests/test_cmd_signature_collect.py b/tests/test_cmd_signature_collect.py index 61f703080f..edd7c16a29 100644 --- a/tests/test_cmd_signature_collect.py +++ b/tests/test_cmd_signature_collect.py @@ -15,13 +15,13 @@ def test_sig_collect_0_nothing(runtmp, manifest_db_format): # run with just output - ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' - if manifest_db_format != 'sql': return + ext = "sqlmf" if manifest_db_format == "sql" else "csv" + if manifest_db_format != "sql": + return - runtmp.sourmash('sig', 'collect', '-o', f'mf.{ext}', - '-F', manifest_db_format) + runtmp.sourmash("sig", "collect", "-o", f"mf.{ext}", "-F", manifest_db_format) - manifest_fn = runtmp.output(f'mf.{ext}') + manifest_fn = runtmp.output(f"mf.{ext}") manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 0 @@ -29,124 +29,125 @@ def test_sig_collect_0_nothing(runtmp, manifest_db_format): def test_sig_collect_1_zipfile(runtmp, manifest_db_format): # collect a manifest from a .zip file - protzip = utils.get_test_data('prot/protein.zip') + protzip = utils.get_test_data("prot/protein.zip") - ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' + ext = "sqlmf" if manifest_db_format == "sql" else "csv" - runtmp.sourmash('sig', 'collect', protzip, '-o', f'mf.{ext}', - '-F', manifest_db_format) + runtmp.sourmash( + "sig", "collect", protzip, "-o", f"mf.{ext}", "-F", manifest_db_format + ) - manifest_fn = runtmp.output(f'mf.{ext}') + manifest_fn = runtmp.output(f"mf.{ext}") manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list def test_sig_collect_1_zipfile_csv_gz(runtmp): # collect a manifest from a .zip file, save to csv.gz - protzip = utils.get_test_data('prot/protein.zip') + protzip = utils.get_test_data("prot/protein.zip") - runtmp.sourmash('sig', 'collect', protzip, '-o', 'mf.csv.gz', - '-F', 'csv') + runtmp.sourmash("sig", "collect", protzip, "-o", "mf.csv.gz", "-F", "csv") - manifest_fn = runtmp.output('mf.csv.gz') + manifest_fn = runtmp.output("mf.csv.gz") # gzip, yes? - print('XXX', manifest_fn) - with gzip.open(manifest_fn, 'rt', newline='') as fp: + print("XXX", manifest_fn) + with gzip.open(manifest_fn, "rt", newline="") as fp: fp.read() manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list def test_sig_collect_1_zipfile_csv_gz_roundtrip(runtmp): # collect a manifest from a .zip file, save to csv.gz; then load again - protzip = utils.get_test_data('prot/protein.zip') + protzip = utils.get_test_data("prot/protein.zip") - runtmp.sourmash('sig', 'collect', protzip, '-o', 'mf.csv.gz', - '-F', 'csv') + runtmp.sourmash("sig", "collect", protzip, "-o", "mf.csv.gz", "-F", "csv") - manifest_fn = runtmp.output('mf.csv.gz') + manifest_fn = runtmp.output("mf.csv.gz") # gzip, yes? - print('XXX', manifest_fn) - with gzip.open(manifest_fn, 'rt', newline='') as fp: + print("XXX", manifest_fn) + with gzip.open(manifest_fn, "rt", newline="") as fp: fp.read() manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list # can we read a csv.gz? - runtmp.sourmash('sig', 'collect', 'mf.csv.gz', '-o', 'mf2.csv', - '-F', 'csv') + runtmp.sourmash("sig", "collect", "mf.csv.gz", "-o", "mf2.csv", "-F", "csv") - manifest_fn2 = runtmp.output('mf2.csv') + manifest_fn2 = runtmp.output("mf2.csv") manifest2 = BaseCollectionManifest.load_from_filename(manifest_fn2) assert len(manifest2) == 2 - md5_list = [ row['md5'] for row in manifest2.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list - + md5_list = [row["md5"] for row in manifest2.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list def test_sig_collect_2_exists_fail(runtmp, manifest_db_format): # collect a manifest from two .zip files - protzip = utils.get_test_data('prot/protein.zip') - allzip = utils.get_test_data('prot/protein.zip') + protzip = utils.get_test_data("prot/protein.zip") + allzip = utils.get_test_data("prot/protein.zip") - ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' + ext = "sqlmf" if manifest_db_format == "sql" else "csv" - runtmp.sourmash('sig', 'collect', protzip, '-o', f'mf.{ext}', - '-F', manifest_db_format) + runtmp.sourmash( + "sig", "collect", protzip, "-o", f"mf.{ext}", "-F", manifest_db_format + ) - manifest_fn = runtmp.output(f'mf.{ext}') + manifest_fn = runtmp.output(f"mf.{ext}") manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list # now run with same filename - should fail with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'collect', allzip, '-o', manifest_fn, - '-F', manifest_db_format) + runtmp.sourmash( + "sig", "collect", allzip, "-o", manifest_fn, "-F", manifest_db_format + ) def test_sig_collect_2_exists_merge(runtmp, manifest_db_format): # collect a manifest from two .zip files - protzip = utils.get_test_data('prot/protein.zip') - allzip = utils.get_test_data('prot/all.zip') + protzip = utils.get_test_data("prot/protein.zip") + allzip = utils.get_test_data("prot/all.zip") - ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' + ext = "sqlmf" if manifest_db_format == "sql" else "csv" - runtmp.sourmash('sig', 'collect', protzip, '-o', f'mf.{ext}', - '-F', manifest_db_format) + runtmp.sourmash( + "sig", "collect", protzip, "-o", f"mf.{ext}", "-F", manifest_db_format + ) - manifest_fn = runtmp.output(f'mf.{ext}') + manifest_fn = runtmp.output(f"mf.{ext}") manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list # now run with same filename - should merge - runtmp.sourmash('sig', 'collect', allzip, '-o', manifest_fn, - '-F', manifest_db_format, '--merge') + runtmp.sourmash( + "sig", "collect", allzip, "-o", manifest_fn, "-F", manifest_db_format, "--merge" + ) manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 10 @@ -154,67 +155,68 @@ def test_sig_collect_2_exists_merge(runtmp, manifest_db_format): def test_sig_collect_2_exists_sql_merge_csv(runtmp, manifest_db_format): # try to merge csv into sql - protzip = utils.get_test_data('prot/protein.zip') - allzip = utils.get_test_data('prot/all.zip') + protzip = utils.get_test_data("prot/protein.zip") + allzip = utils.get_test_data("prot/all.zip") - ext = 'sqlmf' + ext = "sqlmf" # save as sql... - runtmp.sourmash('sig', 'collect', protzip, '-o', f'mf.{ext}', - '-F', 'sql') + runtmp.sourmash("sig", "collect", protzip, "-o", f"mf.{ext}", "-F", "sql") - manifest_fn = runtmp.output(f'mf.{ext}') + manifest_fn = runtmp.output(f"mf.{ext}") manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'collect', allzip, '-o', manifest_fn, - '-F', 'csv', '--merge') + runtmp.sourmash( + "sig", "collect", allzip, "-o", manifest_fn, "-F", "csv", "--merge" + ) assert "ERROR loading" in runtmp.last_result.err def test_sig_collect_2_exists_csv_merge_sql(runtmp): # try to merge sql into csv - protzip = utils.get_test_data('prot/protein.zip') - allzip = utils.get_test_data('prot/all.zip') + protzip = utils.get_test_data("prot/protein.zip") + allzip = utils.get_test_data("prot/all.zip") - ext = 'csv' + ext = "csv" # save as csv... - runtmp.sourmash('sig', 'collect', protzip, '-o', f'mf.{ext}', - '-F', 'csv') + runtmp.sourmash("sig", "collect", protzip, "-o", f"mf.{ext}", "-F", "csv") - manifest_fn = runtmp.output(f'mf.{ext}') + manifest_fn = runtmp.output(f"mf.{ext}") manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'collect', allzip, '-o', manifest_fn, - '-F', 'sql', '--merge') + runtmp.sourmash( + "sig", "collect", allzip, "-o", manifest_fn, "-F", "sql", "--merge" + ) assert "ERROR loading" in runtmp.last_result.err def test_sig_collect_2_no_exists_merge(runtmp, manifest_db_format): # test 'merge' when args.output doesn't already exist => warning - protzip = utils.get_test_data('prot/protein.zip') - allzip = utils.get_test_data('prot/all.zip') + utils.get_test_data("prot/protein.zip") + allzip = utils.get_test_data("prot/all.zip") - ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' - manifest_fn = runtmp.output(f'mf.{ext}') + ext = "sqlmf" if manifest_db_format == "sql" else "csv" + manifest_fn = runtmp.output(f"mf.{ext}") # run with --merge but no previous: - runtmp.sourmash('sig', 'collect', allzip, '-o', manifest_fn, - '-F', manifest_db_format, '--merge') + runtmp.sourmash( + "sig", "collect", allzip, "-o", manifest_fn, "-F", manifest_db_format, "--merge" + ) manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 8 @@ -226,28 +228,37 @@ def test_sig_collect_2_no_exists_merge(runtmp, manifest_db_format): def test_sig_collect_3_multiple(runtmp, manifest_db_format): # collect a manifest from two .zip files - protzip = utils.get_test_data('prot/protein.zip') - hpzip = utils.get_test_data('prot/hp.zip') - dayzip = utils.get_test_data('prot/dayhoff.zip') - - ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' - - runtmp.sourmash('sig', 'collect', protzip, hpzip, dayzip, - '-o', f'mf.{ext}', '-F', manifest_db_format) - - manifest_fn = runtmp.output(f'mf.{ext}') + protzip = utils.get_test_data("prot/protein.zip") + hpzip = utils.get_test_data("prot/hp.zip") + dayzip = utils.get_test_data("prot/dayhoff.zip") + + ext = "sqlmf" if manifest_db_format == "sql" else "csv" + + runtmp.sourmash( + "sig", + "collect", + protzip, + hpzip, + dayzip, + "-o", + f"mf.{ext}", + "-F", + manifest_db_format, + ) + + manifest_fn = runtmp.output(f"mf.{ext}") manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 6 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list - assert 'ea2a1ad233c2908529d124a330bcb672' in md5_list - assert 'bb0e6d90df01b7bd5d0956a5f9e3ed12' in md5_list - assert 'fbca5e5211e4d58427997fd5c8343e9a' in md5_list - assert '1cbd888bf910f83ad8f1715509183223' in md5_list - - locations = set([ row['internal_location'] for row in manifest.rows ]) + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list + assert "ea2a1ad233c2908529d124a330bcb672" in md5_list + assert "bb0e6d90df01b7bd5d0956a5f9e3ed12" in md5_list + assert "fbca5e5211e4d58427997fd5c8343e9a" in md5_list + assert "1cbd888bf910f83ad8f1715509183223" in md5_list + + locations = set([row["internal_location"] for row in manifest.rows]) assert protzip in locations assert hpzip in locations assert dayzip in locations @@ -256,34 +267,42 @@ def test_sig_collect_3_multiple(runtmp, manifest_db_format): def test_sig_collect_3_multiple_use_fromfile(runtmp, manifest_db_format): # collect a manifest from two .zip files using --from-file - protzip = utils.get_test_data('prot/protein.zip') - hpzip = utils.get_test_data('prot/hp.zip') - dayzip = utils.get_test_data('prot/dayhoff.zip') + protzip = utils.get_test_data("prot/protein.zip") + hpzip = utils.get_test_data("prot/hp.zip") + dayzip = utils.get_test_data("prot/dayhoff.zip") - ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' + ext = "sqlmf" if manifest_db_format == "sql" else "csv" - fromfile = runtmp.output('fromfile.txt') - with open(fromfile, 'wt') as fp: + fromfile = runtmp.output("fromfile.txt") + with open(fromfile, "w") as fp: print(protzip, file=fp) print(hpzip, file=fp) print(dayzip, file=fp) - runtmp.sourmash('sig', 'collect', '--from-file', 'fromfile.txt', - '-o', f'mf.{ext}', '-F', manifest_db_format) - - manifest_fn = runtmp.output(f'mf.{ext}') + runtmp.sourmash( + "sig", + "collect", + "--from-file", + "fromfile.txt", + "-o", + f"mf.{ext}", + "-F", + manifest_db_format, + ) + + manifest_fn = runtmp.output(f"mf.{ext}") manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 6 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list - assert 'ea2a1ad233c2908529d124a330bcb672' in md5_list - assert 'bb0e6d90df01b7bd5d0956a5f9e3ed12' in md5_list - assert 'fbca5e5211e4d58427997fd5c8343e9a' in md5_list - assert '1cbd888bf910f83ad8f1715509183223' in md5_list - - locations = set([ row['internal_location'] for row in manifest.rows ]) + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list + assert "ea2a1ad233c2908529d124a330bcb672" in md5_list + assert "bb0e6d90df01b7bd5d0956a5f9e3ed12" in md5_list + assert "fbca5e5211e4d58427997fd5c8343e9a" in md5_list + assert "1cbd888bf910f83ad8f1715509183223" in md5_list + + locations = set([row["internal_location"] for row in manifest.rows]) assert protzip in locations assert hpzip in locations assert dayzip in locations @@ -292,23 +311,24 @@ def test_sig_collect_3_multiple_use_fromfile(runtmp, manifest_db_format): def test_sig_collect_4_multiple_from_sig(runtmp, manifest_db_format): # collect a manifest from sig files - sig43 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig43 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") - ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' + ext = "sqlmf" if manifest_db_format == "sql" else "csv" - runtmp.sourmash('sig', 'collect', sig43, sig63, - '-o', f'mf.{ext}', '-F', manifest_db_format) + runtmp.sourmash( + "sig", "collect", sig43, sig63, "-o", f"mf.{ext}", "-F", manifest_db_format + ) - manifest_fn = runtmp.output(f'mf.{ext}') + manifest_fn = runtmp.output(f"mf.{ext}") manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '09a08691ce52952152f0e866a59f6261' in md5_list - assert '38729c6374925585db28916b82a6f513' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "09a08691ce52952152f0e866a59f6261" in md5_list + assert "38729c6374925585db28916b82a6f513" in md5_list - locations = set([ row['internal_location'] for row in manifest.rows ]) + locations = set([row["internal_location"] for row in manifest.rows]) assert sig43 in locations assert sig63 in locations assert len(locations) == 2, locations @@ -316,89 +336,115 @@ def test_sig_collect_4_multiple_from_sig(runtmp, manifest_db_format): def test_sig_collect_4_multiple_from_sig_abspath(runtmp, manifest_db_format): # collect a manifest from sig files, forcing abspath - sig43 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - - shutil.copyfile(sig43, runtmp.output('47.fa.sig')) - shutil.copyfile(sig63, runtmp.output('63.fa.sig')) - - ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' - - runtmp.sourmash('sig', 'collect', '47.fa.sig', '63.fa.sig', '--abspath', - '-o', f'mf.{ext}', '-F', manifest_db_format) + sig43 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + + shutil.copyfile(sig43, runtmp.output("47.fa.sig")) + shutil.copyfile(sig63, runtmp.output("63.fa.sig")) + + ext = "sqlmf" if manifest_db_format == "sql" else "csv" + + runtmp.sourmash( + "sig", + "collect", + "47.fa.sig", + "63.fa.sig", + "--abspath", + "-o", + f"mf.{ext}", + "-F", + manifest_db_format, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - manifest_fn = runtmp.output(f'mf.{ext}') + manifest_fn = runtmp.output(f"mf.{ext}") manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '09a08691ce52952152f0e866a59f6261' in md5_list - assert '38729c6374925585db28916b82a6f513' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "09a08691ce52952152f0e866a59f6261" in md5_list + assert "38729c6374925585db28916b82a6f513" in md5_list - locations = set([ row['internal_location'] for row in manifest.rows ]) + locations = set([row["internal_location"] for row in manifest.rows]) print(locations) assert len(locations) == 2, locations for xx in locations: - assert xx.startswith('/') + assert xx.startswith("/") def test_sig_collect_4_multiple_no_abspath(runtmp, manifest_db_format): # collect a manifest from sig files, no abspath - sig43 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig43 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # copy files to tmp, where they will not have full paths - shutil.copyfile(sig43, runtmp.output('47.fa.sig')) - shutil.copyfile(sig63, runtmp.output('63.fa.sig')) - - ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' - - runtmp.sourmash('sig', 'collect', '47.fa.sig', '63.fa.sig', - '-o', f'mf.{ext}', '-F', manifest_db_format) - - manifest_fn = runtmp.output(f'mf.{ext}') + shutil.copyfile(sig43, runtmp.output("47.fa.sig")) + shutil.copyfile(sig63, runtmp.output("63.fa.sig")) + + ext = "sqlmf" if manifest_db_format == "sql" else "csv" + + runtmp.sourmash( + "sig", + "collect", + "47.fa.sig", + "63.fa.sig", + "-o", + f"mf.{ext}", + "-F", + manifest_db_format, + ) + + manifest_fn = runtmp.output(f"mf.{ext}") manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '09a08691ce52952152f0e866a59f6261' in md5_list - assert '38729c6374925585db28916b82a6f513' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "09a08691ce52952152f0e866a59f6261" in md5_list + assert "38729c6374925585db28916b82a6f513" in md5_list - locations = set([ row['internal_location'] for row in manifest.rows ]) + locations = set([row["internal_location"] for row in manifest.rows]) print(locations) assert len(locations) == 2, locations - assert '47.fa.sig' in locations - assert '63.fa.sig' in locations + assert "47.fa.sig" in locations + assert "63.fa.sig" in locations def test_sig_collect_5_no_manifest_sbt_fail(runtmp, manifest_db_format): # collect a manifest from files that don't have one - sbt_zip = utils.get_test_data('v6.sbt.zip') + sbt_zip = utils.get_test_data("v6.sbt.zip") - ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' + ext = "sqlmf" if manifest_db_format == "sql" else "csv" with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'collect', sbt_zip, - '-o', f'mf.{ext}', '-F', manifest_db_format) + runtmp.sourmash( + "sig", "collect", sbt_zip, "-o", f"mf.{ext}", "-F", manifest_db_format + ) def test_sig_collect_5_no_manifest_sbt_succeed(runtmp, manifest_db_format): # generate a manifest from files that don't have one when --no-require - sbt_zip = utils.get_test_data('v6.sbt.zip') - - ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' - - runtmp.sourmash('sig', 'collect', sbt_zip, '--no-require-manifest', - '-o', f'mf.{ext}', '-F', manifest_db_format) - - manifest_fn = runtmp.output(f'mf.{ext}') + sbt_zip = utils.get_test_data("v6.sbt.zip") + + ext = "sqlmf" if manifest_db_format == "sql" else "csv" + + runtmp.sourmash( + "sig", + "collect", + sbt_zip, + "--no-require-manifest", + "-o", + f"mf.{ext}", + "-F", + manifest_db_format, + ) + + manifest_fn = runtmp.output(f"mf.{ext}") manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 7 - locations = set([ row['internal_location'] for row in manifest.rows ]) + locations = set([row["internal_location"] for row in manifest.rows]) assert len(locations) == 1, locations assert sbt_zip in locations diff --git a/tests/test_cmd_signature_fileinfo.py b/tests/test_cmd_signature_fileinfo.py index 33bd649748..25e29a5b4f 100644 --- a/tests/test_cmd_signature_fileinfo.py +++ b/tests/test_cmd_signature_fileinfo.py @@ -16,10 +16,10 @@ def test_fileinfo_1_sig(runtmp): # get basic info on a signature - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") - shutil.copyfile(sig47, runtmp.output('sig47.sig')) - runtmp.run_sourmash('sig', 'fileinfo', 'sig47.sig') + shutil.copyfile(sig47, runtmp.output("sig47.sig")) + runtmp.run_sourmash("sig", "fileinfo", "sig47.sig") out = runtmp.last_result.out print(runtmp.last_result.out) @@ -40,10 +40,10 @@ def test_fileinfo_1_sig(runtmp): def test_fileinfo_1_sig_summarize(runtmp): # get basic info on a signature with 'summarize' as alias for fileinfo - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") - shutil.copyfile(sig47, runtmp.output('sig47.sig')) - runtmp.run_sourmash('sig', 'summarize', 'sig47.sig') + shutil.copyfile(sig47, runtmp.output("sig47.sig")) + runtmp.run_sourmash("sig", "summarize", "sig47.sig") out = runtmp.last_result.out print(runtmp.last_result.out) @@ -64,10 +64,10 @@ def test_fileinfo_1_sig_summarize(runtmp): def test_fileinfo_1_sig_abund(runtmp): # get basic info on a signature with abundance - sig47 = utils.get_test_data('track_abund/47.fa.sig') + sig47 = utils.get_test_data("track_abund/47.fa.sig") - shutil.copyfile(sig47, runtmp.output('sig47.sig')) - runtmp.run_sourmash('sig', 'fileinfo', 'sig47.sig') + shutil.copyfile(sig47, runtmp.output("sig47.sig")) + runtmp.run_sourmash("sig", "fileinfo", "sig47.sig") out = runtmp.last_result.out print(runtmp.last_result.out) @@ -88,10 +88,10 @@ def test_fileinfo_1_sig_abund(runtmp): def test_fileinfo_2_lca(runtmp): # get basic info on an LCA database - prot = utils.get_test_data('prot/protein.lca.json.gz') + prot = utils.get_test_data("prot/protein.lca.json.gz") - shutil.copyfile(prot, runtmp.output('protein.lca.json.gz')) - runtmp.run_sourmash('sig', 'fileinfo', 'protein.lca.json.gz') + shutil.copyfile(prot, runtmp.output("protein.lca.json.gz")) + runtmp.run_sourmash("sig", "fileinfo", "protein.lca.json.gz") out = runtmp.last_result.out print(runtmp.last_result.out) @@ -112,10 +112,10 @@ def test_fileinfo_2_lca(runtmp): def test_fileinfo_3_sbt_zip(runtmp): # test on an SBT.zip - prot = utils.get_test_data('prot/protein.sbt.zip') + prot = utils.get_test_data("prot/protein.sbt.zip") - shutil.copyfile(prot, runtmp.output('protein.sbt.zip')) - runtmp.run_sourmash('sig', 'fileinfo', 'protein.sbt.zip') + shutil.copyfile(prot, runtmp.output("protein.sbt.zip")) + runtmp.run_sourmash("sig", "fileinfo", "protein.sbt.zip") out = runtmp.last_result.out print(runtmp.last_result.out) @@ -136,10 +136,10 @@ def test_fileinfo_3_sbt_zip(runtmp): def test_fileinfo_4_zip(runtmp): # test on a ZipFileLinearIndex - prot = utils.get_test_data('prot/all.zip') + prot = utils.get_test_data("prot/all.zip") - shutil.copyfile(prot, runtmp.output('all.zip')) - runtmp.run_sourmash('sig', 'fileinfo', 'all.zip') + shutil.copyfile(prot, runtmp.output("all.zip")) + runtmp.run_sourmash("sig", "fileinfo", "all.zip") out = runtmp.last_result.out print(runtmp.last_result.out) @@ -163,10 +163,10 @@ def test_fileinfo_4_zip(runtmp): def test_fileinfo_4_zip_json_out(runtmp): # check --json-out - prot = utils.get_test_data('prot/all.zip') + prot = utils.get_test_data("prot/all.zip") - shutil.copyfile(prot, runtmp.output('all.zip')) - runtmp.run_sourmash('sig', 'fileinfo', 'all.zip', '--json-out') + shutil.copyfile(prot, runtmp.output("all.zip")) + runtmp.run_sourmash("sig", "fileinfo", "all.zip", "--json-out") out = runtmp.last_result.out print(runtmp.last_result.out) @@ -174,30 +174,62 @@ def test_fileinfo_4_zip_json_out(runtmp): # should succeed as loading as JSON, with correct info vals = json.loads(out) - assert vals['has_manifest'] - assert vals['is_database'] - assert vals['num_sketches'] == 8 - assert vals['path_filetype'] == 'ZipFileLinearIndex' - assert vals['total_hashes'] == 31758 - - d1 = {'ksize': 19, 'moltype': 'dayhoff', 'scaled': 100, 'num': 0, 'abund': False, 'count': 2, 'n_hashes': 7945} - d2 = {'ksize': 19, 'moltype': 'hp', 'scaled': 100, 'num': 0, 'abund': False, 'count': 2, 'n_hashes': 5184} - d3 = {'ksize': 19, 'moltype': 'protein', 'scaled': 100, 'num': 0, 'abund': False, 'count': 2, 'n_hashes': 8214} - d4 = {'ksize': 31, 'moltype': 'DNA', 'scaled': 1000, 'num': 0, 'abund': False, 'count': 2, 'n_hashes': 10415} - - assert d1 in vals['sketch_info'] - assert d2 in vals['sketch_info'] - assert d3 in vals['sketch_info'] - assert d4 in vals['sketch_info'] - assert len(vals['sketch_info']) == 4 + assert vals["has_manifest"] + assert vals["is_database"] + assert vals["num_sketches"] == 8 + assert vals["path_filetype"] == "ZipFileLinearIndex" + assert vals["total_hashes"] == 31758 + + d1 = { + "ksize": 19, + "moltype": "dayhoff", + "scaled": 100, + "num": 0, + "abund": False, + "count": 2, + "n_hashes": 7945, + } + d2 = { + "ksize": 19, + "moltype": "hp", + "scaled": 100, + "num": 0, + "abund": False, + "count": 2, + "n_hashes": 5184, + } + d3 = { + "ksize": 19, + "moltype": "protein", + "scaled": 100, + "num": 0, + "abund": False, + "count": 2, + "n_hashes": 8214, + } + d4 = { + "ksize": 31, + "moltype": "DNA", + "scaled": 1000, + "num": 0, + "abund": False, + "count": 2, + "n_hashes": 10415, + } + + assert d1 in vals["sketch_info"] + assert d2 in vals["sketch_info"] + assert d3 in vals["sketch_info"] + assert d4 in vals["sketch_info"] + assert len(vals["sketch_info"]) == 4 def test_fileinfo_4_zip_rebuild(runtmp): # test --rebuild - prot = utils.get_test_data('prot/all.zip') + prot = utils.get_test_data("prot/all.zip") - shutil.copyfile(prot, runtmp.output('all.zip')) - runtmp.run_sourmash('sig', 'fileinfo', 'all.zip', '--rebuild') + shutil.copyfile(prot, runtmp.output("all.zip")) + runtmp.run_sourmash("sig", "fileinfo", "all.zip", "--rebuild") out = runtmp.last_result.out print(runtmp.last_result.out) @@ -224,12 +256,12 @@ def test_fileinfo_4_zip_rebuild(runtmp): def test_fileinfo_5_dir(runtmp): # test on a directory - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") - os.mkdir(runtmp.output('subdir')) + os.mkdir(runtmp.output("subdir")) - shutil.copyfile(sig47, runtmp.output('subdir/sig47.sig')) - runtmp.run_sourmash('sig', 'fileinfo', 'subdir/') + shutil.copyfile(sig47, runtmp.output("subdir/sig47.sig")) + runtmp.run_sourmash("sig", "fileinfo", "subdir/") out = runtmp.last_result.out print(runtmp.last_result.out) @@ -250,13 +282,13 @@ def test_fileinfo_5_dir(runtmp): def test_fileinfo_6_pathlist(runtmp): # test on a pathlist - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") shutil.copyfile(sig47, runtmp.output("47.fa.sig")) - with open(runtmp.output('pathlist.txt'), 'wt') as fp: + with open(runtmp.output("pathlist.txt"), "w") as fp: fp.write("47.fa.sig\n") - runtmp.run_sourmash('sig', 'fileinfo', 'pathlist.txt') + runtmp.run_sourmash("sig", "fileinfo", "pathlist.txt") out = runtmp.last_result.out print(runtmp.last_result.out) @@ -275,13 +307,22 @@ def test_fileinfo_6_pathlist(runtmp): assert line.strip() in out -@pytest.mark.parametrize("db", ['v6.sbt.json', 'v5.sbt.json', 'v4.sbt.json', - 'v3.sbt.json', 'v2.sbt.json', 'v1.sbt.json']) +@pytest.mark.parametrize( + "db", + [ + "v6.sbt.json", + "v5.sbt.json", + "v4.sbt.json", + "v3.sbt.json", + "v2.sbt.json", + "v1.sbt.json", + ], +) def test_fileinfo_7_sbt_json(runtmp, db): # test on multiple versions of SBT JSON files dbfile = utils.get_test_data(db) - runtmp.run_sourmash('sig', 'fileinfo', dbfile) + runtmp.run_sourmash("sig", "fileinfo", dbfile) out = runtmp.last_result.out print(runtmp.last_result.out) @@ -302,11 +343,13 @@ def test_fileinfo_7_sbt_json(runtmp, db): def test_sig_fileinfo_stdin(runtmp): # test on stdin - sig = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - with open(sig, 'rt') as fp: + sig = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + with open(sig) as fp: data = fp.read() - runtmp.run_sourmash('sig', 'fileinfo', '-', stdin_data=data) + runtmp.run_sourmash("sig", "fileinfo", "-", stdin_data=data) out = runtmp.last_result.out print(out) @@ -328,53 +371,56 @@ def test_sig_fileinfo_stdin(runtmp): def test_sig_fileinfo_does_not_exist(runtmp): # test on file that does not exist with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('sig', 'fileinfo', 'does-not-exist') + runtmp.run_sourmash("sig", "fileinfo", "does-not-exist") - assert "Cannot open 'does-not-exist' as a sourmash signature collection" in runtmp.last_result.err + assert ( + "Cannot open 'does-not-exist' as a sourmash signature collection" + in runtmp.last_result.err + ) def test_sig_fileinfo_8_manifest_works(runtmp): # test on a manifest with relative paths, in proper location - mf = utils.get_test_data('scaled/mf.csv') - runtmp.sourmash('sig', 'fileinfo', mf) + mf = utils.get_test_data("scaled/mf.csv") + runtmp.sourmash("sig", "fileinfo", mf) out = runtmp.last_result.out print(out) - assert '15 sketches with DNA, k=31, scaled=10000 717 total hashes' in out - assert 'num signatures: 15' in out - assert 'has manifest? yes' in out - assert 'is database? yes' in out - assert 'path filetype: StandaloneManifestIndex' in out + assert "15 sketches with DNA, k=31, scaled=10000 717 total hashes" in out + assert "num signatures: 15" in out + assert "has manifest? yes" in out + assert "is database? yes" in out + assert "path filetype: StandaloneManifestIndex" in out def test_sig_fileinfo_8_manifest_works_when_moved(runtmp): # test on a manifest with relative paths, when in wrong place # note: this works, unlike 'describe', because all the necessary info # for 'fileinfo' is in the manifest. - mf = utils.get_test_data('scaled/mf.csv') - shutil.copyfile(mf, runtmp.output('mf.csv')) + mf = utils.get_test_data("scaled/mf.csv") + shutil.copyfile(mf, runtmp.output("mf.csv")) - runtmp.sourmash('sig', 'fileinfo', 'mf.csv') + runtmp.sourmash("sig", "fileinfo", "mf.csv") out = runtmp.last_result.out print(out) - assert '15 sketches with DNA, k=31, scaled=10000 717 total hashes' in out - assert 'num signatures: 15' in out - assert 'has manifest? yes' in out - assert 'is database? yes' in out - assert 'path filetype: StandaloneManifestIndex' in out + assert "15 sketches with DNA, k=31, scaled=10000 717 total hashes" in out + assert "num signatures: 15" in out + assert "has manifest? yes" in out + assert "is database? yes" in out + assert "path filetype: StandaloneManifestIndex" in out def test_sig_fileinfo_9_sqldb_make(runtmp): # make a sqldb and run fileinfo on it - gcf_all = glob.glob(utils.get_test_data('gather/GCF*.sig')) - sqldb = runtmp.output('some.sqldb') + gcf_all = glob.glob(utils.get_test_data("gather/GCF*.sig")) + sqldb = runtmp.output("some.sqldb") - runtmp.sourmash('sig', 'cat', '-k', '31', *gcf_all, '-o', sqldb) + runtmp.sourmash("sig", "cat", "-k", "31", *gcf_all, "-o", sqldb) - runtmp.sourmash('sig', 'fileinfo', sqldb) + runtmp.sourmash("sig", "fileinfo", sqldb) err = runtmp.last_result.err print(err) @@ -387,8 +433,8 @@ def test_sig_fileinfo_9_sqldb_make(runtmp): def test_sig_fileinfo_9_sqldb_exists(runtmp): # run fileinfo on existing sqldb - sqldb = utils.get_test_data('sqlite/index.sqldb') - runtmp.sourmash('sig', 'fileinfo', sqldb) + sqldb = utils.get_test_data("sqlite/index.sqldb") + runtmp.sourmash("sig", "fileinfo", sqldb) err = runtmp.last_result.err print(err) @@ -397,13 +443,15 @@ def test_sig_fileinfo_9_sqldb_exists(runtmp): print(out) assert "path filetype: SqliteIndex" in out - assert "2 sketches with DNA, k=31, scaled=1000 10415 total hashes" in out + assert ( + "2 sketches with DNA, k=31, scaled=1000 10415 total hashes" in out + ) def test_sig_fileinfo_9_sql_manifest(runtmp): # run fileinfo on existing sqldb - sqldb = utils.get_test_data('sqlite/prot.sqlmf') - runtmp.sourmash('sig', 'fileinfo', sqldb) + sqldb = utils.get_test_data("sqlite/prot.sqlmf") + runtmp.sourmash("sig", "fileinfo", sqldb) err = runtmp.last_result.err print(err) @@ -421,8 +469,8 @@ def test_sig_fileinfo_9_sql_manifest(runtmp): def test_sig_fileinfo_9_sql_lca_db(runtmp): # run fileinfo on existing sqldb - sqldb = utils.get_test_data('sqlite/lca.sqldb') - runtmp.sourmash('sig', 'fileinfo', sqldb) + sqldb = utils.get_test_data("sqlite/lca.sqldb") + runtmp.sourmash("sig", "fileinfo", sqldb) err = runtmp.last_result.err print(err) diff --git a/tests/test_cmd_signature_grep.py b/tests/test_cmd_signature_grep.py index 17dd5ee2dc..fa1a5b7dfb 100644 --- a/tests/test_cmd_signature_grep.py +++ b/tests/test_cmd_signature_grep.py @@ -18,299 +18,308 @@ def test_grep_1_sig_name(runtmp): # search on substring in name - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") - runtmp.run_sourmash('sig', 'grep', 'Shewanella', sig47) + runtmp.run_sourmash("sig", "grep", "Shewanella", sig47) out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert 'Shewanella' in ss.name - assert ss.md5sum() == '09a08691ce52952152f0e866a59f6261' + assert "Shewanella" in ss.name + assert ss.md5sum() == "09a08691ce52952152f0e866a59f6261" def test_grep_1_sig_name_case_sensitive(runtmp): # search on substring in name - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('sig', 'grep', 'shewanella', sig47) + runtmp.run_sourmash("sig", "grep", "shewanella", sig47) def test_grep_1_sig_name_case_insensitive(runtmp): # search on substring in name, case insensitive - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") - runtmp.run_sourmash('sig', 'grep', '-i', 'shewanella', sig47) + runtmp.run_sourmash("sig", "grep", "-i", "shewanella", sig47) out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert 'Shewanella' in ss.name - assert ss.md5sum() == '09a08691ce52952152f0e866a59f6261' + assert "Shewanella" in ss.name + assert ss.md5sum() == "09a08691ce52952152f0e866a59f6261" def test_grep_1_sig_name_exclude(runtmp): # search on substring in name, case insensitive - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") # no matches! with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('sig', 'grep', '-v', 'Shewanella', sig47) + runtmp.run_sourmash("sig", "grep", "-v", "Shewanella", sig47) def test_grep_2_sig_md5(runtmp): # search on substring in md5 - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") - runtmp.run_sourmash('sig', 'grep', 'ce52952152f0', sig47) + runtmp.run_sourmash("sig", "grep", "ce52952152f0", sig47) out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert ss.md5sum() == '09a08691ce52952152f0e866a59f6261' + assert ss.md5sum() == "09a08691ce52952152f0e866a59f6261" def test_grep_2_sig_md5_case_sensitive(runtmp): # case sensitive no match - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('sig', 'grep', 'CE52952152f0', sig47) + runtmp.run_sourmash("sig", "grep", "CE52952152f0", sig47) def test_grep_2_sig_md5_case_insensitive(runtmp): # search on substring in md5, case insensitive - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") - runtmp.run_sourmash('sig', 'grep', '-i', 'CE52952152f0', sig47) + runtmp.run_sourmash("sig", "grep", "-i", "CE52952152f0", sig47) out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert ss.md5sum() == '09a08691ce52952152f0e866a59f6261' + assert ss.md5sum() == "09a08691ce52952152f0e866a59f6261" def test_grep_3_filename(runtmp): # filename match - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") - runtmp.run_sourmash('sig', 'grep', '47.fa', sig47) + runtmp.run_sourmash("sig", "grep", "47.fa", sig47) out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert '47.fa' in ss.filename - assert ss.md5sum() == '09a08691ce52952152f0e866a59f6261' + assert "47.fa" in ss.filename + assert ss.md5sum() == "09a08691ce52952152f0e866a59f6261" def test_grep_3_filename_regexp(runtmp): # search for a regexp on filename - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") - runtmp.run_sourmash('sig', 'grep', '^47.fa', sig47) + runtmp.run_sourmash("sig", "grep", "^47.fa", sig47) out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert '7.fa' in ss.filename - assert ss.md5sum() == '09a08691ce52952152f0e866a59f6261' + assert "7.fa" in ss.filename + assert ss.md5sum() == "09a08691ce52952152f0e866a59f6261" def test_grep_4_no_manifest(runtmp): # fail search when no manifest, by default - sbt = utils.get_test_data('v6.sbt.zip') + sbt = utils.get_test_data("v6.sbt.zip") - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('sig', 'grep', 'e60265', sbt) + with pytest.raises(SourmashCommandFailed): + runtmp.run_sourmash("sig", "grep", "e60265", sbt) print(runtmp.last_result.err) - assert 'ERROR on filename' in runtmp.last_result.err - assert 'sig grep requires a manifest by default, but no manifest present.' in runtmp.last_result.err + assert "ERROR on filename" in runtmp.last_result.err + assert ( + "sig grep requires a manifest by default, but no manifest present." + in runtmp.last_result.err + ) def test_grep_4_no_manifest_ok(runtmp): # generate manifest if --no-require-manifest - sbt = utils.get_test_data('v6.sbt.zip') + sbt = utils.get_test_data("v6.sbt.zip") - runtmp.run_sourmash('sig', 'grep', 'e60265', sbt, '--no-require-manifest') + runtmp.run_sourmash("sig", "grep", "e60265", sbt, "--no-require-manifest") ss = load_signatures(runtmp.last_result.out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert 'e60265' in ss.md5sum() + assert "e60265" in ss.md5sum() def test_grep_5_zip_include(runtmp): # search zip, include on case sensitive match to name - allzip = utils.get_test_data('prot/all.zip') + allzip = utils.get_test_data("prot/all.zip") - runtmp.run_sourmash('sig', 'grep', '--dna', 'OS223', allzip) + runtmp.run_sourmash("sig", "grep", "--dna", "OS223", allzip) out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert 'Shewanella baltica OS223' in ss.name - assert ss.md5sum() == '38729c6374925585db28916b82a6f513' + assert "Shewanella baltica OS223" in ss.name + assert ss.md5sum() == "38729c6374925585db28916b82a6f513" def test_grep_5_zip_include_picklist(runtmp): # search zip, include on case sensitive match to name - allzip = utils.get_test_data('prot/all.zip') + allzip = utils.get_test_data("prot/all.zip") - pickfile = runtmp.output('pick.csv') - with open(pickfile, 'w', newline="") as fp: - w = csv.DictWriter(fp, fieldnames=['md5']) + pickfile = runtmp.output("pick.csv") + with open(pickfile, "w", newline="") as fp: + w = csv.DictWriter(fp, fieldnames=["md5"]) w.writeheader() - w.writerow(dict(md5='09a08691ce52952152f0e866a59f6261')) - w.writerow(dict(md5='38729c6374925585db28916b82a6f513')) + w.writerow(dict(md5="09a08691ce52952152f0e866a59f6261")) + w.writerow(dict(md5="38729c6374925585db28916b82a6f513")) - runtmp.run_sourmash('sig', 'grep', '--dna', 'OS223', allzip, - '--picklist', f"{pickfile}:md5:md5") + runtmp.run_sourmash( + "sig", "grep", "--dna", "OS223", allzip, "--picklist", f"{pickfile}:md5:md5" + ) out = runtmp.last_result.out print(out) err = runtmp.last_result.err print(err) - assert 'for given picklist, found 2 matches to 2 distinct values' in err + assert "for given picklist, found 2 matches to 2 distinct values" in err ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert 'Shewanella baltica OS223' in ss.name - assert ss.md5sum() == '38729c6374925585db28916b82a6f513' + assert "Shewanella baltica OS223" in ss.name + assert ss.md5sum() == "38729c6374925585db28916b82a6f513" def test_grep_5_zip_include_case_insensitive(runtmp): # search zip, include on case insensitive match to name - allzip = utils.get_test_data('prot/all.zip') + allzip = utils.get_test_data("prot/all.zip") - runtmp.run_sourmash('sig', 'grep', '--dna', '-i', 'os223', allzip) + runtmp.run_sourmash("sig", "grep", "--dna", "-i", "os223", allzip) out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert 'Shewanella baltica OS223' in ss.name - assert ss.md5sum() == '38729c6374925585db28916b82a6f513' + assert "Shewanella baltica OS223" in ss.name + assert ss.md5sum() == "38729c6374925585db28916b82a6f513" def test_grep_5_zip_exclude(runtmp): # search zip, exclude on case-sensitive match - allzip = utils.get_test_data('prot/all.zip') + allzip = utils.get_test_data("prot/all.zip") - runtmp.run_sourmash('sig', 'grep', '--dna', '-v', 'OS185', allzip) + runtmp.run_sourmash("sig", "grep", "--dna", "-v", "OS185", allzip) out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert 'Shewanella baltica OS223' in ss.name - assert ss.md5sum() == '38729c6374925585db28916b82a6f513' + assert "Shewanella baltica OS223" in ss.name + assert ss.md5sum() == "38729c6374925585db28916b82a6f513" def test_grep_5_zip_exclude_case_insensitive(runtmp): # search zip, exclude on case-insensitive match - allzip = utils.get_test_data('prot/all.zip') + allzip = utils.get_test_data("prot/all.zip") - runtmp.run_sourmash('sig', 'grep', '--dna', '-vi', 'os185', allzip) + runtmp.run_sourmash("sig", "grep", "--dna", "-vi", "os185", allzip) out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert 'Shewanella baltica OS223' in ss.name - assert ss.md5sum() == '38729c6374925585db28916b82a6f513' + assert "Shewanella baltica OS223" in ss.name + assert ss.md5sum() == "38729c6374925585db28916b82a6f513" def test_grep_6_zip_manifest_csv(runtmp): # do --csv and use result as picklist - allzip = utils.get_test_data('prot/all.zip') + allzip = utils.get_test_data("prot/all.zip") - runtmp.run_sourmash('sig', 'grep', '--dna', 'OS223', allzip, - '--csv', 'match.csv') + runtmp.run_sourmash("sig", "grep", "--dna", "OS223", allzip, "--csv", "match.csv") out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert 'Shewanella baltica OS223' in ss.name - assert ss.md5sum() == '38729c6374925585db28916b82a6f513' + assert "Shewanella baltica OS223" in ss.name + assert ss.md5sum() == "38729c6374925585db28916b82a6f513" # now run cat with picklist - runtmp.run_sourmash('sig', 'cat', allzip, - '--picklist', 'match.csv::manifest') + runtmp.run_sourmash("sig", "cat", allzip, "--picklist", "match.csv::manifest") out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert 'Shewanella baltica OS223' in ss.name - assert ss.md5sum() == '38729c6374925585db28916b82a6f513' + assert "Shewanella baltica OS223" in ss.name + assert ss.md5sum() == "38729c6374925585db28916b82a6f513" def test_grep_6_zip_manifest_csv_gz(runtmp): # do --csv and use result as picklist - allzip = utils.get_test_data('prot/all.zip') + allzip = utils.get_test_data("prot/all.zip") - runtmp.run_sourmash('sig', 'grep', '--dna', 'OS223', allzip, - '--csv', 'match.csv.gz') + runtmp.run_sourmash( + "sig", "grep", "--dna", "OS223", allzip, "--csv", "match.csv.gz" + ) out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert 'Shewanella baltica OS223' in ss.name - assert ss.md5sum() == '38729c6374925585db28916b82a6f513' + assert "Shewanella baltica OS223" in ss.name + assert ss.md5sum() == "38729c6374925585db28916b82a6f513" # check that match.csv.gz is a gzip file - with gzip.open(runtmp.output('match.csv.gz'), 'rt', newline='') as fp: + with gzip.open(runtmp.output("match.csv.gz"), "rt", newline="") as fp: fp.read() # now run cat with picklist - runtmp.run_sourmash('sig', 'cat', allzip, - '--picklist', 'match.csv.gz::manifest') + runtmp.run_sourmash("sig", "cat", allzip, "--picklist", "match.csv.gz::manifest") out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert 'Shewanella baltica OS223' in ss.name - assert ss.md5sum() == '38729c6374925585db28916b82a6f513' + assert "Shewanella baltica OS223" in ss.name + assert ss.md5sum() == "38729c6374925585db28916b82a6f513" def test_sig_grep_7_lca(runtmp): # extract 47 from an LCA database, with --no-require-manifest - allzip = utils.get_test_data('lca/47+63.lca.json') - sig47 = utils.get_test_data('47.fa.sig') - - runtmp.sourmash('sig', 'grep', "50a9274021e4", allzip, - '--no-require-manifest', '-o', 'matches.sig') - - match = sourmash.load_file_as_signatures(runtmp.output('matches.sig')) + allzip = utils.get_test_data("lca/47+63.lca.json") + sig47 = utils.get_test_data("47.fa.sig") + + runtmp.sourmash( + "sig", + "grep", + "50a9274021e4", + allzip, + "--no-require-manifest", + "-o", + "matches.sig", + ) + + match = sourmash.load_file_as_signatures(runtmp.output("matches.sig")) match = list(match)[0] ss47 = sourmash.load_file_as_signatures(sig47) @@ -324,50 +333,63 @@ def test_sig_grep_7_lca(runtmp): def test_sig_grep_7_picklist_md5_lca_fail(runtmp): # extract 47 from an LCA database, using a picklist w/full md5 => fail - allzip = utils.get_test_data('lca/47+63.lca.json') + allzip = utils.get_test_data("lca/47+63.lca.json") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='50a9274021e43eda8b2e77f8fa60ae8e', - md5short='50a9274021e43eda8b2e77f8fa60ae8e'[:8], - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="50a9274021e43eda8b2e77f8fa60ae8e", + md5short="50a9274021e43eda8b2e77f8fa60ae8e"[:8], + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:md5full:md5" - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('sig', 'grep', '50a92740', allzip, - '--picklist', picklist_arg, - '--no-require-manifest') + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash( + "sig", + "grep", + "50a92740", + allzip, + "--picklist", + picklist_arg, + "--no-require-manifest", + ) # this happens b/c the implementation of 'grep' uses picklists, and # LCA databases don't support multiple picklists. print(runtmp.last_result.err) - assert "This input collection doesn't support 'grep' with picklists." in runtmp.last_result.err + assert ( + "This input collection doesn't support 'grep' with picklists." + in runtmp.last_result.err + ) def test_sig_grep_8_count(runtmp): - zips = ['prot/all.zip', - 'prot/dayhoff.sbt.zip', - 'prot/dayhoff.zip', - 'prot/hp.sbt.zip', - 'prot/hp.zip', - 'prot/protein.sbt.zip', - 'prot/protein.zip'] - - zip_src = [ utils.get_test_data(x) for x in zips ] - - os.mkdir(runtmp.output('prot')) + zips = [ + "prot/all.zip", + "prot/dayhoff.sbt.zip", + "prot/dayhoff.zip", + "prot/hp.sbt.zip", + "prot/hp.zip", + "prot/protein.sbt.zip", + "prot/protein.zip", + ] + + zip_src = [utils.get_test_data(x) for x in zips] + + os.mkdir(runtmp.output("prot")) for src, dest in zip(zip_src, zips): shutil.copyfile(src, runtmp.output(dest)) - - runtmp.sourmash('sig', 'grep', '-c', '0015939', *zips) + + runtmp.sourmash("sig", "grep", "-c", "0015939", *zips) out = runtmp.last_result.out err = runtmp.last_result.err @@ -391,23 +413,23 @@ def test_sig_grep_8_count(runtmp): def test_sig_grep_identical_md5s(runtmp): # test that we properly handle different signatures with identical md5s - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss = load_signatures(sig47) sig = list(ss)[0] new_sig = sig.to_mutable() - new_sig.name = 'foo' - sig47foo = runtmp.output('foo.sig') + new_sig.name = "foo" + sig47foo = runtmp.output("foo.sig") # this was only a problem when the signatures are stored in the same file - with open(sig47foo, 'wt') as fp: + with open(sig47foo, "w") as fp: sourmash.save_signatures([new_sig, sig], fp) - runtmp.run_sourmash('sig', 'grep', '-i', 'foo', sig47foo) + runtmp.run_sourmash("sig", "grep", "-i", "foo", sig47foo) out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert 'Shewanella' not in ss.name - assert 'foo' in ss.name - assert ss.md5sum() == '09a08691ce52952152f0e866a59f6261' + assert "Shewanella" not in ss.name + assert "foo" in ss.name + assert ss.md5sum() == "09a08691ce52952152f0e866a59f6261" diff --git a/tests/test_compare.py b/tests/test_compare.py index bc25e98e3c..9821295cac 100644 --- a/tests/test_compare.py +++ b/tests/test_compare.py @@ -5,9 +5,14 @@ import pytest import sourmash -from sourmash.compare import (compare_all_pairs, compare_parallel, - compare_serial, compare_serial_containment, - compare_serial_max_containment, compare_serial_avg_containment) +from sourmash.compare import ( + compare_all_pairs, + compare_parallel, + compare_serial, + compare_serial_containment, + compare_serial_max_containment, + compare_serial_avg_containment, +) import sourmash_tst_utils as utils @@ -44,66 +49,90 @@ def test_compare_serial(siglist, ignore_abundance): similarities = compare_serial(siglist, ignore_abundance, downsample=False) true_similarities = np.array( - [[1., 0.356, 0.078, 0.086, 0., 0., 0.], - [0.356, 1., 0.072, 0.078, 0., 0., 0.], - [0.078, 0.072, 1., 0.074, 0., 0., 0.], - [0.086, 0.078, 0.074, 1., 0., 0., 0.], - [0., 0., 0., 0., 1., 0.382, 0.364], - [0., 0., 0., 0., 0.382, 1., 0.386], - [0., 0., 0., 0., 0.364, 0.386, 1.]]) + [ + [1.0, 0.356, 0.078, 0.086, 0.0, 0.0, 0.0], + [0.356, 1.0, 0.072, 0.078, 0.0, 0.0, 0.0], + [0.078, 0.072, 1.0, 0.074, 0.0, 0.0, 0.0], + [0.086, 0.078, 0.074, 1.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 1.0, 0.382, 0.364], + [0.0, 0.0, 0.0, 0.0, 0.382, 1.0, 0.386], + [0.0, 0.0, 0.0, 0.0, 0.364, 0.386, 1.0], + ] + ) np.testing.assert_array_equal(similarities, true_similarities) def test_compare_parallel(siglist, ignore_abundance): - similarities = compare_parallel(siglist, ignore_abundance, downsample=False, n_jobs=2) + similarities = compare_parallel( + siglist, ignore_abundance, downsample=False, n_jobs=2 + ) true_similarities = np.array( - [[1., 0.356, 0.078, 0.086, 0., 0., 0.], - [0.356, 1., 0.072, 0.078, 0., 0., 0.], - [0.078, 0.072, 1., 0.074, 0., 0., 0.], - [0.086, 0.078, 0.074, 1., 0., 0., 0.], - [0., 0., 0., 0., 1., 0.382, 0.364], - [0., 0., 0., 0., 0.382, 1., 0.386], - [0., 0., 0., 0., 0.364, 0.386, 1.]]) + [ + [1.0, 0.356, 0.078, 0.086, 0.0, 0.0, 0.0], + [0.356, 1.0, 0.072, 0.078, 0.0, 0.0, 0.0], + [0.078, 0.072, 1.0, 0.074, 0.0, 0.0, 0.0], + [0.086, 0.078, 0.074, 1.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 1.0, 0.382, 0.364], + [0.0, 0.0, 0.0, 0.0, 0.382, 1.0, 0.386], + [0.0, 0.0, 0.0, 0.0, 0.364, 0.386, 1.0], + ] + ) np.testing.assert_array_equal(similarities, true_similarities) def test_compare_all_pairs(siglist, ignore_abundance): - similarities_parallel = compare_all_pairs(siglist, ignore_abundance, downsample=False, n_jobs=2) + similarities_parallel = compare_all_pairs( + siglist, ignore_abundance, downsample=False, n_jobs=2 + ) similarities_serial = compare_serial(siglist, ignore_abundance, downsample=False) np.testing.assert_array_equal(similarities_parallel, similarities_serial) def test_compare_serial_jaccardANI(scaled_siglist, ignore_abundance): - jANI = compare_serial(scaled_siglist, ignore_abundance, downsample=False, return_ani=True) + jANI = compare_serial( + scaled_siglist, ignore_abundance, downsample=False, return_ani=True + ) print(jANI) - + true_jaccard_ANI = np.array( - [[1., 0.978, 0., 0.], - [0.978, 1., 0.96973012, 0.99262776], - [0., 0.96973012, 1., 0.97697011], - [0., 0.99262776, 0.97697011, 1.]]) + [ + [1.0, 0.978, 0.0, 0.0], + [0.978, 1.0, 0.96973012, 0.99262776], + [0.0, 0.96973012, 1.0, 0.97697011], + [0.0, 0.99262776, 0.97697011, 1.0], + ] + ) np.testing.assert_array_almost_equal(jANI, true_jaccard_ANI, decimal=3) def test_compare_parallel_jaccardANI(scaled_siglist, ignore_abundance): - jANI = compare_parallel(scaled_siglist, ignore_abundance, downsample=False, n_jobs=2, return_ani=True) + jANI = compare_parallel( + scaled_siglist, ignore_abundance, downsample=False, n_jobs=2, return_ani=True + ) true_jaccard_ANI = np.array( - [[1., 0.978, 0., 0.], - [0.978, 1., 0.96973012, 0.99262776], - [0., 0.96973012, 1., 0.97697011], - [0., 0.99262776, 0.97697011, 1.]]) + [ + [1.0, 0.978, 0.0, 0.0], + [0.978, 1.0, 0.96973012, 0.99262776], + [0.0, 0.96973012, 1.0, 0.97697011], + [0.0, 0.99262776, 0.97697011, 1.0], + ] + ) np.testing.assert_array_almost_equal(jANI, true_jaccard_ANI, decimal=3) def test_compare_all_pairs_jaccardANI(scaled_siglist, ignore_abundance): - similarities_parallel = compare_all_pairs(scaled_siglist, ignore_abundance, downsample=False, n_jobs=2, return_ani=True) - similarities_serial = compare_serial(scaled_siglist, ignore_abundance, downsample=False, return_ani=True) + similarities_parallel = compare_all_pairs( + scaled_siglist, ignore_abundance, downsample=False, n_jobs=2, return_ani=True + ) + similarities_serial = compare_serial( + scaled_siglist, ignore_abundance, downsample=False, return_ani=True + ) np.testing.assert_array_equal(similarities_parallel, similarities_serial) @@ -112,39 +141,56 @@ def test_compare_serial_containmentANI(scaled_siglist): print(containment_ANI) true_containment_ANI = np.array( - [[1, 0.966, 0., 0.], - [1, 1., 0.97715525, 1.], - [0., 0.96377054, 1., 0.97678608], - [0., 0.98667513, 0.97715525, 1.]]) + [ + [1, 0.966, 0.0, 0.0], + [1, 1.0, 0.97715525, 1.0], + [0.0, 0.96377054, 1.0, 0.97678608], + [0.0, 0.98667513, 0.97715525, 1.0], + ] + ) - np.testing.assert_array_almost_equal(containment_ANI, true_containment_ANI, decimal=3) + np.testing.assert_array_almost_equal( + containment_ANI, true_containment_ANI, decimal=3 + ) def test_compare_serial_maxcontainmentANI(scaled_siglist): - # check max_containment ANI - max_containment_ANI = compare_serial_max_containment(scaled_siglist, return_ani=True) + max_containment_ANI = compare_serial_max_containment( + scaled_siglist, return_ani=True + ) print(max_containment_ANI) true_max_containment_ANI = np.array( - [[1., 1., 0., 0.], - [1., 1., 0.97715525, 1.], - [0., 0.97715525, 1., 0.97715525], - [0., 1., 0.97715525, 1.]]) + [ + [1.0, 1.0, 0.0, 0.0], + [1.0, 1.0, 0.97715525, 1.0], + [0.0, 0.97715525, 1.0, 0.97715525], + [0.0, 1.0, 0.97715525, 1.0], + ] + ) - np.testing.assert_array_almost_equal(max_containment_ANI, true_max_containment_ANI, decimal=3) + np.testing.assert_array_almost_equal( + max_containment_ANI, true_max_containment_ANI, decimal=3 + ) def test_compare_serial_avg_containmentANI(scaled_siglist): - # check avg_containment ANI - avg_containment_ANI = compare_serial_avg_containment(scaled_siglist, return_ani=True) + avg_containment_ANI = compare_serial_avg_containment( + scaled_siglist, return_ani=True + ) print(avg_containment_ANI) true_avg_containment_ANI = np.array( - [[1., 0.983, 0., 0.], - [0.983, 1., 0.97046289, 0.99333757], - [0., 0.97046289, 1., 0.97697067], - [0., 0.99333757, 0.97697067, 1.]]) - - np.testing.assert_array_almost_equal(avg_containment_ANI, true_avg_containment_ANI, decimal=3) + [ + [1.0, 0.983, 0.0, 0.0], + [0.983, 1.0, 0.97046289, 0.99333757], + [0.0, 0.97046289, 1.0, 0.97697067], + [0.0, 0.99333757, 0.97697067, 1.0], + ] + ) + + np.testing.assert_array_almost_equal( + avg_containment_ANI, true_avg_containment_ANI, decimal=3 + ) diff --git a/tests/test_deprecated.py b/tests/test_deprecated.py index fdd9acc53c..34097dd695 100644 --- a/tests/test_deprecated.py +++ b/tests/test_deprecated.py @@ -1,13 +1,14 @@ from sourmash import signature import sourmash_tst_utils as utils + def test_load_textmode(track_abundance): # ijson required a file in binary mode or bytes, # but we had an API example in the docs using 'rt'. # I fixed the docs, but I'm keeping this test here # to make sure we still support it =/ - sigfile = utils.get_test_data('genome-s10+s11.sig') - with open(sigfile, 'rt') as sigfp: + sigfile = utils.get_test_data("genome-s10+s11.sig") + with open(sigfile) as sigfp: siglist = list(signature.load_signatures(sigfp)) loaded_sig = siglist[0] - assert loaded_sig.name == 'genome-s10+s11' + assert loaded_sig.name == "genome-s10+s11" diff --git a/tests/test_distance_utils.py b/tests/test_distance_utils.py index 22067dcc68..6b44064a9e 100644 --- a/tests/test_distance_utils.py +++ b/tests/test_distance_utils.py @@ -3,23 +3,33 @@ """ import pytest import numpy as np -from sourmash.distance_utils import (containment_to_distance, get_exp_probability_nothing_common, - handle_seqlen_nkmers, jaccard_to_distance, - ANIResult, ciANIResult, jaccardANIResult, var_n_mutated, - set_size_chernoff, set_size_exact_prob) +from sourmash.distance_utils import ( + containment_to_distance, + get_exp_probability_nothing_common, + handle_seqlen_nkmers, + jaccard_to_distance, + ANIResult, + ciANIResult, + jaccardANIResult, + var_n_mutated, + set_size_chernoff, + set_size_exact_prob, +) + def test_aniresult(): res = ANIResult(0.4, 0.1) assert res.dist == 0.4 assert res.ani == 0.6 assert res.p_nothing_in_common == 0.1 - assert res.p_exceeds_threshold ==True + assert res.p_exceeds_threshold == True # check that they're equivalent res2 = ANIResult(0.4, 0.1) assert res == res2 res3 = ANIResult(0.5, 0) assert res != res3 - assert res3.p_exceeds_threshold ==False + assert res3.p_exceeds_threshold == False + def test_aniresult_bad_distance(): """ @@ -38,18 +48,18 @@ def test_aniresult_bad_distance(): def test_jaccard_aniresult(): res = jaccardANIResult(0.4, 0.1, jaccard_error=0.03) assert res.dist == 0.4 - assert res.ani == None + assert res.ani is None assert res.p_nothing_in_common == 0.1 assert res.jaccard_error == 0.03 - assert res.p_exceeds_threshold ==True - assert res.je_exceeds_threshold ==True + assert res.p_exceeds_threshold == True + assert res.je_exceeds_threshold == True res3 = jaccardANIResult(0.4, 0.1, jaccard_error=0.03, je_threshold=0.1) - assert res3.je_exceeds_threshold ==False + assert res3.je_exceeds_threshold == False assert res3.ani == 0.6 def test_jaccard_aniresult_nojaccarderror(): - #jaccard error is None + # jaccard error is None with pytest.raises(Exception) as exc: jaccardANIResult(0.4, 0.1, None) print("\n", str(exc.value)) @@ -57,14 +67,14 @@ def test_jaccard_aniresult_nojaccarderror(): def test_ci_aniresult(): - res = ciANIResult(0.4, 0.1, dist_low=0.3,dist_high=0.5) + res = ciANIResult(0.4, 0.1, dist_low=0.3, dist_high=0.5) print(res) assert res.dist == 0.4 assert res.ani == 0.6 assert res.p_nothing_in_common == 0.1 assert res.ani_low == 0.5 assert res.ani_high == 0.7 - res2 = ciANIResult(0.4, 0.1, dist_low=0.3,dist_high=0.5) + res2 = ciANIResult(0.4, 0.1, dist_low=0.3, dist_high=0.5) assert res == res2 res3 = ciANIResult(0.4, 0.2, dist_low=0.3, dist_high=0.5) assert res != res3 @@ -74,12 +84,14 @@ def test_containment_to_distance_zero(): contain = 0 scaled = 1 nkmers = 10000 - ksize=21 - res = containment_to_distance(contain,ksize,scaled, n_unique_kmers=nkmers, estimate_ci=True) + ksize = 21 + res = containment_to_distance( + contain, ksize, scaled, n_unique_kmers=nkmers, estimate_ci=True + ) print(res) # check results - exp_dist,exp_low,exp_high,pnc = 1.0,1.0,1.0,1.0 - exp_id, exp_idlow,exp_idhigh,pnc = 0.0,0.0,0.0,1.0 + exp_dist, exp_low, exp_high, pnc = 1.0, 1.0, 1.0, 1.0 + exp_id, exp_idlow, exp_idhigh, pnc = 0.0, 0.0, 0.0, 1.0 assert res.dist == exp_dist assert res.dist_low == exp_low assert res.dist_high == exp_high @@ -88,9 +100,15 @@ def test_containment_to_distance_zero(): assert res.ani_low == exp_idlow assert res.ani_high == exp_idhigh # check without returning ci - res2 = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers) + res2 = containment_to_distance(contain, ksize, scaled, n_unique_kmers=nkmers) print(res2) - exp_res = ciANIResult(dist=1.0, dist_low=1.0, dist_high=1.0, p_nothing_in_common=1.0, p_threshold=0.001) + exp_res = ciANIResult( + dist=1.0, + dist_low=1.0, + dist_high=1.0, + p_nothing_in_common=1.0, + p_threshold=0.001, + ) assert res2 == exp_res @@ -98,11 +116,13 @@ def test_containment_to_distance_one(): contain = 1 scaled = 1 nkmers = 10000 - ksize=21 - res = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers,estimate_ci=True) + ksize = 21 + res = containment_to_distance( + contain, ksize, scaled, n_unique_kmers=nkmers, estimate_ci=True + ) print(res) - exp_dist, exp_low,exp_high,pnc = 0.0,0.0,0.0,0.0 - exp_id, exp_idlow,exp_idhigh,pnc = 1.0,1.0,1.0,0.0 + exp_dist, exp_low, exp_high, pnc = 0.0, 0.0, 0.0, 0.0 + exp_id, exp_idlow, exp_idhigh, pnc = 1.0, 1.0, 1.0, 0.0 assert res.dist == exp_dist assert res.dist_low == exp_low assert res.dist_high == exp_high @@ -112,7 +132,7 @@ def test_containment_to_distance_one(): assert res.ani_high == exp_idhigh # check without returning ci - res = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers) + res = containment_to_distance(contain, ksize, scaled, n_unique_kmers=nkmers) assert res.dist == exp_dist assert res.ani == exp_id assert res.p_nothing_in_common == pnc @@ -124,8 +144,10 @@ def test_containment_to_distance_scaled1(): contain = 0.5 scaled = 1 nkmers = 10000 - ksize=21 - res = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers,estimate_ci=True) + ksize = 21 + res = containment_to_distance( + contain, ksize, scaled, n_unique_kmers=nkmers, estimate_ci=True + ) print(res) # check results assert res.dist == 0.032468221476108394 @@ -136,17 +158,27 @@ def test_containment_to_distance_scaled1(): assert res.ani_low == 0.9635213980271021 assert res.p_nothing_in_common == 0.0 # without returning ci - res2 = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers) - assert (res2.dist,res2.ani,res2.p_nothing_in_common) == (0.032468221476108394, 0.9675317785238916, 0.0) - assert (res2.dist,res2.ani,res2.p_nothing_in_common) == (res.dist, res.ani, res.p_nothing_in_common) + res2 = containment_to_distance(contain, ksize, scaled, n_unique_kmers=nkmers) + assert (res2.dist, res2.ani, res2.p_nothing_in_common) == ( + 0.032468221476108394, + 0.9675317785238916, + 0.0, + ) + assert (res2.dist, res2.ani, res2.p_nothing_in_common) == ( + res.dist, + res.ani, + res.p_nothing_in_common, + ) def test_containment_to_distance_scaled100(): contain = 0.1 scaled = 100 nkmers = 10000 - ksize=31 - res = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers,estimate_ci=True) + ksize = 31 + res = containment_to_distance( + contain, ksize, scaled, n_unique_kmers=nkmers, estimate_ci=True + ) print(res) # check results assert res.dist == 0.07158545548052564 @@ -160,8 +192,10 @@ def test_containment_to_distance_scaled100_2(): contain = 0.5 scaled = 100 nkmers = 10000 - ksize=21 - res= containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers,estimate_ci=True) + ksize = 21 + res = containment_to_distance( + contain, ksize, scaled, n_unique_kmers=nkmers, estimate_ci=True + ) print(res) # check results assert res.dist == 0.032468221476108394 @@ -174,8 +208,10 @@ def test_containment_to_distance_k10(): contain = 0.5 scaled = 100 nkmers = 10000 - ksize=10 - res = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers,estimate_ci=True) + ksize = 10 + res = containment_to_distance( + contain, ksize, scaled, n_unique_kmers=nkmers, estimate_ci=True + ) print(res) # check results assert res.dist == 0.06696700846319259 @@ -188,17 +224,31 @@ def test_containment_to_distance_confidence(): contain = 0.1 scaled = 100 nkmers = 10000 - ksize=31 - confidence=0.99 - res = containment_to_distance(contain,ksize,scaled,confidence=confidence,n_unique_kmers=nkmers, estimate_ci=True) + ksize = 31 + confidence = 0.99 + res = containment_to_distance( + contain, + ksize, + scaled, + confidence=confidence, + n_unique_kmers=nkmers, + estimate_ci=True, + ) print(res) # check results assert res.dist == 0.07158545548052564 assert res.dist_low == 0.04802880300938562 assert res.dist_high == 0.09619930040790341 assert res.p_exceeds_threshold == False - confidence=0.90 - res2 = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers,confidence=confidence, estimate_ci=True) + confidence = 0.90 + res2 = containment_to_distance( + contain, + ksize, + scaled, + n_unique_kmers=nkmers, + confidence=confidence, + estimate_ci=True, + ) print(res2) # check results assert res2.dist == res.dist @@ -211,16 +261,30 @@ def test_nkmers_to_bp_containment(): containment = 0.1 scaled = 100 bp_len = 10030 - ksize=31 - nkmers = handle_seqlen_nkmers(ksize, sequence_len_bp= bp_len) + ksize = 31 + nkmers = handle_seqlen_nkmers(ksize, sequence_len_bp=bp_len) print("nkmers_from_bp:", nkmers) - confidence=0.99 - kmer_res = containment_to_distance(containment,ksize,scaled,confidence=confidence,n_unique_kmers=nkmers,estimate_ci=True) - bp_res = containment_to_distance(containment,ksize,scaled,confidence=confidence,sequence_len_bp=bp_len,estimate_ci=True) + confidence = 0.99 + kmer_res = containment_to_distance( + containment, + ksize, + scaled, + confidence=confidence, + n_unique_kmers=nkmers, + estimate_ci=True, + ) + bp_res = containment_to_distance( + containment, + ksize, + scaled, + confidence=confidence, + sequence_len_bp=bp_len, + estimate_ci=True, + ) print(f"\nkDIST: {kmer_res}") print(f"\nbpDIST:,{bp_res}") # check results - assert kmer_res==bp_res + assert kmer_res == bp_res assert kmer_res.dist == 0.07158545548052564 assert kmer_res.dist_low == 0.04802880300938562 assert kmer_res.dist_high == 0.09619930040790341 @@ -230,8 +294,8 @@ def test_jaccard_to_distance_zero(): jaccard = 0 scaled = 1 nkmers = 10000 - ksize=21 - res= jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) + ksize = 21 + res = jaccard_to_distance(jaccard, ksize, scaled, n_unique_kmers=nkmers) print(res) # check results assert res.dist == 1.0 @@ -244,8 +308,8 @@ def test_jaccard_to_distance_one(): jaccard = 1 scaled = 1 nkmers = 10000 - ksize=21 - res= jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) + ksize = 21 + res = jaccard_to_distance(jaccard, ksize, scaled, n_unique_kmers=nkmers) print(res) # check results assert res.dist == 0.0 @@ -259,36 +323,38 @@ def test_jaccard_to_distance_scaled(): jaccard = 0.5 scaled = 1 nkmers = 10000 - ksize=21 - res = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) + ksize = 21 + res = jaccard_to_distance(jaccard, ksize, scaled, n_unique_kmers=nkmers) print(res) # check results assert round(res.dist, 3) == round(0.019122659390482077, 3) - assert res.ani == None + assert res.ani is None assert res.p_exceeds_threshold == False assert res.jaccard_error == 0.00018351337045518042 - assert res.je_exceeds_threshold ==True + assert res.je_exceeds_threshold == True scaled = 100 - res2 = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) + res2 = jaccard_to_distance(jaccard, ksize, scaled, n_unique_kmers=nkmers) print(res2) assert res2.dist == res.dist assert res2.jaccard_error == res.jaccard_error assert res2.p_nothing_in_common != res.p_nothing_in_common - assert res2.p_exceeds_threshold ==False + assert res2.p_exceeds_threshold == False def test_jaccard_to_distance_k31(): jaccard = 0.5 scaled = 100 nkmers = 10000 - ksize=31 - res = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) + ksize = 31 + res = jaccard_to_distance(jaccard, ksize, scaled, n_unique_kmers=nkmers) print(res) # check results - assert res.je_exceeds_threshold ==True - assert res.ani == None + assert res.je_exceeds_threshold == True + assert res.ani is None assert res.p_exceeds_threshold == False - res2 = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers, err_threshold=0.1) + res2 = jaccard_to_distance( + jaccard, ksize, scaled, n_unique_kmers=nkmers, err_threshold=0.1 + ) assert res2.je_exceeds_threshold == False assert res2.ani == 0.9870056455892898 @@ -297,8 +363,8 @@ def test_jaccard_to_distance_k31_2(): jaccard = 0.1 scaled = 100 nkmers = 10000 - ksize=31 - res = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) + ksize = 31 + res = jaccard_to_distance(jaccard, ksize, scaled, n_unique_kmers=nkmers) print(res) # check results assert res.ani == 0.9464928391768298 @@ -310,11 +376,11 @@ def test_nkmers_to_bp_jaccard(): jaccard = 0.1 scaled = 100 bp_len = 10030 - ksize=31 - nkmers = handle_seqlen_nkmers(ksize, sequence_len_bp= bp_len) + ksize = 31 + nkmers = handle_seqlen_nkmers(ksize, sequence_len_bp=bp_len) print("nkmers_from_bp:", nkmers) - kmer_res = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) - bp_res = jaccard_to_distance(jaccard,ksize,scaled,sequence_len_bp=bp_len) + kmer_res = jaccard_to_distance(jaccard, ksize, scaled, n_unique_kmers=nkmers) + bp_res = jaccard_to_distance(jaccard, ksize, scaled, sequence_len_bp=bp_len) print(f"\nkmer_res: {kmer_res}") print(f"\nbp_res: {bp_res}") # check results @@ -329,12 +395,16 @@ def test_exp_prob_nothing_common(): ksize = 31 scaled = 10 bp_len = 1000030 - nkmers = handle_seqlen_nkmers(ksize, sequence_len_bp= bp_len) + nkmers = handle_seqlen_nkmers(ksize, sequence_len_bp=bp_len) print("nkmers_from_bp:", nkmers) - nkmers_pnc = get_exp_probability_nothing_common(dist,ksize,scaled,n_unique_kmers=nkmers) + nkmers_pnc = get_exp_probability_nothing_common( + dist, ksize, scaled, n_unique_kmers=nkmers + ) print(f"prob nothing in common: {nkmers_pnc}") - bp_pnc = get_exp_probability_nothing_common(dist,ksize,scaled,sequence_len_bp=bp_len) + bp_pnc = get_exp_probability_nothing_common( + dist, ksize, scaled, sequence_len_bp=bp_len + ) assert nkmers_pnc == bp_pnc == 7.437016945722123e-07 @@ -347,15 +417,17 @@ def test_containment_to_distance_tinytestdata_var0(): contain = 0.9 scaled = 1 nkmers = 4 - ksize=31 - res = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers, estimate_ci=True) + ksize = 31 + res = containment_to_distance( + contain, ksize, scaled, n_unique_kmers=nkmers, estimate_ci=True + ) print(res) # check results assert res.dist == 0.003392957179023992 - assert res.dist_low == None - assert res.dist_high == None - assert res.ani_low == None - assert res.ani_high == None + assert res.dist_low is None + assert res.dist_high is None + assert res.ani_low is None + assert res.ani_high is None assert res.p_exceeds_threshold == False @@ -364,7 +436,7 @@ def test_var_n_mutated(): r = 0 ksize = 31 nkmers = 200 - var_n_mut = var_n_mutated(nkmers,ksize,r) + var_n_mut = var_n_mutated(nkmers, ksize, r) print(f"var_n_mutated: {var_n_mut}") assert var_n_mut == 0 # check var 0.0 valuerror @@ -372,51 +444,71 @@ def test_var_n_mutated(): ksize = 31 nkmers = 200 with pytest.raises(ValueError) as exc: - var_n_mut = var_n_mutated(nkmers,ksize,r) + var_n_mut = var_n_mutated(nkmers, ksize, r) assert "Error: varN <0.0!" in str(exc) # check successful r = 0.4 ksize = 31 nkmers = 200000 - var_n_mut = var_n_mutated(nkmers,ksize,r) + var_n_mut = var_n_mutated(nkmers, ksize, r) print(f"var_n_mutated: {var_n_mut}") assert var_n_mut == 0.10611425440741508 def test_handle_seqlen_nkmers(): bp_len = 10030 - ksize=31 + ksize = 31 # convert seqlen to nkmers - nkmers = handle_seqlen_nkmers(ksize, sequence_len_bp= bp_len) + nkmers = handle_seqlen_nkmers(ksize, sequence_len_bp=bp_len) assert nkmers == 10000 # if nkmers is provided, just use that - nkmers = handle_seqlen_nkmers(ksize, sequence_len_bp= bp_len, n_unique_kmers= bp_len) + nkmers = handle_seqlen_nkmers(ksize, sequence_len_bp=bp_len, n_unique_kmers=bp_len) assert nkmers == 10030 # if neither seqlen or nkmers provided, complain with pytest.raises(ValueError) as exc: nkmers = handle_seqlen_nkmers(ksize) - assert("Error: distance estimation requires input of either 'sequence_len_bp' or 'n_unique_kmers'") in str(exc) + assert ( + "Error: distance estimation requires input of either 'sequence_len_bp' or 'n_unique_kmers'" + ) in str(exc) def test_set_size_chernoff(): - eps = 10**(-6) + eps = 10 ** (-6) rel_error = 0.01 set_size = 1000000 - s = 1/0.1 # I'm used to using a scale value between 0 and 1 + s = 1 / 0.1 # I'm used to using a scale value between 0 and 1 value_from_mathematica = 0.928652 - assert np.abs(set_size_chernoff(set_size, s, relative_error=rel_error) - value_from_mathematica) < eps + assert ( + np.abs( + set_size_chernoff(set_size, s, relative_error=rel_error) + - value_from_mathematica + ) + < eps + ) rel_error = 0.05 set_size = 10000 s = 1 value_from_mathematica = 0.999519 - assert np.abs(set_size_chernoff(set_size, s, relative_error=rel_error) - value_from_mathematica) < eps + assert ( + np.abs( + set_size_chernoff(set_size, s, relative_error=rel_error) + - value_from_mathematica + ) + < eps + ) rel_error = 0.001 set_size = 10 - s = 1/.01 + s = 1 / 0.01 value_from_mathematica = -1 - assert np.abs(set_size_chernoff(set_size, s, relative_error=rel_error) - value_from_mathematica) < eps + assert ( + np.abs( + set_size_chernoff(set_size, s, relative_error=rel_error) + - value_from_mathematica + ) + < eps + ) def test_set_size_exact_prob(): diff --git a/tests/test_hll.py b/tests/test_hll.py index da8d3aad68..d49336bf7a 100644 --- a/tests/test_hll.py +++ b/tests/test_hll.py @@ -11,7 +11,7 @@ K = 21 # size of kmer ERR_RATE = 0.01 N_UNIQUE = 3356 -TRANSLATE = {'A': 'T', 'C': 'G', 'T': 'A', 'G': 'C'} +TRANSLATE = {"A": "T", "C": "G", "T": "A", "G": "C"} def test_hll_add_python(): @@ -19,16 +19,16 @@ def test_hll_add_python(): # use the lower level add() method, which accepts anything, # and compare to an exact count using collections.Counter - filename = utils.get_test_data('ecoli.genes.fna') + filename = utils.get_test_data("ecoli.genes.fna") hll = HLL(ERR_RATE, K) counter = set() with open(filename) as f: for n, record in enumerate(fasta_iter(f)): - sequence = record['sequence'] + sequence = record["sequence"] seq_len = len(sequence) for n in range(0, seq_len + 1 - K): - kmer = sequence[n:n + K] + kmer = sequence[n : n + K] rc = "".join(TRANSLATE[c] for c in kmer[::-1]) hll.add(kmer) @@ -47,12 +47,12 @@ def test_hll_consume_string(): # test rust code to count unique kmers using HyperLogLog, # using screed to feed each read to the counter. - filename = utils.get_test_data('ecoli.genes.fna') + filename = utils.get_test_data("ecoli.genes.fna") hll = HLL(ERR_RATE, K) - n_consumed = n = 0 + n = 0 with open(filename) as f: for n, record in enumerate(fasta_iter(f), 1): - hll.add_sequence(record['sequence']) + hll.add_sequence(record["sequence"]) assert abs(1 - float(len(hll)) / N_UNIQUE) < ERR_RATE @@ -60,10 +60,9 @@ def test_hll_consume_string(): def test_hll_similarity_containment(): N_UNIQUE_H1 = 500741 N_UNIQUE_H2 = 995845 - N_UNIQUE_U = 995845 SIMILARITY = 0.502783 - CONTAINMENT_H1 = 1. + CONTAINMENT_H1 = 1.0 CONTAINMENT_H2 = 0.502783 INTERSECTION = 500838 @@ -72,23 +71,23 @@ def test_hll_similarity_containment(): hll2 = HLL(ERR_RATE, K) hllu = HLL(ERR_RATE, K) - filename = utils.get_test_data('genome-s10.fa.gz') + filename = utils.get_test_data("genome-s10.fa.gz") with gzip.GzipFile(filename) as f: for n, record in enumerate(fasta_iter(f)): - sequence = record['sequence'] + sequence = record["sequence"] seq_len = len(sequence) for n in range(0, seq_len + 1 - K): - kmer = sequence[n:n + K] + kmer = sequence[n : n + K] hll1.add(kmer) hllu.add(kmer) - filename = utils.get_test_data('genome-s10+s11.fa.gz') + filename = utils.get_test_data("genome-s10+s11.fa.gz") with gzip.GzipFile(filename) as f: for n, record in enumerate(fasta_iter(f)): - sequence = record['sequence'] + sequence = record["sequence"] seq_len = len(sequence) for n in range(0, seq_len + 1 - K): - kmer = sequence[n:n + K] + kmer = sequence[n : n + K] hll2.add(kmer) hllu.add(kmer) @@ -113,13 +112,14 @@ def test_hll_similarity_containment(): assert abs(1 - float(hll1.intersection(hllu)) / N_UNIQUE_U) < ERR_RATE """ + def test_hll_save_load(): - filename = utils.get_test_data('ecoli.genes.fna') + filename = utils.get_test_data("ecoli.genes.fna") hll = HLL(ERR_RATE, K) - n_consumed = n = 0 + n = 0 with open(filename) as f: for n, record in enumerate(fasta_iter(f), 1): - hll.add_sequence(record['sequence']) + hll.add_sequence(record["sequence"]) assert abs(1 - float(len(hll)) / N_UNIQUE) < ERR_RATE diff --git a/tests/test_index.py b/tests/test_index.py index af0c1da890..b207376443 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -9,10 +9,15 @@ import sourmash from sourmash import load_one_signature, SourmashSignature -from sourmash.index import (LinearIndex, ZipFileLinearIndex, - make_jaccard_search_query, CounterGather, - LazyLinearIndex, MultiIndex, - StandaloneManifestIndex) +from sourmash.index import ( + LinearIndex, + ZipFileLinearIndex, + make_jaccard_search_query, + CounterGather, + LazyLinearIndex, + MultiIndex, + StandaloneManifestIndex, +) from sourmash.index.revindex import RevIndex from sourmash.sbt import SBT, GraphFactory from sourmash import sourmash_args @@ -90,7 +95,7 @@ def test_simple_index(n_children): def test_linear_index_prefetch_empty(): # check that an exception is raised upon for an empty LinearIndex - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) lidx = LinearIndex() @@ -111,8 +116,8 @@ class FakeSignature: def minhash(self): raise Exception("don't touch me!") - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss47 = sourmash.load_one_signature(sig47) ss63 = sourmash.load_one_signature(sig63) @@ -142,8 +147,8 @@ def minhash(self): def test_linear_index_search_subj_has_abundance(): # check that search signatures in the index are flattened appropriately. - queryfile = utils.get_test_data('47.fa.sig') - subjfile = utils.get_test_data('track_abund/47.fa.sig') + queryfile = utils.get_test_data("47.fa.sig") + subjfile = utils.get_test_data("track_abund/47.fa.sig") qs = sourmash.load_one_signature(queryfile) ss = sourmash.load_one_signature(subjfile) @@ -159,8 +164,8 @@ def test_linear_index_search_subj_has_abundance(): def test_linear_index_gather_subj_has_abundance(): # check that target signatures in the index are flattened appropriately. - queryfile = utils.get_test_data('47.fa.sig') - subjfile = utils.get_test_data('track_abund/47.fa.sig') + queryfile = utils.get_test_data("47.fa.sig") + subjfile = utils.get_test_data("track_abund/47.fa.sig") qs = sourmash.load_one_signature(queryfile) ss = sourmash.load_one_signature(subjfile) @@ -178,7 +183,9 @@ def test_linear_index_gather_subj_has_abundance(): def test_index_search_subj_scaled_is_lower(): # check that subject sketches are appropriately downsampled for scaled # sketches. - sigfile = utils.get_test_data('scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz') + sigfile = utils.get_test_data( + "scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz" + ) ss = sourmash.load_one_signature(sigfile) # double check :) @@ -201,7 +208,7 @@ def test_index_search_subj_scaled_is_lower(): def test_index_search_subj_num_is_lower(): # check that subject sketches are appropriately downsampled for num # sketches - sigfile = utils.get_test_data('num/47.fa.sig') + sigfile = utils.get_test_data("num/47.fa.sig") ss = sourmash.load_one_signature(sigfile, ksize=31) # double check :) @@ -223,7 +230,7 @@ def test_index_search_subj_num_is_lower(): def test_index_search_query_num_is_lower(): # check that query sketches are appropriately downsampled for num. - sigfile = utils.get_test_data('num/47.fa.sig') + sigfile = utils.get_test_data("num/47.fa.sig") qs = sourmash.load_one_signature(sigfile, ksize=31) # double check :) @@ -244,8 +251,8 @@ def test_index_search_query_num_is_lower(): def test_linear_index_search_abund(): # test Index.search_abund - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") ss47 = sourmash.load_one_signature(sig47) ss63 = sourmash.load_one_signature(sig63) @@ -262,8 +269,8 @@ def test_linear_index_search_abund(): def test_linear_index_search_abund_downsample_query(): # test Index.search_abund with query with higher scaled - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") ss47 = sourmash.load_one_signature(sig47) ss63 = sourmash.load_one_signature(sig63) @@ -285,8 +292,8 @@ def test_linear_index_search_abund_downsample_query(): def test_linear_index_search_abund_downsample_subj(): # test Index.search_abund with subj with higher scaled - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") ss47 = sourmash.load_one_signature(sig47) ss63 = sourmash.load_one_signature(sig63) @@ -308,8 +315,8 @@ def test_linear_index_search_abund_downsample_subj(): def test_linear_index_search_abund_requires_threshold(): # test that Index.search_abund requires a 'threshold' - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") ss47 = sourmash.load_one_signature(sig47) ss63 = sourmash.load_one_signature(sig63) @@ -319,15 +326,15 @@ def test_linear_index_search_abund_requires_threshold(): lidx.insert(ss63) with pytest.raises(TypeError) as exc: - results = list(lidx.search_abund(ss47, threshold=None)) + list(lidx.search_abund(ss47, threshold=None)) assert "'search_abund' requires 'threshold'" in str(exc.value) def test_linear_index_search_abund_query_flat(): # test that Index.search_abund requires an abund query sig - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) ss63 = sourmash.load_one_signature(sig63) @@ -337,15 +344,17 @@ def test_linear_index_search_abund_query_flat(): lidx.insert(ss63) with pytest.raises(TypeError) as exc: - results = list(lidx.search_abund(ss47, threshold=0)) + list(lidx.search_abund(ss47, threshold=0)) - assert "'search_abund' requires query signature with abundance information" in str(exc.value) + assert "'search_abund' requires query signature with abundance information" in str( + exc.value + ) def test_linear_index_search_abund_subj_flat(): # test Index.search_abund requires an abund subj - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss47 = sourmash.load_one_signature(sig47) ss63 = sourmash.load_one_signature(sig63) @@ -355,16 +364,19 @@ def test_linear_index_search_abund_subj_flat(): lidx.insert(ss63) with pytest.raises(TypeError) as exc: - results = list(lidx.search_abund(ss47, threshold=0)) + list(lidx.search_abund(ss47, threshold=0)) - assert "'search_abund' requires subject signatures with abundance information" in str(exc.value) + assert ( + "'search_abund' requires subject signatures with abundance information" + in str(exc.value) + ) def test_linear_index_save(runtmp): # test save output from LinearIndex => JSON - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47) @@ -375,7 +387,7 @@ def test_linear_index_save(runtmp): linear.insert(ss47) linear.insert(ss63) - filename = runtmp.output('foo') + filename = runtmp.output("foo") linear.save(filename) si = set(sourmash.load_file_as_signatures(filename)) @@ -385,24 +397,24 @@ def test_linear_index_save(runtmp): print(len(si)) print(len(x)) - print('si: ', si) - print('x: ', x) + print("si: ", si) + print("x: ", x) assert si == x, si def test_linear_index_load(runtmp): # test .load class method of LinearIndex - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47) ss63 = sourmash.load_one_signature(sig63) - filename = runtmp.output('foo') - with open(filename, 'wt') as fp: + filename = runtmp.output("foo") + with open(filename, "w") as fp: sourmash.save_signatures([ss2, ss47, ss63], fp) linear = LinearIndex.load(filename) @@ -414,9 +426,9 @@ def test_linear_index_load(runtmp): def test_linear_index_save_load(runtmp): # LinearIndex save/load round trip - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47) @@ -427,7 +439,7 @@ def test_linear_index_save_load(runtmp): linear.insert(ss47) linear.insert(ss63) - filename = runtmp.output('foo') + filename = runtmp.output("foo") linear.save(filename) linear2 = LinearIndex.load(filename) @@ -440,9 +452,9 @@ def test_linear_index_save_load(runtmp): def test_linear_gather_threshold_1(): # test gather() method, in some detail - sig2 = load_one_signature(utils.get_test_data('2.fa.sig'), ksize=31) - sig47 = load_one_signature(utils.get_test_data('47.fa.sig'), ksize=31) - sig63 = load_one_signature(utils.get_test_data('63.fa.sig'), ksize=31) + sig2 = load_one_signature(utils.get_test_data("2.fa.sig"), ksize=31) + sig47 = load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + sig63 = load_one_signature(utils.get_test_data("63.fa.sig"), ksize=31) linear = LinearIndex() @@ -498,11 +510,11 @@ def test_linear_gather_threshold_1(): def test_linear_gather_threshold_5(): # test gather() method above threshold - sig2 = load_one_signature(utils.get_test_data('2.fa.sig'), ksize=31) - sig47 = load_one_signature(utils.get_test_data('47.fa.sig'), ksize=31) - sig63 = load_one_signature(utils.get_test_data('63.fa.sig'), ksize=31) + sig2 = load_one_signature(utils.get_test_data("2.fa.sig"), ksize=31) + sig47 = load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + sig63 = load_one_signature(utils.get_test_data("63.fa.sig"), ksize=31) - linear = LinearIndex(filename='foo') + linear = LinearIndex(filename="foo") linear.insert(sig47) linear.insert(sig63) @@ -528,21 +540,20 @@ def test_linear_gather_threshold_5(): containment, match_sig, name = result assert containment == 1.0 assert match_sig == sig2 - assert name == 'foo' + assert name == "foo" # now, check with a threshold_bp that should be meet-able. - result = linear.best_containment(SourmashSignature(new_mh), - threshold_bp=5000) + result = linear.best_containment(SourmashSignature(new_mh), threshold_bp=5000) assert result containment, match_sig, name = result assert containment == 1.0 assert match_sig == sig2 - assert name == 'foo' + assert name == "foo" def test_linear_index_multik_select(): # test that LinearIndx can load multiple (three) ksizes, 21/31/51 - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") siglist = sourmash.load_file_as_signatures(sig2) linear = LinearIndex() @@ -550,17 +561,17 @@ def test_linear_index_multik_select(): linear.insert(ss) # select most specifically - linear2 = linear.select(ksize=31, moltype='DNA') + linear2 = linear.select(ksize=31, moltype="DNA") assert len(linear2) == 1 # all are DNA: - linear2 = linear.select(moltype='DNA') + linear2 = linear.select(moltype="DNA") assert len(linear2) == 3 def test_linear_index_moltype_select(): # this loads two ksizes(21, 10), and two moltypes (DNA and protein) - filename = utils.get_test_data('genome-s10+s11.sig') + filename = utils.get_test_data("genome-s10+s11.sig") siglist = sourmash.load_file_as_signatures(filename) linear = LinearIndex() @@ -568,19 +579,19 @@ def test_linear_index_moltype_select(): linear.insert(ss) # select most specific DNA - linear2 = linear.select(ksize=30, moltype='DNA') + linear2 = linear.select(ksize=30, moltype="DNA") assert len(linear2) == 1 # select most specific protein - linear2 = linear.select(ksize=10, moltype='protein') + linear2 = linear.select(ksize=10, moltype="protein") assert len(linear2) == 1 # can leave off ksize, selects all ksizes - linear2 = linear.select(moltype='DNA') + linear2 = linear.select(moltype="DNA") assert len(linear2) == 2 # can leave off ksize, selects all ksizes - linear2 = linear.select(moltype='protein') + linear2 = linear.select(moltype="protein") assert len(linear2) == 2 # select something impossible @@ -592,7 +603,7 @@ def test_linear_index_picklist_select(): # test LinearIndex.select with a picklist # this loads three ksizes, 21/31/51 - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") siglist = sourmash.load_file_as_signatures(sig2) linear = LinearIndex() @@ -600,22 +611,22 @@ def test_linear_index_picklist_select(): linear.insert(ss) # construct a picklist... - picklist = SignaturePicklist('md5prefix8') - picklist.init(['f3a90d4e']) + picklist = SignaturePicklist("md5prefix8") + picklist.init(["f3a90d4e"]) # select on picklist linear2 = linear.select(picklist=picklist) assert len(linear2) == 1 ss = list(linear2.signatures())[0] assert ss.minhash.ksize == 31 - assert ss.md5sum().startswith('f3a90d4e55') + assert ss.md5sum().startswith("f3a90d4e55") def test_linear_index_picklist_select_exclude(): # test select with a picklist, but exclude # this loads three ksizes, 21/31/51 - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") siglist = sourmash.load_file_as_signatures(sig2) linear = LinearIndex() @@ -623,8 +634,8 @@ def test_linear_index_picklist_select_exclude(): linear.insert(ss) # construct a picklist... - picklist = SignaturePicklist('md5prefix8', pickstyle=PickStyle.EXCLUDE) - picklist.init(['f3a90d4e']) + picklist = SignaturePicklist("md5prefix8", pickstyle=PickStyle.EXCLUDE) + picklist.init(["f3a90d4e"]) # select on picklist linear2 = linear.select(picklist=picklist) @@ -634,37 +645,39 @@ def test_linear_index_picklist_select_exclude(): for ss in list(linear2.signatures()): md5s.add(ss.md5sum()) ksizes.add(ss.minhash.ksize) - assert md5s == set(['f372e47893edd349e5956f8b0d8dcbf7','43f3b48e59443092850964d355a20ac0']) - assert ksizes == set([21,51]) + assert md5s == set( + ["f372e47893edd349e5956f8b0d8dcbf7", "43f3b48e59443092850964d355a20ac0"] + ) + assert ksizes == set([21, 51]) def test_index_same_md5sum_fsstorage(runtmp): # check SBT directory 'save' with two signatures that have identical md5 c = runtmp - testdata1 = utils.get_test_data('img/2706795855.sig') - testdata2 = utils.get_test_data('img/638277004.sig') + testdata1 = utils.get_test_data("img/2706795855.sig") + testdata2 = utils.get_test_data("img/638277004.sig") - c.run_sourmash('index', '-k', '21', 'zzz.sbt.json', testdata1, testdata2) + c.run_sourmash("index", "-k", "21", "zzz.sbt.json", testdata1, testdata2) assert c.last_result.status == 0 - outfile = c.output('zzz.sbt.json') + outfile = c.output("zzz.sbt.json") assert os.path.exists(outfile) - storage = c.output('.sbt.zzz') + storage = c.output(".sbt.zzz") assert len(glob.glob(storage + "/*")) == 4 def test_index_same_md5sum_sbt_zipstorage(runtmp): # check SBT zipfile 'save' with two signatures w/identical md5 c = runtmp - testdata1 = utils.get_test_data('img/2706795855.sig') - testdata2 = utils.get_test_data('img/638277004.sig') + testdata1 = utils.get_test_data("img/2706795855.sig") + testdata2 = utils.get_test_data("img/638277004.sig") - c.run_sourmash('index', '-k', '21', 'zzz.sbt.zip', testdata1, testdata2) + c.run_sourmash("index", "-k", "21", "zzz.sbt.zip", testdata1, testdata2) assert c.last_result.status == 0 - outfile = c.output('zzz.sbt.zip') + outfile = c.output("zzz.sbt.zip") assert os.path.exists(outfile) - zout = zipfile.ZipFile(outfile, mode='r') + zout = zipfile.ZipFile(outfile, mode="r") # should have 3 files, 1 internal and two sigs. We check for 4 because the # directory also shows in namelist() assert len([f for f in zout.namelist() if f.startswith(".sbt.zzz/")]) == 5 @@ -672,11 +685,11 @@ def test_index_same_md5sum_sbt_zipstorage(runtmp): def test_zipfile_does_not_exist(runtmp): with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('sig', 'describe', 'no-exist.zip') + runtmp.sourmash("sig", "describe", "no-exist.zip") # old behavior, pre PR #1777 - assert 'FileNotFoundError: SOURMASH-MANIFEST.csv' not in str(exc) - assert not os.path.exists(runtmp.output('no-exist.zip')) + assert "FileNotFoundError: SOURMASH-MANIFEST.csv" not in str(exc) + assert not os.path.exists(runtmp.output("no-exist.zip")) # correct behavior assert "ERROR: Error while reading signatures from 'no-exist.zip'." in str(exc) @@ -686,90 +699,102 @@ def test_zipfile_protein_command_search(runtmp): # test command-line search/gather of zipfile with protein sigs c = runtmp - sigfile1 = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - db_out = utils.get_test_data('prot/protein.zip') + sigfile1 = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + db_out = utils.get_test_data("prot/protein.zip") - c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches' in c.last_result.out + c.run_sourmash("search", sigfile1, db_out, "--threshold", "0.0") + assert "2 matches" in c.last_result.out - c.run_sourmash('gather', sigfile1, db_out) - assert 'found 1 matches total' in c.last_result.out - assert 'the recovered matches hit 100.0% of the query' in c.last_result.out + c.run_sourmash("gather", sigfile1, db_out) + assert "found 1 matches total" in c.last_result.out + assert "the recovered matches hit 100.0% of the query" in c.last_result.out def test_zipfile_hp_command_search(runtmp): # test command-line search/gather of zipfile with hp sigs c = runtmp - sigfile1 = utils.get_test_data('prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - db_out = utils.get_test_data('prot/hp.zip') + sigfile1 = utils.get_test_data( + "prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + db_out = utils.get_test_data("prot/hp.zip") - c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches' in c.last_result.out + c.run_sourmash("search", sigfile1, db_out, "--threshold", "0.0") + assert "2 matches" in c.last_result.out - c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0') - assert 'found 1 matches total' in c.last_result.out - assert 'the recovered matches hit 100.0% of the query' in c.last_result.out + c.run_sourmash("gather", sigfile1, db_out, "--threshold", "0.0") + assert "found 1 matches total" in c.last_result.out + assert "the recovered matches hit 100.0% of the query" in c.last_result.out def test_zipfile_dayhoff_command_search(runtmp): # test command-line search/gather of zipfile with dayhoff sigs c = runtmp - sigfile1 = utils.get_test_data('prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - db_out = utils.get_test_data('prot/dayhoff.zip') + sigfile1 = utils.get_test_data( + "prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + db_out = utils.get_test_data("prot/dayhoff.zip") - c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches' in c.last_result.out + c.run_sourmash("search", sigfile1, db_out, "--threshold", "0.0") + assert "2 matches" in c.last_result.out - c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0') - assert 'found 1 matches total' in c.last_result.out - assert 'the recovered matches hit 100.0% of the query' in c.last_result.out + c.run_sourmash("gather", sigfile1, db_out, "--threshold", "0.0") + assert "found 1 matches total" in c.last_result.out + assert "the recovered matches hit 100.0% of the query" in c.last_result.out def test_zipfile_protein_command_search_combined(runtmp): # test command-line search/gather of combined zipfile with protein sigs c = runtmp - sigfile1 = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - db_out = utils.get_test_data('prot/all.zip') + sigfile1 = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + db_out = utils.get_test_data("prot/all.zip") - c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches' in c.last_result.out + c.run_sourmash("search", sigfile1, db_out, "--threshold", "0.0") + assert "2 matches" in c.last_result.out - c.run_sourmash('gather', sigfile1, db_out) - assert 'found 1 matches total' in c.last_result.out - assert 'the recovered matches hit 100.0% of the query' in c.last_result.out + c.run_sourmash("gather", sigfile1, db_out) + assert "found 1 matches total" in c.last_result.out + assert "the recovered matches hit 100.0% of the query" in c.last_result.out def test_zipfile_hp_command_search_combined(runtmp): # test command-line search/gather of combined zipfile with hp sigs c = runtmp - sigfile1 = utils.get_test_data('prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - db_out = utils.get_test_data('prot/all.zip') + sigfile1 = utils.get_test_data( + "prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + db_out = utils.get_test_data("prot/all.zip") - c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches' in c.last_result.out + c.run_sourmash("search", sigfile1, db_out, "--threshold", "0.0") + assert "2 matches" in c.last_result.out - c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0') - assert 'found 1 matches total' in c.last_result.out - assert 'the recovered matches hit 100.0% of the query' in c.last_result.out + c.run_sourmash("gather", sigfile1, db_out, "--threshold", "0.0") + assert "found 1 matches total" in c.last_result.out + assert "the recovered matches hit 100.0% of the query" in c.last_result.out def test_zipfile_dayhoff_command_search_combined(runtmp): # test command-line search/gather of combined zipfile with dayhoff sigs c = runtmp - sigfile1 = utils.get_test_data('prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - db_out = utils.get_test_data('prot/all.zip') + sigfile1 = utils.get_test_data( + "prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + db_out = utils.get_test_data("prot/all.zip") - c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches' in c.last_result.out + c.run_sourmash("search", sigfile1, db_out, "--threshold", "0.0") + assert "2 matches" in c.last_result.out - c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0') - assert 'found 1 matches total' in c.last_result.out - assert 'the recovered matches hit 100.0% of the query' in c.last_result.out + c.run_sourmash("gather", sigfile1, db_out, "--threshold", "0.0") + assert "found 1 matches total" in c.last_result.out + assert "the recovered matches hit 100.0% of the query" in c.last_result.out def test_zipfile_dayhoff_command_search_protein(runtmp): @@ -777,21 +802,23 @@ def test_zipfile_dayhoff_command_search_protein(runtmp): c = runtmp # with dayhoff query - sigfile1 = utils.get_test_data('prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - db_out = utils.get_test_data('prot/protein.zip') + sigfile1 = utils.get_test_data( + "prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + db_out = utils.get_test_data("prot/protein.zip") - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("search", sigfile1, db_out, "--threshold", "0.0") print(c.last_result.out) print(c.last_result.err) - assert 'no compatible signatures found in ' in c.last_result.err + assert "no compatible signatures found in " in c.last_result.err def test_zipfile_API_signatures(use_manifest): # return all of the .sig and .sig.gz files in all.zip - zipfile_db = utils.get_test_data('prot/all.zip') + zipfile_db = utils.get_test_data("prot/all.zip") zipidx = ZipFileLinearIndex.load(zipfile_db, use_manifest=use_manifest) siglist = list(zipidx.signatures()) @@ -814,7 +841,7 @@ def __init__(self): pass def signatures(self): - yield 'a' + yield "a" raise Exception("don't touch me!") def __len__(self): @@ -832,10 +859,11 @@ def __len__(self): def test_zipfile_API_signatures_traverse_yield_all(use_manifest): # include dna-sig.noext, but not build.sh (cannot be loaded as signature) - zipfile_db = utils.get_test_data('prot/all.zip') + zipfile_db = utils.get_test_data("prot/all.zip") - zipidx = ZipFileLinearIndex.load(zipfile_db, traverse_yield_all=True, - use_manifest=use_manifest) + zipidx = ZipFileLinearIndex.load( + zipfile_db, traverse_yield_all=True, use_manifest=use_manifest + ) siglist = list(zipidx.signatures()) assert len(siglist) == 8 assert len(zipidx) == 8 @@ -848,11 +876,12 @@ def test_zipfile_API_signatures_traverse_yield_all(use_manifest): def test_zipfile_API_signatures_traverse_yield_all_select(use_manifest): # include dna-sig.noext - zipfile_db = utils.get_test_data('prot/all.zip') + zipfile_db = utils.get_test_data("prot/all.zip") - zipidx = ZipFileLinearIndex.load(zipfile_db, traverse_yield_all=True, - use_manifest=use_manifest) - zipidx = zipidx.select(moltype='DNA') + zipidx = ZipFileLinearIndex.load( + zipfile_db, traverse_yield_all=True, use_manifest=use_manifest + ) + zipidx = zipidx.select(moltype="DNA") siglist = list(zipidx.signatures()) assert len(siglist) == 2 assert len(zipidx) == 2 @@ -860,14 +889,15 @@ def test_zipfile_API_signatures_traverse_yield_all_select(use_manifest): def test_zipfile_API_signatures_traverse_yield_all_manifest(): # check that manifest len is correct - zipfile_db = utils.get_test_data('prot/all.zip') + zipfile_db = utils.get_test_data("prot/all.zip") - zipidx = ZipFileLinearIndex.load(zipfile_db, traverse_yield_all=True, - use_manifest=True) + zipidx = ZipFileLinearIndex.load( + zipfile_db, traverse_yield_all=True, use_manifest=True + ) assert len(zipidx) == 8, len(zipidx) assert len(zipidx.manifest) == 8, len(zipidx.manifest) - zipidx = zipidx.select(moltype='DNA') + zipidx = zipidx.select(moltype="DNA") siglist = list(zipidx.signatures()) assert len(siglist) == 2 assert len(zipidx) == 2 @@ -876,13 +906,13 @@ def test_zipfile_API_signatures_traverse_yield_all_manifest(): def test_zipfile_API_signatures_select(use_manifest): # include dna-sig.noext - zipfile_db = utils.get_test_data('prot/all.zip') + zipfile_db = utils.get_test_data("prot/all.zip") zipidx = ZipFileLinearIndex.load(zipfile_db, use_manifest=use_manifest) ziplist_pre = LinearIndex(zipidx.signatures()) - ziplist_pre = ziplist_pre.select(moltype='DNA') + ziplist_pre = ziplist_pre.select(moltype="DNA") - zipidx = zipidx.select(moltype='DNA') + zipidx = zipidx.select(moltype="DNA") siglist = list(zipidx.signatures()) if use_manifest: @@ -897,7 +927,7 @@ def test_zipfile_API_signatures_select(use_manifest): def test_zipfile_API_signatures_select_abund_false(use_manifest): # check for abund=False (all signatures match b/c can convert) - zipfile_db = utils.get_test_data('track_abund/track_abund.zip') + zipfile_db = utils.get_test_data("track_abund/track_abund.zip") zipidx = ZipFileLinearIndex.load(zipfile_db, use_manifest=use_manifest) ziplist_pre = LinearIndex(zipidx.signatures()) @@ -913,7 +943,7 @@ def test_zipfile_API_signatures_select_abund_false(use_manifest): def test_zipfile_API_signatures_select_abund_true(use_manifest): # find all abund=True (all signatures match, b/c abund) - zipfile_db = utils.get_test_data('track_abund/track_abund.zip') + zipfile_db = utils.get_test_data("track_abund/track_abund.zip") zipidx = ZipFileLinearIndex.load(zipfile_db, use_manifest=use_manifest) ziplist_pre = LinearIndex(zipidx.signatures()) @@ -929,7 +959,7 @@ def test_zipfile_API_signatures_select_abund_true(use_manifest): def test_zipfile_API_signatures_select_abund_none(use_manifest): # find all abund=None (all signatures match, b/c no selection criteria) - zipfile_db = utils.get_test_data('track_abund/track_abund.zip') + zipfile_db = utils.get_test_data("track_abund/track_abund.zip") zipidx = ZipFileLinearIndex.load(zipfile_db, use_manifest=use_manifest) ziplist_pre = LinearIndex(zipidx.signatures()) @@ -945,14 +975,14 @@ def test_zipfile_API_signatures_select_abund_none(use_manifest): def test_zipfile_API_signatures_select_twice(use_manifest): # include dna-sig.noext - zipfile_db = utils.get_test_data('prot/all.zip') + zipfile_db = utils.get_test_data("prot/all.zip") zipidx = ZipFileLinearIndex.load(zipfile_db, use_manifest=use_manifest) ziplist_pre = LinearIndex(zipidx.signatures()) - ziplist_pre = ziplist_pre.select(moltype='DNA') + ziplist_pre = ziplist_pre.select(moltype="DNA") ziplist_pre = ziplist_pre.select(ksize=31) - zipidx = zipidx.select(moltype='DNA') + zipidx = zipidx.select(moltype="DNA") zipidx = zipidx.select(ksize=31) siglist = list(zipidx.signatures()) @@ -968,17 +998,17 @@ def test_zipfile_API_signatures_select_twice(use_manifest): def test_zipfile_API_save(): # ZipFileLinearIndex.save is not implemented. - zipfile_db = utils.get_test_data('prot/all.zip') + zipfile_db = utils.get_test_data("prot/all.zip") zipidx = ZipFileLinearIndex.load(zipfile_db) with pytest.raises(NotImplementedError): - zipidx.save('xxx') + zipidx.save("xxx") def test_zipfile_API_insert(): # ZipFileLinearIndex.insert is not implemented. - zipfile_db = utils.get_test_data('prot/all.zip') + zipfile_db = utils.get_test_data("prot/all.zip") zipidx = ZipFileLinearIndex.load(zipfile_db) @@ -989,7 +1019,7 @@ def test_zipfile_API_insert(): def test_zipfile_API_location(use_manifest): # test ZipFileLinearIndex.location property - zipfile_db = utils.get_test_data('prot/all.zip') + zipfile_db = utils.get_test_data("prot/all.zip") zipidx = ZipFileLinearIndex.load(zipfile_db, use_manifest=use_manifest) @@ -1000,9 +1030,8 @@ def test_zipfile_load_file_as_signatures(use_manifest): # make sure that ZipFileLinearIndex.signatures works, and is generator from types import GeneratorType - zipfile_db = utils.get_test_data('prot/all.zip') - sigs = sourmash_args.load_file_as_signatures(zipfile_db, - _use_manifest=use_manifest) + zipfile_db = utils.get_test_data("prot/all.zip") + sigs = sourmash_args.load_file_as_signatures(zipfile_db, _use_manifest=use_manifest) # it's fine if this needs to change, but for now I want to make # sure that this is a generator. @@ -1019,10 +1048,10 @@ def test_zipfile_load_file_as_signatures_traverse_yield_all(use_manifest): # test with --force, which loads all files from types import GeneratorType - zipfile_db = utils.get_test_data('prot/all.zip') - sigs = sourmash_args.load_file_as_signatures(zipfile_db, - yield_all_files=True, - _use_manifest=use_manifest) + zipfile_db = utils.get_test_data("prot/all.zip") + sigs = sourmash_args.load_file_as_signatures( + zipfile_db, yield_all_files=True, _use_manifest=use_manifest + ) # it's fine if this needs to change, but for now I want to make # sure that this is a generator. @@ -1036,21 +1065,21 @@ def test_zipfile_load_database_fail_if_not_zip(runtmp): # fail _load_database if not .zip c = runtmp - zipfile_db = utils.get_test_data('prot/all.zip') - badname = c.output('xyz.nada') + zipfile_db = utils.get_test_data("prot/all.zip") + badname = c.output("xyz.nada") shutil.copyfile(zipfile_db, badname) with pytest.raises(ValueError) as exc: - sigs = sourmash_args.load_file_as_signatures(badname) + sourmash_args.load_file_as_signatures(badname) - assert 'Error while reading signatures from' in str(exc.value) + assert "Error while reading signatures from" in str(exc.value) def test_multi_index_search(): # test MultiIndex.search - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47) @@ -1061,8 +1090,7 @@ def test_multi_index_search(): lidx3 = LinearIndex.load(sig63) # create MultiIndex with source location override - lidx = MultiIndex.load([lidx1, lidx2, lidx3], ['A', None, 'C'], - None) + lidx = MultiIndex.load([lidx1, lidx2, lidx3], ["A", None, "C"], None) lidx = lidx.select(ksize=31) # now, search for sig2 @@ -1070,7 +1098,7 @@ def test_multi_index_search(): print([s[1].name for s in sr]) assert len(sr) == 1 assert sr[0][1] == ss2 - assert sr[0][2] == 'A' # source override + assert sr[0][2] == "A" # source override # search for sig47 with lower threshold; search order not guaranteed. sr = lidx.search(ss47, threshold=0.1) @@ -1078,9 +1106,9 @@ def test_multi_index_search(): assert len(sr) == 2 sr.sort(key=lambda x: -x[0]) assert sr[0][1] == ss47 - assert sr[0][2] == sig47 # source was set to None, so no override + assert sr[0][2] == sig47 # source was set to None, so no override assert sr[1][1] == ss63 - assert sr[1][2] == 'C' # source override + assert sr[1][2] == "C" # source override # search for sig63 with lower threshold; search order not guaranteed. sr = lidx.search(ss63, threshold=0.1) @@ -1088,9 +1116,9 @@ def test_multi_index_search(): assert len(sr) == 2 sr.sort(key=lambda x: -x[0]) assert sr[0][1] == ss63 - assert sr[0][2] == 'C' # source override + assert sr[0][2] == "C" # source override assert sr[1][1] == ss47 - assert sr[1][2] == sig47 # source was set to None, so no override + assert sr[1][2] == sig47 # source was set to None, so no override # search for sig63 with high threshold => 1 match sr = lidx.search(ss63, threshold=0.8) @@ -1098,45 +1126,44 @@ def test_multi_index_search(): assert len(sr) == 1 sr.sort(key=lambda x: -x[0]) assert sr[0][1] == ss63 - assert sr[0][2] == 'C' # source override + assert sr[0][2] == "C" # source override def test_multi_index_gather(): # test MultiIndex.best_containment - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47) - ss63 = sourmash.load_one_signature(sig63) + sourmash.load_one_signature(sig63) lidx1 = LinearIndex.load(sig2) lidx2 = LinearIndex.load(sig47) lidx3 = LinearIndex.load(sig63) # create MultiIndex with source location override - lidx = MultiIndex.load([lidx1, lidx2, lidx3], ['A', None, 'C'], - None) + lidx = MultiIndex.load([lidx1, lidx2, lidx3], ["A", None, "C"], None) lidx = lidx.select(ksize=31) match = lidx.best_containment(ss2) assert match assert match.score == 1.0 - assert match.location == 'A' + assert match.location == "A" match = lidx.best_containment(ss47) assert match assert match.score == 1.0 assert match.signature == ss47 - assert match.location == sig47 # no source override + assert match.location == sig47 # no source override def test_multi_index_signatures(): # test MultiIndex.signatures - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47) @@ -1147,8 +1174,7 @@ def test_multi_index_signatures(): lidx3 = LinearIndex.load(sig63) # create MultiIndex with source location override - lidx = MultiIndex.load([lidx1, lidx2, lidx3], ['A', None, 'C'], - None) + lidx = MultiIndex.load([lidx1, lidx2, lidx3], ["A", None, "C"], None) lidx = lidx.select(ksize=31) siglist = list(lidx.signatures()) @@ -1168,13 +1194,13 @@ def test_multi_index_create_prepend(): # test MultiIndex constructor - location must be specified if # 'prepend_location is True with pytest.raises(ValueError): - mi = MultiIndex(None, None, prepend_location=True) + MultiIndex(None, None, prepend_location=True) def test_multi_index_load_from_directory(): # test MultiIndex loading from a directory. The full paths to the # signature files should be available via 'signatures_with_location()' - dirname = utils.get_test_data('prot/protein') + dirname = utils.get_test_data("prot/protein") mi = MultiIndex.load_from_directory(dirname, force=False) assert mi.location == dirname @@ -1183,10 +1209,12 @@ def test_multi_index_load_from_directory(): assert len(sigs) == 2 # check to make sure that full paths to expected sig files are returned - locs = [ x[1] for x in mi.signatures_with_location() ] + locs = [x[1] for x in mi.signatures_with_location()] - endings = ('GCA_001593925.1_ASM159392v1_protein.faa.gz.sig', - 'GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') + endings = ( + "GCA_001593925.1_ASM159392v1_protein.faa.gz.sig", + "GCA_001593935.1_ASM159393v1_protein.faa.gz.sig", + ) for loc in locs: found = False for end in endings: @@ -1195,16 +1223,16 @@ def test_multi_index_load_from_directory(): assert found, f"could not find full filename in locations for {end}" # also check internal locations and parent value -- - assert mi.parent.endswith('prot/protein') + assert mi.parent.endswith("prot/protein") - ilocs = [ x[1] for x in mi._signatures_with_internal() ] + ilocs = [x[1] for x in mi._signatures_with_internal()] assert endings[0] in ilocs, ilocs assert endings[1] in ilocs, ilocs def test_multi_index_load_from_directory_2(): # only load .sig files, currently; not the databases under that directory. - dirname = utils.get_test_data('prot') + dirname = utils.get_test_data("prot") mi = MultiIndex.load_from_directory(dirname, force=False) sigs = list(mi.signatures()) @@ -1214,13 +1242,12 @@ def test_multi_index_load_from_directory_2(): def test_multi_index_load_from_directory_3_simple_bad_file(runtmp): # check that force=False fails properly when confronted with non-JSON # files. - c = runtmp - with open(runtmp.output('badsig.sig'), 'wt') as fp: - fp.write('bad content.') + with open(runtmp.output("badsig.sig"), "w") as fp: + fp.write("bad content.") with pytest.raises(ValueError): - mi = MultiIndex.load_from_directory(runtmp.location, force=False) + MultiIndex.load_from_directory(runtmp.location, force=False) def test_multi_index_load_from_directory_3(runtmp): @@ -1228,7 +1255,7 @@ def test_multi_index_load_from_directory_3(runtmp): # files that are legit sourmash files... c = runtmp - dirname = utils.get_test_data('prot') + dirname = utils.get_test_data("prot") count = 0 for root, dirs, files in os.walk(dirname): @@ -1240,7 +1267,7 @@ def test_multi_index_load_from_directory_3(runtmp): count += 1 with pytest.raises(ValueError): - mi = MultiIndex.load_from_directory(c.location, force=False) + MultiIndex.load_from_directory(c.location, force=False) def test_multi_index_load_from_directory_3_yield_all_true(runtmp): @@ -1248,7 +1275,7 @@ def test_multi_index_load_from_directory_3_yield_all_true(runtmp): # Note here that only .sig/.sig.gz files are loaded. c = runtmp - dirname = utils.get_test_data('prot') + dirname = utils.get_test_data("prot") count = 0 for root, dirs, files in os.walk(dirname): @@ -1269,7 +1296,7 @@ def test_multi_index_load_from_directory_3_yield_all_true_subdir(runtmp): # check that force works ok on subdirectories. # Note here that only .sig/.sig.gz files are loaded. c = runtmp - dirname = utils.get_test_data('prot') + dirname = utils.get_test_data("prot") target_dir = c.output("some_subdir") os.mkdir(target_dir) @@ -1285,7 +1312,7 @@ def test_multi_index_load_from_directory_3_yield_all_true_subdir(runtmp): mi = MultiIndex.load_from_directory(c.location, force=True) - locations = set([ row['internal_location'] for row in mi.manifest.rows ]) + locations = set([row["internal_location"] for row in mi.manifest.rows]) print(locations) sigs = list(mi.signatures()) @@ -1296,12 +1323,12 @@ def test_multi_index_load_from_directory_3_sig_gz(runtmp): # check that we find .sig.gz files, too c = runtmp - dirname = utils.get_test_data('prot') + dirname = utils.get_test_data("prot") count = 0 for root, dirs, files in os.walk(dirname): for name in files: - if not name.endswith('.sig'): # skip non .sig things + if not name.endswith(".sig"): # skip non .sig things continue print(f"at {name}") fullname = os.path.join(root, name) @@ -1321,26 +1348,25 @@ def test_multi_index_load_from_directory_3_check_traverse_fn(runtmp): # test the actual traverse function... eventually this test can be # removed, probably, as we consolidate functionality and test MultiIndex # better. - c = runtmp - dirname = utils.get_test_data('prot') + dirname = utils.get_test_data("prot") files = list(sourmash_args.traverse_find_sigs([dirname])) assert len(files) == 7, files files = list(sourmash_args.traverse_find_sigs([dirname], True)) - assert len(files) == 20, files # if this fails, check for extra files! + assert len(files) == 20, files # if this fails, check for extra files! def test_multi_index_load_from_directory_no_exist(): # raise ValueError on files that don't exist in load_from_directory - dirname = utils.get_test_data('does-not-exist') + dirname = utils.get_test_data("does-not-exist") with pytest.raises(ValueError): - mi = MultiIndex.load_from_directory(dirname, force=True) + MultiIndex.load_from_directory(dirname, force=True) def test_multi_index_load_from_file_path(): # test that MultiIndex.load_from_path works fine - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") mi = MultiIndex.load_from_path(sig2) assert len(mi) == 3 @@ -1349,29 +1375,29 @@ def test_multi_index_load_from_file_path(): def test_multi_index_load_from_file_path_no_exist(): # test that load_from_path fails on non-existent files - filename = utils.get_test_data('does-not-exist') + filename = utils.get_test_data("does-not-exist") with pytest.raises(ValueError): - mi = MultiIndex.load_from_directory(filename, force=True) + MultiIndex.load_from_directory(filename, force=True) def test_multi_index_load_from_pathlist_no_exist(): # test that load_from_pathlist fails on non-existent files - dirname = utils.get_test_data('does-not-exist') + dirname = utils.get_test_data("does-not-exist") with pytest.raises(ValueError): - mi = MultiIndex.load_from_pathlist(dirname) + MultiIndex.load_from_pathlist(dirname) def test_multi_index_load_from_pathlist_1(runtmp): # test functionality of MultiIndex.load_from_pathlist with .sig files c = runtmp - dirname = utils.get_test_data('prot') + dirname = utils.get_test_data("prot") files = list(sourmash_args.traverse_find_sigs([dirname])) assert len(files) == 7, files - file_list = c.output('filelist.txt') + file_list = c.output("filelist.txt") - with open(file_list, 'wt') as fp: + with open(file_list, "w") as fp: print("\n".join(files), file=fp) mi = MultiIndex.load_from_pathlist(file_list) @@ -1388,54 +1414,57 @@ def test_multi_index_load_from_pathlist_2(runtmp): # CTB note: if you create extra files under this directory, # it will fail :) c = runtmp - dirname = utils.get_test_data('prot') + dirname = utils.get_test_data("prot") files = list(sourmash_args.traverse_find_sigs([dirname], True)) - assert len(files) == 20, files # check there aren't extra files in here! + assert len(files) == 20, files # check there aren't extra files in here! - file_list = c.output('filelist.txt') + file_list = c.output("filelist.txt") - with open(file_list, 'wt') as fp: + with open(file_list, "w") as fp: print("\n".join(files), file=fp) with pytest.raises(ValueError) as exc: - mi = MultiIndex.load_from_pathlist(file_list) + MultiIndex.load_from_pathlist(file_list) print(str(exc)) - assert 'Error while reading signatures from' in str(exc) + assert "Error while reading signatures from" in str(exc) def test_multi_index_load_from_pathlist_3_zipfile(runtmp): # can we load zipfiles in a pathlist? yes please. c = runtmp - zipfile = utils.get_test_data('prot/all.zip') + zipfile = utils.get_test_data("prot/all.zip") - file_list = c.output('filelist.txt') + file_list = c.output("filelist.txt") - with open(file_list, 'wt') as fp: + with open(file_list, "w") as fp: print(zipfile, file=fp) mi = MultiIndex.load_from_pathlist(file_list) assert len(mi) == 8 + ## ## test a slightly outre version of JaccardSearch - this is a test of the ## JaccardSearch 'collect' protocol, in particular... ## + class JaccardSearchBestOnly_ButIgnore(JaccardSearch): "A class that ignores certain results, but still does all the pruning." + def __init__(self, ignore_list): super().__init__(SearchType.JACCARD, threshold=0.1) self.ignore_list = ignore_list # a collect function that _ignores_ things in the ignore_list def collect(self, score, match): - print('in collect; current threshold:', self.threshold) + print("in collect; current threshold:", self.threshold) for q in self.ignore_list: - print('ZZZ', match, match.similarity(q)) + print("ZZZ", match, match.similarity(q)) if match.similarity(q) == 1.0: - print('yes, found.') + print("yes, found.") return False # update threshold if not perfect match, which could help prune. @@ -1445,9 +1474,9 @@ def collect(self, score, match): def test_linear_index_gather_ignore(): # do we properly ignore exact matches in 'search' for LinearIndex? - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47, ksize=31) @@ -1460,7 +1489,7 @@ def test_linear_index_gather_ignore(): search_fn = JaccardSearchBestOnly_ButIgnore([ss47]) results = list(lidx.find(search_fn, ss47)) - results = [ sr.signature for sr in results ] + results = [sr.signature for sr in results] def is_found(ss, xx): for q in xx: @@ -1478,9 +1507,9 @@ def test_lca_index_gather_ignore(): # do we properly ignore exact matches in gather on an LCA DB? from sourmash.lca import LCA_Database - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47, ksize=31) @@ -1496,7 +1525,7 @@ def test_lca_index_gather_ignore(): search_fn = JaccardSearchBestOnly_ButIgnore([ss47]) results = list(db.find(search_fn, ss47)) - results = [ sr.signature for sr in results ] + results = [sr.signature for sr in results] def is_found(ss, xx): for q in xx: @@ -1512,9 +1541,9 @@ def is_found(ss, xx): def test_sbt_index_gather_ignore(): # do we properly ignore exact matches in gather on an SBT? - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47, ksize=31) @@ -1529,15 +1558,15 @@ def test_sbt_index_gather_ignore(): db.insert(ss63) # ...now search with something that should ignore sig47, the exact match. - print(f'\n** trying to ignore {ss47}') + print(f"\n** trying to ignore {ss47}") search_fn = JaccardSearchBestOnly_ButIgnore([ss47]) results = list(db.find(search_fn, ss47)) - results = [ sr.signature for sr in results ] + results = [sr.signature for sr in results] def is_found(ss, xx): for q in xx: - print('is found?', ss, ss.similarity(q)) + print("is found?", ss, ss.similarity(q)) if ss.similarity(q) == 1.0: return True return False @@ -1552,39 +1581,41 @@ def test_counter_gather_test_consume(): # (see test_index_protocol.py for generic CounterGather tests.) query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") match_mh_1 = query_mh.copy_and_clear() match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') + match_ss_1 = SourmashSignature(match_mh_1, name="match1") match_mh_2 = query_mh.copy_and_clear() match_mh_2.add_many(range(7, 15)) - match_ss_2 = SourmashSignature(match_mh_2, name='match2') + match_ss_2 = SourmashSignature(match_mh_2, name="match2") match_mh_3 = query_mh.copy_and_clear() match_mh_3.add_many(range(13, 17)) - match_ss_3 = SourmashSignature(match_mh_3, name='match3') + match_ss_3 = SourmashSignature(match_mh_3, name="match3") # load up the counter counter = CounterGather(query_ss) - counter.add(match_ss_1, location='loc a') - counter.add(match_ss_2, location='loc b') - counter.add(match_ss_3, location='loc c') + counter.add(match_ss_1, location="loc a") + counter.add(match_ss_2, location="loc b") + counter.add(match_ss_3, location="loc c") ### ok, dig into actual counts... import pprint + pprint.pprint(counter.counter) pprint.pprint(list(counter.signatures())) pprint.pprint(counter.locations) assert set(counter.signatures()) == set([match_ss_1, match_ss_2, match_ss_3]) - assert list(sorted(counter.locations.values())) == ['loc a', 'loc b', 'loc c'] + assert list(sorted(counter.locations.values())) == ["loc a", "loc b", "loc c"] pprint.pprint(counter.counter.most_common()) - assert list(counter.counter.most_common()) == \ - [('26d4943627b33c446f37be1f5baf8d46', 10), - ('f51cedec90ea666e0ebc11aa274eca61', 8), - ('f331f8279113d77e42ab8efca8f9cc17', 4)] + assert list(counter.counter.most_common()) == [ + ("26d4943627b33c446f37be1f5baf8d46", 10), + ("f51cedec90ea666e0ebc11aa274eca61", 8), + ("f331f8279113d77e42ab8efca8f9cc17", 4), + ] ## round 1 @@ -1595,12 +1626,13 @@ def test_counter_gather_test_consume(): assert cur_query == query_ss.minhash counter.consume(intersect_mh) - assert set(counter.signatures()) == set([ match_ss_1, match_ss_2, match_ss_3 ]) - assert list(sorted(counter.locations.values())) == ['loc a', 'loc b', 'loc c'] + assert set(counter.signatures()) == set([match_ss_1, match_ss_2, match_ss_3]) + assert list(sorted(counter.locations.values())) == ["loc a", "loc b", "loc c"] pprint.pprint(counter.counter.most_common()) - assert list(counter.counter.most_common()) == \ - [('f51cedec90ea666e0ebc11aa274eca61', 5), - ('f331f8279113d77e42ab8efca8f9cc17', 4)] + assert list(counter.counter.most_common()) == [ + ("f51cedec90ea666e0ebc11aa274eca61", 5), + ("f331f8279113d77e42ab8efca8f9cc17", 4), + ] ### round 2 @@ -1611,12 +1643,13 @@ def test_counter_gather_test_consume(): assert cur_query != query_ss.minhash counter.consume(intersect_mh) - assert set(counter.signatures()) == set([ match_ss_1, match_ss_2, match_ss_3 ]) - assert list(sorted(counter.locations.values())) == ['loc a', 'loc b', 'loc c'] + assert set(counter.signatures()) == set([match_ss_1, match_ss_2, match_ss_3]) + assert list(sorted(counter.locations.values())) == ["loc a", "loc b", "loc c"] pprint.pprint(counter.counter.most_common()) - assert list(counter.counter.most_common()) == \ - [('f331f8279113d77e42ab8efca8f9cc17', 2)] + assert list(counter.counter.most_common()) == [ + ("f331f8279113d77e42ab8efca8f9cc17", 2) + ] ## round 3 @@ -1627,8 +1660,8 @@ def test_counter_gather_test_consume(): assert cur_query != query_ss.minhash counter.consume(intersect_mh) - assert set(counter.signatures()) == set([ match_ss_1, match_ss_2, match_ss_3 ]) - assert list(sorted(counter.locations.values())) == ['loc a', 'loc b', 'loc c'] + assert set(counter.signatures()) == set([match_ss_1, match_ss_2, match_ss_3]) + assert list(sorted(counter.locations.values())) == ["loc a", "loc b", "loc c"] pprint.pprint(counter.counter.most_common()) assert list(counter.counter.most_common()) == [] @@ -1639,8 +1672,8 @@ def test_counter_gather_test_consume(): assert not results counter.consume(intersect_mh) - assert set(counter.signatures()) == set([ match_ss_1, match_ss_2, match_ss_3 ]) - assert list(sorted(counter.locations.values())) == ['loc a', 'loc b', 'loc c'] + assert set(counter.signatures()) == set([match_ss_1, match_ss_2, match_ss_3]) + assert list(sorted(counter.locations.values())) == ["loc a", "loc b", "loc c"] assert list(counter.counter.most_common()) == [] @@ -1649,28 +1682,28 @@ def test_counter_gather_identical_md5sum(): # check what happens with identical matches w/different names query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") match_mh_1 = query_mh.copy_and_clear() match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') + match_ss_1 = SourmashSignature(match_mh_1, name="match1") # same as match_mh_1 match_mh_2 = query_mh.copy_and_clear() match_mh_2.add_many(range(0, 10)) - match_ss_2 = SourmashSignature(match_mh_2, name='match2') + match_ss_2 = SourmashSignature(match_mh_2, name="match2") # identical md5sum assert match_ss_1.md5sum() == match_ss_2.md5sum() # load up the counter counter = CounterGather(query_ss) - counter.add(match_ss_1, location='loc a') - counter.add(match_ss_2, location='loc b') + counter.add(match_ss_1, location="loc a") + counter.add(match_ss_2, location="loc b") assert len(counter.siglist) == 1 stored_match = list(counter.siglist.values()).pop() - assert stored_match.name == 'match2' + assert stored_match.name == "match2" # CTB note: this behavior may be changed freely, as the protocol # tests simply specify that _one_ of the identical matches is # returned. See test_counter_gather_multiple_identical_matches. @@ -1678,9 +1711,9 @@ def test_counter_gather_identical_md5sum(): def test_lazy_index_1(): # test some basic features of LazyLinearIndex - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47) @@ -1735,14 +1768,14 @@ def minhash(self): lazy = LazyLinearIndex(lidx) lazy2 = lazy.select(ksize=31) with pytest.raises(ValueError) as e: - lazy3 = lazy2.select(ksize=21) + lazy2.select(ksize=21) assert str(e.value) == "cannot select on two different values for ksize" def test_lazy_index_4_bool(): # test some basic features of LazyLinearIndex - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) # test bool false/true @@ -1757,24 +1790,26 @@ def test_lazy_index_4_bool(): def test_lazy_index_wraps_multi_index_location(): # check that 'location' works fine when MultiIndex is wrapped by # LazyLinearIndex. - sigdir = utils.get_test_data('prot/protein/') - sigzip = utils.get_test_data('prot/protein.zip') - siglca = utils.get_test_data('prot/protein.lca.json.gz') - sigsbt = utils.get_test_data('prot/protein.sbt.zip') + sigdir = utils.get_test_data("prot/protein/") + sigzip = utils.get_test_data("prot/protein.zip") + siglca = utils.get_test_data("prot/protein.lca.json.gz") + sigsbt = utils.get_test_data("prot/protein.sbt.zip") db_paths = (sigdir, sigzip, siglca, sigsbt) - dbs = [ sourmash.load_file_as_index(db_path) for db_path in db_paths ] + dbs = [sourmash.load_file_as_index(db_path) for db_path in db_paths] mi = MultiIndex.load(dbs, db_paths, None) lazy = LazyLinearIndex(mi) - mi2 = mi.select(moltype='protein') - lazy2 = lazy.select(moltype='protein') + mi2 = mi.select(moltype="protein") + lazy2 = lazy.select(moltype="protein") - for (ss_tup, ss_lazy_tup) in zip(mi2.signatures_with_location(), - lazy2.signatures_with_location()): + for ss_tup, ss_lazy_tup in zip( + mi2.signatures_with_location(), lazy2.signatures_with_location() + ): assert ss_tup == ss_lazy_tup + def test_revindex_index_search(): # confirm that RevIndex works sig2 = utils.get_test_data("2.fa.sig") @@ -1848,9 +1883,9 @@ def test_revindex_gather(): def test_revindex_gather_ignore(): # check that RevIndex gather ignores things properly. - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47, ksize=31) @@ -1863,7 +1898,7 @@ def test_revindex_gather_ignore(): search_fn = JaccardSearchBestOnly_ButIgnore([ss47]) results = list(lidx.find(search_fn, ss47)) - results = [ ss.signature for ss in results ] + results = [ss.signature for ss in results] def is_found(ss, xx): for q in xx: @@ -1881,8 +1916,8 @@ def test_standalone_manifest_signatures(runtmp): # build a StandaloneManifestIndex and test 'signatures' method. ## first, build a manifest in memory using MultiIndex - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss47 = sourmash.load_one_signature(sig47) ss63 = sourmash.load_one_signature(sig63) @@ -1895,7 +1930,7 @@ def test_standalone_manifest_signatures(runtmp): ## got a manifest! ok, now test out StandaloneManifestIndex mm = StandaloneManifestIndex(mi.manifest, None) - siglist = [ ss for ss in mm.signatures() ] + siglist = [ss for ss in mm.signatures()] assert len(siglist) == 2 assert ss47 in siglist assert ss63 in siglist @@ -1905,11 +1940,11 @@ def test_standalone_manifest_signatures_prefix(runtmp): # try out 'prefix' for StandaloneManifestIndex ## first, build a manifest in memory using MultiIndex - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") - ss47 = sourmash.load_one_signature(sig47) - ss63 = sourmash.load_one_signature(sig63) + sourmash.load_one_signature(sig47) + sourmash.load_one_signature(sig63) lidx1 = LinearIndex.load(sig47) lidx2 = LinearIndex.load(sig63) @@ -1917,11 +1952,10 @@ def test_standalone_manifest_signatures_prefix(runtmp): # ok, now remove the abspath prefix from iloc for row in mi.manifest.rows: - row['internal_location'] = os.path.basename(row['internal_location']) + row["internal_location"] = os.path.basename(row["internal_location"]) ## this should succeed! - mm = StandaloneManifestIndex(mi.manifest, None, - prefix=utils.get_test_data('')) + mm = StandaloneManifestIndex(mi.manifest, None, prefix=utils.get_test_data("")) assert len(list(mm.signatures())) == 2 @@ -1930,25 +1964,24 @@ def test_standalone_manifest_signatures_prefix_fail(runtmp): # give StandaloneManifest the wrong prefix ## first, build a manifest in memory using MultiIndex - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") - ss47 = sourmash.load_one_signature(sig47) - ss63 = sourmash.load_one_signature(sig63) + sourmash.load_one_signature(sig47) + sourmash.load_one_signature(sig63) lidx1 = LinearIndex.load(sig47) lidx2 = LinearIndex.load(sig63) - print('XXX', lidx1.location) + print("XXX", lidx1.location) mi = MultiIndex.load([lidx1, lidx2], [sig47, sig63], "") # remove prefix from manifest for row in mi.manifest.rows: - row['internal_location'] = os.path.basename(row['internal_location']) + row["internal_location"] = os.path.basename(row["internal_location"]) ## got a manifest! ok, now test out StandaloneManifestIndex - mm = StandaloneManifestIndex(mi.manifest, None, - prefix=runtmp.output('foo')) + mm = StandaloneManifestIndex(mi.manifest, None, prefix=runtmp.output("foo")) # should fail with pytest.raises(ValueError) as exc: @@ -1960,37 +1993,37 @@ def test_standalone_manifest_signatures_prefix_fail(runtmp): def test_standalone_manifest_load_from_dir(runtmp): # test loading a mf with relative directory paths from test-data - mf = utils.get_test_data('scaled/mf.csv') + mf = utils.get_test_data("scaled/mf.csv") idx = sourmash.load_file_as_index(mf) siglist = list(idx.signatures()) assert len(siglist) == 15 - assert idx # should be 'True' + assert idx # should be 'True' assert len(idx) == 15 with pytest.raises(NotImplementedError): idx.insert() with pytest.raises(NotImplementedError): - idx.save('foo') + idx.save("foo") assert idx.location == mf def test_standalone_manifest_lazy_load(runtmp): # check that it's actually doing lazy loading - orig_sig47 = utils.get_test_data('47.fa.sig') - sig47 = runtmp.output('47.fa.sig') + orig_sig47 = utils.get_test_data("47.fa.sig") + sig47 = runtmp.output("47.fa.sig") # build an external manifest shutil.copyfile(orig_sig47, sig47) # this is an abspath to sig47 - runtmp.sourmash('sig', 'manifest', sig47, '-o', 'mf.csv') + runtmp.sourmash("sig", "manifest", sig47, "-o", "mf.csv") # should work to get signatures: - idx = StandaloneManifestIndex.load(runtmp.output('mf.csv')) + idx = StandaloneManifestIndex.load(runtmp.output("mf.csv")) siglist = list(idx.signatures()) assert len(siglist) == 1 @@ -2013,18 +2046,19 @@ def test_standalone_manifest_lazy_load(runtmp): def test_standalone_manifest_lazy_load_2_prefix(runtmp): # check that it's actually doing lazy loading; supply explicit prefix - orig_sig47 = utils.get_test_data('47.fa.sig') - sig47 = runtmp.output('47.fa.sig') + orig_sig47 = utils.get_test_data("47.fa.sig") + sig47 = runtmp.output("47.fa.sig") # build an external manifest # note, here use a relative path to 47.fa.sig; the manifest will contain # just '47.fa.sig' as the location shutil.copyfile(orig_sig47, sig47) - runtmp.sourmash('sig', 'manifest', '47.fa.sig', '-o', 'mf.csv') + runtmp.sourmash("sig", "manifest", "47.fa.sig", "-o", "mf.csv") # should work to get signatures: - idx = StandaloneManifestIndex.load(runtmp.output('mf.csv'), - prefix=runtmp.output('')) + idx = StandaloneManifestIndex.load( + runtmp.output("mf.csv"), prefix=runtmp.output("") + ) siglist = list(idx.signatures()) assert len(siglist) == 1 @@ -2047,68 +2081,68 @@ def test_standalone_manifest_lazy_load_2_prefix(runtmp): def test_standalone_manifest_search(runtmp): # test a straight up 'search' - query_sig = utils.get_test_data('scaled/genome-s12.fa.gz.sig') - mf = utils.get_test_data('scaled/mf.csv') + query_sig = utils.get_test_data("scaled/genome-s12.fa.gz.sig") + mf = utils.get_test_data("scaled/mf.csv") - runtmp.sourmash('search', query_sig, mf) + runtmp.sourmash("search", query_sig, mf) out = runtmp.last_result.out print(out) - assert '100.0% d84ef28f' in out + assert "100.0% d84ef28f" in out def test_standalone_manifest_prefetch_lazy(runtmp): # check that prefetch is actually doing lazy loading on manifest index. - orig_sig47 = utils.get_test_data('47.fa.sig') - sig47 = runtmp.output('47.fa.sig') - orig_sig2 = utils.get_test_data('2.fa.sig') - sig2 = runtmp.output('2.fa.sig') - orig_sig63 = utils.get_test_data('63.fa.sig') - sig63 = runtmp.output('63.fa.sig') + orig_sig47 = utils.get_test_data("47.fa.sig") + sig47 = runtmp.output("47.fa.sig") + orig_sig2 = utils.get_test_data("2.fa.sig") + sig2 = runtmp.output("2.fa.sig") + orig_sig63 = utils.get_test_data("63.fa.sig") + sig63 = runtmp.output("63.fa.sig") shutil.copyfile(orig_sig47, sig47) - runtmp.sourmash('sig', 'manifest', sig47, '-o', 'mf1.csv') + runtmp.sourmash("sig", "manifest", sig47, "-o", "mf1.csv") shutil.copyfile(orig_sig2, sig2) - runtmp.sourmash('sig', 'manifest', sig2, '-o', 'mf2.csv') + runtmp.sourmash("sig", "manifest", sig2, "-o", "mf2.csv") shutil.copyfile(orig_sig63, sig63) - runtmp.sourmash('sig', 'manifest', sig63, '-o', 'mf3.csv') + runtmp.sourmash("sig", "manifest", sig63, "-o", "mf3.csv") # combine the manifests, manually for now... - mf1 = CollectionManifest.load_from_filename(runtmp.output('mf1.csv')) + mf1 = CollectionManifest.load_from_filename(runtmp.output("mf1.csv")) assert len(mf1) == 1 - mf2 = CollectionManifest.load_from_filename(runtmp.output('mf2.csv')) + mf2 = CollectionManifest.load_from_filename(runtmp.output("mf2.csv")) assert len(mf2) == 3 - mf3 = CollectionManifest.load_from_filename(runtmp.output('mf3.csv')) + mf3 = CollectionManifest.load_from_filename(runtmp.output("mf3.csv")) assert len(mf3) == 1 mf = mf1 + mf2 + mf3 assert len(mf) == 5 - mf.write_to_filename(runtmp.output('mf.csv')) + mf.write_to_filename(runtmp.output("mf.csv")) # ok! now, remove the last signature, 'sig63'. os.unlink(sig63) # ...but loading the manifest should still work. - idx = StandaloneManifestIndex.load(runtmp.output('mf.csv')) + idx = StandaloneManifestIndex.load(runtmp.output("mf.csv")) # double check - third load will fail. this relies on load order :shrug:. sig_iter = iter(idx.signatures()) ss = next(sig_iter) print(ss) - assert '47.fa' in ss.filename + assert "47.fa" in ss.filename for i in range(3): ss = next(sig_iter) print(i, ss) - assert '2.fa' in ss.filename + assert "2.fa" in ss.filename with pytest.raises(ValueError) as exc: ss = next(sig_iter) - assert 'Error while reading signatures from' in str(exc) - assert '63.fa.sig' in str(exc) + assert "Error while reading signatures from" in str(exc) + assert "63.fa.sig" in str(exc) # ok! now test prefetch... should get one match legit, to 47, # and then no matches to 2, and then error. @@ -2125,5 +2159,5 @@ def test_standalone_manifest_prefetch_lazy(runtmp): with pytest.raises(ValueError) as exc: sr = next(g) - assert 'Error while reading signatures from' in str(exc) - assert '63.fa.sig' in str(exc) + assert "Error while reading signatures from" in str(exc) + assert "63.fa.sig" in str(exc) diff --git a/tests/test_index_protocol.py b/tests/test_index_protocol.py index 4a6672408e..b843e9883d 100644 --- a/tests/test_index_protocol.py +++ b/tests/test_index_protocol.py @@ -8,27 +8,30 @@ import sourmash from sourmash import SourmashSignature -from sourmash.index import (LinearIndex, ZipFileLinearIndex, - LazyLinearIndex, MultiIndex, - StandaloneManifestIndex, - IndexSearchResult) +from sourmash.index import ( + LinearIndex, + ZipFileLinearIndex, + LazyLinearIndex, + MultiIndex, + StandaloneManifestIndex, + IndexSearchResult, +) from sourmash.index import CounterGather from sourmash.index.sqlite_index import SqliteIndex from sourmash.index.revindex import RevIndex from sourmash.sbt import SBT, GraphFactory from sourmash.manifest import CollectionManifest, BaseCollectionManifest from sourmash.lca.lca_db import LCA_Database, load_single_database -from sourmash.minhash import (flatten_and_intersect_scaled, - flatten_and_downsample_scaled) +from sourmash.minhash import flatten_and_intersect_scaled, flatten_and_downsample_scaled import sourmash_tst_utils as utils def _load_three_sigs(): # utility function - load & return these three sigs. - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47) @@ -55,7 +58,7 @@ def build_lazy_linear_index(runtmp): def build_sbt_index(runtmp): ss2, ss47, ss63 = _load_three_sigs() - + factory = GraphFactory(5, 100, 3) root = SBT(factory, d=2) @@ -68,7 +71,7 @@ def build_sbt_index(runtmp): def build_sbt_index_save_load(runtmp): root = build_sbt_index(runtmp) - out = runtmp.output('xyz.sbt.zip') + out = runtmp.output("xyz.sbt.zip") root.save(out) return sourmash.load_file_as_index(out) @@ -77,7 +80,7 @@ def build_sbt_index_save_load(runtmp): def build_zipfile_index(runtmp): from sourmash.save_load import SaveSignatures_ZipFile - location = runtmp.output('index.zip') + location = runtmp.output("index.zip") with SaveSignatures_ZipFile(location) as save_sigs: for ss in _load_three_sigs(): save_sigs.add(ss) @@ -95,9 +98,9 @@ def build_multi_index(runtmp): def build_standalone_manifest_index(runtmp): - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47) @@ -106,10 +109,10 @@ def build_standalone_manifest_index(runtmp): siglist = [(ss2, sig2), (ss47, sig47), (ss63, sig63)] rows = [] - rows.extend((CollectionManifest.make_manifest_row(ss, loc) for ss, loc in siglist )) + rows.extend((CollectionManifest.make_manifest_row(ss, loc) for ss, loc in siglist)) mf = CollectionManifest(rows) mf_filename = runtmp.output("mf.csv") - + mf.write_to_filename(mf_filename) idx = StandaloneManifestIndex.load(mf_filename) @@ -118,7 +121,7 @@ def build_standalone_manifest_index(runtmp): def build_lca_index(runtmp): siglist = _load_three_sigs() - db = LCA_Database(31, 1000, 'DNA') + db = LCA_Database(31, 1000, "DNA") for ss in siglist: db.insert(ss) @@ -127,14 +130,14 @@ def build_lca_index(runtmp): def build_lca_index_save_load(runtmp): db = build_lca_index(runtmp) - outfile = runtmp.output('db.lca.json') + outfile = runtmp.output("db.lca.json") db.save(outfile) return sourmash.load_file_as_index(outfile) def build_sqlite_index(runtmp): - filename = runtmp.output('idx.sqldb') + filename = runtmp.output("idx.sqldb") db = SqliteIndex.create(filename) siglist = _load_three_sigs() @@ -157,8 +160,8 @@ def build_revindex(runtmp): def build_lca_index_save_load_sql(runtmp): db = build_lca_index(runtmp) - outfile = runtmp.output('db.lca.json') - db.save(outfile, format='sql') + outfile = runtmp.output("db.lca.json") + db.save(outfile, format="sql") x = load_single_database(outfile) db_load = x[0] @@ -171,19 +174,22 @@ def build_lca_index_save_load_sql(runtmp): # building functions. # -@pytest.fixture(params=[build_linear_index, - build_lazy_linear_index, - build_sbt_index, - build_zipfile_index, - build_multi_index, - build_standalone_manifest_index, - build_lca_index, - build_sbt_index_save_load, - build_lca_index_save_load, - build_sqlite_index, - build_lca_index_save_load_sql, -# build_revindex, - ] + +@pytest.fixture( + params=[ + build_linear_index, + build_lazy_linear_index, + build_sbt_index, + build_zipfile_index, + build_multi_index, + build_standalone_manifest_index, + build_lca_index, + build_sbt_index_save_load, + build_lca_index_save_load, + build_sqlite_index, + build_lca_index_save_load_sql, + # build_revindex, + ] ) def index_obj(request, runtmp): build_fn = request.param @@ -271,7 +277,7 @@ def test_index_signatures(index_obj): assert len(siglist) == 3 # check md5sums, since 'in' doesn't always work - md5s = set(( ss.md5sum() for ss in siglist )) + md5s = set(ss.md5sum() for ss in siglist) assert ss2.md5sum() in md5s assert ss47.md5sum() in md5s assert ss63.md5sum() in md5s @@ -285,7 +291,7 @@ def test_index_signatures_with_location(index_obj): assert len(siglist) == 3 # check md5sums, since 'in' doesn't always work - md5s = set(( ss.md5sum() for ss, loc in siglist )) + md5s = set((ss.md5sum() for ss, loc in siglist)) assert ss2.md5sum() in md5s assert ss47.md5sum() in md5s assert ss63.md5sum() in md5s @@ -315,15 +321,22 @@ def test_index_manifest(index_obj): def test_index_select_basic(index_obj): # select does the basic thing ok - idx = index_obj.select(ksize=31, moltype='DNA', abund=False, - containment=True, scaled=1000, num=0, picklist=None) + idx = index_obj.select( + ksize=31, + moltype="DNA", + abund=False, + containment=True, + scaled=1000, + num=0, + picklist=None, + ) assert len(idx) == 3 siglist = list(idx.signatures()) assert len(siglist) == 3 # check md5sums, since 'in' doesn't always work - md5s = set(( ss.md5sum() for ss in siglist )) + md5s = set(ss.md5sum() for ss in siglist) ss2, ss47, ss63 = _load_three_sigs() assert ss2.md5sum() in md5s assert ss47.md5sum() in md5s @@ -477,6 +490,7 @@ class CounterGather_LinearIndex: Provides an (inefficient) CounterGather-style class, for protocol testing purposes. """ + def __init__(self, orig_query): "Constructor - take a SourmashSignature that is the original query." orig_query_mh = orig_query.minhash @@ -564,6 +578,7 @@ class CounterGather_LCA: based on LCA_Database. This is currently just for protocol and API testing purposes. """ + def __init__(self, query): from sourmash.lca.lca_db import LCA_Database @@ -572,8 +587,7 @@ def __init__(self, query): raise ValueError("must use scaled MinHash") self.orig_query_mh = query_mh - lca_db = LCA_Database(query_mh.ksize, query_mh.scaled, - query_mh.moltype) + lca_db = LCA_Database(query_mh.ksize, query_mh.scaled, query_mh.moltype) self.db = lca_db self.siglist = {} self.locations = {} @@ -598,8 +612,7 @@ def add(self, ss, *, location=None, require_overlap=True): def signatures(self): "Yield all signatures." - for ss in self.siglist.values(): - yield ss + yield from self.siglist.values() def downsample(self, scaled): "Track highest scaled across all possible matches." @@ -635,8 +648,7 @@ def peek(self, query_mh, *, threshold_bp=0): cont = result.score match = result.signature - intersect_mh = flatten_and_intersect_scaled(result.signature.minhash, - query_mh) + intersect_mh = flatten_and_intersect_scaled(result.signature.minhash, query_mh) md5 = result.signature.md5sum() location = self.locations[md5] @@ -648,10 +660,12 @@ def consume(self, intersect_mh): self.query_started = 1 -@pytest.fixture(params=[CounterGather, - CounterGather_LinearIndex, - CounterGather_LCA, - ] +@pytest.fixture( + params=[ + CounterGather, + CounterGather_LinearIndex, + CounterGather_LCA, + ] ) def counter_gather_constructor(request): build_fn = request.param @@ -664,19 +678,19 @@ def test_counter_get_signatures(counter_gather_constructor): # test .signatures() method query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") match_mh_1 = query_mh.copy_and_clear() match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') + match_ss_1 = SourmashSignature(match_mh_1, name="match1") match_mh_2 = query_mh.copy_and_clear() match_mh_2.add_many(range(10, 15)) - match_ss_2 = SourmashSignature(match_mh_2, name='match2') + match_ss_2 = SourmashSignature(match_mh_2, name="match2") match_mh_3 = query_mh.copy_and_clear() match_mh_3.add_many(range(15, 17)) - match_ss_3 = SourmashSignature(match_mh_3, name='match3') + match_ss_3 = SourmashSignature(match_mh_3, name="match3") counter = counter_gather_constructor(query_ss) counter.add(match_ss_1) @@ -720,19 +734,19 @@ def test_counter_gather_1(counter_gather_constructor): # generated via CounterGather query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") match_mh_1 = query_mh.copy_and_clear() match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') + match_ss_1 = SourmashSignature(match_mh_1, name="match1") match_mh_2 = query_mh.copy_and_clear() match_mh_2.add_many(range(10, 15)) - match_ss_2 = SourmashSignature(match_mh_2, name='match2') + match_ss_2 = SourmashSignature(match_mh_2, name="match2") match_mh_3 = query_mh.copy_and_clear() match_mh_3.add_many(range(15, 17)) - match_ss_3 = SourmashSignature(match_mh_3, name='match3') + match_ss_3 = SourmashSignature(match_mh_3, name="match3") # load up the counter counter = counter_gather_constructor(query_ss) @@ -742,9 +756,11 @@ def test_counter_gather_1(counter_gather_constructor): results = _consume_all(query_ss.minhash, counter) - expected = (['match1', 10], - ['match2', 5], - ['match3', 2],) + expected = ( + ["match1", 10], + ["match2", 5], + ["match3", 2], + ) assert len(results) == len(expected), results for (sr, size), (exp_name, exp_size) in zip(results, expected): @@ -762,19 +778,19 @@ def test_counter_gather_1_b(counter_gather_constructor): # larger. query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") match_mh_1 = query_mh.copy_and_clear() match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') + match_ss_1 = SourmashSignature(match_mh_1, name="match1") match_mh_2 = query_mh.copy_and_clear() match_mh_2.add_many(range(7, 15)) - match_ss_2 = SourmashSignature(match_mh_2, name='match2') + match_ss_2 = SourmashSignature(match_mh_2, name="match2") match_mh_3 = query_mh.copy_and_clear() match_mh_3.add_many(range(13, 17)) - match_ss_3 = SourmashSignature(match_mh_3, name='match3') + match_ss_3 = SourmashSignature(match_mh_3, name="match3") # load up the counter counter = counter_gather_constructor(query_ss) @@ -784,9 +800,11 @@ def test_counter_gather_1_b(counter_gather_constructor): results = _consume_all(query_ss.minhash, counter) - expected = (['match1', 10], - ['match2', 5], - ['match3', 2],) + expected = ( + ["match1", 10], + ["match2", 5], + ["match3", 2], + ) assert len(results) == len(expected), results for (sr, size), (exp_name, exp_size) in zip(results, expected): @@ -806,19 +824,19 @@ def test_counter_gather_1_c_with_threshold(counter_gather_constructor): query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") match_mh_1 = query_mh.copy_and_clear() match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') + match_ss_1 = SourmashSignature(match_mh_1, name="match1") match_mh_2 = query_mh.copy_and_clear() match_mh_2.add_many(range(7, 15)) - match_ss_2 = SourmashSignature(match_mh_2, name='match2') + match_ss_2 = SourmashSignature(match_mh_2, name="match2") match_mh_3 = query_mh.copy_and_clear() match_mh_3.add_many(range(13, 17)) - match_ss_3 = SourmashSignature(match_mh_3, name='match3') + match_ss_3 = SourmashSignature(match_mh_3, name="match3") # load up the counter counter = counter_gather_constructor(query_ss) @@ -826,11 +844,9 @@ def test_counter_gather_1_c_with_threshold(counter_gather_constructor): counter.add(match_ss_2) counter.add(match_ss_3) - results = _consume_all(query_ss.minhash, counter, - threshold_bp=3) + results = _consume_all(query_ss.minhash, counter, threshold_bp=3) - expected = (['match1', 10], - ['match2', 5]) + expected = (["match1", 10], ["match2", 5]) assert len(results) == len(expected), results for (sr, size), (exp_name, exp_size) in zip(results, expected): @@ -844,19 +860,19 @@ def test_counter_gather_1_d_diff_scaled(counter_gather_constructor): # test as above, but with different scaled. query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") match_mh_1 = query_mh.copy_and_clear().downsample(scaled=10) match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') + match_ss_1 = SourmashSignature(match_mh_1, name="match1") match_mh_2 = query_mh.copy_and_clear().downsample(scaled=20) match_mh_2.add_many(range(7, 15)) - match_ss_2 = SourmashSignature(match_mh_2, name='match2') + match_ss_2 = SourmashSignature(match_mh_2, name="match2") match_mh_3 = query_mh.copy_and_clear().downsample(scaled=30) match_mh_3.add_many(range(13, 17)) - match_ss_3 = SourmashSignature(match_mh_3, name='match3') + match_ss_3 = SourmashSignature(match_mh_3, name="match3") # load up the counter counter = counter_gather_constructor(query_ss) @@ -866,9 +882,11 @@ def test_counter_gather_1_d_diff_scaled(counter_gather_constructor): results = _consume_all(query_ss.minhash, counter) - expected = (['match1', 10], - ['match2', 5], - ['match3', 2],) + expected = ( + ["match1", 10], + ["match2", 5], + ["match3", 2], + ) assert len(results) == len(expected), results for (sr, size), (exp_name, exp_size) in zip(results, expected): @@ -885,18 +903,18 @@ def test_counter_gather_1_d_diff_scaled_query(counter_gather_constructor): match_mh_1 = query_mh.copy_and_clear().downsample(scaled=10) match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') + match_ss_1 = SourmashSignature(match_mh_1, name="match1") match_mh_2 = query_mh.copy_and_clear().downsample(scaled=20) match_mh_2.add_many(range(7, 15)) - match_ss_2 = SourmashSignature(match_mh_2, name='match2') + match_ss_2 = SourmashSignature(match_mh_2, name="match2") match_mh_3 = query_mh.copy_and_clear().downsample(scaled=30) match_mh_3.add_many(range(13, 17)) - match_ss_3 = SourmashSignature(match_mh_3, name='match3') + match_ss_3 = SourmashSignature(match_mh_3, name="match3") # downsample query now - - query_ss = SourmashSignature(query_mh.downsample(scaled=100), name='query') + query_ss = SourmashSignature(query_mh.downsample(scaled=100), name="query") # load up the counter counter = counter_gather_constructor(query_ss) @@ -906,9 +924,11 @@ def test_counter_gather_1_d_diff_scaled_query(counter_gather_constructor): results = _consume_all(query_ss.minhash, counter) - expected = (['match1', 10], - ['match2', 5], - ['match3', 2],) + expected = ( + ["match1", 10], + ["match2", 5], + ["match3", 2], + ) assert len(results) == len(expected), results for (sr, size), (exp_name, exp_size) in zip(results, expected): @@ -922,19 +942,19 @@ def test_counter_gather_1_e_abund_query(counter_gather_constructor): # test as above, but abund query query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1, track_abundance=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") match_mh_1 = query_mh.copy_and_clear().flatten() match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') + match_ss_1 = SourmashSignature(match_mh_1, name="match1") match_mh_2 = query_mh.copy_and_clear().flatten() match_mh_2.add_many(range(7, 15)) - match_ss_2 = SourmashSignature(match_mh_2, name='match2') + match_ss_2 = SourmashSignature(match_mh_2, name="match2") match_mh_3 = query_mh.copy_and_clear().flatten() match_mh_3.add_many(range(13, 17)) - match_ss_3 = SourmashSignature(match_mh_3, name='match3') + match_ss_3 = SourmashSignature(match_mh_3, name="match3") # load up the counter counter = counter_gather_constructor(query_ss) @@ -945,9 +965,11 @@ def test_counter_gather_1_e_abund_query(counter_gather_constructor): # must flatten before peek! results = _consume_all(query_ss.minhash.flatten(), counter) - expected = (['match1', 10], - ['match2', 5], - ['match3', 2],) + expected = ( + ["match1", 10], + ["match2", 5], + ["match3", 2], + ) assert len(results) == len(expected), results for (sr, size), (exp_name, exp_size) in zip(results, expected): @@ -961,19 +983,19 @@ def test_counter_gather_1_f_abund_match(counter_gather_constructor): # test as above, but abund query query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1, track_abundance=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh.flatten(), name='query') + query_ss = SourmashSignature(query_mh.flatten(), name="query") match_mh_1 = query_mh.copy_and_clear() match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') + match_ss_1 = SourmashSignature(match_mh_1, name="match1") match_mh_2 = query_mh.copy_and_clear() match_mh_2.add_many(range(7, 15)) - match_ss_2 = SourmashSignature(match_mh_2, name='match2') + match_ss_2 = SourmashSignature(match_mh_2, name="match2") match_mh_3 = query_mh.copy_and_clear() match_mh_3.add_many(range(13, 17)) - match_ss_3 = SourmashSignature(match_mh_3, name='match3') + match_ss_3 = SourmashSignature(match_mh_3, name="match3") # load up the counter counter = counter_gather_constructor(query_ss) @@ -984,9 +1006,11 @@ def test_counter_gather_1_f_abund_match(counter_gather_constructor): # must flatten before peek! results = _consume_all(query_ss.minhash.flatten(), counter) - expected = (['match1', 10], - ['match2', 5], - ['match3', 2],) + expected = ( + ["match1", 10], + ["match2", 5], + ["match3", 2], + ) assert len(results) == len(expected), results for (sr, size), (exp_name, exp_size) in zip(results, expected): @@ -999,13 +1023,14 @@ def test_counter_gather_1_f_abund_match(counter_gather_constructor): def test_counter_gather_2(counter_gather_constructor): # check basic set of gather results on semi-real data, # generated via CounterGather - testdata_combined = utils.get_test_data('gather/combined.sig') - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_combined = utils.get_test_data("gather/combined.sig") + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) query_ss = sourmash.load_one_signature(testdata_combined, ksize=21) - subject_sigs = [ (sourmash.load_one_signature(t, ksize=21), t) - for t in testdata_sigs ] + subject_sigs = [ + (sourmash.load_one_signature(t, ksize=21), t) for t in testdata_sigs + ] # load up the counter counter = counter_gather_constructor(query_ss) @@ -1014,18 +1039,20 @@ def test_counter_gather_2(counter_gather_constructor): results = _consume_all(query_ss.minhash, counter) - expected = (['NC_003198.1', 487], - ['NC_000853.1', 192], - ['NC_011978.1', 169], - ['NC_002163.1', 157], - ['NC_003197.2', 152], - ['NC_009486.1', 92], - ['NC_006905.1', 76], - ['NC_011080.1', 59], - ['NC_011274.1', 42], - ['NC_006511.1', 31], - ['NC_011294.1', 7], - ['NC_004631.1', 2]) + expected = ( + ["NC_003198.1", 487], + ["NC_000853.1", 192], + ["NC_011978.1", 169], + ["NC_002163.1", 157], + ["NC_003197.2", 152], + ["NC_009486.1", 92], + ["NC_006905.1", 76], + ["NC_011080.1", 59], + ["NC_011274.1", 42], + ["NC_006511.1", 31], + ["NC_011294.1", 7], + ["NC_004631.1", 2], + ) assert len(results) == len(expected) for (sr, size), (exp_name, exp_size) in zip(results, expected): @@ -1040,11 +1067,11 @@ def test_counter_gather_exact_match(counter_gather_constructor): # query == match query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") # load up the counter; provide a location override, too. counter = counter_gather_constructor(query_ss) - counter.add(query_ss, location='somewhere over the rainbow') + counter.add(query_ss, location="somewhere over the rainbow") results = _consume_all(query_ss.minhash, counter) assert len(results) == 1 @@ -1052,14 +1079,14 @@ def test_counter_gather_exact_match(counter_gather_constructor): assert sr.score == 1.0 assert sr.signature == query_ss - assert sr.location == 'somewhere over the rainbow' + assert sr.location == "somewhere over the rainbow" def test_counter_gather_multiple_identical_matches(counter_gather_constructor): # test multiple identical matches being inserted, with only one return query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") # create counter... counter = counter_gather_constructor(query_ss) @@ -1068,7 +1095,7 @@ def test_counter_gather_multiple_identical_matches(counter_gather_constructor): match_mh = query_mh.copy_and_clear() match_mh.add_many(range(5, 15)) - for name in 'match1', 'match2', 'match3': + for name in "match1", "match2", "match3": match_ss = SourmashSignature(match_mh, name=name) counter.add(match_ss, location=name) @@ -1080,18 +1107,18 @@ def test_counter_gather_multiple_identical_matches(counter_gather_constructor): assert overlap_count == 10 # any one of the three is valid - assert sr.location in ('match1', 'match2', 'match3') + assert sr.location in ("match1", "match2", "match3") def test_counter_gather_add_after_peek(counter_gather_constructor): # cannot add after peek or consume query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") # load up the counter counter = counter_gather_constructor(query_ss) - counter.add(query_ss, location='somewhere over the rainbow') + counter.add(query_ss, location="somewhere over the rainbow") counter.peek(query_ss.minhash) @@ -1103,11 +1130,11 @@ def test_counter_gather_add_after_consume(counter_gather_constructor): # cannot add after peek or consume query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") # load up the counter counter = counter_gather_constructor(query_ss) - counter.add(query_ss, location='somewhere over the rainbow') + counter.add(query_ss, location="somewhere over the rainbow") counter.consume(query_ss.minhash) @@ -1119,11 +1146,11 @@ def test_counter_gather_consume_empty_intersect(counter_gather_constructor): # check that consume works fine when there is an empty signature. query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") # load up the counter counter = counter_gather_constructor(query_ss) - counter.add(query_ss, location='somewhere over the rainbow') + counter.add(query_ss, location="somewhere over the rainbow") # nothing really happens here :laugh:, just making sure there's no error counter.consume(query_ss.minhash.copy_and_clear()) @@ -1132,11 +1159,11 @@ def test_counter_gather_consume_empty_intersect(counter_gather_constructor): def test_counter_gather_empty_initial_query(counter_gather_constructor): # check empty initial query query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") match_mh_1 = query_mh.copy_and_clear() match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') + match_ss_1 = SourmashSignature(match_mh_1, name="match1") # load up the counter counter = counter_gather_constructor(query_ss) @@ -1149,7 +1176,7 @@ def test_counter_gather_num_query(counter_gather_constructor): # check num query query_mh = sourmash.MinHash(n=500, ksize=31) query_mh.add_many(range(0, 10)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") with pytest.raises(ValueError): counter_gather_constructor(query_ss) @@ -1159,11 +1186,11 @@ def test_counter_gather_empty_cur_query(counter_gather_constructor): # test empty cur query query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") # load up the counter counter = counter_gather_constructor(query_ss) - counter.add(query_ss, location='somewhere over the rainbow') + counter.add(query_ss, location="somewhere over the rainbow") cur_query_mh = query_ss.minhash.copy_and_clear() results = _consume_all(cur_query_mh, counter) @@ -1174,27 +1201,27 @@ def test_counter_gather_add_num_matchy(counter_gather_constructor): # test add num query query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") match_mh = sourmash.MinHash(n=500, ksize=31) match_mh.add_many(range(0, 20)) - match_ss = SourmashSignature(match_mh, name='query') + match_ss = SourmashSignature(match_mh, name="query") # load up the counter counter = counter_gather_constructor(query_ss) with pytest.raises(ValueError): - counter.add(match_ss, location='somewhere over the rainbow') + counter.add(match_ss, location="somewhere over the rainbow") def test_counter_gather_bad_cur_query(counter_gather_constructor): # test cur query that is not subset of original query query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") # load up the counter counter = counter_gather_constructor(query_ss) - counter.add(query_ss, location='somewhere over the rainbow') + counter.add(query_ss, location="somewhere over the rainbow") cur_query_mh = query_ss.minhash.copy_and_clear() cur_query_mh.add_many(range(20, 30)) @@ -1206,11 +1233,11 @@ def test_counter_gather_add_no_overlap(counter_gather_constructor): # check adding match with no overlap w/query query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 10)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") match_mh_1 = query_mh.copy_and_clear() match_mh_1.add_many(range(10, 20)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') + match_ss_1 = SourmashSignature(match_mh_1, name="match1") # load up the counter counter = counter_gather_constructor(query_ss) @@ -1224,18 +1251,18 @@ def test_counter_gather_big_threshold(counter_gather_constructor): # check 'peek' with a huge threshold query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") match_mh_1 = query_mh.copy_and_clear() match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') + match_ss_1 = SourmashSignature(match_mh_1, name="match1") # load up the counter counter = counter_gather_constructor(query_ss) counter.add(match_ss_1) # impossible threshold: - threshold_bp=30*query_ss.minhash.scaled + threshold_bp = 30 * query_ss.minhash.scaled results = counter.peek(query_ss.minhash, threshold_bp=threshold_bp) assert results == [] @@ -1243,7 +1270,7 @@ def test_counter_gather_big_threshold(counter_gather_constructor): def test_counter_gather_empty_counter(counter_gather_constructor): # check empty counter query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") # empty counter! counter = counter_gather_constructor(query_ss) diff --git a/tests/test_jaccard.py b/tests/test_jaccard.py index ce0846a3ae..87093ee194 100644 --- a/tests/test_jaccard.py +++ b/tests/test_jaccard.py @@ -72,10 +72,10 @@ def test_dna_mh(track_abundance): e1 = MinHash(n=5, ksize=4, track_abundance=track_abundance) e2 = MinHash(n=5, ksize=4, track_abundance=track_abundance) - seq = 'ATGGCAGTGACGATGCCAG' + seq = "ATGGCAGTGACGATGCCAG" e1.add_sequence(seq) for i in range(len(seq) - 3): - e2.add_kmer(seq[i:i + 4]) + e2.add_kmer(seq[i : i + 4]) assert e1.hashes.keys() == e2.hashes.keys() print(e1.hashes.keys()) @@ -84,19 +84,17 @@ def test_dna_mh(track_abundance): def test_protein_mh(track_abundance): - e1 = MinHash(n=5, ksize=2, is_protein=True, - track_abundance=track_abundance) - e2 = MinHash(n=5, ksize=2, is_protein=True, - track_abundance=track_abundance) + e1 = MinHash(n=5, ksize=2, is_protein=True, track_abundance=track_abundance) + e2 = MinHash(n=5, ksize=2, is_protein=True, track_abundance=track_abundance) # ok, so this is confusing, but: we are adding _DNA_ kmers here, # and translating. so, add_sequence and add_kmer actually both add # 6-mers. - seq = 'ATGGCAGTGACGATGCCG' + seq = "ATGGCAGTGACGATGCCG" e1.add_sequence(seq) for i in range(len(seq) - 5): - kmer = seq[i:i + 6] + kmer = seq[i : i + 6] e2.add_kmer(kmer) assert e1.hashes.keys() == e2.hashes.keys() @@ -107,10 +105,9 @@ def test_pickle(track_abundance): import pickle from io import BytesIO - e1 = MinHash(n=5, ksize=6, is_protein=False, - track_abundance=track_abundance) + e1 = MinHash(n=5, ksize=6, is_protein=False, track_abundance=track_abundance) - seq = 'ATGGCAGTGACGATGCCG' + seq = "ATGGCAGTGACGATGCCG" e1.add_sequence(seq) e1.add_sequence(seq) @@ -131,8 +128,7 @@ def test_pickle(track_abundance): def test_bad_construct_1(track_abundance): try: - e1 = MinHash(ksize=6, is_protein=False, - track_abundance=track_abundance) + MinHash(ksize=6, is_protein=False, track_abundance=track_abundance) assert 0, "require n in constructor" except TypeError: pass @@ -140,8 +136,7 @@ def test_bad_construct_1(track_abundance): def test_bad_construct_2(track_abundance): try: - e1 = MinHash(n=100, is_protein=False, - track_abundance=track_abundance) + MinHash(n=100, is_protein=False, track_abundance=track_abundance) assert 0, "require ksize in constructor" except TypeError: pass @@ -175,15 +170,16 @@ def test_abund_similarity_zero(): #### + def test_jaccard_on_real_data(): from sourmash.signature import load_signatures - afile = 'n10000/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz' + afile = "n10000/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz" a = utils.get_test_data(afile) sig1 = list(load_signatures(a))[0] mh1 = sig1.minhash - bfile = 'n10000/GCF_000006945.1_ASM694v1_genomic.fna.gz.sig.gz' + bfile = "n10000/GCF_000006945.1_ASM694v1_genomic.fna.gz.sig.gz" b = utils.get_test_data(bfile) sig2 = list(load_signatures(b))[0] mh2 = sig2.minhash @@ -210,12 +206,12 @@ def test_jaccard_on_real_data(): def test_scaled_on_real_data(): from sourmash.signature import load_signatures - afile = 'scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz' + afile = "scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz" a = utils.get_test_data(afile) sig1 = list(load_signatures(a))[0] mh1 = sig1.minhash - bfile = 'scaled100/GCF_000006945.1_ASM694v1_genomic.fna.gz.sig.gz' + bfile = "scaled100/GCF_000006945.1_ASM694v1_genomic.fna.gz.sig.gz" b = utils.get_test_data(bfile) sig2 = list(load_signatures(b))[0] mh2 = sig2.minhash @@ -243,12 +239,12 @@ def test_scaled_on_real_data(): def test_scaled_on_real_data_2(): from sourmash.signature import load_signatures - afile = 'scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz' + afile = "scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz" a = utils.get_test_data(afile) sig1 = list(load_signatures(a))[0] mh1 = sig1.minhash - bfile = 'scaled100/GCF_000006945.1_ASM694v1_genomic.fna.gz.sig.gz' + bfile = "scaled100/GCF_000006945.1_ASM694v1_genomic.fna.gz.sig.gz" b = utils.get_test_data(bfile) sig2 = list(load_signatures(b))[0] mh2 = sig2.minhash @@ -276,12 +272,12 @@ def test_scaled_on_real_data_2(): def test_downsample_scaled_with_num(): from sourmash.signature import load_signatures - afile = 'scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz' + afile = "scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz" a = utils.get_test_data(afile) sig1 = list(load_signatures(a))[0] mh1 = sig1.minhash with pytest.raises(ValueError) as exc: - mh = mh1.downsample(num=500) + mh1.downsample(num=500) - assert 'cannot downsample a scaled MinHash using num' in str(exc.value) + assert "cannot downsample a scaled MinHash using num" in str(exc.value) diff --git a/tests/test_lca.py b/tests/test_lca.py index 46b1d9716d..7db105628e 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -21,8 +21,7 @@ def test_api_create_search(): # create a database and then search for result. - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) assert len(lca_db) == 0 @@ -44,18 +43,16 @@ def test_api_create_search(): def test_api_find_picklist_select(): # does 'find' respect picklists? - sig47 = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) - sig63 = sourmash.load_one_signature(utils.get_test_data('63.fa.sig'), - ksize=31) + sig47 = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + sig63 = sourmash.load_one_signature(utils.get_test_data("63.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) lca_db.insert(sig47) lca_db.insert(sig63) # construct a picklist... - picklist = SignaturePicklist('md5prefix8') - picklist.init(['09a08691']) + picklist = SignaturePicklist("md5prefix8") + picklist.init(["09a08691"]) # run a 'find' with sig63, should find 47 and 63 both. search_obj = make_jaccard_search_query(do_containment=True, threshold=0.0) @@ -72,24 +69,22 @@ def test_api_find_picklist_select(): # and check that it is the expected one! ss = results[0].signature assert ss.minhash.ksize == 31 - assert ss.md5sum().startswith('09a08691c') + assert ss.md5sum().startswith("09a08691c") def test_api_find_picklist_select_exclude(): # does 'find' respect picklists? - sig47 = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) - sig63 = sourmash.load_one_signature(utils.get_test_data('63.fa.sig'), - ksize=31) + sig47 = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + sig63 = sourmash.load_one_signature(utils.get_test_data("63.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) lca_db.insert(sig47) lca_db.insert(sig63) # construct a picklist... - picklist = SignaturePicklist('md5prefix8', pickstyle= PickStyle.EXCLUDE) - picklist.init(['09a08691']) + picklist = SignaturePicklist("md5prefix8", pickstyle=PickStyle.EXCLUDE) + picklist.init(["09a08691"]) # run a 'find' with sig63, should find 47 and 63 both. search_obj = make_jaccard_search_query(do_containment=True, threshold=0.0) @@ -106,13 +101,12 @@ def test_api_find_picklist_select_exclude(): # and check that it is the expected one! ss = results[0].signature assert ss.minhash.ksize == 31 - assert ss.md5sum().startswith('38729c637') + assert ss.md5sum().startswith("38729c637") def test_api_create_insert(): # test some internal implementation stuff: create & then insert a sig. - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) lca_db.insert(ss) @@ -131,16 +125,15 @@ def test_api_create_insert(): for vv in lca_db._hashval_to_idx.values(): set_of_values.update(vv) assert len(set_of_values) == 1 - assert set_of_values == { 0 } + assert set_of_values == {0} - assert not lca_db._idx_to_lid # no lineage added - assert not lca_db._lid_to_lineage # no lineage added + assert not lca_db._idx_to_lid # no lineage added + assert not lca_db._lid_to_lineage # no lineage added def test_api_create_insert_bad_ksize(): # can we insert a ksize=21 signature into a ksize=31 DB? hopefully not. - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=21, scaled=1000) with pytest.raises(ValueError): @@ -149,17 +142,15 @@ def test_api_create_insert_bad_ksize(): def test_api_create_insert_bad_ident(): # can we insert a signature with no/empty ident? - ss1 = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) - ss2 = sourmash.load_one_signature(utils.get_test_data('63.fa.sig'), - ksize=31) + ss1 = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + ss2 = sourmash.load_one_signature(utils.get_test_data("63.fa.sig"), ksize=31) ss1 = ss1.to_mutable() ss2 = ss2.to_mutable() - ss1.name = '' - ss1.filename = '' - ss2.name = '' - ss2.filename = '' + ss1.name = "" + ss1.filename = "" + ss2.name = "" + ss2.filename = "" lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) lca_db.insert(ss1) @@ -171,8 +162,7 @@ def test_api_create_insert_bad_ident(): def test_api_create_insert_bad_scaled(): # can we insert a scaled=1000 signature into a scaled=500 DB? # hopefully not. - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) assert ss.minhash.scaled == 1000 lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=500) @@ -183,11 +173,10 @@ def test_api_create_insert_bad_scaled(): def test_api_create_insert_bad_moltype(): # can we insert a DNAsignature into a protein DB? # hopefully not. - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) - assert ss.minhash.moltype == 'DNA' + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + assert ss.minhash.moltype == "DNA" - lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=500, moltype='protein') + lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=500, moltype="protein") with pytest.raises(ValueError): lca_db.insert(ss) @@ -195,13 +184,12 @@ def test_api_create_insert_bad_moltype(): def test_api_create_insert_ident(): # test some internal implementation stuff: signature inserted with # different ident than name. - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) - lca_db.insert(ss, ident='foo') + lca_db.insert(ss, ident="foo") - ident = 'foo' + ident = "foo" assert len(lca_db._ident_to_name) == 1 assert ident in lca_db._ident_to_name assert lca_db._ident_to_name[ident] == ss.name @@ -215,27 +203,25 @@ def test_api_create_insert_ident(): for vv in lca_db._hashval_to_idx.values(): set_of_values.update(vv) assert len(set_of_values) == 1 - assert set_of_values == { 0 } + assert set_of_values == {0} - assert not lca_db._idx_to_lid # no lineage added - assert not lca_db._lid_to_lineage # no lineage added + assert not lca_db._idx_to_lid # no lineage added + assert not lca_db._lid_to_lineage # no lineage added assert not lca_db._lineage_to_lid assert not lca_db._lid_to_idx def test_api_create_insert_two(): # check internal details if multiple signatures are inserted. - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) - ss2 = sourmash.load_one_signature(utils.get_test_data('63.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + ss2 = sourmash.load_one_signature(utils.get_test_data("63.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) - lca_db.insert(ss, ident='foo') - lca_db.insert(ss2, ident='bar') + lca_db.insert(ss, ident="foo") + lca_db.insert(ss2, ident="bar") - ident = 'foo' - ident2 = 'bar' + ident = "foo" + ident2 = "bar" assert len(lca_db._ident_to_name) == 2 assert ident in lca_db._ident_to_name assert ident2 in lca_db._ident_to_name @@ -258,22 +244,20 @@ def test_api_create_insert_two(): for vv in lca_db._hashval_to_idx.values(): set_of_values.update(vv) assert len(set_of_values) == 2 - assert set_of_values == { 0, 1 } + assert set_of_values == {0, 1} - assert not lca_db._idx_to_lid # no lineage added - assert not lca_db._lid_to_lineage # no lineage added + assert not lca_db._idx_to_lid # no lineage added + assert not lca_db._lid_to_lineage # no lineage added assert not lca_db._lineage_to_lid assert not lca_db._lid_to_idx def test_api_create_insert_w_lineage(): # test some internal implementation stuff - insert signature w/lineage - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) - lineage = ((LineagePair('rank1', 'name1'), - LineagePair('rank2', 'name2'))) + lineage = (LineagePair("rank1", "name1"), LineagePair("rank2", "name2")) lca_db.insert(ss, lineage=lineage) @@ -293,14 +277,14 @@ def test_api_create_insert_w_lineage(): for vv in lca_db._hashval_to_idx.values(): set_of_values.update(vv) assert len(set_of_values) == 1 - assert set_of_values == { 0 } + assert set_of_values == {0} # check lineage stuff assert len(lca_db._idx_to_lid) == 1 assert lca_db._idx_to_lid[0] == 0 assert len(lca_db._lid_to_lineage) == 1 assert lca_db._lid_to_lineage[0] == lineage - assert lca_db._lid_to_idx[0] == { 0 } + assert lca_db._lid_to_idx[0] == {0} assert len(lca_db._lineage_to_lid) == 1 assert lca_db._lineage_to_lid[lineage] == 0 @@ -308,12 +292,10 @@ def test_api_create_insert_w_lineage(): def test_api_create_insert_w_bad_lineage(): # test some internal implementation stuff - insert signature w/bad lineage - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) - lineage = ([LineagePair('rank1', 'name1'), - LineagePair('rank2', 'name2')],) + lineage = ([LineagePair("rank1", "name1"), LineagePair("rank2", "name2")],) with pytest.raises(ValueError): lca_db.insert(ss, lineage=lineage) @@ -321,11 +303,10 @@ def test_api_create_insert_w_bad_lineage(): def test_api_create_insert_w_bad_lineage_2(): # test some internal implementation stuff - insert signature w/bad lineage - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) - lineage = 1 # something non-iterable... + lineage = 1 # something non-iterable... with pytest.raises(ValueError): lca_db.insert(ss, lineage=lineage) @@ -333,8 +314,7 @@ def test_api_create_insert_w_bad_lineage_2(): def test_api_create_gather(): # create a database, and then run gather on it. - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) lca_db.insert(ss) @@ -348,10 +328,8 @@ def test_api_create_gather(): def test_api_add_genome_lineage(): # LCA_Databases can store/retrieve arbitrary lineages/taxonomies. - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) - lineage = ((LineagePair('rank1', 'name1'), - (LineagePair('rank2', 'name2')))) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + lineage = (LineagePair("rank1", "name1"), (LineagePair("rank2", "name2"))) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) lca_db.insert(ss, lineage=lineage) @@ -366,26 +344,24 @@ def test_api_add_genome_lineage(): def test_api_insert_update(): # check that cached parts of LCA_Database are updated when a new # signature is inserted. - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) - ss2 = sourmash.load_one_signature(utils.get_test_data('63.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + ss2 = sourmash.load_one_signature(utils.get_test_data("63.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) lca_db.insert(ss) - all_mh = [ x.minhash for x in lca_db.signatures() ] + all_mh = [x.minhash for x in lca_db.signatures()] assert ss.minhash in all_mh # see decorator @cached_property - assert hasattr(lca_db, '_cache') + assert hasattr(lca_db, "_cache") assert lca_db._cache # inserting a signature should delete the cache lca_db.insert(ss2) - assert not hasattr(lca_db, '_cache') + assert not hasattr(lca_db, "_cache") # check that it's rebuilt etc. etc. - all_mh = [ x.minhash for x in lca_db.signatures() ] + all_mh = [x.minhash for x in lca_db.signatures()] assert ss.minhash in all_mh assert ss2.minhash in all_mh @@ -393,8 +369,7 @@ def test_api_insert_update(): def test_api_insert_retrieve_check_name(): # check that signatures retrieved from LCA_Database objects have the # right name. - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) lca_db.insert(ss) @@ -408,10 +383,8 @@ def test_api_insert_retrieve_check_name(): def test_api_create_insert_two_then_scale(): # construct database, THEN downsample - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) - ss2 = sourmash.load_one_signature(utils.get_test_data('63.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + ss2 = sourmash.load_one_signature(utils.get_test_data("63.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) lca_db.insert(ss) @@ -431,10 +404,8 @@ def test_api_create_insert_two_then_scale(): def test_api_create_insert_two_then_scale_then_add(): # construct database, THEN downsample, then add another - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) - ss2 = sourmash.load_one_signature(utils.get_test_data('63.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + ss2 = sourmash.load_one_signature(utils.get_test_data("63.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) lca_db.insert(ss) @@ -460,10 +431,8 @@ def test_api_create_insert_two_then_scale_then_add(): def test_api_create_insert_scale_two(): # downsample while constructing database - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) - ss2 = sourmash.load_one_signature(utils.get_test_data('63.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + ss2 = sourmash.load_one_signature(utils.get_test_data("63.fa.sig"), ksize=31) # downsample to 5000 while inserting: lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=5000) @@ -483,7 +452,7 @@ def test_api_create_insert_scale_two(): def test_load_single_db(): - filename = utils.get_test_data('lca/delmont-1.lca.json') + filename = utils.get_test_data("lca/delmont-1.lca.json") db, ksize, scaled = lca_utils.load_single_database(filename) print(db) @@ -494,9 +463,9 @@ def test_load_single_db(): def test_load_single_db_empty(runtmp): # test load_single_database on an empty file; should raise ValueError - empty = runtmp.output('empty.lca.json') + empty = runtmp.output("empty.lca.json") - with open(empty, "wt") as fp: + with open(empty, "w"): pass with pytest.raises(ValueError) as exc: @@ -506,8 +475,8 @@ def test_load_single_db_empty(runtmp): def test_databases(): - filename1 = utils.get_test_data('lca/delmont-1.lca.json') - filename2 = utils.get_test_data('lca/delmont-2.lca.json') + filename1 = utils.get_test_data("lca/delmont-1.lca.json") + filename2 = utils.get_test_data("lca/delmont-2.lca.json") dblist, ksize, scaled = lca_utils.load_databases([filename1, filename2]) print(dblist) @@ -518,7 +487,7 @@ def test_databases(): def test_databases_load_fail_on_no_JSON(): - filename1 = utils.get_test_data('prot/protein.zip') + filename1 = utils.get_test_data("prot/protein.zip") with pytest.raises(ValueError) as exc: dblist, ksize, scaled = lca_utils.load_databases([filename1]) @@ -528,36 +497,37 @@ def test_databases_load_fail_on_no_JSON(): def test_databases_load_fail_on_dir(): - filename1 = utils.get_test_data('lca') + filename1 = utils.get_test_data("lca") with pytest.raises(ValueError) as exc: dblist, ksize, scaled = lca_utils.load_databases([filename1]) err = str(exc.value) print(err) assert f"'{filename1}' is not a file and cannot be loaded as an LCA database" in err - assert not 'found 0 matches total;' in err + assert "found 0 matches total;" not in err def test_databases_load_fail_on_not_exist(): - filename1 = utils.get_test_data('does-not-exist') + filename1 = utils.get_test_data("does-not-exist") with pytest.raises(ValueError) as exc: dblist, ksize, scaled = lca_utils.load_databases([filename1]) err = str(exc.value) print(err) assert f"'{filename1}' is not a file and cannot be loaded as an LCA database" in err - assert not 'found 0 matches total;' in err + assert "found 0 matches total;" not in err + def test_db_repr(): - filename = utils.get_test_data('lca/delmont-1.lca.json') + filename = utils.get_test_data("lca/delmont-1.lca.json") db, ksize, scaled = lca_utils.load_single_database(filename) - assert repr(db) == "LCA_Database('{}')".format(filename) + assert repr(db) == f"LCA_Database('{filename}')" def test_lca_index_signatures_method(): # test 'signatures' method from base class Index - filename = utils.get_test_data('lca/47+63.lca.json') + filename = utils.get_test_data("lca/47+63.lca.json") db, ksize, scaled = lca_utils.load_single_database(filename) siglist = list(db.signatures()) @@ -567,13 +537,13 @@ def test_lca_index_signatures_method(): def test_lca_index_select(): # test 'select' method from Index base class. - filename = utils.get_test_data('lca/47+63.lca.json') + filename = utils.get_test_data("lca/47+63.lca.json") db, ksize, scaled = lca_utils.load_single_database(filename) xx = db.select(ksize=31) assert xx == db - xx = db.select(moltype='DNA') + xx = db.select(moltype="DNA") assert xx == db xx = db.select(abund=False) @@ -583,7 +553,7 @@ def test_lca_index_select(): db.select(ksize=21) with pytest.raises(ValueError): - db.select(moltype='protein') + db.select(moltype="protein") with pytest.raises(ValueError): db.select(abund=True) @@ -592,12 +562,12 @@ def test_lca_index_select(): def test_lca_index_select_picklist(): # test 'select' method from Index base class with a picklist. - filename = utils.get_test_data('lca/47+63.lca.json') + filename = utils.get_test_data("lca/47+63.lca.json") db, ksize, scaled = lca_utils.load_single_database(filename) # construct a picklist... - picklist = SignaturePicklist('md5prefix8') - picklist.init(['50a92740']) + picklist = SignaturePicklist("md5prefix8") + picklist.init(["50a92740"]) xx = db.select(picklist=picklist) assert xx == db @@ -605,7 +575,7 @@ def test_lca_index_select_picklist(): siglist = list(db.signatures()) assert len(siglist) == 1 ss = siglist[0] - assert ss.md5sum().startswith('50a92740') + assert ss.md5sum().startswith("50a92740") assert ss.minhash.ksize == 31 @@ -613,14 +583,14 @@ def test_lca_index_find_picklist_check_overlap(): # make sure 'find' works for picklists that exclude relevant signatures # (bug #1638) - query_fn = utils.get_test_data('47.fa.sig') + query_fn = utils.get_test_data("47.fa.sig") query_sig = sourmash.load_one_signature(query_fn, ksize=31) - db_fn = utils.get_test_data('lca/47+63.lca.json') + db_fn = utils.get_test_data("lca/47+63.lca.json") db, ksize, scaled = lca_utils.load_single_database(db_fn) # construct a picklist... - picklist = SignaturePicklist('ident') - picklist.init(['NC_009665.1']) + picklist = SignaturePicklist("ident") + picklist.init(["NC_009665.1"]) xx = db.select(picklist=picklist) assert xx == db @@ -632,12 +602,12 @@ def test_lca_index_find_picklist_check_overlap(): def test_lca_index_select_picklist_exclude(): # test 'select' method from Index base class with a picklist. - filename = utils.get_test_data('lca/47+63.lca.json') + filename = utils.get_test_data("lca/47+63.lca.json") db, ksize, scaled = lca_utils.load_single_database(filename) # construct a picklist... - picklist = SignaturePicklist('md5prefix8', pickstyle=PickStyle.EXCLUDE) - picklist.init(['50a92740']) + picklist = SignaturePicklist("md5prefix8", pickstyle=PickStyle.EXCLUDE) + picklist.init(["50a92740"]) xx = db.select(picklist=picklist) assert xx == db @@ -645,19 +615,19 @@ def test_lca_index_select_picklist_exclude(): siglist = list(db.signatures()) assert len(siglist) == 1 ss = siglist[0] - assert ss.md5sum().startswith('e88dc390') + assert ss.md5sum().startswith("e88dc390") assert ss.minhash.ksize == 31 def test_lca_index_select_picklist_twice(): # test 'select' method from Index base class with a picklist. - filename = utils.get_test_data('lca/47+63.lca.json') + filename = utils.get_test_data("lca/47+63.lca.json") db, ksize, scaled = lca_utils.load_single_database(filename) # construct a picklist... - picklist = SignaturePicklist('md5prefix8') - picklist.init(['50a92740']) + picklist = SignaturePicklist("md5prefix8") + picklist.init(["50a92740"]) xx = db.select(picklist=picklist) assert xx == db @@ -668,13 +638,12 @@ def test_lca_index_select_picklist_twice(): assert "we do not (yet) support multiple picklists for LCA databases" in str(exc) - def test_search_db_scaled_gt_sig_scaled(): - dbfile = utils.get_test_data('lca/47+63.lca.json') + dbfile = utils.get_test_data("lca/47+63.lca.json") db, ksize, scaled = lca_utils.load_single_database(dbfile) - sig = sourmash.load_one_signature(utils.get_test_data('47.fa.sig')) + sig = sourmash.load_one_signature(utils.get_test_data("47.fa.sig")) - results = db.search(sig, threshold=.01, ignore_abundance=True) + results = db.search(sig, threshold=0.01, ignore_abundance=True) match_sig = results[0][1] minhash = sig.minhash.downsample(scaled=10000) @@ -682,28 +651,28 @@ def test_search_db_scaled_gt_sig_scaled(): def test_search_db_scaled_lt_sig_scaled(): - dbfile = utils.get_test_data('lca/47+63.lca.json') + dbfile = utils.get_test_data("lca/47+63.lca.json") db, ksize, scaled = lca_utils.load_single_database(dbfile) - sig = sourmash.load_one_signature(utils.get_test_data('47.fa.sig')) + sig = sourmash.load_one_signature(utils.get_test_data("47.fa.sig")) sig = sig.to_mutable() sig.minhash = sig.minhash.downsample(scaled=100000) - results = db.search(sig, threshold=.01, ignore_abundance=True) + results = db.search(sig, threshold=0.01, ignore_abundance=True) print(results) assert results[0].score == 1.0 match = results[0].signature - orig_sig = sourmash.load_one_signature(utils.get_test_data('47.fa.sig')) + orig_sig = sourmash.load_one_signature(utils.get_test_data("47.fa.sig")) assert orig_sig.minhash.jaccard(match.minhash, downsample=True) == 1.0 def test_gather_db_scaled_gt_sig_scaled(): - dbfile = utils.get_test_data('lca/47+63.lca.json') + dbfile = utils.get_test_data("lca/47+63.lca.json") db, ksize, scaled = lca_utils.load_single_database(dbfile) - sig = sourmash.load_one_signature(utils.get_test_data('47.fa.sig')) + sig = sourmash.load_one_signature(utils.get_test_data("47.fa.sig")) - result = db.best_containment(sig, threshold=.01, ignore_abundance=True) + result = db.best_containment(sig, threshold=0.01, ignore_abundance=True) match_sig = result[1] minhash = sig.minhash.downsample(scaled=10000) @@ -711,12 +680,12 @@ def test_gather_db_scaled_gt_sig_scaled(): def test_gather_db_scaled_lt_sig_scaled(): - dbfile = utils.get_test_data('lca/47+63.lca.json') + dbfile = utils.get_test_data("lca/47+63.lca.json") db, ksize, scaled = lca_utils.load_single_database(dbfile) - sig = sourmash.load_one_signature(utils.get_test_data('47.fa.sig')) + sig = sourmash.load_one_signature(utils.get_test_data("47.fa.sig")) sig_minhash = sig.minhash.downsample(scaled=100000) - result = db.best_containment(sig, threshold=.01, ignore_abundance=True) + result = db.best_containment(sig, threshold=0.01, ignore_abundance=True) match_sig = result[1] minhash = match_sig.minhash.downsample(scaled=100000) @@ -724,7 +693,7 @@ def test_gather_db_scaled_lt_sig_scaled(): def test_db_lineage_to_lid(): - dbfile = utils.get_test_data('lca/47+63.lca.json') + dbfile = utils.get_test_data("lca/47+63.lca.json") db, ksize, scaled = lca_utils.load_single_database(dbfile) d = db._lineage_to_lid @@ -735,15 +704,15 @@ def test_db_lineage_to_lid(): print(items) lin1 = items[0][0][-1] - assert lin1.rank == 'strain' - assert lin1.name == 'Shewanella baltica OS185' + assert lin1.rank == "strain" + assert lin1.name == "Shewanella baltica OS185" lin1 = items[1][0][-1] - assert lin1.rank == 'strain' - assert lin1.name == 'Shewanella baltica OS223' + assert lin1.rank == "strain" + assert lin1.name == "Shewanella baltica OS223" def test_db_lid_to_idx(): - dbfile = utils.get_test_data('lca/47+63.lca.json') + dbfile = utils.get_test_data("lca/47+63.lca.json") db, ksize, scaled = lca_utils.load_single_database(dbfile) d = db._lid_to_idx @@ -756,7 +725,7 @@ def test_db_lid_to_idx(): def test_db_idx_to_ident(): - dbfile = utils.get_test_data('lca/47+63.lca.json') + dbfile = utils.get_test_data("lca/47+63.lca.json") db, ksize, scaled = lca_utils.load_single_database(dbfile) d = db._idx_to_ident @@ -765,23 +734,23 @@ def test_db_idx_to_ident(): assert len(items) == 2 print(items) - assert items == [(32, 'NC_009665'), (48, 'NC_011663')] + assert items == [(32, "NC_009665"), (48, "NC_011663")] ## command line tests def test_run_sourmash_lca(): - status, out, err = utils.runscript('sourmash', ['lca'], fail_ok=True) - assert status != 0 # no args provided, ok ;) + status, out, err = utils.runscript("sourmash", ["lca"], fail_ok=True) + assert status != 0 # no args provided, ok ;) def test_basic_index(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca/delmont-1.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-1.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, 'delmont-1', input_sig, '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, "delmont-1", input_sig, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -790,38 +759,50 @@ def test_basic_index(runtmp, lca_db_format): assert os.path.exists(lca_db), lca_db - assert 'Building LCA database with ksize=31 scaled=10000 moltype=DNA' in runtmp.last_result.err - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in runtmp.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in runtmp.last_result.err - assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in runtmp.last_result.err + assert ( + "Building LCA database with ksize=31 scaled=10000 moltype=DNA" + in runtmp.last_result.err + ) + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "1 identifiers used out of 1 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) def test_basic_index_twice(runtmp, lca_db_format): # run 'lca index' twice. - taxcsv = utils.get_test_data('lca/delmont-1.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-1.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + runtmp.output(f"delmont-1.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, 'delmont-1', input_sig, '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, "delmont-1", input_sig, "-F", lca_db_format] runtmp.sourmash(*cmd) with pytest.raises(SourmashCommandFailed): - cmd = ['lca', 'index', taxcsv, 'delmont-1', input_sig, '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, "delmont-1", input_sig, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'already exists. Not overwriting.' in runtmp.last_result.err + assert "already exists. Not overwriting." in runtmp.last_result.err def test_basic_index_bad_spreadsheet(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca/bad-spreadsheet.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/bad-spreadsheet.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -830,79 +811,112 @@ def test_basic_index_bad_spreadsheet(runtmp, lca_db_format): assert os.path.exists(lca_db), lca_db - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in runtmp.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in runtmp.last_result.err - assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in runtmp.last_result.err + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "1 identifiers used out of 1 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) def test_basic_index_broken_spreadsheet(runtmp, lca_db_format): # duplicate identifiers in this spreadsheet - taxcsv = utils.get_test_data('lca/bad-spreadsheet-2.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/bad-spreadsheet-2.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig, "-F", lca_db_format] with pytest.raises(SourmashCommandFailed): runtmp.sourmash(*cmd) assert runtmp.last_result.status != 0 - assert "multiple lineages for identifier TARA_ASE_MAG_00031" in runtmp.last_result.err + assert ( + "multiple lineages for identifier TARA_ASE_MAG_00031" in runtmp.last_result.err + ) def test_basic_index_too_many_strains_too_few_species(runtmp, lca_db_format): # explicit test for #841, where 'n_species' wasn't getting counted # if lineage was at strain level resolution. - taxcsv = utils.get_test_data('lca/podar-lineage.csv') - input_sig = utils.get_test_data('47.fa.sig') - lca_db = runtmp.output(f'out.lca.{lca_db_format}') - - cmd = ['lca', 'index', taxcsv, lca_db, input_sig, - '-C', '3', '--split-identifiers', '-F', lca_db_format] + taxcsv = utils.get_test_data("lca/podar-lineage.csv") + input_sig = utils.get_test_data("47.fa.sig") + lca_db = runtmp.output(f"out.lca.{lca_db_format}") + + cmd = [ + "lca", + "index", + taxcsv, + lca_db, + input_sig, + "-C", + "3", + "--split-identifiers", + "-F", + lca_db_format, + ] runtmp.sourmash(*cmd) - assert not 'error: fewer than 20% of lineages' in runtmp.last_result.err + assert "error: fewer than 20% of lineages" not in runtmp.last_result.err assert runtmp.last_result.status == 0 def test_basic_index_too_few_species(runtmp, lca_db_format): # spreadsheets with too few species should be flagged, unless -f specified - taxcsv = utils.get_test_data('lca/tully-genome-sigs.classify.csv') + taxcsv = utils.get_test_data("lca/tully-genome-sigs.classify.csv") # (these don't really matter, should break on load spreadsheet) - input_sig = utils.get_test_data('47.fa.sig') - lca_db = runtmp.output(f'out.lca.{lca_db_format}') + input_sig = utils.get_test_data("47.fa.sig") + lca_db = runtmp.output(f"out.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '-C', '3', - '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig, "-C", "3", "-F", lca_db_format] with pytest.raises(SourmashCommandFailed): runtmp.sourmash(*cmd) - assert not '"ERROR: fewer than 20% of lineages have species-level resolution' in runtmp.last_result.err + assert ( + '"ERROR: fewer than 20% of lineages have species-level resolution' + not in runtmp.last_result.err + ) assert runtmp.last_result.status != 0 def test_basic_index_require_taxonomy(runtmp, lca_db_format): # no taxonomy in here - taxcsv = utils.get_test_data('lca/bad-spreadsheet-3.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') - - cmd = ['lca', 'index', '--require-taxonomy', taxcsv, lca_db, input_sig, - '-F', lca_db_format] + taxcsv = utils.get_test_data("lca/bad-spreadsheet-3.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") + + cmd = [ + "lca", + "index", + "--require-taxonomy", + taxcsv, + lca_db, + input_sig, + "-F", + lca_db_format, + ] with pytest.raises(SourmashCommandFailed): runtmp.sourmash(*cmd) assert runtmp.last_result.status != 0 - assert "ERROR: no hash values found - are there any signatures?" in runtmp.last_result.err + assert ( + "ERROR: no hash values found - are there any signatures?" + in runtmp.last_result.err + ) def test_basic_index_column_start(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca/delmont-3.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-3.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") - cmd = ['lca', 'index', '-C', '3', taxcsv, lca_db, input_sig, - '-F', lca_db_format] + cmd = ["lca", "index", "-C", "3", taxcsv, lca_db, input_sig, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -911,49 +925,71 @@ def test_basic_index_column_start(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in runtmp.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in runtmp.last_result.err - assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in runtmp.last_result.err + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "1 identifiers used out of 1 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) def test_index_empty_sketch_name(runtmp, lca_db_format): c = runtmp # create two signatures with empty 'name' attributes - cmd = ['sketch', 'dna', utils.get_test_data('genome-s12.fa.gz'), - utils.get_test_data('genome-s11.fa.gz')] + cmd = [ + "sketch", + "dna", + utils.get_test_data("genome-s12.fa.gz"), + utils.get_test_data("genome-s11.fa.gz"), + ] c.run_sourmash(*cmd) - sig1 = c.output('genome-s11.fa.gz.sig') + sig1 = c.output("genome-s11.fa.gz.sig") assert os.path.exists(sig1) - sig2 = c.output('genome-s12.fa.gz.sig') + sig2 = c.output("genome-s12.fa.gz.sig") assert os.path.exists(sig2) - outfile = f'zzz.lca.{lca_db_format}' + outfile = f"zzz.lca.{lca_db_format}" # can we insert them both? - taxcsv = utils.get_test_data('lca/delmont-1.csv') - cmd = ['lca', 'index', taxcsv, outfile, sig1, sig2, '-F', lca_db_format] + taxcsv = utils.get_test_data("lca/delmont-1.csv") + cmd = ["lca", "index", taxcsv, outfile, sig1, sig2, "-F", lca_db_format] c.run_sourmash(*cmd) assert os.path.exists(c.output(outfile)) print(c.last_result.out) print(c.last_result.err) - assert 'WARNING: no lineage provided for 2 sig' in c.last_result.err + assert "WARNING: no lineage provided for 2 sig" in c.last_result.err def test_basic_index_and_classify_with_tsv_and_gz(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca/delmont-1.tsv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') + taxcsv = utils.get_test_data("lca/delmont-1.tsv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") - if lca_db_format == 'json': - lca_db = runtmp.output(f'delmont-1.lca.json.gz') + if lca_db_format == "json": + lca_db = runtmp.output("delmont-1.lca.json.gz") else: - lca_db = runtmp.output(f'delmont-1.lca.sql') - - cmd = ['lca', 'index', '--tabs', '--no-header', taxcsv, lca_db, input_sig, - '-F', lca_db_format] + lca_db = runtmp.output("delmont-1.lca.sql") + + cmd = [ + "lca", + "index", + "--tabs", + "--no-header", + taxcsv, + lca_db, + input_sig, + "-F", + lca_db_format, + ] runtmp.sourmash(*cmd) print(cmd) @@ -962,27 +998,36 @@ def test_basic_index_and_classify_with_tsv_and_gz(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in runtmp.last_result.err + assert ( + "1 identifiers used out of 1 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) - cmd = ['lca', 'classify', '--db', lca_db, '--query', input_sig] + cmd = ["lca", "classify", "--db", lca_db, "--query", input_sig] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'ID,status,superkingdom,phylum,class,order,family,genus,species' in runtmp.last_result.out - assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii' in runtmp.last_result.out - assert 'classified 1 signatures total' in runtmp.last_result.err - assert 'loaded 1 LCA databases' in runtmp.last_result.err + assert ( + "ID,status,superkingdom,phylum,class,order,family,genus,species" + in runtmp.last_result.out + ) + assert ( + "TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii" + in runtmp.last_result.out + ) + assert "classified 1 signatures total" in runtmp.last_result.err + assert "loaded 1 LCA databases" in runtmp.last_result.err def test_basic_index_and_classify(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca/delmont-1.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-1.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -991,31 +1036,55 @@ def test_basic_index_and_classify(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in runtmp.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in runtmp.last_result.err - assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in runtmp.last_result.err - - cmd = ['lca', 'classify', '--db', lca_db, '--query', input_sig] + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "1 identifiers used out of 1 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) + + cmd = ["lca", "classify", "--db", lca_db, "--query", input_sig] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'ID,status,superkingdom,phylum,class,order,family,genus,species' in runtmp.last_result.out - assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii' in runtmp.last_result.out - assert 'classified 1 signatures total' in runtmp.last_result.err - assert 'loaded 1 LCA databases' in runtmp.last_result.err + assert ( + "ID,status,superkingdom,phylum,class,order,family,genus,species" + in runtmp.last_result.out + ) + assert ( + "TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii" + in runtmp.last_result.out + ) + assert "classified 1 signatures total" in runtmp.last_result.err + assert "loaded 1 LCA databases" in runtmp.last_result.err def test_basic_index_and_classify_dup_lineage(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca/tara-delmont-SuppTable3.csv') - input_sig1 = utils.get_test_data('lca/TARA_ASE_MAG_00007.sig') - input_sig2 = utils.get_test_data('lca/TARA_ANW_MAG_00005.sig') - lca_db = runtmp.output(f'delmont-dup.lca.{lca_db_format}') - - cmd = ['lca', 'index', taxcsv, lca_db, input_sig1, input_sig2, - '-F', lca_db_format, '-f'] + taxcsv = utils.get_test_data("lca/tara-delmont-SuppTable3.csv") + input_sig1 = utils.get_test_data("lca/TARA_ASE_MAG_00007.sig") + input_sig2 = utils.get_test_data("lca/TARA_ANW_MAG_00005.sig") + lca_db = runtmp.output(f"delmont-dup.lca.{lca_db_format}") + + cmd = [ + "lca", + "index", + taxcsv, + lca_db, + input_sig1, + input_sig2, + "-F", + lca_db_format, + "-f", + ] runtmp.sourmash(*cmd) print(cmd) @@ -1024,35 +1093,41 @@ def test_basic_index_and_classify_dup_lineage(runtmp, lca_db_format): assert os.path.exists(lca_db) - cmd = ['lca', 'classify', '--db', lca_db, '--query', input_sig1] + cmd = ["lca", "classify", "--db", lca_db, "--query", input_sig1] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'TARA_ASE_MAG_00007,found,Bacteria,Proteobacteria,Gammaproteobacteria,,,,,' in runtmp.last_result.out + assert ( + "TARA_ASE_MAG_00007,found,Bacteria,Proteobacteria,Gammaproteobacteria,,,,," + in runtmp.last_result.out + ) - cmd = ['lca', 'classify', '--db', lca_db, '--query', input_sig2] + cmd = ["lca", "classify", "--db", lca_db, "--query", input_sig2] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'TARA_ANW_MAG_00005,found,Bacteria,Proteobacteria,Gammaproteobacteria,,,,,' in runtmp.last_result.out + assert ( + "TARA_ANW_MAG_00005,found,Bacteria,Proteobacteria,Gammaproteobacteria,,,,," + in runtmp.last_result.out + ) def test_index_traverse(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca/delmont-1.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-1.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") - in_dir = runtmp.output('sigs') + in_dir = runtmp.output("sigs") os.mkdir(in_dir) - shutil.copyfile(input_sig, os.path.join(in_dir, 'q.sig')) + shutil.copyfile(input_sig, os.path.join(in_dir, "q.sig")) - cmd = ['lca', 'index', taxcsv, lca_db, in_dir, '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, in_dir, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -1061,26 +1136,35 @@ def test_index_traverse(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in runtmp.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in runtmp.last_result.err - assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in runtmp.last_result.err - assert 'WARNING: 1 duplicate signatures.' not in runtmp.last_result.err + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "1 identifiers used out of 1 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) + assert "WARNING: 1 duplicate signatures." not in runtmp.last_result.err def test_index_traverse_force(runtmp, lca_db_format): c = runtmp # test the use of --force to load all files, not just .sig - taxcsv = utils.get_test_data('lca/delmont-1.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = c.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-1.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db = c.output(f"delmont-1.lca.{lca_db_format}") - in_dir = c.output('sigs') + in_dir = c.output("sigs") os.mkdir(in_dir) # name signature .txt instead of .sig: - shutil.copyfile(input_sig, os.path.join(in_dir, 'q.txt')) + shutil.copyfile(input_sig, os.path.join(in_dir, "q.txt")) # use --force - cmd = ['lca', 'index', taxcsv, lca_db, in_dir, '-f', '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, in_dir, "-f", "-F", lca_db_format] c.run_sourmash(*cmd) out = c.last_result.out @@ -1092,22 +1176,31 @@ def test_index_traverse_force(runtmp, lca_db_format): assert "** assuming column 'MAGs' is identifiers in spreadsheet" in err assert "** assuming column 'Domain' is superkingdom in spreadsheet" in err - assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in err - assert 'WARNING: 1 duplicate signatures.' not in err + assert "1 identifiers used out of 1 distinct identifiers in spreadsheet." in err + assert "WARNING: 1 duplicate signatures." not in err def test_index_from_file_cmdline_sig(runtmp, lca_db_format): c = runtmp - taxcsv = utils.get_test_data('lca/delmont-1.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = c.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-1.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db = c.output(f"delmont-1.lca.{lca_db_format}") - file_list = c.output('sigs.list') - with open(file_list, 'wt') as fp: + file_list = c.output("sigs.list") + with open(file_list, "w") as fp: print(input_sig, file=fp) - cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '--from-file', file_list, - '-F', lca_db_format] + cmd = [ + "lca", + "index", + taxcsv, + lca_db, + input_sig, + "--from-file", + file_list, + "-F", + lca_db_format, + ] c.run_sourmash(*cmd) out = c.last_result.out @@ -1119,23 +1212,31 @@ def test_index_from_file_cmdline_sig(runtmp, lca_db_format): assert "** assuming column 'MAGs' is identifiers in spreadsheet" in err assert "** assuming column 'Domain' is superkingdom in spreadsheet" in err - assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in err - assert 'WARNING: 1 duplicate signatures.' in err + assert "1 identifiers used out of 1 distinct identifiers in spreadsheet." in err + assert "WARNING: 1 duplicate signatures." in err def test_index_from_file(runtmp, lca_db_format): c = runtmp - taxcsv = utils.get_test_data('lca/delmont-1.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = c.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-1.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db = c.output(f"delmont-1.lca.{lca_db_format}") - file_list = c.output('sigs.list') - with open(file_list, 'wt') as fp: + file_list = c.output("sigs.list") + with open(file_list, "w") as fp: print(input_sig, file=fp) - cmd = ['lca', 'index', taxcsv, lca_db, '--from-file', file_list, - '-F', lca_db_format] + cmd = [ + "lca", + "index", + taxcsv, + lca_db, + "--from-file", + file_list, + "-F", + lca_db_format, + ] c.run_sourmash(*cmd) out = c.last_result.out @@ -1147,33 +1248,41 @@ def test_index_from_file(runtmp, lca_db_format): assert "** assuming column 'MAGs' is identifiers in spreadsheet" in err assert "** assuming column 'Domain' is superkingdom in spreadsheet" in err - assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in err + assert "1 identifiers used out of 1 distinct identifiers in spreadsheet." in err def test_index_fail_on_num(runtmp, lca_db_format): c = runtmp # lca index should yield a decent error message when attempted on 'num' - sigfile = utils.get_test_data('num/63.fa.sig') - taxcsv = utils.get_test_data('lca/podar-lineage.csv') + sigfile = utils.get_test_data("num/63.fa.sig") + taxcsv = utils.get_test_data("lca/podar-lineage.csv") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('lca', 'index', taxcsv, f'xxx.lca.{lca_db_format}', sigfile, - '-C', '3', '-F', lca_db_format) + c.run_sourmash( + "lca", + "index", + taxcsv, + f"xxx.lca.{lca_db_format}", + sigfile, + "-C", + "3", + "-F", + lca_db_format, + ) err = c.last_result.err print(err) - assert 'ERROR: cannot insert signature ' in err - assert 'ERROR: cannot downsample signature; is it a scaled signature?' in err + assert "ERROR: cannot insert signature " in err + assert "ERROR: cannot downsample signature; is it a scaled signature?" in err def test_index_traverse_real_spreadsheet_no_report(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca/tara-delmont-SuppTable3.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/tara-delmont-SuppTable3.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '-f', - '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig, "-f", "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -1182,22 +1291,44 @@ def test_index_traverse_real_spreadsheet_no_report(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in runtmp.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in runtmp.last_result.err - assert '1 identifiers used out of 957 distinct identifiers in spreadsheet.' in runtmp.last_result.err - assert 'WARNING: no signatures for 956 spreadsheet rows.' in runtmp.last_result.err - assert 'WARNING: 105 unused lineages.' in runtmp.last_result.err - assert '(You can use --report to generate a detailed report.)' in runtmp.last_result.err + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "1 identifiers used out of 957 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) + assert "WARNING: no signatures for 956 spreadsheet rows." in runtmp.last_result.err + assert "WARNING: 105 unused lineages." in runtmp.last_result.err + assert ( + "(You can use --report to generate a detailed report.)" + in runtmp.last_result.err + ) def test_index_traverse_real_spreadsheet_report(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca/tara-delmont-SuppTable3.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') - report_loc = runtmp.output('report.txt') - - cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '--report', - report_loc, '-f', '-F', lca_db_format] + taxcsv = utils.get_test_data("lca/tara-delmont-SuppTable3.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") + report_loc = runtmp.output("report.txt") + + cmd = [ + "lca", + "index", + taxcsv, + lca_db, + input_sig, + "--report", + report_loc, + "-f", + "-F", + lca_db_format, + ] runtmp.sourmash(*cmd) print(cmd) @@ -1206,148 +1337,191 @@ def test_index_traverse_real_spreadsheet_report(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in runtmp.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in runtmp.last_result.err - assert '1 identifiers used out of 957 distinct identifiers in spreadsheet.' in runtmp.last_result.err - assert 'WARNING: no signatures for 956 spreadsheet rows.' in runtmp.last_result.err - assert 'WARNING: 105 unused lineages.' in runtmp.last_result.err - assert '(You can use --report to generate a detailed report.)' not in runtmp.last_result.err + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "1 identifiers used out of 957 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) + assert "WARNING: no signatures for 956 spreadsheet rows." in runtmp.last_result.err + assert "WARNING: 105 unused lineages." in runtmp.last_result.err + assert ( + "(You can use --report to generate a detailed report.)" + not in runtmp.last_result.err + ) assert os.path.exists(report_loc) def test_single_classify(runtmp): # run a basic 'classify', check output. - db1 = utils.get_test_data('lca/delmont-1.lca.json') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') + db1 = utils.get_test_data("lca/delmont-1.lca.json") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") - cmd = ['lca', 'classify', '--db', db1, '--query', input_sig] + cmd = ["lca", "classify", "--db", db1, "--query", input_sig] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii' in runtmp.last_result.out - assert 'classified 1 signatures total' in runtmp.last_result.err - assert 'loaded 1 LCA databases' in runtmp.last_result.err + assert ( + "TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii" + in runtmp.last_result.out + ) + assert "classified 1 signatures total" in runtmp.last_result.err + assert "loaded 1 LCA databases" in runtmp.last_result.err def test_single_classify_zip_query(runtmp): # run 'classify' with a query in a zipfile - db1 = utils.get_test_data('lca/delmont-1.lca.json') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') + db1 = utils.get_test_data("lca/delmont-1.lca.json") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") query_ss = sourmash.load_one_signature(input_sig, ksize=31) - query_zipfile = runtmp.output('query.zip') + query_zipfile = runtmp.output("query.zip") with sourmash_args.SaveSignaturesToLocation(query_zipfile) as save_sig: save_sig.add(query_ss) - cmd = ['lca', 'classify', '--db', db1, '--query', query_zipfile] + cmd = ["lca", "classify", "--db", db1, "--query", query_zipfile] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii' in runtmp.last_result.out - assert 'classified 1 signatures total' in runtmp.last_result.err - assert 'loaded 1 LCA databases' in runtmp.last_result.err + assert ( + "TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii" + in runtmp.last_result.out + ) + assert "classified 1 signatures total" in runtmp.last_result.err + assert "loaded 1 LCA databases" in runtmp.last_result.err def test_single_classify_to_output(runtmp): - db1 = utils.get_test_data(f'lca/delmont-1.lca.json') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - - cmd = ['lca', 'classify', '--db', db1, '--query', input_sig, - '-o', runtmp.output('outfile.txt')] + db1 = utils.get_test_data("lca/delmont-1.lca.json") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + + cmd = [ + "lca", + "classify", + "--db", + db1, + "--query", + input_sig, + "-o", + runtmp.output("outfile.txt"), + ] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - with open(runtmp.output('outfile.txt'), 'rt') as fp: + with open(runtmp.output("outfile.txt")) as fp: outdata = fp.read() - assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii' in outdata - assert 'classified 1 signatures total' in runtmp.last_result.err - assert 'loaded 1 LCA databases' in runtmp.last_result.err + assert ( + "TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii" + in outdata + ) + assert "classified 1 signatures total" in runtmp.last_result.err + assert "loaded 1 LCA databases" in runtmp.last_result.err def test_single_classify_to_output_no_name(runtmp): - db1 = utils.get_test_data(f'lca/delmont-1.lca.json') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') + db1 = utils.get_test_data("lca/delmont-1.lca.json") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") ss = sourmash.load_one_signature(input_sig, ksize=31) - outsig_filename = runtmp.output('q.sig') - with open(outsig_filename, 'wt') as fp: + outsig_filename = runtmp.output("q.sig") + with open(outsig_filename, "w") as fp: # remove name from signature here -- - new_sig = sourmash.SourmashSignature(ss.minhash, filename='xyz') + new_sig = sourmash.SourmashSignature(ss.minhash, filename="xyz") sourmash.save_signatures([new_sig], fp) - cmd = ['lca', 'classify', '--db', db1, '--query', outsig_filename, - '-o', runtmp.output('outfile.txt')] + cmd = [ + "lca", + "classify", + "--db", + db1, + "--query", + outsig_filename, + "-o", + runtmp.output("outfile.txt"), + ] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - with open(runtmp.output('outfile.txt'), 'rt') as fp: + with open(runtmp.output("outfile.txt")) as fp: outdata = fp.read() print((outdata,)) - assert 'xyz,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii' in outdata - assert 'classified 1 signatures total' in runtmp.last_result.err - assert 'loaded 1 LCA databases' in runtmp.last_result.err + assert ( + "xyz,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii" + in outdata + ) + assert "classified 1 signatures total" in runtmp.last_result.err + assert "loaded 1 LCA databases" in runtmp.last_result.err def test_single_classify_empty(runtmp): - db1 = utils.get_test_data(f'lca/both.lca.json') - input_sig = utils.get_test_data('GCF_000005845.2_ASM584v2_genomic.fna.gz.sig') + db1 = utils.get_test_data("lca/both.lca.json") + input_sig = utils.get_test_data("GCF_000005845.2_ASM584v2_genomic.fna.gz.sig") - cmd = ['lca', 'classify', '--db', db1, '--query', input_sig] + cmd = ["lca", "classify", "--db", db1, "--query", input_sig] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'GCF_000005845,nomatch,,,,,,,,' in runtmp.last_result.out - assert 'classified 1 signatures total' in runtmp.last_result.err - assert 'loaded 1 LCA databases' in runtmp.last_result.err + assert "GCF_000005845,nomatch,,,,,,,," in runtmp.last_result.out + assert "classified 1 signatures total" in runtmp.last_result.err + assert "loaded 1 LCA databases" in runtmp.last_result.err def test_single_classify_traverse(runtmp): - db1 = utils.get_test_data(f'lca/delmont-1.lca.json') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - in_dir = runtmp.output('sigs') + db1 = utils.get_test_data("lca/delmont-1.lca.json") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + in_dir = runtmp.output("sigs") os.mkdir(in_dir) - shutil.copyfile(input_sig, os.path.join(in_dir, 'q.sig')) + shutil.copyfile(input_sig, os.path.join(in_dir, "q.sig")) - cmd = ['lca', 'classify', '--db', db1, '--query', input_sig] + cmd = ["lca", "classify", "--db", db1, "--query", input_sig] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii' in runtmp.last_result.out - assert 'classified 1 signatures total' in runtmp.last_result.err - assert 'loaded 1 LCA databases' in runtmp.last_result.err + assert ( + "TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii" + in runtmp.last_result.out + ) + assert "classified 1 signatures total" in runtmp.last_result.err + assert "loaded 1 LCA databases" in runtmp.last_result.err def test_multi_query_classify_traverse(runtmp): # both.lca.json is built from both dir and dir2 - db1 = utils.get_test_data(f'lca/both.lca.json') - dir1 = utils.get_test_data('lca/dir1') - dir2 = utils.get_test_data('lca/dir2') + db1 = utils.get_test_data("lca/both.lca.json") + dir1 = utils.get_test_data("lca/dir1") + dir2 = utils.get_test_data("lca/dir2") - cmd = ['lca', 'classify', '--db', db1, '--query', dir1, dir2] + cmd = ["lca", "classify", "--db", db1, "--query", dir1, dir2] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - with open(utils.get_test_data('lca/classify-by-both.csv')) as fp: + with open(utils.get_test_data("lca/classify-by-both.csv")) as fp: fp_lines = fp.readlines() out_lines = runtmp.last_result.out.splitlines() @@ -1362,22 +1536,22 @@ def test_multi_query_classify_traverse(runtmp): @utils.in_tempdir def test_multi_query_classify_query_from_file(c): # both.lca.json is built from both dir and dir2 - db1 = utils.get_test_data('lca/both.lca.json') - dir1_glob = utils.get_test_data('lca/dir1/*.sig') + db1 = utils.get_test_data("lca/both.lca.json") + dir1_glob = utils.get_test_data("lca/dir1/*.sig") dir1_files = glob.glob(dir1_glob) - dir2_glob = utils.get_test_data('lca/dir2/*.sig') + dir2_glob = utils.get_test_data("lca/dir2/*.sig") dir2_files = glob.glob(dir2_glob) - file_list = c.output('file.list') - with open(file_list, 'wt') as fp: + file_list = c.output("file.list") + with open(file_list, "w") as fp: print("\n".join(dir1_files), file=fp) print("\n".join(dir2_files), file=fp) - cmd = ['lca', 'classify', '--db', db1, '--query-from-file', file_list] + cmd = ["lca", "classify", "--db", db1, "--query-from-file", file_list] c.run_sourmash(*cmd) out = c.last_result.out - with open(utils.get_test_data('lca/classify-by-both.csv')) as fp: + with open(utils.get_test_data("lca/classify-by-both.csv")) as fp: fp_lines = fp.readlines() out_lines = out.splitlines() @@ -1392,23 +1566,31 @@ def test_multi_query_classify_query_from_file(c): @utils.in_tempdir def test_multi_query_classify_query_from_file_and_query(c): # both.lca.json is built from both dir and dir2 - db1 = utils.get_test_data(f'lca/both.lca.json') - dir1_glob = utils.get_test_data('lca/dir1/*.sig') + db1 = utils.get_test_data("lca/both.lca.json") + dir1_glob = utils.get_test_data("lca/dir1/*.sig") dir1_files = glob.glob(dir1_glob) - dir2_glob = utils.get_test_data('lca/dir2/*.sig') + dir2_glob = utils.get_test_data("lca/dir2/*.sig") dir2_files = glob.glob(dir2_glob) - file_list = c.output('file.list') - with open(file_list, 'wt') as fp: - print("\n".join(dir1_files[1:]), file=fp) # leave off first one + file_list = c.output("file.list") + with open(file_list, "w") as fp: + print("\n".join(dir1_files[1:]), file=fp) # leave off first one print("\n".join(dir2_files), file=fp) - cmd = ['lca', 'classify', '--db', db1, '--query', dir1_files[0], - '--query-from-file', file_list] + cmd = [ + "lca", + "classify", + "--db", + db1, + "--query", + dir1_files[0], + "--query-from-file", + file_list, + ] c.run_sourmash(*cmd) out = c.last_result.out - with open(utils.get_test_data('lca/classify-by-both.csv'), 'rt') as fp: + with open(utils.get_test_data("lca/classify-by-both.csv")) as fp: fp_lines = fp.readlines() out_lines = out.splitlines() @@ -1422,19 +1604,19 @@ def test_multi_query_classify_query_from_file_and_query(c): def test_multi_db_multi_query_classify_traverse(runtmp): # two halves of both.lca.json, see above test. - db1 = utils.get_test_data(f'lca/dir1.lca.json') - db2 = utils.get_test_data(f'lca/dir2.lca.json') - dir1 = utils.get_test_data('lca/dir1') - dir2 = utils.get_test_data('lca/dir2') + db1 = utils.get_test_data("lca/dir1.lca.json") + db2 = utils.get_test_data("lca/dir2.lca.json") + dir1 = utils.get_test_data("lca/dir1") + dir2 = utils.get_test_data("lca/dir2") - cmd = ['lca', 'classify', '--db', db1, db2, '--query', dir1, dir2] + cmd = ["lca", "classify", "--db", db1, db2, "--query", dir1, dir2] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - with open(utils.get_test_data('lca/classify-by-both.csv'), 'rt') as fp: + with open(utils.get_test_data("lca/classify-by-both.csv")) as fp: fp_lines = fp.readlines() out_lines = runtmp.last_result.out.splitlines() @@ -1447,11 +1629,11 @@ def test_multi_db_multi_query_classify_traverse(runtmp): def test_unassigned_internal_index_and_classify(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca/delmont-4.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-4.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -1460,29 +1642,44 @@ def test_unassigned_internal_index_and_classify(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in runtmp.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in runtmp.last_result.err - assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in runtmp.last_result.err - - cmd = ['lca', 'classify', '--db', lca_db, '--query', input_sig] + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "1 identifiers used out of 1 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) + + cmd = ["lca", "classify", "--db", lca_db, "--query", input_sig] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'ID,status,superkingdom,phylum,class,order,family,genus,species' in runtmp.last_result.out - assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,unassigned,Alteromonadaceae,unassigned,Alteromonas_macleodii' in runtmp.last_result.out - assert 'classified 1 signatures total' in runtmp.last_result.err - assert 'loaded 1 LCA databases' in runtmp.last_result.err + assert ( + "ID,status,superkingdom,phylum,class,order,family,genus,species" + in runtmp.last_result.out + ) + assert ( + "TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,unassigned,Alteromonadaceae,unassigned,Alteromonas_macleodii" + in runtmp.last_result.out + ) + assert "classified 1 signatures total" in runtmp.last_result.err + assert "loaded 1 LCA databases" in runtmp.last_result.err def test_unassigned_last_index_and_classify(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca/delmont-5.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-5.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -1491,31 +1688,45 @@ def test_unassigned_last_index_and_classify(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in runtmp.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in runtmp.last_result.err - assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in runtmp.last_result.err - - cmd = ['lca', 'classify', '--db', lca_db, '--query', input_sig] + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "1 identifiers used out of 1 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) + + cmd = ["lca", "classify", "--db", lca_db, "--query", input_sig] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'ID,status,superkingdom,phylum,class,order,family,genus,species' in runtmp.last_result.out - assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,,,\r\n' in runtmp.last_result.out - assert 'classified 1 signatures total' in runtmp.last_result.err - assert 'loaded 1 LCA databases' in runtmp.last_result.err + assert ( + "ID,status,superkingdom,phylum,class,order,family,genus,species" + in runtmp.last_result.out + ) + assert ( + "TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,,,\r\n" + in runtmp.last_result.out + ) + assert "classified 1 signatures total" in runtmp.last_result.err + assert "loaded 1 LCA databases" in runtmp.last_result.err def test_index_and_classify_internal_unassigned_multi(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca/delmont-6.csv') - input_sig1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - input_sig2 = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-6.csv") + input_sig1 = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + input_sig2 = utils.get_test_data("lca/TARA_PSW_MAG_00136.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig1, input_sig2, - '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig1, input_sig2, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -1524,35 +1735,56 @@ def test_index_and_classify_internal_unassigned_multi(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in runtmp.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in runtmp.last_result.err - assert '2 identifiers used out of 2 distinct identifiers in spreadsheet.' in runtmp.last_result.err + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "2 identifiers used out of 2 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) # classify input_sig1 - cmd = ['lca', 'classify', '--db', lca_db, '--query', input_sig1] + cmd = ["lca", "classify", "--db", lca_db, "--query", input_sig1] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'ID,status,superkingdom,phylum,class,order,family,genus,species' in runtmp.last_result.out - assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,unassigned,unassigned,Alteromonadaceae,,,\r\n' in runtmp.last_result.out - assert 'classified 1 signatures total' in runtmp.last_result.err - assert 'loaded 1 LCA databases' in runtmp.last_result.err + assert ( + "ID,status,superkingdom,phylum,class,order,family,genus,species" + in runtmp.last_result.out + ) + assert ( + "TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,unassigned,unassigned,Alteromonadaceae,,,\r\n" + in runtmp.last_result.out + ) + assert "classified 1 signatures total" in runtmp.last_result.err + assert "loaded 1 LCA databases" in runtmp.last_result.err # classify input_sig2 - cmd = ['lca', 'classify', '--db', lca_db, '--query', input_sig2] + cmd = ["lca", "classify", "--db", lca_db, "--query", input_sig2] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'ID,status,superkingdom,phylum,class,order,family,genus,species' in runtmp.last_result.out - assert 'TARA_PSW_MAG_00136,found,Eukaryota,Chlorophyta,Prasinophyceae,unassigned,unassigned,Ostreococcus,,\r\n' in runtmp.last_result.out - assert 'classified 1 signatures total' in runtmp.last_result.err - assert 'loaded 1 LCA databases' in runtmp.last_result.err + assert ( + "ID,status,superkingdom,phylum,class,order,family,genus,species" + in runtmp.last_result.out + ) + assert ( + "TARA_PSW_MAG_00136,found,Eukaryota,Chlorophyta,Prasinophyceae,unassigned,unassigned,Ostreococcus,,\r\n" + in runtmp.last_result.out + ) + assert "classified 1 signatures total" in runtmp.last_result.err + assert "loaded 1 LCA databases" in runtmp.last_result.err def test_classify_majority_vote_1(runtmp, lca_db_format): @@ -1560,13 +1792,14 @@ def test_classify_majority_vote_1(runtmp, lca_db_format): c = runtmp # build database - taxcsv = utils.get_test_data('lca/delmont-6.csv') - input_sig1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - input_sig2 = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') - lca_db = c.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-6.csv") + input_sig1 = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + input_sig2 = utils.get_test_data("lca/TARA_PSW_MAG_00136.sig") + lca_db = c.output(f"delmont-1.lca.{lca_db_format}") - c.run_sourmash('lca', 'index', taxcsv, lca_db, input_sig1, input_sig2, - '-F', lca_db_format) + c.run_sourmash( + "lca", "index", taxcsv, lca_db, input_sig1, input_sig2, "-F", lca_db_format + ) print(c.last_command) print(c.last_result.out) @@ -1574,26 +1807,46 @@ def test_classify_majority_vote_1(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in c.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in c.last_result.err - assert '2 identifiers used out of 2 distinct identifiers in spreadsheet.' in c.last_result.err + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" in c.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in c.last_result.err + ) + assert ( + "2 identifiers used out of 2 distinct identifiers in spreadsheet." + in c.last_result.err + ) # merge input_sig1 and input_sig2 - c.run_sourmash('signature', 'merge', input_sig1, input_sig2, '-k', '31', '--flatten', '-o', 'sig1and2.sig') - sig1and2 = c.output('sig1and2.sig') + c.run_sourmash( + "signature", + "merge", + input_sig1, + input_sig2, + "-k", + "31", + "--flatten", + "-o", + "sig1and2.sig", + ) + sig1and2 = c.output("sig1and2.sig") # lca classify should yield no results - c.run_sourmash('lca', 'classify', '--db', lca_db, '--query', sig1and2) + c.run_sourmash("lca", "classify", "--db", lca_db, "--query", sig1and2) print(c.last_command) print(c.last_result.out) print(c.last_result.err) - assert 'ID,status,superkingdom,phylum,class,order,family,genus,species' in c.last_result.out - assert 'disagree,,,,,,,,' in c.last_result.out - assert 'classified 1 signatures total' in c.last_result.err - assert 'loaded 1 LCA databases' in c.last_result.err - + assert ( + "ID,status,superkingdom,phylum,class,order,family,genus,species" + in c.last_result.out + ) + assert "disagree,,,,,,,," in c.last_result.out + assert "classified 1 signatures total" in c.last_result.err + assert "loaded 1 LCA databases" in c.last_result.err def test_classify_majority_vote_2(runtmp, lca_db_format): @@ -1603,13 +1856,14 @@ def test_classify_majority_vote_2(runtmp, lca_db_format): c = runtmp # build database - taxcsv = utils.get_test_data('lca/delmont-6.csv') - input_sig1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - input_sig2 = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') - lca_db = c.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-6.csv") + input_sig1 = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + input_sig2 = utils.get_test_data("lca/TARA_PSW_MAG_00136.sig") + lca_db = c.output(f"delmont-1.lca.{lca_db_format}") - c.run_sourmash('lca', 'index', taxcsv, lca_db, input_sig1, input_sig2, - '-F', lca_db_format) + c.run_sourmash( + "lca", "index", taxcsv, lca_db, input_sig1, input_sig2, "-F", lca_db_format + ) print(c.last_command) print(c.last_result.out) @@ -1617,25 +1871,49 @@ def test_classify_majority_vote_2(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in c.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in c.last_result.err - assert '2 identifiers used out of 2 distinct identifiers in spreadsheet.' in c.last_result.err + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" in c.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in c.last_result.err + ) + assert ( + "2 identifiers used out of 2 distinct identifiers in spreadsheet." + in c.last_result.err + ) # merge input_sig1 and input_sig2 - c.run_sourmash('signature', 'merge', input_sig1, input_sig2, '-k', '31', '--flatten', '-o', 'sig1and2.sig') - sig1and2 = c.output('sig1and2.sig') + c.run_sourmash( + "signature", + "merge", + input_sig1, + input_sig2, + "-k", + "31", + "--flatten", + "-o", + "sig1and2.sig", + ) + sig1and2 = c.output("sig1and2.sig") # majority vote classify - c.run_sourmash('lca', 'classify', '--db', lca_db, '--query', sig1and2, '--majority') + c.run_sourmash("lca", "classify", "--db", lca_db, "--query", sig1and2, "--majority") print(c.last_command) print(c.last_result.out) print(c.last_result.err) - assert 'ID,status,superkingdom,phylum,class,order,family,genus,species' in c.last_result.out - assert 'found,Eukaryota,Chlorophyta,Prasinophyceae,unassigned,unassigned,Ostreococcus' in c.last_result.out - assert 'classified 1 signatures total' in c.last_result.err - assert 'loaded 1 LCA databases' in c.last_result.err + assert ( + "ID,status,superkingdom,phylum,class,order,family,genus,species" + in c.last_result.out + ) + assert ( + "found,Eukaryota,Chlorophyta,Prasinophyceae,unassigned,unassigned,Ostreococcus" + in c.last_result.out + ) + assert "classified 1 signatures total" in c.last_result.err + assert "loaded 1 LCA databases" in c.last_result.err def test_classify_majority_vote_3(runtmp, lca_db_format): @@ -1643,13 +1921,14 @@ def test_classify_majority_vote_3(runtmp, lca_db_format): c = runtmp # build database - taxcsv = utils.get_test_data('lca/delmont-6.csv') - input_sig1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - input_sig2 = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') - lca_db = c.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-6.csv") + input_sig1 = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + input_sig2 = utils.get_test_data("lca/TARA_PSW_MAG_00136.sig") + lca_db = c.output(f"delmont-1.lca.{lca_db_format}") - c.run_sourmash('lca', 'index', taxcsv, lca_db, input_sig1, input_sig2, - '-F', lca_db_format) + c.run_sourmash( + "lca", "index", taxcsv, lca_db, input_sig1, input_sig2, "-F", lca_db_format + ) print(c.last_command) print(c.last_result.out) @@ -1657,51 +1936,70 @@ def test_classify_majority_vote_3(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in c.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in c.last_result.err - assert '2 identifiers used out of 2 distinct identifiers in spreadsheet.' in c.last_result.err + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" in c.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in c.last_result.err + ) + assert ( + "2 identifiers used out of 2 distinct identifiers in spreadsheet." + in c.last_result.err + ) # obtain testdata '47.fa.sig' - testdata1 = utils.get_test_data('47.fa.sig') + testdata1 = utils.get_test_data("47.fa.sig") # majority vote classify - c.run_sourmash('lca', 'classify', '--db', lca_db, '--query', testdata1, '--majority') + c.run_sourmash( + "lca", "classify", "--db", lca_db, "--query", testdata1, "--majority" + ) print(c.last_command) print(c.last_result.out) print(c.last_result.err) - assert 'ID,status,superkingdom,phylum,class,order,family,genus,species' in c.last_result.out - assert 'nomatch,,,,,,,,' in c.last_result.out - assert 'classified 1 signatures total' in c.last_result.err - assert 'loaded 1 LCA databases' in c.last_result.err + assert ( + "ID,status,superkingdom,phylum,class,order,family,genus,species" + in c.last_result.out + ) + assert "nomatch,,,,,,,," in c.last_result.out + assert "classified 1 signatures total" in c.last_result.err + assert "loaded 1 LCA databases" in c.last_result.err def test_multi_db_classify(runtmp): - db1 = utils.get_test_data(f'lca/delmont-1.lca.json') - db2 = utils.get_test_data('lca/delmont-2.lca.json') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') + db1 = utils.get_test_data("lca/delmont-1.lca.json") + db2 = utils.get_test_data("lca/delmont-2.lca.json") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") - cmd = ['lca', 'classify', '--db', db1, db2, '--query', input_sig] + cmd = ["lca", "classify", "--db", db1, db2, "--query", input_sig] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'ID,status,superkingdom,phylum,class,order,family,genus,species' in runtmp.last_result.out - assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,,,,' in runtmp.last_result.out - assert 'classified 1 signatures total' in runtmp.last_result.err - assert 'loaded 2 LCA databases' in runtmp.last_result.err + assert ( + "ID,status,superkingdom,phylum,class,order,family,genus,species" + in runtmp.last_result.out + ) + assert ( + "TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,,,," + in runtmp.last_result.out + ) + assert "classified 1 signatures total" in runtmp.last_result.err + assert "loaded 2 LCA databases" in runtmp.last_result.err def test_classify_unknown_hashes(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca-root/tax.csv') - input_sig1 = utils.get_test_data('lca-root/TARA_MED_MAG_00029.fa.sig') - input_sig2 = utils.get_test_data('lca-root/TOBG_MED-875.fna.gz.sig') - lca_db = runtmp.output(f'lca-root.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca-root/tax.csv") + input_sig1 = utils.get_test_data("lca-root/TARA_MED_MAG_00029.fa.sig") + input_sig2 = utils.get_test_data("lca-root/TOBG_MED-875.fna.gz.sig") + lca_db = runtmp.output(f"lca-root.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig2, '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig2, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -1710,59 +2008,81 @@ def test_classify_unknown_hashes(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert '1 identifiers used out of 2 distinct identifiers in spreadsheet.' in runtmp.last_result.err + assert ( + "1 identifiers used out of 2 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) - cmd = ['lca', 'classify', '--db', lca_db, '--query', input_sig1] + cmd = ["lca", "classify", "--db", lca_db, "--query", input_sig1] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '(root)' not in runtmp.last_result.out - assert 'TARA_MED_MAG_00029,found,Archaea,Euryarcheoata,unassigned,unassigned,novelFamily_I' in runtmp.last_result.out + assert "(root)" not in runtmp.last_result.out + assert ( + "TARA_MED_MAG_00029,found,Archaea,Euryarcheoata,unassigned,unassigned,novelFamily_I" + in runtmp.last_result.out + ) def test_single_summarize(runtmp): - db1 = utils.get_test_data('lca/delmont-1.lca.json') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') + db1 = utils.get_test_data("lca/delmont-1.lca.json") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") - cmd = ['lca', 'summarize', '--db', db1, '--query', input_sig] + cmd = ["lca", "summarize", "--db", db1, "--query", input_sig] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'loaded 1 signatures from 1 files total.' in runtmp.last_result.err - assert '100.0% 200 Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales' in runtmp.last_result.out + assert "loaded 1 signatures from 1 files total." in runtmp.last_result.err + assert ( + "100.0% 200 Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales" + in runtmp.last_result.out + ) def test_single_summarize_singleton(runtmp): - db1 = utils.get_test_data('lca/delmont-1.lca.json') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - - cmd = ['lca', 'summarize', '--db', db1, '--query', input_sig,] + db1 = utils.get_test_data("lca/delmont-1.lca.json") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + + cmd = [ + "lca", + "summarize", + "--db", + db1, + "--query", + input_sig, + ] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'loaded 1 signatures from 1 files total.' in runtmp.last_result.err - assert '100.0% 200 Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales' in runtmp.last_result.out - assert 'test-data/lca/TARA_ASE_MAG_00031.sig:5b438c6c TARA_ASE_MAG_00031' in runtmp.last_result.out + assert "loaded 1 signatures from 1 files total." in runtmp.last_result.err + assert ( + "100.0% 200 Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales" + in runtmp.last_result.out + ) + assert ( + "test-data/lca/TARA_ASE_MAG_00031.sig:5b438c6c TARA_ASE_MAG_00031" + in runtmp.last_result.out + ) @utils.in_tempdir def test_single_summarize_traverse(c): - db1 = utils.get_test_data('lca/delmont-1.lca.json') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - in_dir = c.output('sigs') + db1 = utils.get_test_data("lca/delmont-1.lca.json") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + in_dir = c.output("sigs") os.mkdir(in_dir) - shutil.copyfile(input_sig, os.path.join(in_dir, 'q.sig')) + shutil.copyfile(input_sig, os.path.join(in_dir, "q.sig")) - cmd = ['lca', 'summarize', '--db', db1, '--query', in_dir] + cmd = ["lca", "summarize", "--db", db1, "--query", in_dir] c.run_sourmash(*cmd) out = c.last_result.out @@ -1770,18 +2090,22 @@ def test_single_summarize_traverse(c): err = c.last_result.err print(err) - assert 'loaded 1 signatures from 1 files total.' in err - assert '100.0% 200 Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales' in out + assert "loaded 1 signatures from 1 files total." in err + assert ( + "100.0% 200 Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales" + in out + ) + @utils.in_tempdir def test_single_summarize_singleton_traverse(c): - db1 = utils.get_test_data('lca/delmont-1.lca.json') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - in_dir = c.output('sigs') + db1 = utils.get_test_data("lca/delmont-1.lca.json") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + in_dir = c.output("sigs") os.mkdir(in_dir) - shutil.copyfile(input_sig, os.path.join(in_dir, 'q.sig')) + shutil.copyfile(input_sig, os.path.join(in_dir, "q.sig")) - cmd = ['lca', 'summarize', '--db', db1, '--query', in_dir] + cmd = ["lca", "summarize", "--db", db1, "--query", in_dir] c.run_sourmash(*cmd) out = c.last_result.out @@ -1789,63 +2113,89 @@ def test_single_summarize_singleton_traverse(c): err = c.last_result.err print(err) - assert 'loaded 1 signatures from 1 files total.' in err - assert '100.0% 200 Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales' in out - assert 'q.sig:5b438c6c TARA_ASE_MAG_00031' in out + assert "loaded 1 signatures from 1 files total." in err + assert ( + "100.0% 200 Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales" + in out + ) + assert "q.sig:5b438c6c TARA_ASE_MAG_00031" in out def test_single_summarize_to_output(runtmp): - db1 = utils.get_test_data('lca/delmont-1.lca.json') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - in_dir = runtmp.output('sigs') + db1 = utils.get_test_data("lca/delmont-1.lca.json") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + in_dir = runtmp.output("sigs") os.mkdir(in_dir) - shutil.copyfile(input_sig, os.path.join(in_dir, 'q.sig')) - - cmd = ['lca', 'summarize', '--db', db1, '--query', input_sig, - '-o', runtmp.output('output.txt')] + shutil.copyfile(input_sig, os.path.join(in_dir, "q.sig")) + + cmd = [ + "lca", + "summarize", + "--db", + db1, + "--query", + input_sig, + "-o", + runtmp.output("output.txt"), + ] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - with open(runtmp.output('output.txt'), 'rt') as fp: + with open(runtmp.output("output.txt")) as fp: outdata = fp.read() - assert 'loaded 1 signatures from 1 files total.' in runtmp.last_result.err - assert '200,Bacteria,Proteobacteria,Gammaproteobacteria' in outdata - + assert "loaded 1 signatures from 1 files total." in runtmp.last_result.err + assert "200,Bacteria,Proteobacteria,Gammaproteobacteria" in outdata def test_single_summarize_to_output_check_filename(runtmp): - db1 = utils.get_test_data('lca/delmont-1.lca.json') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - in_dir = runtmp.output('sigs') + db1 = utils.get_test_data("lca/delmont-1.lca.json") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + in_dir = runtmp.output("sigs") os.mkdir(in_dir) - shutil.copyfile(input_sig, os.path.join(in_dir, 'q.sig')) - - cmd = ['lca', 'summarize', '--db', db1, '--query', os.path.join(in_dir, 'q.sig'), - '-o', runtmp.output('output.txt')] + shutil.copyfile(input_sig, os.path.join(in_dir, "q.sig")) + + cmd = [ + "lca", + "summarize", + "--db", + db1, + "--query", + os.path.join(in_dir, "q.sig"), + "-o", + runtmp.output("output.txt"), + ] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - outdata = Path(runtmp.output('output.txt')).read_text() - - assert 'loaded 1 signatures from 1 files total.' in runtmp.last_result.err - assert 'count,superkingdom,phylum,class,order,family,genus,species,strain,filename,sig_name,sig_md5,total_counts\n' in outdata - assert '200,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii,,'+os.path.join(in_dir, 'q.sig')+',TARA_ASE_MAG_00031,5b438c6c858cdaf9e9b05a207fa3f9f0,200.0\n' in outdata + outdata = Path(runtmp.output("output.txt")).read_text() + + assert "loaded 1 signatures from 1 files total." in runtmp.last_result.err + assert ( + "count,superkingdom,phylum,class,order,family,genus,species,strain,filename,sig_name,sig_md5,total_counts\n" + in outdata + ) + assert ( + "200,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii,," + + os.path.join(in_dir, "q.sig") + + ",TARA_ASE_MAG_00031,5b438c6c858cdaf9e9b05a207fa3f9f0,200.0\n" + in outdata + ) print(outdata) def test_summarize_unknown_hashes_to_output_check_total_counts(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca-root/tax.csv') - input_sig1 = utils.get_test_data('lca-root/TARA_MED_MAG_00029.fa.sig') - input_sig2 = utils.get_test_data('lca-root/TOBG_MED-875.fna.gz.sig') - lca_db = runtmp.output(f'lca-root.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca-root/tax.csv") + input_sig1 = utils.get_test_data("lca-root/TARA_MED_MAG_00029.fa.sig") + input_sig2 = utils.get_test_data("lca-root/TOBG_MED-875.fna.gz.sig") + lca_db = runtmp.output(f"lca-root.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig2, '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig2, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -1854,78 +2204,89 @@ def test_summarize_unknown_hashes_to_output_check_total_counts(runtmp, lca_db_fo assert os.path.exists(lca_db) - assert '1 identifiers used out of 2 distinct identifiers in spreadsheet.' in runtmp.last_result.err + assert ( + "1 identifiers used out of 2 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) - cmd = ['lca', 'summarize', '--db', lca_db, '--query', input_sig1, - '-o', 'out.csv'] + cmd = ["lca", "summarize", "--db", lca_db, "--query", input_sig1, "-o", "out.csv"] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '(root)' not in runtmp.last_result.out - assert '11.5% 27 Archaea;Euryarcheoata;unassigned;unassigned;novelFamily_I' in runtmp.last_result.out + assert "(root)" not in runtmp.last_result.out + assert ( + "11.5% 27 Archaea;Euryarcheoata;unassigned;unassigned;novelFamily_I" + in runtmp.last_result.out + ) - with open(runtmp.output('out.csv'), newline="") as fp: + with open(runtmp.output("out.csv"), newline="") as fp: r = csv.DictReader(fp) rows = list(r) - pairs = [ (row['count'], row['total_counts']) for row in rows ] - pairs = [ (float(x), float(y)) for x, y in pairs ] + pairs = [(row["count"], row["total_counts"]) for row in rows] + pairs = [(float(x), float(y)) for x, y in pairs] pairs = set(pairs) - assert pairs == { (27.0, 234.0) } + assert pairs == {(27.0, 234.0)} def test_single_summarize_scaled(runtmp): - db1 = utils.get_test_data('lca/delmont-1.lca.json') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - in_dir = runtmp.output('sigs') + db1 = utils.get_test_data("lca/delmont-1.lca.json") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + in_dir = runtmp.output("sigs") os.mkdir(in_dir) - shutil.copyfile(input_sig, os.path.join(in_dir, 'q.sig')) + shutil.copyfile(input_sig, os.path.join(in_dir, "q.sig")) - cmd = ['lca', 'summarize', '--db', db1, '--query', input_sig, - '--scaled', '100000'] + cmd = ["lca", "summarize", "--db", db1, "--query", input_sig, "--scaled", "100000"] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'loaded 1 signatures from 1 files total.' in runtmp.last_result.err - assert '100.0% 27 Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales' + assert "loaded 1 signatures from 1 files total." in runtmp.last_result.err + assert "100.0% 27 Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales" def test_single_summarize_scaled_zip_query(runtmp): # check zipfile as query - db1 = utils.get_test_data('lca/delmont-1.lca.json') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') + db1 = utils.get_test_data("lca/delmont-1.lca.json") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") query_ss = sourmash.load_one_signature(input_sig, ksize=31) - query_zipfile = runtmp.output('query.zip') + query_zipfile = runtmp.output("query.zip") with sourmash_args.SaveSignaturesToLocation(query_zipfile) as save_sig: save_sig.add(query_ss) - cmd = ['lca', 'summarize', '--db', db1, '--query', query_zipfile, - '--scaled', '100000'] + cmd = [ + "lca", + "summarize", + "--db", + db1, + "--query", + query_zipfile, + "--scaled", + "100000", + ] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'loaded 1 signatures from 1 files total.' in runtmp.last_result.err - assert '100.0% 27 Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales' + assert "loaded 1 signatures from 1 files total." in runtmp.last_result.err + assert "100.0% 27 Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales" def test_multi_summarize_with_unassigned_singleton(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca/delmont-6.csv') - input_sig1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - input_sig2 = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-6.csv") + input_sig1 = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + input_sig2 = utils.get_test_data("lca/TARA_PSW_MAG_00136.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig1, input_sig2, - '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig1, input_sig2, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -1934,21 +2295,39 @@ def test_multi_summarize_with_unassigned_singleton(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in runtmp.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in runtmp.last_result.err - assert '2 identifiers used out of 2 distinct identifiers in spreadsheet.' in runtmp.last_result.err - - cmd = ['lca', 'summarize', '--db', lca_db, '--query', input_sig1, - input_sig2, '--ignore-abundance'] + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "2 identifiers used out of 2 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) + + cmd = [ + "lca", + "summarize", + "--db", + lca_db, + "--query", + input_sig1, + input_sig2, + "--ignore-abundance", + ] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'loaded 2 signatures from 2 files total.' in runtmp.last_result.err + assert "loaded 2 signatures from 2 files total." in runtmp.last_result.err out_lines = runtmp.last_result.out.splitlines() + def remove_line_startswith(x, check=None): for line in out_lines: if line.startswith(x): @@ -1957,32 +2336,45 @@ def remove_line_startswith(x, check=None): # make sure the check value is in there assert check in line return line - assert 0, "couldn't find {}".format(x) + assert 0, f"couldn't find {x}" # note, proportions/percentages are now per-file - remove_line_startswith('100.0% 200 Bacteria ', 'TARA_ASE_MAG_00031.sig:5b438c6c') - remove_line_startswith('100.0% 200 Bacteria;Proteobacteria;unassigned;unassigned ') - remove_line_startswith('100.0% 1231 Eukaryota;Chlorophyta ') - remove_line_startswith('100.0% 1231 Eukaryota ', 'TARA_PSW_MAG_00136.sig:db50b713') - remove_line_startswith('100.0% 200 Bacteria;Proteobacteria ') - remove_line_startswith('100.0% 200 Bacteria;Proteobacteria;unassigned ') - remove_line_startswith('100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae ') - remove_line_startswith('100.0% 200 Bacteria;Proteobacteria;unassigned;unassigned;Alteromonadaceae ') - remove_line_startswith('100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae;unassigned;unassigned ') - remove_line_startswith('100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae;unassigned ') - remove_line_startswith('100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae;unassigned;unassigned;Ostreococcus ') + remove_line_startswith( + "100.0% 200 Bacteria ", "TARA_ASE_MAG_00031.sig:5b438c6c" + ) + remove_line_startswith( + "100.0% 200 Bacteria;Proteobacteria;unassigned;unassigned " + ) + remove_line_startswith("100.0% 1231 Eukaryota;Chlorophyta ") + remove_line_startswith( + "100.0% 1231 Eukaryota ", "TARA_PSW_MAG_00136.sig:db50b713" + ) + remove_line_startswith("100.0% 200 Bacteria;Proteobacteria ") + remove_line_startswith("100.0% 200 Bacteria;Proteobacteria;unassigned ") + remove_line_startswith("100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae ") + remove_line_startswith( + "100.0% 200 Bacteria;Proteobacteria;unassigned;unassigned;Alteromonadaceae " + ) + remove_line_startswith( + "100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae;unassigned;unassigned " + ) + remove_line_startswith( + "100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae;unassigned " + ) + remove_line_startswith( + "100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae;unassigned;unassigned;Ostreococcus " + ) assert not out_lines def test_multi_summarize_with_zip_unassigned_singleton(runtmp, lca_db_format): # test summarize on multiple queries, in a zipfile. - taxcsv = utils.get_test_data('lca/delmont-6.csv') - input_sig1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - input_sig2 = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-6.csv") + input_sig1 = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + input_sig2 = utils.get_test_data("lca/TARA_PSW_MAG_00136.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig1, input_sig2, - '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig1, input_sig2, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -1991,31 +2383,48 @@ def test_multi_summarize_with_zip_unassigned_singleton(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in runtmp.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in runtmp.last_result.err - assert '2 identifiers used out of 2 distinct identifiers in spreadsheet.' in runtmp.last_result.err - - query_zipfile = runtmp.output('query.zip') + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "2 identifiers used out of 2 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) + + query_zipfile = runtmp.output("query.zip") with sourmash_args.SaveSignaturesToLocation(query_zipfile) as save_sig: - input_sig1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') + input_sig1 = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") sig1 = sourmash.load_one_signature(input_sig1, ksize=31) - input_sig2 = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') + input_sig2 = utils.get_test_data("lca/TARA_PSW_MAG_00136.sig") sig2 = sourmash.load_one_signature(input_sig2, ksize=31) save_sig.add(sig1) save_sig.add(sig2) - cmd = ['lca', 'summarize', '--db', lca_db, '--query', 'query.zip', - '--ignore-abundance'] + cmd = [ + "lca", + "summarize", + "--db", + lca_db, + "--query", + "query.zip", + "--ignore-abundance", + ] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'loaded 2 signatures from 1 files total.' in runtmp.last_result.err + assert "loaded 2 signatures from 1 files total." in runtmp.last_result.err out_lines = runtmp.last_result.out.splitlines() + def remove_line_startswith(x, check=None): for line in out_lines: if line.startswith(x): @@ -2024,31 +2433,40 @@ def remove_line_startswith(x, check=None): # make sure the check value is in there assert check in line return line - assert 0, "couldn't find {}".format(x) + assert 0, f"couldn't find {x}" # note, proportions/percentages are now per-file - remove_line_startswith('100.0% 200 Bacteria ', ':5b438c6c') - remove_line_startswith('100.0% 200 Bacteria;Proteobacteria;unassigned;unassigned ') - remove_line_startswith('100.0% 1231 Eukaryota;Chlorophyta ') - remove_line_startswith('100.0% 1231 Eukaryota ', ':db50b713') - remove_line_startswith('100.0% 200 Bacteria;Proteobacteria ') - remove_line_startswith('100.0% 200 Bacteria;Proteobacteria;unassigned ') - remove_line_startswith('100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae ') - remove_line_startswith('100.0% 200 Bacteria;Proteobacteria;unassigned;unassigned;Alteromonadaceae ') - remove_line_startswith('100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae;unassigned;unassigned ') - remove_line_startswith('100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae;unassigned ') - remove_line_startswith('100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae;unassigned;unassigned;Ostreococcus ') + remove_line_startswith("100.0% 200 Bacteria ", ":5b438c6c") + remove_line_startswith( + "100.0% 200 Bacteria;Proteobacteria;unassigned;unassigned " + ) + remove_line_startswith("100.0% 1231 Eukaryota;Chlorophyta ") + remove_line_startswith("100.0% 1231 Eukaryota ", ":db50b713") + remove_line_startswith("100.0% 200 Bacteria;Proteobacteria ") + remove_line_startswith("100.0% 200 Bacteria;Proteobacteria;unassigned ") + remove_line_startswith("100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae ") + remove_line_startswith( + "100.0% 200 Bacteria;Proteobacteria;unassigned;unassigned;Alteromonadaceae " + ) + remove_line_startswith( + "100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae;unassigned;unassigned " + ) + remove_line_startswith( + "100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae;unassigned " + ) + remove_line_startswith( + "100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae;unassigned;unassigned;Ostreococcus " + ) assert not out_lines def test_summarize_to_root(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca-root/tax.csv') - input_sig1 = utils.get_test_data('lca-root/TARA_MED_MAG_00029.fa.sig') - input_sig2 = utils.get_test_data('lca-root/TOBG_MED-875.fna.gz.sig') - lca_db = runtmp.output(f'lca-root.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca-root/tax.csv") + input_sig1 = utils.get_test_data("lca-root/TARA_MED_MAG_00029.fa.sig") + input_sig2 = utils.get_test_data("lca-root/TOBG_MED-875.fna.gz.sig") + lca_db = runtmp.output(f"lca-root.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig1, input_sig2, - '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig1, input_sig2, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -2057,27 +2475,37 @@ def test_summarize_to_root(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert '2 identifiers used out of 2 distinct identifiers in spreadsheet.' in runtmp.last_result.err - - cmd = ['lca', 'summarize', '--db', lca_db, '--query', input_sig2, - '--ignore-abundance'] + assert ( + "2 identifiers used out of 2 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) + + cmd = [ + "lca", + "summarize", + "--db", + lca_db, + "--query", + input_sig2, + "--ignore-abundance", + ] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '78.6% 99 Archaea' in runtmp.last_result.out - assert '21.4% 27 (root)' in runtmp.last_result.out + assert "78.6% 99 Archaea" in runtmp.last_result.out + assert "21.4% 27 (root)" in runtmp.last_result.out def test_summarize_unknown_hashes(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca-root/tax.csv') - input_sig1 = utils.get_test_data('lca-root/TARA_MED_MAG_00029.fa.sig') - input_sig2 = utils.get_test_data('lca-root/TOBG_MED-875.fna.gz.sig') - lca_db = runtmp.output(f'lca-root.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca-root/tax.csv") + input_sig1 = utils.get_test_data("lca-root/TARA_MED_MAG_00029.fa.sig") + input_sig2 = utils.get_test_data("lca-root/TOBG_MED-875.fna.gz.sig") + lca_db = runtmp.output(f"lca-root.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig2, '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig2, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -2086,27 +2514,32 @@ def test_summarize_unknown_hashes(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert '1 identifiers used out of 2 distinct identifiers in spreadsheet.' in runtmp.last_result.err + assert ( + "1 identifiers used out of 2 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) - cmd = ['lca', 'summarize', '--db', lca_db, '--query', input_sig1] + cmd = ["lca", "summarize", "--db", lca_db, "--query", input_sig1] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '(root)' not in runtmp.last_result.out - assert '11.5% 27 Archaea;Euryarcheoata;unassigned;unassigned;novelFamily_I' in runtmp.last_result.out + assert "(root)" not in runtmp.last_result.out + assert ( + "11.5% 27 Archaea;Euryarcheoata;unassigned;unassigned;novelFamily_I" + in runtmp.last_result.out + ) def test_summarize_to_root_abund(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca-root/tax.csv') - input_sig1 = utils.get_test_data('lca-root/TARA_MED_MAG_00029.fa.sig') - input_sig2 = utils.get_test_data('lca-root/TOBG_MED-875.fna.gz.sig') - lca_db = runtmp.output(f'lca-root.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca-root/tax.csv") + input_sig1 = utils.get_test_data("lca-root/TARA_MED_MAG_00029.fa.sig") + input_sig2 = utils.get_test_data("lca-root/TOBG_MED-875.fna.gz.sig") + lca_db = runtmp.output(f"lca-root.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig1, input_sig2, - '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig1, input_sig2, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -2115,26 +2548,29 @@ def test_summarize_to_root_abund(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert '2 identifiers used out of 2 distinct identifiers in spreadsheet.' in runtmp.last_result.err + assert ( + "2 identifiers used out of 2 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) - cmd = ['lca', 'summarize', '--db', lca_db, '--query', input_sig2] + cmd = ["lca", "summarize", "--db", lca_db, "--query", input_sig2] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '78.9% 101 Archaea' in runtmp.last_result.out - assert '21.1% 27 (root)' in runtmp.last_result.out + assert "78.9% 101 Archaea" in runtmp.last_result.out + assert "21.1% 27 (root)" in runtmp.last_result.out def test_summarize_unknown_hashes_abund(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca-root/tax.csv') - input_sig1 = utils.get_test_data('lca-root/TARA_MED_MAG_00029.fa.sig') - input_sig2 = utils.get_test_data('lca-root/TOBG_MED-875.fna.gz.sig') - lca_db = runtmp.output(f'lca-root.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca-root/tax.csv") + input_sig1 = utils.get_test_data("lca-root/TARA_MED_MAG_00029.fa.sig") + input_sig2 = utils.get_test_data("lca-root/TOBG_MED-875.fna.gz.sig") + lca_db = runtmp.output(f"lca-root.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig2, '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig2, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -2143,61 +2579,73 @@ def test_summarize_unknown_hashes_abund(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert '1 identifiers used out of 2 distinct identifiers in spreadsheet.' in runtmp.last_result.err + assert ( + "1 identifiers used out of 2 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) - cmd = ['lca', 'summarize', '--db', lca_db, '--query', input_sig1] + cmd = ["lca", "summarize", "--db", lca_db, "--query", input_sig1] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '(root)' not in runtmp.last_result.out - assert '11.5% 27 Archaea;Euryarcheoata;unassigned;unassigned;novelFamily_I' in runtmp.last_result.out + assert "(root)" not in runtmp.last_result.out + assert ( + "11.5% 27 Archaea;Euryarcheoata;unassigned;unassigned;novelFamily_I" + in runtmp.last_result.out + ) @utils.in_thisdir def test_summarize_abund_hmp(c): # test lca summarize --with-abundance on some real data - queryfile = utils.get_test_data('hmp-sigs/G36354.sig.gz') - dbname = utils.get_test_data('hmp-sigs/G36354-matches.lca.json.gz') + queryfile = utils.get_test_data("hmp-sigs/G36354.sig.gz") + dbname = utils.get_test_data("hmp-sigs/G36354-matches.lca.json.gz") - c.run_sourmash('lca', 'summarize', '--db', dbname, '--query', queryfile) + c.run_sourmash("lca", "summarize", "--db", dbname, "--query", queryfile) - assert '32.1% 1080 p__Firmicutes;c__Bacilli;o__Lactobacillales' in c.last_result.out + assert ( + "32.1% 1080 p__Firmicutes;c__Bacilli;o__Lactobacillales" in c.last_result.out + ) @utils.in_thisdir def test_summarize_abund_fake_no_abund(c): # test lca summarize on some known/fake data; see docs for explanation. - queryfile = utils.get_test_data('fake-abund/query.sig.gz') - dbname = utils.get_test_data('fake-abund/matches.lca.json.gz') + queryfile = utils.get_test_data("fake-abund/query.sig.gz") + dbname = utils.get_test_data("fake-abund/matches.lca.json.gz") - c.run_sourmash('lca', 'summarize', '--db', dbname, '--query', queryfile, - '--ignore-abundance') + c.run_sourmash( + "lca", "summarize", "--db", dbname, "--query", queryfile, "--ignore-abundance" + ) - assert 'NOTE: discarding abundances in query, since --ignore-abundance' in c.last_result.err - assert '79.6% 550 Bacteria' in c.last_result.out - assert '20.4% 141 Archaea' in c.last_result.out + assert ( + "NOTE: discarding abundances in query, since --ignore-abundance" + in c.last_result.err + ) + assert "79.6% 550 Bacteria" in c.last_result.out + assert "20.4% 141 Archaea" in c.last_result.out @utils.in_thisdir def test_summarize_abund_fake_yes_abund(c): # test lca summarize abundance weighting on some known/fake data - queryfile = utils.get_test_data('fake-abund/query.sig.gz') - dbname = utils.get_test_data('fake-abund/matches.lca.json.gz') + queryfile = utils.get_test_data("fake-abund/query.sig.gz") + dbname = utils.get_test_data("fake-abund/matches.lca.json.gz") - c.run_sourmash('lca', 'summarize', '--db', dbname, '--query', queryfile) + c.run_sourmash("lca", "summarize", "--db", dbname, "--query", queryfile) - assert '43.2% 563 Bacteria' in c.last_result.out - assert '56.8% 740 Archaea' in c.last_result.out + assert "43.2% 563 Bacteria" in c.last_result.out + assert "56.8% 740 Archaea" in c.last_result.out def test_rankinfo_on_multi(runtmp): - db1 = utils.get_test_data('lca/dir1.lca.json') - db2 = utils.get_test_data('lca/dir2.lca.json') + db1 = utils.get_test_data("lca/dir1.lca.json") + db2 = utils.get_test_data("lca/dir2.lca.json") - cmd = ['lca', 'rankinfo', db1, db2] + cmd = ["lca", "rankinfo", db1, db2] runtmp.sourmash(*cmd) print(cmd) @@ -2205,22 +2653,22 @@ def test_rankinfo_on_multi(runtmp): print(runtmp.last_result.err) lines = runtmp.last_result.out.splitlines() - lines.remove('superkingdom: 0 (0.0%)') - lines.remove('phylum: 464 (12.8%)') - lines.remove('class: 533 (14.7%)') - lines.remove('order: 1050 (29.0%)') - lines.remove('family: 695 (19.2%)') - lines.remove('genus: 681 (18.8%)') - lines.remove('species: 200 (5.5%)') - lines.remove('strain: 0 (0.0%)') + lines.remove("superkingdom: 0 (0.0%)") + lines.remove("phylum: 464 (12.8%)") + lines.remove("class: 533 (14.7%)") + lines.remove("order: 1050 (29.0%)") + lines.remove("family: 695 (19.2%)") + lines.remove("genus: 681 (18.8%)") + lines.remove("species: 200 (5.5%)") + lines.remove("strain: 0 (0.0%)") assert not lines def test_rankinfo_on_single(runtmp): - db1 = utils.get_test_data('lca/both.lca.json') + db1 = utils.get_test_data("lca/both.lca.json") - cmd = ['lca', 'rankinfo', db1] + cmd = ["lca", "rankinfo", db1] runtmp.sourmash(*cmd) print(cmd) @@ -2228,46 +2676,55 @@ def test_rankinfo_on_single(runtmp): print(runtmp.last_result.err) lines = runtmp.last_result.out.splitlines() - lines.remove('superkingdom: 0 (0.0%)') - lines.remove('phylum: 464 (12.8%)') - lines.remove('class: 533 (14.7%)') - lines.remove('order: 1050 (29.0%)') - lines.remove('family: 695 (19.2%)') - lines.remove('genus: 681 (18.8%)') - lines.remove('species: 200 (5.5%)') - lines.remove('strain: 0 (0.0%)') + lines.remove("superkingdom: 0 (0.0%)") + lines.remove("phylum: 464 (12.8%)") + lines.remove("class: 533 (14.7%)") + lines.remove("order: 1050 (29.0%)") + lines.remove("family: 695 (19.2%)") + lines.remove("genus: 681 (18.8%)") + lines.remove("species: 200 (5.5%)") + lines.remove("strain: 0 (0.0%)") assert not lines def test_rankinfo_no_tax(runtmp, lca_db_format): # note: TARA_PSW_MAG_00136 is _not_ in delmont-1.csv. - taxcsv = utils.get_test_data('lca/delmont-1.csv') - input_sig = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-1.csv") + input_sig = utils.get_test_data("lca/TARA_PSW_MAG_00136.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig, "-F", lca_db_format] runtmp.sourmash(*cmd) - print('cmd:', cmd) - print('out:', runtmp.last_result.out) - print('err:', runtmp.last_result.err) + print("cmd:", cmd) + print("out:", runtmp.last_result.out) + print("err:", runtmp.last_result.err) assert os.path.exists(lca_db) - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in runtmp.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in runtmp.last_result.err - assert '0 identifiers used out of 1 distinct identifiers in spreadsheet.' in runtmp.last_result.err - - cmd = ['lca', 'rankinfo', lca_db] + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "0 identifiers used out of 1 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) + + cmd = ["lca", "rankinfo", lca_db] runtmp.sourmash(*cmd) def test_rankinfo_with_min(runtmp): - db1 = utils.get_test_data('lca/dir1.lca.json') - db2 = utils.get_test_data('lca/dir2.lca.json') + db1 = utils.get_test_data("lca/dir1.lca.json") + db2 = utils.get_test_data("lca/dir2.lca.json") - cmd = ['lca', 'rankinfo', db1, db2, '--minimum-num', '1'] + cmd = ["lca", "rankinfo", db1, db2, "--minimum-num", "1"] runtmp.sourmash(*cmd) print(cmd) @@ -2275,23 +2732,23 @@ def test_rankinfo_with_min(runtmp): print(runtmp.last_result.err) lines = runtmp.last_result.out.splitlines() - lines.remove('superkingdom: 0 (0.0%)') - lines.remove('phylum: 464 (12.8%)') - lines.remove('class: 533 (14.7%)') - lines.remove('order: 1050 (29.0%)') - lines.remove('family: 695 (19.2%)') - lines.remove('genus: 681 (18.8%)') - lines.remove('species: 200 (5.5%)') - lines.remove('strain: 0 (0.0%)') + lines.remove("superkingdom: 0 (0.0%)") + lines.remove("phylum: 464 (12.8%)") + lines.remove("class: 533 (14.7%)") + lines.remove("order: 1050 (29.0%)") + lines.remove("family: 695 (19.2%)") + lines.remove("genus: 681 (18.8%)") + lines.remove("species: 200 (5.5%)") + lines.remove("strain: 0 (0.0%)") assert not lines def test_rankinfo_with_min_2(runtmp): - db1 = utils.get_test_data('lca/dir1.lca.json') - db2 = utils.get_test_data('lca/dir2.lca.json') + db1 = utils.get_test_data("lca/dir1.lca.json") + db2 = utils.get_test_data("lca/dir2.lca.json") - cmd = ['lca', 'rankinfo', db1, db2, '--minimum-num', '2'] + cmd = ["lca", "rankinfo", db1, db2, "--minimum-num", "2"] runtmp.sourmash(*cmd) print(cmd) @@ -2302,126 +2759,186 @@ def test_rankinfo_with_min_2(runtmp): def test_compare_csv(runtmp): - a = utils.get_test_data('lca/classify-by-both.csv') - b = utils.get_test_data('lca/tara-delmont-SuppTable3.csv') + a = utils.get_test_data("lca/classify-by-both.csv") + b = utils.get_test_data("lca/tara-delmont-SuppTable3.csv") - cmd = ['lca', 'compare_csv', a, b, '-f'] + cmd = ["lca", "compare_csv", a, b, "-f"] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'loaded 106 distinct lineages, 957 rows' in runtmp.last_result.err - assert 'missing 937 assignments in classify spreadsheet.' in runtmp.last_result.err - assert '20 total assignments, 0 differ between spreadsheets.' in runtmp.last_result.err + assert "loaded 106 distinct lineages, 957 rows" in runtmp.last_result.err + assert "missing 937 assignments in classify spreadsheet." in runtmp.last_result.err + assert ( + "20 total assignments, 0 differ between spreadsheets." in runtmp.last_result.err + ) def test_compare_csv_real(runtmp): - a = utils.get_test_data('lca/tully-genome-sigs.classify.csv') - b = utils.get_test_data('lca/tully-query.delmont-db.sigs.classify.csv') + a = utils.get_test_data("lca/tully-genome-sigs.classify.csv") + b = utils.get_test_data("lca/tully-query.delmont-db.sigs.classify.csv") - cmd = ['lca', 'compare_csv', a, b, '--start-column=3', '-f'] + cmd = ["lca", "compare_csv", a, b, "--start-column=3", "-f"] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'loaded 87 distinct lineages, 2631 rows' in runtmp.last_result.err - assert 'missing 71 assignments in classify spreadsheet.' in runtmp.last_result.err - assert 'missing 1380 assignments in custom spreadsheet.' in runtmp.last_result.err - assert '(these will not be evaluated any further)' in runtmp.last_result.err - assert '987 total assignments, 889 differ between spreadsheets.' in runtmp.last_result.err - assert '296 are compatible (one lineage is ancestor of another.' in runtmp.last_result.err - assert '593 are incompatible (there is a disagreement in the trees).' in runtmp.last_result.err - assert '164 incompatible at rank superkingdom' in runtmp.last_result.err - assert '255 incompatible at rank phylum' in runtmp.last_result.err - assert '107 incompatible at rank class' in runtmp.last_result.err - assert '54 incompatible at rank order' in runtmp.last_result.err - assert '13 incompatible at rank family' in runtmp.last_result.err - assert '0 incompatible at rank genus' in runtmp.last_result.err - assert '0 incompatible at rank species' in runtmp.last_result.err + assert "loaded 87 distinct lineages, 2631 rows" in runtmp.last_result.err + assert "missing 71 assignments in classify spreadsheet." in runtmp.last_result.err + assert "missing 1380 assignments in custom spreadsheet." in runtmp.last_result.err + assert "(these will not be evaluated any further)" in runtmp.last_result.err + assert ( + "987 total assignments, 889 differ between spreadsheets." + in runtmp.last_result.err + ) + assert ( + "296 are compatible (one lineage is ancestor of another." + in runtmp.last_result.err + ) + assert ( + "593 are incompatible (there is a disagreement in the trees)." + in runtmp.last_result.err + ) + assert "164 incompatible at rank superkingdom" in runtmp.last_result.err + assert "255 incompatible at rank phylum" in runtmp.last_result.err + assert "107 incompatible at rank class" in runtmp.last_result.err + assert "54 incompatible at rank order" in runtmp.last_result.err + assert "13 incompatible at rank family" in runtmp.last_result.err + assert "0 incompatible at rank genus" in runtmp.last_result.err + assert "0 incompatible at rank species" in runtmp.last_result.err def test_incompat_lca_db_ksize_2_fail(runtmp, lca_db_format): # test on gather - create a database with ksize of 25 => fail # because of incompatibility. c = runtmp - testdata1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.fa.gz') - c.run_sourmash('sketch', 'dna', '-p', 'k=25,scaled=1000', testdata1, - '-o', 'test_db.sig') + testdata1 = utils.get_test_data("lca/TARA_ASE_MAG_00031.fa.gz") + c.run_sourmash( + "sketch", "dna", "-p", "k=25,scaled=1000", testdata1, "-o", "test_db.sig" + ) print(c) - c.run_sourmash('lca', 'index', utils.get_test_data('lca/delmont-1.csv',), - f'test.lca.{lca_db_format}', 'test_db.sig', - '-k', '25', '--scaled', '10000', - '-F', lca_db_format) + c.run_sourmash( + "lca", + "index", + utils.get_test_data( + "lca/delmont-1.csv", + ), + f"test.lca.{lca_db_format}", + "test_db.sig", + "-k", + "25", + "--scaled", + "10000", + "-F", + lca_db_format, + ) print(c) # this should fail: the LCA database has ksize 25, and the query sig has # no compatible ksizes. - with pytest.raises(SourmashCommandFailed) as e: - c.run_sourmash('gather', utils.get_test_data('lca/TARA_ASE_MAG_00031.sig'), f'test.lca.{lca_db_format}') + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "gather", + utils.get_test_data("lca/TARA_ASE_MAG_00031.sig"), + f"test.lca.{lca_db_format}", + ) err = c.last_result.err print(err) - if lca_db_format == 'sql': + if lca_db_format == "sql": assert "no compatible signatures found in 'test.lca.sql'" in err else: assert "ERROR: cannot use 'test.lca.json' for this query." in err - assert "ksize on this database is 25; this is different from requested ksize of 31" + assert ( + "ksize on this database is 25; this is different from requested ksize of 31" + ) def test_incompat_lca_db_ksize_2_nofail(runtmp, lca_db_format): # test on gather - create a database with ksize of 25, no fail # because of --no-fail-on-empty-databases c = runtmp - testdata1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.fa.gz') - c.run_sourmash('sketch', 'dna', '-p', 'k=25,scaled=1000', testdata1, - '-o', 'test_db.sig') + testdata1 = utils.get_test_data("lca/TARA_ASE_MAG_00031.fa.gz") + c.run_sourmash( + "sketch", "dna", "-p", "k=25,scaled=1000", testdata1, "-o", "test_db.sig" + ) print(c) - c.run_sourmash('lca', 'index', utils.get_test_data('lca/delmont-1.csv',), - f'test.lca.{lca_db_format}', 'test_db.sig', - '-k', '25', '--scaled', '10000', - '-F', lca_db_format) + c.run_sourmash( + "lca", + "index", + utils.get_test_data( + "lca/delmont-1.csv", + ), + f"test.lca.{lca_db_format}", + "test_db.sig", + "-k", + "25", + "--scaled", + "10000", + "-F", + lca_db_format, + ) print(c) # this should not fail despite mismatched ksize, b/c of --no-fail flag. - c.run_sourmash('gather', utils.get_test_data('lca/TARA_ASE_MAG_00031.sig'), f'test.lca.{lca_db_format}', '--no-fail-on-empty-database') + c.run_sourmash( + "gather", + utils.get_test_data("lca/TARA_ASE_MAG_00031.sig"), + f"test.lca.{lca_db_format}", + "--no-fail-on-empty-database", + ) err = c.last_result.err print(err) - if lca_db_format == 'sql': + if lca_db_format == "sql": assert "no compatible signatures found in 'test.lca.sql'" in err else: assert "ERROR: cannot use 'test.lca.json' for this query." in err - assert "ksize on this database is 25; this is different from requested ksize of 31" + assert ( + "ksize on this database is 25; this is different from requested ksize of 31" + ) def test_lca_index_empty(runtmp, lca_db_format): c = runtmp # test lca index with an empty taxonomy CSV, followed by a load & gather. - sig2file = utils.get_test_data('2.fa.sig') - sig47file = utils.get_test_data('47.fa.sig') - sig63file = utils.get_test_data('63.fa.sig') + sig2file = utils.get_test_data("2.fa.sig") + sig47file = utils.get_test_data("47.fa.sig") + sig63file = utils.get_test_data("63.fa.sig") sig63 = load_one_signature(sig63file, ksize=31) # create an empty spreadsheet - with open(c.output('empty.csv'), 'wt') as fp: - fp.write('accession,superkingdom,phylum,class,order,family,genus,species,strain') + with open(c.output("empty.csv"), "w") as fp: + fp.write( + "accession,superkingdom,phylum,class,order,family,genus,species,strain" + ) # index! - c.run_sourmash('lca', 'index', 'empty.csv', 'xxx', - sig2file, sig47file, sig63file, '--scaled', '1000', - '-F', lca_db_format) + c.run_sourmash( + "lca", + "index", + "empty.csv", + "xxx", + sig2file, + sig47file, + sig63file, + "--scaled", + "1000", + "-F", + lca_db_format, + ) # can we load and search? - lca_db_filename = c.output(f'xxx.lca.{lca_db_format}') + lca_db_filename = c.output(f"xxx.lca.{lca_db_format}") db, ksize, scaled = lca_utils.load_single_database(lca_db_filename) result = db.best_containment(sig63) @@ -2434,9 +2951,9 @@ def test_lca_index_empty(runtmp, lca_db_format): def test_lca_gather_threshold_1(): # test gather() method, in some detail; see same tests for sbt. - sig2file = utils.get_test_data('2.fa.sig') - sig47file = utils.get_test_data('47.fa.sig') - sig63file = utils.get_test_data('63.fa.sig') + sig2file = utils.get_test_data("2.fa.sig") + sig47file = utils.get_test_data("47.fa.sig") + sig63file = utils.get_test_data("63.fa.sig") sig2 = load_one_signature(sig2file, ksize=31) sig47 = load_one_signature(sig47file, ksize=31) @@ -2468,7 +2985,7 @@ def test_lca_gather_threshold_1(): containment, match_sig, name = result assert containment == 1.0 assert match_sig.minhash == sig2.minhash - assert name == None + assert name is None # check with a threshold -> should be no results. with pytest.raises(ValueError): @@ -2485,7 +3002,7 @@ def test_lca_gather_threshold_1(): containment, match_sig, name = result assert containment == 1.0 assert match_sig.minhash == sig2.minhash - assert name == None + assert name is None # check with a too-high threshold -> should be no results. with pytest.raises(ValueError): @@ -2494,9 +3011,9 @@ def test_lca_gather_threshold_1(): def test_lca_gather_threshold_5(): # test gather() method, in some detail; see same tests for sbt. - sig2file = utils.get_test_data('2.fa.sig') - sig47file = utils.get_test_data('47.fa.sig') - sig63file = utils.get_test_data('63.fa.sig') + sig2file = utils.get_test_data("2.fa.sig") + sig47file = utils.get_test_data("47.fa.sig") + sig63file = utils.get_test_data("63.fa.sig") sig2 = load_one_signature(sig2file, ksize=31) sig47 = load_one_signature(sig47file, ksize=31) @@ -2528,7 +3045,7 @@ def test_lca_gather_threshold_5(): containment, match_sig, name = result assert containment == 1.0 assert match_sig.minhash == sig2.minhash - assert name == None + assert name is None # now, check with a threshold_bp that should be meet-able. result = db.best_containment(SourmashSignature(new_mh), threshold_bp=5000) @@ -2536,13 +3053,13 @@ def test_lca_gather_threshold_5(): containment, match_sig, name = result assert containment == 1.0 assert match_sig.minhash == sig2.minhash - assert name == None + assert name is None def test_gather_multiple_return(): - sig2file = utils.get_test_data('2.fa.sig') - sig47file = utils.get_test_data('47.fa.sig') - sig63file = utils.get_test_data('63.fa.sig') + sig2file = utils.get_test_data("2.fa.sig") + sig47file = utils.get_test_data("47.fa.sig") + sig63file = utils.get_test_data("63.fa.sig") sig2 = load_one_signature(sig2file, ksize=31) sig47 = load_one_signature(sig47file, ksize=31) @@ -2564,18 +3081,22 @@ def test_gather_multiple_return(): def test_lca_db_protein_build(): # test programmatic creation of LCA database with protein sigs in it - sigfile1 = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') + sigfile1 = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) sig1 = sourmash.load_one_signature(sigfile1) sig2 = sourmash.load_one_signature(sigfile2) - db = sourmash.lca.LCA_Database(ksize=19, scaled=100, moltype='protein') + db = sourmash.lca.LCA_Database(ksize=19, scaled=100, moltype="protein") assert db.insert(sig1) assert db.insert(sig2) # check reconstruction -- - mh_list = [ x.minhash for x in db.signatures() ] + mh_list = [x.minhash for x in db.signatures()] assert len(mh_list) == 2 assert sig1.minhash in mh_list assert sig2.minhash in mh_list @@ -2591,28 +3112,32 @@ def test_lca_db_protein_build(): @utils.in_tempdir def test_lca_db_protein_save_load(c): # test save/load of programmatically created db with protein sigs in it - sigfile1 = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') + sigfile1 = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) sig1 = sourmash.load_one_signature(sigfile1) sig2 = sourmash.load_one_signature(sigfile2) - db = sourmash.lca.LCA_Database(ksize=19, scaled=100, moltype='protein') + db = sourmash.lca.LCA_Database(ksize=19, scaled=100, moltype="protein") assert db.insert(sig1) assert db.insert(sig2) - db.save(c.output('xxx.lca.json')) + db.save(c.output("xxx.lca.json")) del db - x = sourmash.lca.lca_db.load_single_database(c.output('xxx.lca.json')) + x = sourmash.lca.lca_db.load_single_database(c.output("xxx.lca.json")) db2 = x[0] - assert db2.moltype == 'protein' + assert db2.moltype == "protein" # check reconstruction -- - mh_list = [ x.minhash for x in db2.signatures() ] + mh_list = [x.minhash for x in db2.signatures()] assert len(mh_list) == 2 - print('XXX', mh_list[0].ksize) - print('YYY', sig1.minhash.ksize) + print("XXX", mh_list[0].ksize) + print("YYY", sig1.minhash.ksize) assert sig1.minhash in mh_list assert sig2.minhash in mh_list @@ -2628,26 +3153,45 @@ def test_lca_db_protein_command_index(runtmp, lca_db_format): # test command-line creation of LCA database with protein sigs c = runtmp - sigfile1 = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') - lineages = utils.get_test_data('prot/gtdb-subset-lineages.csv') - - db_out = c.output(f'protein.lca.{lca_db_format}') - - c.run_sourmash('lca', 'index', lineages, db_out, sigfile1, sigfile2, - '-C', '2', '--split-identifiers', '--require-taxonomy', - '--scaled', '100', '-k', '19', '--protein', - '-F', lca_db_format) + sigfile1 = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) + lineages = utils.get_test_data("prot/gtdb-subset-lineages.csv") + + db_out = c.output(f"protein.lca.{lca_db_format}") + + c.run_sourmash( + "lca", + "index", + lineages, + db_out, + sigfile1, + sigfile2, + "-C", + "2", + "--split-identifiers", + "--require-taxonomy", + "--scaled", + "100", + "-k", + "19", + "--protein", + "-F", + lca_db_format, + ) x = sourmash.lca.lca_db.load_single_database(db_out) db2 = x[0] - assert db2.moltype == 'protein' + assert db2.moltype == "protein" sig1 = sourmash.load_one_signature(sigfile1) sig2 = sourmash.load_one_signature(sigfile2) # check reconstruction -- - mh_list = [ x.minhash for x in db2.signatures() ] + mh_list = [x.minhash for x in db2.signatures()] assert len(mh_list) == 2 assert sig1.minhash in mh_list assert sig2.minhash in mh_list @@ -2664,31 +3208,37 @@ def test_lca_db_protein_command_index(runtmp, lca_db_format): def test_lca_db_protein_command_search(c): # test command-line search/gather of LCA database with protein sigs # (LCA database created as above) - sigfile1 = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - db_out = utils.get_test_data('prot/protein.lca.json.gz') + sigfile1 = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + db_out = utils.get_test_data("prot/protein.lca.json.gz") - c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches' in c.last_result.out + c.run_sourmash("search", sigfile1, db_out, "--threshold", "0.0") + assert "2 matches" in c.last_result.out - c.run_sourmash('gather', sigfile1, db_out) - assert 'found 1 matches total' in c.last_result.out - assert 'the recovered matches hit 100.0% of the query' in c.last_result.out + c.run_sourmash("gather", sigfile1, db_out) + assert "found 1 matches total" in c.last_result.out + assert "the recovered matches hit 100.0% of the query" in c.last_result.out def test_lca_db_hp_build(): # test programmatic creation of LCA database with hp sigs in it - sigfile1 = utils.get_test_data('prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/hp/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') + sigfile1 = utils.get_test_data( + "prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/hp/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) sig1 = sourmash.load_one_signature(sigfile1) sig2 = sourmash.load_one_signature(sigfile2) - db = sourmash.lca.LCA_Database(ksize=19, scaled=100, moltype='hp') + db = sourmash.lca.LCA_Database(ksize=19, scaled=100, moltype="hp") assert db.insert(sig1) assert db.insert(sig2) # check reconstruction -- - mh_list = [ x.minhash for x in db.signatures() ] + mh_list = [x.minhash for x in db.signatures()] assert len(mh_list) == 2 assert sig1.minhash in mh_list assert sig2.minhash in mh_list @@ -2704,25 +3254,29 @@ def test_lca_db_hp_build(): @utils.in_tempdir def test_lca_db_hp_save_load(c): # test save/load of programmatically created db with hp sigs in it - sigfile1 = utils.get_test_data('prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/hp/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') + sigfile1 = utils.get_test_data( + "prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/hp/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) sig1 = sourmash.load_one_signature(sigfile1) sig2 = sourmash.load_one_signature(sigfile2) - db = sourmash.lca.LCA_Database(ksize=19, scaled=100, moltype='hp') + db = sourmash.lca.LCA_Database(ksize=19, scaled=100, moltype="hp") assert db.insert(sig1) assert db.insert(sig2) - db.save(c.output('xxx.lca.json')) + db.save(c.output("xxx.lca.json")) del db - x = sourmash.lca.lca_db.load_single_database(c.output('xxx.lca.json')) + x = sourmash.lca.lca_db.load_single_database(c.output("xxx.lca.json")) db2 = x[0] - assert db2.moltype == 'hp' + assert db2.moltype == "hp" # check reconstruction -- - mh_list = [ x.minhash for x in db2.signatures() ] + mh_list = [x.minhash for x in db2.signatures()] assert len(mh_list) == 2 assert sig1.minhash in mh_list assert sig2.minhash in mh_list @@ -2739,26 +3293,45 @@ def test_lca_db_hp_command_index(runtmp, lca_db_format): # test command-line creation of LCA database with hp sigs c = runtmp - sigfile1 = utils.get_test_data('prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/hp/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') - lineages = utils.get_test_data('prot/gtdb-subset-lineages.csv') - - db_out = c.output(f'hp.lca.{lca_db_format}') - - c.run_sourmash('lca', 'index', lineages, db_out, sigfile1, sigfile2, - '-C', '2', '--split-identifiers', '--require-taxonomy', - '--scaled', '100', '-k', '19', '--hp', - '-F', lca_db_format) + sigfile1 = utils.get_test_data( + "prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/hp/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) + lineages = utils.get_test_data("prot/gtdb-subset-lineages.csv") + + db_out = c.output(f"hp.lca.{lca_db_format}") + + c.run_sourmash( + "lca", + "index", + lineages, + db_out, + sigfile1, + sigfile2, + "-C", + "2", + "--split-identifiers", + "--require-taxonomy", + "--scaled", + "100", + "-k", + "19", + "--hp", + "-F", + lca_db_format, + ) x = sourmash.lca.lca_db.load_single_database(db_out) db2 = x[0] - assert db2.moltype == 'hp' + assert db2.moltype == "hp" sig1 = sourmash.load_one_signature(sigfile1) sig2 = sourmash.load_one_signature(sigfile2) # check reconstruction -- - mh_list = [ x.minhash for x in db2.signatures() ] + mh_list = [x.minhash for x in db2.signatures()] assert len(mh_list) == 2 assert sig1.minhash in mh_list assert sig2.minhash in mh_list @@ -2775,31 +3348,37 @@ def test_lca_db_hp_command_index(runtmp, lca_db_format): def test_lca_db_hp_command_search(c): # test command-line search/gather of LCA database with hp sigs # (LCA database created as above) - sigfile1 = utils.get_test_data('prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - db_out = utils.get_test_data('prot/hp.lca.json.gz') + sigfile1 = utils.get_test_data( + "prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + db_out = utils.get_test_data("prot/hp.lca.json.gz") - c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches' in c.last_result.out + c.run_sourmash("search", sigfile1, db_out, "--threshold", "0.0") + assert "2 matches" in c.last_result.out - c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0') - assert 'found 1 matches total' in c.last_result.out - assert 'the recovered matches hit 100.0% of the query' in c.last_result.out + c.run_sourmash("gather", sigfile1, db_out, "--threshold", "0.0") + assert "found 1 matches total" in c.last_result.out + assert "the recovered matches hit 100.0% of the query" in c.last_result.out def test_lca_db_dayhoff_build(): # test programmatic creation of LCA database with dayhoff sigs in it - sigfile1 = utils.get_test_data('prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/dayhoff/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') + sigfile1 = utils.get_test_data( + "prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/dayhoff/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) sig1 = sourmash.load_one_signature(sigfile1) sig2 = sourmash.load_one_signature(sigfile2) - db = sourmash.lca.LCA_Database(ksize=19, scaled=100, moltype='dayhoff') + db = sourmash.lca.LCA_Database(ksize=19, scaled=100, moltype="dayhoff") assert db.insert(sig1) assert db.insert(sig2) # check reconstruction -- - mh_list = [ x.minhash for x in db.signatures() ] + mh_list = [x.minhash for x in db.signatures()] assert len(mh_list) == 2 assert sig1.minhash in mh_list assert sig2.minhash in mh_list @@ -2815,25 +3394,29 @@ def test_lca_db_dayhoff_build(): @utils.in_tempdir def test_lca_db_dayhoff_save_load(c): # test save/load of programmatically created db with dayhoff sigs in it - sigfile1 = utils.get_test_data('prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/dayhoff/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') + sigfile1 = utils.get_test_data( + "prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/dayhoff/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) sig1 = sourmash.load_one_signature(sigfile1) sig2 = sourmash.load_one_signature(sigfile2) - db = sourmash.lca.LCA_Database(ksize=19, scaled=100, moltype='dayhoff') + db = sourmash.lca.LCA_Database(ksize=19, scaled=100, moltype="dayhoff") assert db.insert(sig1) assert db.insert(sig2) - db.save(c.output('xxx.lca.json')) + db.save(c.output("xxx.lca.json")) del db - x = sourmash.lca.lca_db.load_single_database(c.output('xxx.lca.json')) + x = sourmash.lca.lca_db.load_single_database(c.output("xxx.lca.json")) db2 = x[0] - assert db2.moltype == 'dayhoff' + assert db2.moltype == "dayhoff" # check reconstruction -- - mh_list = [ x.minhash for x in db2.signatures() ] + mh_list = [x.minhash for x in db2.signatures()] assert len(mh_list) == 2 assert sig1.minhash in mh_list assert sig2.minhash in mh_list @@ -2850,26 +3433,45 @@ def test_lca_db_dayhoff_command_index(runtmp, lca_db_format): # test command-line creation of LCA database with dayhoff sigs c = runtmp - sigfile1 = utils.get_test_data('prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/dayhoff/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') - lineages = utils.get_test_data('prot/gtdb-subset-lineages.csv') - - db_out = c.output(f'dayhoff.lca.{lca_db_format}') - - c.run_sourmash('lca', 'index', lineages, db_out, sigfile1, sigfile2, - '-C', '2', '--split-identifiers', '--require-taxonomy', - '--scaled', '100', '-k', '19', '--dayhoff', - '-F', lca_db_format) + sigfile1 = utils.get_test_data( + "prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/dayhoff/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) + lineages = utils.get_test_data("prot/gtdb-subset-lineages.csv") + + db_out = c.output(f"dayhoff.lca.{lca_db_format}") + + c.run_sourmash( + "lca", + "index", + lineages, + db_out, + sigfile1, + sigfile2, + "-C", + "2", + "--split-identifiers", + "--require-taxonomy", + "--scaled", + "100", + "-k", + "19", + "--dayhoff", + "-F", + lca_db_format, + ) x = sourmash.lca.lca_db.load_single_database(db_out) db2 = x[0] - assert db2.moltype == 'dayhoff' + assert db2.moltype == "dayhoff" sig1 = sourmash.load_one_signature(sigfile1) sig2 = sourmash.load_one_signature(sigfile2) # check reconstruction -- - mh_list = [ x.minhash for x in db2.signatures() ] + mh_list = [x.minhash for x in db2.signatures()] assert len(mh_list) == 2 assert sig1.minhash in mh_list assert sig2.minhash in mh_list @@ -2886,29 +3488,43 @@ def test_lca_db_dayhoff_command_index(runtmp, lca_db_format): def test_lca_db_dayhoff_command_search(c): # test command-line search/gather of LCA database with dayhoff sigs # (LCA database created as above) - sigfile1 = utils.get_test_data('prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - db_out = utils.get_test_data('prot/dayhoff.lca.json.gz') + sigfile1 = utils.get_test_data( + "prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + db_out = utils.get_test_data("prot/dayhoff.lca.json.gz") - c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches' in c.last_result.out + c.run_sourmash("search", sigfile1, db_out, "--threshold", "0.0") + assert "2 matches" in c.last_result.out - c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0') - assert 'found 1 matches total' in c.last_result.out - assert 'the recovered matches hit 100.0% of the query' in c.last_result.out + c.run_sourmash("gather", sigfile1, db_out, "--threshold", "0.0") + assert "found 1 matches total" in c.last_result.out + assert "the recovered matches hit 100.0% of the query" in c.last_result.out def test_lca_index_with_picklist(runtmp, lca_db_format): - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - outdb = runtmp.output(f'gcf.lca.{lca_db_format}') - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + outdb = runtmp.output(f"gcf.lca.{lca_db_format}") + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") # create an empty spreadsheet - with open(runtmp.output('empty.csv'), 'wt') as fp: - fp.write('accession,superkingdom,phylum,class,order,family,genus,species,strain') - - runtmp.sourmash('lca', 'index', 'empty.csv', outdb, *gcf_sigs, - '-k', '21', '--picklist', f"{picklist}:md5:md5", - '-F', lca_db_format) + with open(runtmp.output("empty.csv"), "w") as fp: + fp.write( + "accession,superkingdom,phylum,class,order,family,genus,species,strain" + ) + + runtmp.sourmash( + "lca", + "index", + "empty.csv", + outdb, + *gcf_sigs, + "-k", + "21", + "--picklist", + f"{picklist}:md5:md5", + "-F", + lca_db_format, + ) out = runtmp.last_result.out err = runtmp.last_result.err @@ -2923,21 +3539,33 @@ def test_lca_index_with_picklist(runtmp, lca_db_format): siglist = list(sourmash.load_file_as_signatures(outdb)) assert len(siglist) == 3 for ss in siglist: - assert 'Thermotoga' in ss.name + assert "Thermotoga" in ss.name def test_lca_index_with_picklist_exclude(runtmp, lca_db_format): - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - outdb = runtmp.output(f'gcf.lca.{lca_db_format}') - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + outdb = runtmp.output(f"gcf.lca.{lca_db_format}") + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") # create an empty spreadsheet - with open(runtmp.output('empty.csv'), 'wt') as fp: - fp.write('accession,superkingdom,phylum,class,order,family,genus,species,strain') - - runtmp.sourmash('lca', 'index', 'empty.csv', outdb, *gcf_sigs, - '-k', '21', '--picklist', f"{picklist}:md5:md5:exclude", - '-F', lca_db_format) + with open(runtmp.output("empty.csv"), "w") as fp: + fp.write( + "accession,superkingdom,phylum,class,order,family,genus,species,strain" + ) + + runtmp.sourmash( + "lca", + "index", + "empty.csv", + outdb, + *gcf_sigs, + "-k", + "21", + "--picklist", + f"{picklist}:md5:md5:exclude", + "-F", + lca_db_format, + ) out = runtmp.last_result.out err = runtmp.last_result.err @@ -2948,21 +3576,24 @@ def test_lca_index_with_picklist_exclude(runtmp, lca_db_format): siglist = list(sourmash.load_file_as_signatures(outdb)) assert len(siglist) == 9 for ss in siglist: - assert 'Thermotoga' not in ss.name + assert "Thermotoga" not in ss.name def test_lca_index_select_with_picklist(runtmp, lca_db_format): # check what happens with picklists after index - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - outdb = runtmp.output(f'gcf.lca.{lca_db_format}') - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + outdb = runtmp.output(f"gcf.lca.{lca_db_format}") + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") # create an empty spreadsheet - with open(runtmp.output('empty.csv'), 'wt') as fp: - fp.write('accession,superkingdom,phylum,class,order,family,genus,species,strain') + with open(runtmp.output("empty.csv"), "w") as fp: + fp.write( + "accession,superkingdom,phylum,class,order,family,genus,species,strain" + ) - runtmp.sourmash('lca', 'index', 'empty.csv', outdb, *gcf_sigs, - '-k', '21', '-F', lca_db_format) + runtmp.sourmash( + "lca", "index", "empty.csv", outdb, *gcf_sigs, "-k", "21", "-F", lca_db_format + ) out = runtmp.last_result.out err = runtmp.last_result.err @@ -2979,21 +3610,24 @@ def test_lca_index_select_with_picklist(runtmp, lca_db_format): siglist = list(idx.signatures()) assert len(siglist) == 3 for ss in siglist: - assert 'Thermotoga' in ss.name + assert "Thermotoga" in ss.name def test_lca_index_select_with_picklist_exclude(runtmp, lca_db_format): # check what happens with picklists after index - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - outdb = runtmp.output(f'gcf.lca.{lca_db_format}') - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + outdb = runtmp.output(f"gcf.lca.{lca_db_format}") + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") # create an empty spreadsheet - with open(runtmp.output('empty.csv'), 'wt') as fp: - fp.write('accession,superkingdom,phylum,class,order,family,genus,species,strain') + with open(runtmp.output("empty.csv"), "w") as fp: + fp.write( + "accession,superkingdom,phylum,class,order,family,genus,species,strain" + ) - runtmp.sourmash('lca', 'index', 'empty.csv', outdb, *gcf_sigs, - '-k', '21', '-F', lca_db_format) + runtmp.sourmash( + "lca", "index", "empty.csv", outdb, *gcf_sigs, "-k", "21", "-F", lca_db_format + ) out = runtmp.last_result.out err = runtmp.last_result.err @@ -3009,7 +3643,7 @@ def test_lca_index_select_with_picklist_exclude(runtmp, lca_db_format): siglist = list(idx.signatures()) assert len(siglist) == 9 for ss in siglist: - assert 'Thermotoga' not in ss.name + assert "Thermotoga" not in ss.name def test_lca_jaccard_ordering(): @@ -3028,10 +3662,10 @@ def test_lca_jaccard_ordering(): def _intersect(x, y): return x.intersection_and_union_size(y)[0] - print('a intersect b:', _intersect(a, b)) - print('a intersect c:', _intersect(a, c)) - print('a jaccard b:', a.jaccard(b)) - print('a jaccard c:', a.jaccard(c)) + print("a intersect b:", _intersect(a, b)) + print("a intersect c:", _intersect(a, c)) + print("a jaccard b:", a.jaccard(b)) + print("a jaccard c:", a.jaccard(c)) assert _intersect(a, b) > _intersect(a, c) assert a.jaccard(b) < a.jaccard(c) @@ -3040,9 +3674,9 @@ def _intersect(x, y): assert a.jaccard(c) > 0.15 # now - make signatures, try out :) - ss_a = sourmash.SourmashSignature(a, name='A') - ss_b = sourmash.SourmashSignature(b, name='B') - ss_c = sourmash.SourmashSignature(c, name='C') + ss_a = sourmash.SourmashSignature(a, name="A") + ss_b = sourmash.SourmashSignature(b, name="B") + ss_c = sourmash.SourmashSignature(c, name="C") db = sourmash.lca.LCA_Database(ksize=31, scaled=2) db.insert(ss_a) @@ -3060,17 +3694,21 @@ def _intersect(x, y): def test_lca_db_protein_save_twice(runtmp, lca_db_format): # test save twice - sigfile1 = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') + sigfile1 = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) sig1 = sourmash.load_one_signature(sigfile1) sig2 = sourmash.load_one_signature(sigfile2) - db = sourmash.lca.LCA_Database(ksize=19, scaled=100, moltype='protein') + db = sourmash.lca.LCA_Database(ksize=19, scaled=100, moltype="protein") assert db.insert(sig1) assert db.insert(sig2) - db.save(runtmp.output('xxx'), format=lca_db_format) + db.save(runtmp.output("xxx"), format=lca_db_format) with pytest.raises(ValueError): - db.save(runtmp.output('xxx'), format=lca_db_format) + db.save(runtmp.output("xxx"), format=lca_db_format) diff --git a/tests/test_lca_db_protocol.py b/tests/test_lca_db_protocol.py index a3fc57b085..eb2f76fe07 100644 --- a/tests/test_lca_db_protocol.py +++ b/tests/test_lca_db_protocol.py @@ -7,26 +7,30 @@ import sourmash from sourmash.tax.tax_utils import MultiLineageDB -from sourmash.lca.lca_db import (LCA_Database, load_single_database) +from sourmash.lca.lca_db import LCA_Database, load_single_database def build_inmem_lca_db(runtmp): # test in-memory LCA_Database - sigfile1 = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') + sigfile1 = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) ss1 = sourmash.load_one_signature(sigfile1) ss2 = sourmash.load_one_signature(sigfile2) - lineages_file = utils.get_test_data('prot/gtdb-subset-lineages.csv') + lineages_file = utils.get_test_data("prot/gtdb-subset-lineages.csv") lineages = MultiLineageDB.load([lineages_file]) - db = LCA_Database(ksize=19, scaled=100, moltype='protein') + db = LCA_Database(ksize=19, scaled=100, moltype="protein") - ident1 = ss1.name.split(' ')[0].split('.')[0] + ident1 = ss1.name.split(" ")[0].split(".")[0] assert lineages[ident1] db.insert(ss1, ident=ident1, lineage=lineages[ident1]) - ident2 = ss2.name.split(' ')[0].split('.')[0] + ident2 = ss2.name.split(" ")[0].split(".")[0] assert lineages[ident2] db.insert(ss2, ident=ident2, lineage=lineages[ident2]) @@ -36,9 +40,9 @@ def build_inmem_lca_db(runtmp): def build_json_lca_db(runtmp): # test saved/loaded JSON database db = build_inmem_lca_db(runtmp) - db_out = runtmp.output('protein.lca.json') + db_out = runtmp.output("protein.lca.json") - db.save(db_out, format='json') + db.save(db_out, format="json") x = load_single_database(db_out) db_load = x[0] @@ -49,9 +53,9 @@ def build_json_lca_db(runtmp): def build_sql_lca_db(runtmp): # test saved/loaded SQL database db = build_inmem_lca_db(runtmp) - db_out = runtmp.output('protein.lca.json') + db_out = runtmp.output("protein.lca.json") - db.save(db_out, format='sql') + db.save(db_out, format="sql") x = load_single_database(db_out) db_load = x[0] @@ -59,9 +63,7 @@ def build_sql_lca_db(runtmp): return db_load -@pytest.fixture(params=[build_inmem_lca_db, - build_json_lca_db, - build_sql_lca_db]) +@pytest.fixture(params=[build_inmem_lca_db, build_json_lca_db, build_sql_lca_db]) def lca_db_obj(request, runtmp): build_fn = request.param @@ -77,16 +79,18 @@ def test_get_lineage_assignments(lca_db_obj): x = [] for tup in lineage: - if tup[0] != 'strain' or tup[1]: # ignore empty strain + if tup[0] != "strain" or tup[1]: # ignore empty strain x.append((tup[0], tup[1])) - assert x == [('superkingdom', 'd__Archaea'), - ('phylum', 'p__Crenarchaeota'), - ('class', 'c__Bathyarchaeia'), - ('order', 'o__B26-1'), - ('family', 'f__B26-1'), - ('genus', 'g__B26-1'), - ('species', 's__B26-1 sp001593925'),] + assert x == [ + ("superkingdom", "d__Archaea"), + ("phylum", "p__Crenarchaeota"), + ("class", "c__Bathyarchaeia"), + ("order", "o__B26-1"), + ("family", "f__B26-1"), + ("genus", "g__B26-1"), + ("species", "s__B26-1 sp001593925"), + ] def test_hashvals(lca_db_obj): @@ -102,7 +106,7 @@ def test_get_identifiers_for_hashval(lca_db_obj): assert len(idents) == 1 ident = idents[0] - assert ident == 'GCA_001593925' + assert ident == "GCA_001593925" def test_get_identifiers_for_hashval_2(lca_db_obj): @@ -111,15 +115,15 @@ def test_get_identifiers_for_hashval_2(lca_db_obj): for hashval in lca_db_obj.hashvals: idents = lca_db_obj.get_identifiers_for_hashval(hashval) - #idents = list(idents) + # idents = list(idents) all_idents.update(idents) all_idents = list(all_idents) print(all_idents) assert len(all_idents) == 2 - assert 'GCA_001593925' in all_idents - assert 'GCA_001593935' in all_idents + assert "GCA_001593925" in all_idents + assert "GCA_001593935" in all_idents def test_downsample_scaled(lca_db_obj): diff --git a/tests/test_lca_functions.py b/tests/test_lca_functions.py index 0674df80df..9add0df47f 100644 --- a/tests/test_lca_functions.py +++ b/tests/test_lca_functions.py @@ -4,14 +4,21 @@ import pytest from sourmash.lca import lca_utils -from sourmash.lca.lca_utils import (LineagePair, build_tree, find_lca, - taxlist, count_lca_for_assignments, - zip_lineage, display_lineage, - make_lineage, is_lineage_match, - pop_to_rank) - - -class FakeLCA_Database(object): +from sourmash.lca.lca_utils import ( + LineagePair, + build_tree, + find_lca, + taxlist, + count_lca_for_assignments, + zip_lineage, + display_lineage, + make_lineage, + is_lineage_match, + pop_to_rank, +) + + +class FakeLCA_Database: def __init__(self): self._assignments = {} @@ -26,139 +33,194 @@ def get_lineage_assignments(self, hashval): def test_taxlist_1(): - assert list(taxlist()) == ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain'] + assert list(taxlist()) == [ + "superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + "strain", + ] def test_taxlist_2(): - assert list(taxlist(include_strain=False)) == ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'] + assert list(taxlist(include_strain=False)) == [ + "superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + ] def test_zip_lineage_1(): - x = [ LineagePair('superkingdom', 'a'), LineagePair('phylum', 'b') ] - assert zip_lineage(x) == ['a', 'b', '', '', '', '', '', ''] + x = [LineagePair("superkingdom", "a"), LineagePair("phylum", "b")] + assert zip_lineage(x) == ["a", "b", "", "", "", "", "", ""] def test_zip_lineage_2(): - x = [ LineagePair('superkingdom', 'a'), LineagePair('phylum', 'b') ] - assert zip_lineage(x, truncate_empty=True) == ['a', 'b'] + x = [LineagePair("superkingdom", "a"), LineagePair("phylum", "b")] + assert zip_lineage(x, truncate_empty=True) == ["a", "b"] def test_zip_lineage_3(): - x = [ LineagePair('superkingdom', 'a'), LineagePair(None, ''), LineagePair('class', 'c') ] - assert zip_lineage(x) == ['a', '', 'c', '', '', '', '', ''] + x = [ + LineagePair("superkingdom", "a"), + LineagePair(None, ""), + LineagePair("class", "c"), + ] + assert zip_lineage(x) == ["a", "", "c", "", "", "", "", ""] def test_zip_lineage_3_truncate(): - x = [ LineagePair('superkingdom', 'a'), LineagePair(None, ''), LineagePair('class', 'c') ] - assert zip_lineage(x, truncate_empty=True) == ['a', '', 'c'] + x = [ + LineagePair("superkingdom", "a"), + LineagePair(None, ""), + LineagePair("class", "c"), + ] + assert zip_lineage(x, truncate_empty=True) == ["a", "", "c"] def test_zip_lineage_4(): - x = [ LineagePair('superkingdom', 'a'), LineagePair('class', 'c') ] + x = [LineagePair("superkingdom", "a"), LineagePair("class", "c")] with pytest.raises(ValueError) as e: zip_lineage(x) - assert 'incomplete lineage at phylum - is class instead' in str(e.value) + assert "incomplete lineage at phylum - is class instead" in str(e.value) def test_display_lineage_1(): - x = [ LineagePair('superkingdom', 'a'), LineagePair('phylum', 'b') ] + x = [LineagePair("superkingdom", "a"), LineagePair("phylum", "b")] assert display_lineage(x) == "a;b", display_lineage(x) def test_display_lineage_2(): - x = [ LineagePair('superkingdom', 'a'), LineagePair(None, ''), LineagePair('class', 'c') ] + x = [ + LineagePair("superkingdom", "a"), + LineagePair(None, ""), + LineagePair("class", "c"), + ] assert display_lineage(x) == "a;;c", display_lineage(x) def test_build_tree(): - tree = build_tree([[LineagePair('rank1', 'name1'), - LineagePair('rank2', 'name2')]]) - assert tree == { LineagePair('rank1', 'name1'): - { LineagePair('rank2', 'name2') : {}} } + tree = build_tree([[LineagePair("rank1", "name1"), LineagePair("rank2", "name2")]]) + assert tree == {LineagePair("rank1", "name1"): {LineagePair("rank2", "name2"): {}}} def test_build_tree_2(): - tree = build_tree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2a')], - [LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2b')], - ]) + tree = build_tree( + [ + [LineagePair("rank1", "name1"), LineagePair("rank2", "name2a")], + [LineagePair("rank1", "name1"), LineagePair("rank2", "name2b")], + ] + ) - assert tree == { LineagePair('rank1', 'name1'): { LineagePair('rank2', 'name2a') : {}, - LineagePair('rank2', 'name2b') : {}} } + assert tree == { + LineagePair("rank1", "name1"): { + LineagePair("rank2", "name2a"): {}, + LineagePair("rank2", "name2b"): {}, + } + } -def test_build_tree_3(): # empty 'rank2' name - tree = build_tree([[LineagePair('rank1', 'name1'), LineagePair('rank2', '')]]) - assert tree == { LineagePair('rank1', 'name1'): {} } +def test_build_tree_3(): # empty 'rank2' name + tree = build_tree([[LineagePair("rank1", "name1"), LineagePair("rank2", "")]]) + assert tree == {LineagePair("rank1", "name1"): {}} def test_build_tree_4(): - tree = build_tree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2a')], - ]) - - tree = build_tree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2b')], - ], tree) + tree = build_tree( + [ + [LineagePair("rank1", "name1"), LineagePair("rank2", "name2a")], + ] + ) + + tree = build_tree( + [ + [LineagePair("rank1", "name1"), LineagePair("rank2", "name2b")], + ], + tree, + ) + + assert tree == { + LineagePair("rank1", "name1"): { + LineagePair("rank2", "name2a"): {}, + LineagePair("rank2", "name2b"): {}, + } + } - assert tree == { LineagePair('rank1', 'name1'): { LineagePair('rank2', 'name2a') : {}, - LineagePair('rank2', 'name2b') : {}} } def test_build_tree_5(): with pytest.raises(ValueError): - tree = build_tree([]) + build_tree([]) def test_find_lca(): - tree = build_tree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2')]]) + tree = build_tree([[LineagePair("rank1", "name1"), LineagePair("rank2", "name2")]]) lca = find_lca(tree) - assert lca == ((LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2'),), 0) + assert lca == ( + ( + LineagePair("rank1", "name1"), + LineagePair("rank2", "name2"), + ), + 0, + ) def test_find_lca_2(): - tree = build_tree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2a')], - [LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2b')], - ]) + tree = build_tree( + [ + [LineagePair("rank1", "name1"), LineagePair("rank2", "name2a")], + [LineagePair("rank1", "name1"), LineagePair("rank2", "name2b")], + ] + ) lca = find_lca(tree) - assert lca == ((LineagePair('rank1', 'name1'),), 2) + assert lca == ((LineagePair("rank1", "name1"),), 2) def test_find_lca_3(): - lin1 = lca_utils.make_lineage('a;b;c') - lin2 = lca_utils.make_lineage('a;b') + lin1 = lca_utils.make_lineage("a;b;c") + lin2 = lca_utils.make_lineage("a;b") tree = build_tree([lin1, lin2]) lca, reason = find_lca(tree) - assert lca == lin1 # find most specific leaf node + assert lca == lin1 # find most specific leaf node def test_gather_assignments_1(): # test basic mechanics of gather_assignments function hashval = 12345678 - lin = lca_utils.make_lineage('a;b;c') + lin = lca_utils.make_lineage("a;b;c") db = FakeLCA_Database() - db._set_lineage_assignment(hashval, set([ lin ])) + db._set_lineage_assignment(hashval, set([lin])) assignments = lca_utils.gather_assignments([hashval], [db]) print(assignments) - assert assignments[hashval] == set([ lin ]) + assert assignments[hashval] == set([lin]) def test_gather_assignments_2(): # test basic mechanics of gather_assignments function with two lineages hashval = 12345678 - lin = lca_utils.make_lineage('a;b;c') - lin2 = lca_utils.make_lineage('a;b;d') + lin = lca_utils.make_lineage("a;b;c") + lin2 = lca_utils.make_lineage("a;b;d") db = FakeLCA_Database() - db._set_lineage_assignment(hashval, set([ lin, lin2 ])) + db._set_lineage_assignment(hashval, set([lin, lin2])) assignments = lca_utils.gather_assignments([hashval], [db]) print(assignments) - assert assignments[hashval] == set([ lin, lin2 ]) + assert assignments[hashval] == set([lin, lin2]) def test_gather_assignments_3(): @@ -166,27 +228,27 @@ def test_gather_assignments_3(): # and two hashvals hashval = 12345678 hashval2 = 87654321 - lin = lca_utils.make_lineage('a;b;c') - lin2 = lca_utils.make_lineage('a;b;d') + lin = lca_utils.make_lineage("a;b;c") + lin2 = lca_utils.make_lineage("a;b;d") db = FakeLCA_Database() - db._set_lineage_assignment(hashval, set([ lin, lin2 ])) - db._set_lineage_assignment(hashval2, set([ lin ])) + db._set_lineage_assignment(hashval, set([lin, lin2])) + db._set_lineage_assignment(hashval2, set([lin])) assignments = lca_utils.gather_assignments([hashval, hashval2], [db]) print(assignments) - assert assignments[hashval] == set([ lin, lin2 ]) - assert assignments[hashval2] == set([ lin ]) + assert assignments[hashval] == set([lin, lin2]) + assert assignments[hashval2] == set([lin]) def test_count_lca_for_assignments_1(): # test basic mechanics of gather_assignments function hashval = 12345678 - lin = lca_utils.make_lineage('a;b;c') + lin = lca_utils.make_lineage("a;b;c") db = FakeLCA_Database() - db._set_lineage_assignment(hashval, set([ lin ])) + db._set_lineage_assignment(hashval, set([lin])) assignments = lca_utils.gather_assignments([hashval], [db]) counts = count_lca_for_assignments(assignments) @@ -199,11 +261,11 @@ def test_count_lca_for_assignments_1(): def test_count_lca_for_assignments_2(): # test basic mechanics of gather_assignments function with two lineages hashval = 12345678 - lin = lca_utils.make_lineage('a;b;c') - lin2 = lca_utils.make_lineage('a;b;d') + lin = lca_utils.make_lineage("a;b;c") + lin2 = lca_utils.make_lineage("a;b;d") db = FakeLCA_Database() - db._set_lineage_assignment(hashval, set([ lin, lin2 ])) + db._set_lineage_assignment(hashval, set([lin, lin2])) assignments = lca_utils.gather_assignments([hashval], [db]) counts = count_lca_for_assignments(assignments) @@ -213,7 +275,7 @@ def test_count_lca_for_assignments_2(): assert counts[lin2] == 0 assert len(counts) == 1 - lca_lin = lca_utils.make_lineage('a;b') + lca_lin = lca_utils.make_lineage("a;b") assert counts[lca_lin] == 1 @@ -222,12 +284,12 @@ def test_count_lca_for_assignments_3(): # and two hashvals hashval = 12345678 hashval2 = 87654321 - lin = lca_utils.make_lineage('a;b;c') - lin2 = lca_utils.make_lineage('a;b;d') + lin = lca_utils.make_lineage("a;b;c") + lin2 = lca_utils.make_lineage("a;b;d") db = FakeLCA_Database() - db._set_lineage_assignment(hashval, set([ lin, lin2 ])) - db._set_lineage_assignment(hashval2, set([ lin ])) + db._set_lineage_assignment(hashval, set([lin, lin2])) + db._set_lineage_assignment(hashval2, set([lin])) assignments = lca_utils.gather_assignments([hashval, hashval2], [db]) counts = count_lca_for_assignments(assignments) @@ -237,20 +299,20 @@ def test_count_lca_for_assignments_3(): assert counts[lin] == 1 assert counts[lin2] == 0 - lca_lin = lca_utils.make_lineage('a;b') + lca_lin = lca_utils.make_lineage("a;b") assert counts[lca_lin] == 1 def test_count_lca_for_assignments_abund_1(): # test basic mechanics of gather_assignments function hashval = 12345678 - hashval_counts = dict() + hashval_counts = {} hashval_counts[hashval] = 3 - lin = lca_utils.make_lineage('a;b;c') + lin = lca_utils.make_lineage("a;b;c") db = FakeLCA_Database() - db._set_lineage_assignment(hashval, set([ lin ])) + db._set_lineage_assignment(hashval, set([lin])) assignments = lca_utils.gather_assignments(hashval_counts.keys(), [db]) counts = count_lca_for_assignments(assignments, hashval_counts) @@ -263,14 +325,14 @@ def test_count_lca_for_assignments_abund_1(): def test_count_lca_for_assignments_abund_2(): # test basic mechanics of gather_assignments function with two lineages hashval = 12345678 - hashval_counts = dict() + hashval_counts = {} hashval_counts[hashval] = 3 - lin = lca_utils.make_lineage('a;b;c') - lin2 = lca_utils.make_lineage('a;b;d') + lin = lca_utils.make_lineage("a;b;c") + lin2 = lca_utils.make_lineage("a;b;d") db = FakeLCA_Database() - db._set_lineage_assignment(hashval, set([ lin, lin2 ])) + db._set_lineage_assignment(hashval, set([lin, lin2])) assignments = lca_utils.gather_assignments(hashval_counts, [db]) counts = count_lca_for_assignments(assignments, hashval_counts) @@ -280,8 +342,8 @@ def test_count_lca_for_assignments_abund_2(): assert counts[lin2] == 0 assert len(counts) == 1 - lca_lin = lca_utils.make_lineage('a;b') - assert counts[lca_lin] == 3 # yes! + lca_lin = lca_utils.make_lineage("a;b") + assert counts[lca_lin] == 3 # yes! def test_count_lca_for_assignments_abund_3(): @@ -289,27 +351,28 @@ def test_count_lca_for_assignments_abund_3(): # and two hashvals hashval = 12345678 hashval2 = 87654321 - hashval_counts = dict() + hashval_counts = {} hashval_counts[hashval] = 2 hashval_counts[hashval2] = 5 - lin = lca_utils.make_lineage('a;b;c') - lin2 = lca_utils.make_lineage('a;b;d') + lin = lca_utils.make_lineage("a;b;c") + lin2 = lca_utils.make_lineage("a;b;d") db = FakeLCA_Database() - db._set_lineage_assignment(hashval, set([ lin, lin2 ])) - db._set_lineage_assignment(hashval2, set([ lin ])) + db._set_lineage_assignment(hashval, set([lin, lin2])) + db._set_lineage_assignment(hashval2, set([lin])) assignments = lca_utils.gather_assignments(hashval_counts, [db]) counts = count_lca_for_assignments(assignments, hashval_counts) print(counts) assert len(counts) == 2 - assert counts[lin] == 5 # makes sense - assert counts[lin2] == 0 # makes sense + assert counts[lin] == 5 # makes sense + assert counts[lin2] == 0 # makes sense + + lca_lin = lca_utils.make_lineage("a;b") + assert counts[lca_lin] == 2 # yes! - lca_lin = lca_utils.make_lineage('a;b') - assert counts[lca_lin] == 2 # yes! def test_count_lca_for_assignments_abund_4(): # test basic mechanics of gather_assignments function with three lineages @@ -317,112 +380,113 @@ def test_count_lca_for_assignments_abund_4(): hashval = 12345678 hashval2 = 87654321 hashval3 = 34567891 - hashval_counts = dict() + hashval_counts = {} hashval_counts[hashval] = 2 hashval_counts[hashval2] = 5 hashval_counts[hashval3] = 3 - lin = lca_utils.make_lineage('a;b;c') - lin2 = lca_utils.make_lineage('a;b;d') - lin3 = lca_utils.make_lineage('a;b;d;e') + lin = lca_utils.make_lineage("a;b;c") + lin2 = lca_utils.make_lineage("a;b;d") + lin3 = lca_utils.make_lineage("a;b;d;e") db = FakeLCA_Database() - db._set_lineage_assignment(hashval, set([ lin, lin2 ])) # lca: a;b - db._set_lineage_assignment(hashval2, set([ lin ])) # lca: a;b;c - db._set_lineage_assignment(hashval3, set([ lin2, lin3 ])) # a;b;d;e + db._set_lineage_assignment(hashval, set([lin, lin2])) # lca: a;b + db._set_lineage_assignment(hashval2, set([lin])) # lca: a;b;c + db._set_lineage_assignment(hashval3, set([lin2, lin3])) # a;b;d;e assignments = lca_utils.gather_assignments(hashval_counts, [db]) counts = count_lca_for_assignments(assignments, hashval_counts) print(counts) assert len(counts) == 3 - assert counts[lin] == 5 # makes sense b/c hashval2 - assert counts[lin2] == 0 # a;b;d (lin2) + a;b;d;e (lin3) -->a;b;d;e (lin3) only - assert counts[lin3] == 3 # hashval3 + assert counts[lin] == 5 # makes sense b/c hashval2 + assert counts[lin2] == 0 # a;b;d (lin2) + a;b;d;e (lin3) -->a;b;d;e (lin3) only + assert counts[lin3] == 3 # hashval3 + + lca_lin = lca_utils.make_lineage("a;b") + assert counts[lca_lin] == 2 # yes, b/c hashval - lca_lin = lca_utils.make_lineage('a;b') - assert counts[lca_lin] == 2 # yes, b/c hashval def test_count_lca_for_assignments_abund_5(): # test basic mechanics of gather_assignments function with two lineages # and two hashvals when linages match but one has lower taxo detail hashval = 12345678 hashval2 = 87654321 - hashval_counts = dict() + hashval_counts = {} hashval_counts[hashval] = 2 hashval_counts[hashval2] = 5 - lin = lca_utils.make_lineage('a;b;d') - lin2 = lca_utils.make_lineage('a;b;d;e') + lin = lca_utils.make_lineage("a;b;d") + lin2 = lca_utils.make_lineage("a;b;d;e") db = FakeLCA_Database() - db._set_lineage_assignment(hashval, set([ lin, lin2 ])) - db._set_lineage_assignment(hashval2, set([ lin ])) + db._set_lineage_assignment(hashval, set([lin, lin2])) + db._set_lineage_assignment(hashval2, set([lin])) assignments = lca_utils.gather_assignments(hashval_counts, [db]) counts = count_lca_for_assignments(assignments, hashval_counts) print(counts) assert len(counts) == 2 - assert counts[lin] == 5 # makes sense - assert counts[lin2] == 2 # lin+lin2 yield just lin2 + assert counts[lin] == 5 # makes sense + assert counts[lin2] == 2 # lin+lin2 yield just lin2 def test_is_lineage_match_1(): # basic behavior: match at order and above, but not at family or below. - lin1 = make_lineage('d__a;p__b;c__c;o__d;f__e') - lin2 = make_lineage('d__a;p__b;c__c;o__d;f__f') + lin1 = make_lineage("d__a;p__b;c__c;o__d;f__e") + lin2 = make_lineage("d__a;p__b;c__c;o__d;f__f") - assert is_lineage_match(lin1, lin2, 'superkingdom') - assert is_lineage_match(lin1, lin2, 'phylum') - assert is_lineage_match(lin1, lin2, 'class') - assert is_lineage_match(lin1, lin2, 'order') - assert not is_lineage_match(lin1, lin2, 'family') - assert not is_lineage_match(lin1, lin2, 'genus') - assert not is_lineage_match(lin1, lin2, 'species') + assert is_lineage_match(lin1, lin2, "superkingdom") + assert is_lineage_match(lin1, lin2, "phylum") + assert is_lineage_match(lin1, lin2, "class") + assert is_lineage_match(lin1, lin2, "order") + assert not is_lineage_match(lin1, lin2, "family") + assert not is_lineage_match(lin1, lin2, "genus") + assert not is_lineage_match(lin1, lin2, "species") def test_is_lineage_match_2(): # match at family, and above, levels; no genus or species to match - lin1 = make_lineage('d__a;p__b;c__c;o__d;f__f') - lin2 = make_lineage('d__a;p__b;c__c;o__d;f__f') + lin1 = make_lineage("d__a;p__b;c__c;o__d;f__f") + lin2 = make_lineage("d__a;p__b;c__c;o__d;f__f") - assert is_lineage_match(lin1, lin2, 'superkingdom') - assert is_lineage_match(lin1, lin2, 'phylum') - assert is_lineage_match(lin1, lin2, 'class') - assert is_lineage_match(lin1, lin2, 'order') - assert is_lineage_match(lin1, lin2, 'family') - assert not is_lineage_match(lin1, lin2, 'genus') - assert not is_lineage_match(lin1, lin2, 'species') + assert is_lineage_match(lin1, lin2, "superkingdom") + assert is_lineage_match(lin1, lin2, "phylum") + assert is_lineage_match(lin1, lin2, "class") + assert is_lineage_match(lin1, lin2, "order") + assert is_lineage_match(lin1, lin2, "family") + assert not is_lineage_match(lin1, lin2, "genus") + assert not is_lineage_match(lin1, lin2, "species") def test_is_lineage_match_3(): # one lineage is empty - lin1 = make_lineage('') - lin2 = make_lineage('d__a;p__b;c__c;o__d;f__f') + lin1 = make_lineage("") + lin2 = make_lineage("d__a;p__b;c__c;o__d;f__f") - assert not is_lineage_match(lin1, lin2, 'superkingdom') - assert not is_lineage_match(lin1, lin2, 'family') - assert not is_lineage_match(lin1, lin2, 'order') - assert not is_lineage_match(lin1, lin2, 'class') - assert not is_lineage_match(lin1, lin2, 'phylum') - assert not is_lineage_match(lin1, lin2, 'genus') - assert not is_lineage_match(lin1, lin2, 'species') + assert not is_lineage_match(lin1, lin2, "superkingdom") + assert not is_lineage_match(lin1, lin2, "family") + assert not is_lineage_match(lin1, lin2, "order") + assert not is_lineage_match(lin1, lin2, "class") + assert not is_lineage_match(lin1, lin2, "phylum") + assert not is_lineage_match(lin1, lin2, "genus") + assert not is_lineage_match(lin1, lin2, "species") def test_pop_to_rank_1(): # basic behavior - pop to order? - lin1 = make_lineage('d__a;p__b;c__c;o__d') - lin2 = make_lineage('d__a;p__b;c__c;o__d;f__f') + lin1 = make_lineage("d__a;p__b;c__c;o__d") + lin2 = make_lineage("d__a;p__b;c__c;o__d;f__f") print(lin1) - print(pop_to_rank(lin2, 'order')) - assert pop_to_rank(lin2, 'order') == lin1 + print(pop_to_rank(lin2, "order")) + assert pop_to_rank(lin2, "order") == lin1 def test_pop_to_rank_2(): # what if we're already above rank? - lin2 = make_lineage('d__a;p__b;c__c;o__d;f__f') + lin2 = make_lineage("d__a;p__b;c__c;o__d;f__f") - print(pop_to_rank(lin2, 'species')) - assert pop_to_rank(lin2, 'species') == lin2 + print(pop_to_rank(lin2, "species")) + assert pop_to_rank(lin2, "species") == lin2 diff --git a/tests/test_manifest.py b/tests/test_manifest.py index 074d72d705..138ae0f829 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -12,13 +12,13 @@ def test_generate_manifest(): # test basic manifest-generating functionality. - protzip = utils.get_test_data('prot/protein.zip') + protzip = utils.get_test_data("prot/protein.zip") loader = sourmash.load_file_as_index(protzip) rows = [] siglist = [] - for (sig, loc) in loader._signatures_with_internal(): + for sig, loc in loader._signatures_with_internal(): row = index.CollectionManifest.make_manifest_row(sig, loc) rows.append(row) siglist.append(sig) @@ -28,9 +28,9 @@ def test_generate_manifest(): assert len(manifest) == len(rows) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list for sig in siglist: assert sig in manifest @@ -38,13 +38,13 @@ def test_generate_manifest(): def test_manifest_operations(): # test basic manifest operations - += - protzip = utils.get_test_data('prot/protein.zip') + protzip = utils.get_test_data("prot/protein.zip") loader = sourmash.load_file_as_index(protzip) rows = [] siglist = [] - for (sig, loc) in loader._signatures_with_internal(): + for sig, loc in loader._signatures_with_internal(): row = index.CollectionManifest.make_manifest_row(sig, loc) rows.append(row) siglist.append(sig) @@ -53,24 +53,24 @@ def test_manifest_operations(): manifest2 = index.CollectionManifest(rows) manifest += manifest2 - assert len(manifest) == 2*len(rows) + assert len(manifest) == 2 * len(rows) assert len(manifest) == 4 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list def test_manifest_operations_fail(): # should not be able to add a manifest to itself - not only makes # no sense, but it means you're modifying a generator in place, sometimes. - protzip = utils.get_test_data('prot/protein.zip') + protzip = utils.get_test_data("prot/protein.zip") loader = sourmash.load_file_as_index(protzip) rows = [] siglist = [] - for (sig, loc) in loader._signatures_with_internal(): + for sig, loc in loader._signatures_with_internal(): row = index.CollectionManifest.make_manifest_row(sig, loc) rows.append(row) siglist.append(sig) @@ -82,13 +82,13 @@ def test_manifest_operations_fail(): def test_manifest_to_picklist(): # test manifest/picklist interaction basics - protzip = utils.get_test_data('prot/protein.zip') + protzip = utils.get_test_data("prot/protein.zip") loader = sourmash.load_file_as_index(protzip) rows = [] siglist = [] - for (sig, loc) in loader._signatures_with_internal(): + for sig, loc in loader._signatures_with_internal(): row = index.CollectionManifest.make_manifest_row(sig, loc) rows.append(row) siglist.append(sig) @@ -103,7 +103,7 @@ def test_manifest_to_picklist(): def test_manifest_compare(): # test saving and loading manifests - protzip = utils.get_test_data('prot/protein.zip') + protzip = utils.get_test_data("prot/protein.zip") loader = sourmash.load_file_as_index(protzip) manifest = loader.manifest @@ -124,7 +124,7 @@ def test_manifest_compare(): # not equal / diff values rows = list(manifest.rows) rows[0] = dict(rows[0]) - rows[0]['internal_location'] += '.foo' + rows[0]["internal_location"] += ".foo" short_mf = index.CollectionManifest(rows) assert short_mf != manifest @@ -132,13 +132,13 @@ def test_manifest_compare(): def test_save_load_manifest(): # test saving and loading manifests - protzip = utils.get_test_data('prot/protein.zip') + protzip = utils.get_test_data("prot/protein.zip") loader = sourmash.load_file_as_index(protzip) rows = [] siglist = [] - for (sig, loc) in loader._signatures_with_internal(): + for sig, loc in loader._signatures_with_internal(): row = index.CollectionManifest.make_manifest_row(sig, loc) rows.append(row) siglist.append(sig) @@ -179,7 +179,7 @@ def test_save_load_manifest(): # not equal / diff values rows = list(manifest.rows) rows[0] = dict(rows[0]) - rows[0]['internal_location'] += '.foo' + rows[0]["internal_location"] += ".foo" short_mf = index.CollectionManifest(rows) assert short_mf != manifest @@ -189,8 +189,7 @@ def test_manifest_to_picklist_bug(runtmp): # this tests a fun combination of things that led to a bug. # tl;dr we only want to iterate once across a generator... # ref #2762 - c = runtmp - all_zip = utils.get_test_data('prot/all.zip') + all_zip = utils.get_test_data("prot/all.zip") idx = sourmash_args.load_file_as_index(all_zip) assert len(idx) == 8 @@ -201,7 +200,7 @@ def test_manifest_to_picklist_bug(runtmp): def filter_fn(row): # match? keep = False - if "09a0869" in row['md5']: + if "09a0869" in row["md5"]: keep = True return keep @@ -219,17 +218,17 @@ def filter_fn(row): def test_generate_manifest_iterate_once(): # we should only iterate across manifest rows once - protzip = utils.get_test_data('prot/protein.zip') + protzip = utils.get_test_data("prot/protein.zip") loader = sourmash.load_file_as_index(protzip) siglist = [] - for (sig, loc) in loader._signatures_with_internal(): + for sig, loc in loader._signatures_with_internal(): siglist.append(sig) # build generator function => will not allow iteration twice def genfn(): - for (sig, loc) in loader._signatures_with_internal(): + for sig, loc in loader._signatures_with_internal(): row = index.CollectionManifest.make_manifest_row(sig, loc) yield row @@ -238,9 +237,9 @@ def genfn(): assert len(manifest) == 2 assert len(manifest._md5_set) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list for sig in siglist: assert sig in manifest diff --git a/tests/test_manifest_protocol.py b/tests/test_manifest_protocol.py index 5b9ea003d5..d36e8a309c 100644 --- a/tests/test_manifest_protocol.py +++ b/tests/test_manifest_protocol.py @@ -13,7 +13,7 @@ def build_simple_manifest(runtmp): # load and return the manifest from prot/all.zip - filename = utils.get_test_data('prot/all.zip') + filename = utils.get_test_data("prot/all.zip") idx = sourmash.load_file_as_index(filename) mf = idx.manifest assert len(mf) == 8 @@ -22,29 +22,29 @@ def build_simple_manifest(runtmp): def build_sqlite_manifest(runtmp): # return the manifest from prot/all.zip - filename = utils.get_test_data('prot/all.zip') + filename = utils.get_test_data("prot/all.zip") idx = sourmash.load_file_as_index(filename) mf = idx.manifest # build sqlite manifest from this 'un - mfdb = runtmp.output('test.sqlmf') + mfdb = runtmp.output("test.sqlmf") return SqliteCollectionManifest.load_from_manifest(mf, dbfile=mfdb) - + def save_load_manifest(runtmp): # save/load the manifest from a CSV. mf = build_simple_manifest(runtmp) - mf_csv = runtmp.output('mf.csv') + mf_csv = runtmp.output("mf.csv") mf.write_to_filename(mf_csv) load_mf = CollectionManifest.load_from_filename(mf_csv) return load_mf - -@pytest.fixture(params=[build_simple_manifest, - save_load_manifest, - build_sqlite_manifest]) + +@pytest.fixture( + params=[build_simple_manifest, save_load_manifest, build_sqlite_manifest] +) def manifest_obj(request, runtmp): build_fn = request.param @@ -55,6 +55,7 @@ def manifest_obj(request, runtmp): ### generic CollectionManifeset tests go here ### + def test_manifest_len(manifest_obj): # check that 'len' works assert len(manifest_obj) == 8 @@ -78,39 +79,38 @@ def test_manifest_bool(manifest_obj): def test_make_manifest_row(manifest_obj): # build a manifest row from a signature - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss = sourmash.load_one_signature(sig47) - row = manifest_obj.make_manifest_row(ss, 'foo', include_signature=False) - assert not 'signature' in row - assert row['internal_location'] == 'foo' - - assert row['md5'] == ss.md5sum() - assert row['md5short'] == ss.md5sum()[:8] - assert row['ksize'] == 31 - assert row['moltype'] == 'DNA' - assert row['num'] == 0 - assert row['scaled'] == 1000 - assert row['n_hashes'] == len(ss.minhash) - assert not row['with_abundance'] - assert row['name'] == ss.name - assert row['filename'] == ss.filename - - + row = manifest_obj.make_manifest_row(ss, "foo", include_signature=False) + assert "signature" not in row + assert row["internal_location"] == "foo" + + assert row["md5"] == ss.md5sum() + assert row["md5short"] == ss.md5sum()[:8] + assert row["ksize"] == 31 + assert row["moltype"] == "DNA" + assert row["num"] == 0 + assert row["scaled"] == 1000 + assert row["n_hashes"] == len(ss.minhash) + assert not row["with_abundance"] + assert row["name"] == ss.name + assert row["filename"] == ss.filename + + def test_manifest_create_manifest(manifest_obj): # test the 'create_manifest' method - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss = sourmash.load_one_signature(sig47) def yield_sigs(): - yield ss, 'fiz' + yield ss, "fiz" - new_mf = manifest_obj.create_manifest(yield_sigs(), - include_signature=False) + new_mf = manifest_obj.create_manifest(yield_sigs(), include_signature=False) assert len(new_mf) == 1 new_row = list(new_mf.rows)[0] - - row = manifest_obj.make_manifest_row(ss, 'fiz', include_signature=False) + + row = manifest_obj.make_manifest_row(ss, "fiz", include_signature=False) required_keys = BaseCollectionManifest.required_keys for k in required_keys: @@ -119,32 +119,37 @@ def yield_sigs(): def test_manifest_select_to_manifest(manifest_obj): # do some light testing of 'select_to_manifest' - new_mf = manifest_obj.select_to_manifest(moltype='DNA') + new_mf = manifest_obj.select_to_manifest(moltype="DNA") assert len(new_mf) == 2 def test_manifest_locations(manifest_obj): # check the 'locations' method - locs = set(['dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig', - 'dayhoff/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig', - 'hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig', - 'hp/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig', - 'protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig', - 'protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig', - 'dna-sig.noext', - 'dna-sig.sig.gz'] - ) + locs = set( + [ + "dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig", + "dayhoff/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig", + "hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig", + "hp/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig", + "protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig", + "protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig", + "dna-sig.noext", + "dna-sig.sig.gz", + ] + ) assert set(manifest_obj.locations()) == locs def test_manifest___contains__(manifest_obj): # check the 'in' operator - sigfile = utils.get_test_data('prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') + sigfile = utils.get_test_data( + "prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) ss = sourmash.load_one_signature(sigfile) assert ss in manifest_obj - sigfile2 = utils.get_test_data('2.fa.sig') + sigfile2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sigfile2, ksize=31) assert ss2 not in manifest_obj @@ -159,36 +164,37 @@ def test_manifest_to_picklist(manifest_obj): def test_manifest_filter_rows(manifest_obj): # test filter_rows - filter_fn = lambda x: 'OS223' in x['name'] + def filter_fn(x): + return "OS223" in x["name"] mf = manifest_obj.filter_rows(filter_fn) assert len(mf) == 1 row = list(mf.rows)[0] - assert row['name'] == 'NC_011663.1 Shewanella baltica OS223, complete genome' + assert row["name"] == "NC_011663.1 Shewanella baltica OS223, complete genome" def test_manifest_filter_cols(manifest_obj): # test filter_rows - col_filter_fn = lambda x: 'OS223' in x[0] + def col_filter_fn(x): + return "OS223" in x[0] - mf = manifest_obj.filter_on_columns(col_filter_fn, ['name']) + mf = manifest_obj.filter_on_columns(col_filter_fn, ["name"]) assert len(mf) == 1 row = list(mf.rows)[0] - assert row['name'] == 'NC_011663.1 Shewanella baltica OS223, complete genome' + assert row["name"] == "NC_011663.1 Shewanella baltica OS223, complete genome" def test_manifest_iadd(manifest_obj): # test the 'create_manifest' method - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss = sourmash.load_one_signature(sig47) def yield_sigs(): - yield ss, 'fiz' + yield ss, "fiz" - new_mf = manifest_obj.create_manifest(yield_sigs(), - include_signature=False) + new_mf = manifest_obj.create_manifest(yield_sigs(), include_signature=False) assert len(new_mf) == 1 new_mf += manifest_obj @@ -197,14 +203,13 @@ def yield_sigs(): def test_manifest_add(manifest_obj): # test the 'create_manifest' method - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss = sourmash.load_one_signature(sig47) def yield_sigs(): - yield ss, 'fiz' + yield ss, "fiz" - new_mf = manifest_obj.create_manifest(yield_sigs(), - include_signature=False) + new_mf = manifest_obj.create_manifest(yield_sigs(), include_signature=False) assert len(new_mf) == 1 new_mf2 = new_mf + manifest_obj diff --git a/tests/test_minhash.py b/tests/test_minhash.py index 05802c0bff..474f1e231a 100644 --- a/tests/test_minhash.py +++ b/tests/test_minhash.py @@ -49,7 +49,7 @@ hash_murmur, _get_scaled_for_max_hash, _get_max_hash_for_scaled, - translate_codon + translate_codon, ) from sourmash import signature @@ -79,18 +79,18 @@ def _kmers_from_all_coding_frames(sequence, ksize): for frame in (0, 1, 2): # get forward k-mers for start in range(0, len(sequence) - ksize + 1 - frame, 3): - kmer = sequence[start + frame:start + frame + ksize] + kmer = sequence[start + frame : start + frame + ksize] yield kmer # get rc k-mers for start in range(0, len(seqrc) - ksize + 1 - frame, 3): - kmer = seqrc[start + frame:start + frame + ksize] + kmer = seqrc[start + frame : start + frame + ksize] yield kmer def _hash_fwd_only(mh_translate, seq): "Return the first hashval only, for coding frame +1." - assert len(seq) == mh_translate.ksize*3 + assert len(seq) == mh_translate.ksize * 3 xx = mh_translate.seq_to_hashes(seq)[0] return xx @@ -98,12 +98,12 @@ def _hash_fwd_only(mh_translate, seq): def test_basic_dna(track_abundance): # verify that MHs of size 1 stay size 1, & act properly as bottom sketches. mh = MinHash(1, 4, track_abundance=track_abundance) - assert mh.moltype == 'DNA' + assert mh.moltype == "DNA" - mh.add_sequence('ATGC') + mh.add_sequence("ATGC") a = mh.hashes - mh.add_sequence('GCAT') # this will not get added; hash > ATGC + mh.add_sequence("GCAT") # this will not get added; hash > ATGC b = mh.hashes print(a, b) @@ -117,7 +117,7 @@ def test_div_zero(track_abundance): mh = MinHash(1, 4, track_abundance=track_abundance) mh2 = mh.copy_and_clear() - mh.add_sequence('ATGC') + mh.add_sequence("ATGC") assert mh.similarity(mh2) == 0 assert mh2.similarity(mh) == 0 @@ -127,7 +127,7 @@ def test_div_zero_contained(track_abundance): mh = MinHash(0, 4, scaled=1, track_abundance=track_abundance) mh2 = mh.copy_and_clear() - mh.add_sequence('ATGC') + mh.add_sequence("ATGC") assert mh.contained_by(mh2) == 0 assert mh2.contained_by(mh) == 0 @@ -137,8 +137,8 @@ def test_contained_requires_scaled(track_abundance): mh1 = MinHash(1, 4, track_abundance=track_abundance) mh2 = MinHash(0, 4, scaled=1, track_abundance=track_abundance) - mh1.add_sequence('ATGC') - mh2.add_sequence('ATGC') + mh1.add_sequence("ATGC") + mh2.add_sequence("ATGC") with pytest.raises(TypeError): mh2.contained_by(mh1) @@ -152,8 +152,8 @@ def test_contained_requires_scaled_2(track_abundance): mh1 = MinHash(1, 4, track_abundance=track_abundance) mh2 = MinHash(0, 4, scaled=1, track_abundance=track_abundance) - mh1.add_sequence('ATGC') - mh2.add_sequence('ATGC') + mh1.add_sequence("ATGC") + mh2.add_sequence("ATGC") with pytest.raises(TypeError): mh2.max_containment(mh1) @@ -167,8 +167,8 @@ def test_contained_requires_scaled_3(track_abundance): mh1 = MinHash(1, 4, track_abundance=track_abundance) mh2 = MinHash(0, 4, scaled=1, track_abundance=track_abundance) - mh1.add_sequence('ATGC') - mh2.add_sequence('ATGC') + mh1.add_sequence("ATGC") + mh2.add_sequence("ATGC") with pytest.raises(TypeError): mh2.avg_containment(mh1) @@ -179,36 +179,39 @@ def test_contained_requires_scaled_3(track_abundance): def test_bytes_dna(track_abundance): mh = MinHash(1, 4, track_abundance=track_abundance) - mh.add_sequence('ATGC') - mh.add_sequence(b'ATGC') - mh.add_sequence('ATGC') + mh.add_sequence("ATGC") + mh.add_sequence(b"ATGC") + mh.add_sequence("ATGC") a = mh.hashes - mh.add_sequence('GCAT') # this will not get added; hash > ATGC - mh.add_sequence(b'GCAT') # this will not get added; hash > ATGC - mh.add_sequence('GCAT') # this will not get added; hash > ATGC + mh.add_sequence("GCAT") # this will not get added; hash > ATGC + mh.add_sequence(b"GCAT") # this will not get added; hash > ATGC + mh.add_sequence("GCAT") # this will not get added; hash > ATGC b = mh.hashes print(a, b) assert list(a) == list(b) assert len(b) == 1 + def test_add_long_seqs_force(): # Test for (All kmers are invalid) - mh = sourmash.minhash.MinHash(n = 0, ksize=21, scaled =10, seed = 42) + mh = sourmash.minhash.MinHash(n=0, ksize=21, scaled=10, seed=42) seq = "ACGTN" * 100000 - hashes = mh.seq_to_hashes(seq, force = True) - assert(len(mh.hashes) == 0) + mh.seq_to_hashes(seq, force=True) + assert len(mh.hashes) == 0 def test_seq_to_hashes(track_abundance): - mh = sourmash.minhash.MinHash(n=0, ksize=21, scaled=1, track_abundance=track_abundance) + mh = sourmash.minhash.MinHash( + n=0, ksize=21, scaled=1, track_abundance=track_abundance + ) seq = "ATGAGAGACGATAGACAGATGACC" mh.add_sequence(seq) golden_hashes = mh.hashes - + # New seq to hashes without adding to the sketch new_hashes = mh.seq_to_hashes(seq) @@ -216,7 +219,14 @@ def test_seq_to_hashes(track_abundance): def test_seq_to_hashes_protein_1(track_abundance, dayhoff): - mh = MinHash(10, 2, is_protein=True, dayhoff=dayhoff, hp=False, track_abundance=track_abundance) + mh = MinHash( + 10, + 2, + is_protein=True, + dayhoff=dayhoff, + hp=False, + track_abundance=track_abundance, + ) prot_seq = "AGYYG" mh.add_protein(prot_seq) @@ -224,16 +234,19 @@ def test_seq_to_hashes_protein_1(track_abundance, dayhoff): golden_hashes = mh.hashes # New seq to hashes without adding to the sketch - new_hashes = mh.seq_to_hashes(prot_seq, is_protein = True) + new_hashes = mh.seq_to_hashes(prot_seq, is_protein=True) assert set(golden_hashes) == set(new_hashes) + def test_seq_to_hashes_protein_2(track_abundance): - mh = sourmash.minhash.MinHash(n=0, ksize=21, scaled=1, track_abundance=track_abundance) + mh = sourmash.minhash.MinHash( + n=0, ksize=21, scaled=1, track_abundance=track_abundance + ) seq = "ATGAGAGACGATAGACAGATGACC" with pytest.raises(ValueError): - mh.seq_to_hashes(seq, is_protein = True) + mh.seq_to_hashes(seq, is_protein=True) def test_seq_to_hashes_translated(track_abundance): @@ -252,7 +265,7 @@ def test_seq_to_hashes_translated(track_abundance): def test_seq_to_hashes_bad_kmers_as_zeroes_1(): mh = sourmash.minhash.MinHash(n=0, ksize=21, scaled=1) seq = "ATGAGAGACGATAGACAGATGACN" - + # New seq to hashes without adding to the sketch hashes = mh.seq_to_hashes(seq, force=True, bad_kmers_as_zeroes=True) @@ -262,54 +275,69 @@ def test_seq_to_hashes_bad_kmers_as_zeroes_1(): def test_seq_to_hashes_bad_kmers_as_zeroes_2(): mh = sourmash.minhash.MinHash(n=0, ksize=21, scaled=1) seq = "ATGAGAGACGATAGACAGATGACN" - + with pytest.raises(ValueError): - hashes = mh.seq_to_hashes(seq, bad_kmers_as_zeroes=True) + mh.seq_to_hashes(seq, bad_kmers_as_zeroes=True) def test_seq_to_hashes_translated_short(): - mh = MinHash(0, 2, is_protein=True, dayhoff=True, hp=False, scaled = 1) + mh = MinHash(0, 2, is_protein=True, dayhoff=True, hp=False, scaled=1) hashes = mh.seq_to_hashes("ACTGA") - assert(len(hashes) == 0) + assert len(hashes) == 0 def test_bytes_protein_dayhoff(track_abundance, dayhoff): # verify that we can hash protein/aa sequences - mh = MinHash(10, 2, is_protein=True, dayhoff=dayhoff, hp=False, - track_abundance=track_abundance) - - expected_moltype = 'protein' + mh = MinHash( + 10, + 2, + is_protein=True, + dayhoff=dayhoff, + hp=False, + track_abundance=track_abundance, + ) + + expected_moltype = "protein" if dayhoff: - expected_moltype = 'dayhoff' + expected_moltype = "dayhoff" assert mh.moltype == expected_moltype - mh.add_protein('AGYYG') - mh.add_protein('AGYYG') - mh.add_protein(b'AGYYG') + mh.add_protein("AGYYG") + mh.add_protein("AGYYG") + mh.add_protein(b"AGYYG") assert len(mh.hashes) == 4 def test_protein_dayhoff(track_abundance, dayhoff): # verify that we can hash protein/aa sequences - mh = MinHash(10, 2, is_protein=True, dayhoff=dayhoff, hp=False, track_abundance=track_abundance) - mh.add_protein('AGYYG') + mh = MinHash( + 10, + 2, + is_protein=True, + dayhoff=dayhoff, + hp=False, + track_abundance=track_abundance, + ) + mh.add_protein("AGYYG") assert len(mh.hashes) == 4 def test_bytes_protein_hp(track_abundance, hp): # verify that we can hash protein/aa sequences - mh = MinHash(10, 2, is_protein=True, dayhoff=False, hp=hp, track_abundance=track_abundance) - expected_moltype = 'protein' + mh = MinHash( + 10, 2, is_protein=True, dayhoff=False, hp=hp, track_abundance=track_abundance + ) + expected_moltype = "protein" if hp: - expected_moltype = 'hp' + expected_moltype = "hp" assert mh.moltype == expected_moltype - mh.add_protein('AGYYG') - mh.add_protein(u'AGYYG') - mh.add_protein(b'AGYYG') + mh.add_protein("AGYYG") + mh.add_protein("AGYYG") + mh.add_protein(b"AGYYG") if hp: assert len(mh.hashes) == 1 @@ -319,8 +347,10 @@ def test_bytes_protein_hp(track_abundance, hp): def test_protein_hp(track_abundance, hp): # verify that we can hash protein/aa sequences - mh = MinHash(10, 2, is_protein=True, dayhoff=False, hp=hp, track_abundance=track_abundance) - mh.add_protein('AGYYG') + mh = MinHash( + 10, 2, is_protein=True, dayhoff=False, hp=hp, track_abundance=track_abundance + ) + mh.add_protein("AGYYG") if hp: assert len(mh.hashes) == 1 @@ -330,8 +360,8 @@ def test_protein_hp(track_abundance, hp): def test_module_translate_codon(track_abundance): # Ensure that translation occurs properly - module level function tests - assert "S" == translate_codon('TCT') - assert "S" == translate_codon('TC') + assert "S" == translate_codon("TCT") + assert "S" == translate_codon("TC") assert "X" == translate_codon("T") with pytest.raises(ValueError): @@ -341,14 +371,15 @@ def test_module_translate_codon(track_abundance): def test_dayhoff(track_abundance): # verify that we can hash to dayhoff-encoded protein/aa sequences - mh_dayhoff = MinHash(10, 2, is_protein=True, - dayhoff=True, hp=False, track_abundance=track_abundance) - mh_dayhoff.add_sequence('ACTGAC') + mh_dayhoff = MinHash( + 10, 2, is_protein=True, dayhoff=True, hp=False, track_abundance=track_abundance + ) + mh_dayhoff.add_sequence("ACTGAC") assert len(mh_dayhoff.hashes) == 2 # verify that dayhoff-encoded hashes are different from protein/aa hashes mh_protein = MinHash(10, 2, is_protein=True, track_abundance=track_abundance) - mh_protein.add_sequence('ACTGAC') + mh_protein.add_sequence("ACTGAC") assert len(mh_protein.hashes) == 2 print(mh_protein.hashes) @@ -360,39 +391,40 @@ def test_dayhoff_2(track_abundance): mh = MinHash(0, 7, scaled=1, dayhoff=True, track_abundance=1) # first, check protein -> dayhoff hashes via minhash - mh.add_protein('CADHIFC') + mh.add_protein("CADHIFC") assert len(mh) == 1 hashval = list(mh.hashes)[0] - assert hashval == hash_murmur('abcdefa') + assert hashval == hash_murmur("abcdefa") # also check seq_to_hashes - hashes = list(mh.seq_to_hashes('CADHIFC', is_protein=True)) + hashes = list(mh.seq_to_hashes("CADHIFC", is_protein=True)) assert hashval == hashes[0] # do we handle stop codons properly? mh = mh.copy_and_clear() - mh.add_protein('CADHIF*') + mh.add_protein("CADHIF*") assert len(mh) == 1 hashval = list(mh.hashes)[0] - assert hashval == hash_murmur('abcdef*') + assert hashval == hash_murmur("abcdef*") # again, check seq_to_hashes - hashes = list(mh.seq_to_hashes('CADHIF*', is_protein=True)) + hashes = list(mh.seq_to_hashes("CADHIF*", is_protein=True)) assert hashval == hashes[0] def test_hp(track_abundance): # verify that we can hash to hp-encoded protein/aa sequences - mh_hp = MinHash(10, 2, is_protein=True, - dayhoff=False, hp=True, track_abundance=track_abundance) - assert mh_hp.moltype == 'hp' + mh_hp = MinHash( + 10, 2, is_protein=True, dayhoff=False, hp=True, track_abundance=track_abundance + ) + assert mh_hp.moltype == "hp" - mh_hp.add_sequence('ACTGAC') + mh_hp.add_sequence("ACTGAC") assert len(mh_hp.hashes) == 2 # verify that hp-encoded hashes are different from protein/aa hashes mh_protein = MinHash(10, 2, is_protein=True, track_abundance=track_abundance) - mh_protein.add_sequence('ACTGAC') + mh_protein.add_sequence("ACTGAC") assert len(mh_protein.hashes) == 2 assert mh_protein.hashes != mh_hp.hashes @@ -401,30 +433,30 @@ def test_hp(track_abundance): def test_hp_2(track_abundance): mh = MinHash(0, 3, scaled=1, hp=True, track_abundance=track_abundance) - mh.add_protein('ANA') + mh.add_protein("ANA") assert len(mh) == 1 hashval = list(mh.hashes)[0] - assert hashval == hash_murmur('hph') + assert hashval == hash_murmur("hph") # also check seq_to_hashes - hashes = list(mh.seq_to_hashes('ANA', is_protein=True)) + hashes = list(mh.seq_to_hashes("ANA", is_protein=True)) assert hashval == hashes[0] mh = mh.copy_and_clear() - mh.add_protein('AN*') + mh.add_protein("AN*") assert len(mh) == 1 hashval = list(mh.hashes)[0] - assert hashval == hash_murmur('hp*') + assert hashval == hash_murmur("hp*") # also check seq_to_hashes - hashes = list(mh.seq_to_hashes('AN*', is_protein=True)) + hashes = list(mh.seq_to_hashes("AN*", is_protein=True)) assert hashval == hashes[0] def test_protein_short(track_abundance): # verify that we can hash protein/aa sequences mh = MinHash(10, 9, is_protein=True, track_abundance=track_abundance) - mh.add_protein('AG') + mh.add_protein("AG") assert len(mh.hashes) == 0, mh.hashes @@ -436,14 +468,14 @@ def test_size_limit(track_abundance): mh.add_hash(20) mh.add_hash(30) assert list(sorted(mh.hashes)) == [10, 20, 30] - mh.add_hash(5) # -> should push 30 off end + mh.add_hash(5) # -> should push 30 off end assert list(sorted(mh.hashes)) == [5, 10, 20] def test_scaled(track_abundance): # test behavior with scaled scaled = _get_scaled_for_max_hash(35) - print('XX', scaled, _get_max_hash_for_scaled(scaled)) + print("XX", scaled, _get_max_hash_for_scaled(scaled)) mh = MinHash(0, 4, track_abundance=track_abundance, scaled=scaled) assert mh._max_hash == 35 @@ -461,11 +493,11 @@ def test_scaled(track_abundance): def test_no_scaled(track_abundance): # no 'scaled', num=0 - should fail with pytest.raises(ValueError): - mh = MinHash(0, 4, track_abundance=track_abundance) + MinHash(0, 4, track_abundance=track_abundance) def test_max_hash_conversion(): - SCALED=100000 + SCALED = 100000 max_hash = _get_max_hash_for_scaled(SCALED) new_scaled = _get_scaled_for_max_hash(max_hash) assert new_scaled == SCALED @@ -481,15 +513,15 @@ def test_max_hash_and_scaled_zero(): def test_max_hash_and_scaled_error(track_abundance): # test behavior when supplying both max_hash and scaled with pytest.raises(ValueError): - mh = MinHash(0, 4, track_abundance=track_abundance, max_hash=35, - scaled=5) + MinHash(0, 4, track_abundance=track_abundance, max_hash=35, scaled=5) def test_max_hash_cannot_limit(track_abundance): # make sure you can't set both n and scaled. with pytest.raises(ValueError): - mh = MinHash(2, 4, track_abundance=track_abundance, - scaled=_get_scaled_for_max_hash(1)) + MinHash( + 2, 4, track_abundance=track_abundance, scaled=_get_scaled_for_max_hash(1) + ) def test_no_downsample_scaled_if_n(track_abundance): @@ -498,13 +530,13 @@ def test_no_downsample_scaled_if_n(track_abundance): with pytest.raises(ValueError) as excinfo: mh.downsample(scaled=100000000) - assert 'cannot downsample a num MinHash using scaled' in str(excinfo.value) + assert "cannot downsample a num MinHash using scaled" in str(excinfo.value) def test_scaled_num_both(track_abundance): # make sure you can't set both max_n and scaled. with pytest.raises(ValueError): - mh = MinHash(2, 4, track_abundance=track_abundance, scaled=2) + MinHash(2, 4, track_abundance=track_abundance, scaled=2) def test_mh_jaccard_similarity(): @@ -514,7 +546,7 @@ def test_mh_jaccard_similarity(): a.add_many([1, 3, 5, 8]) b.add_many([1, 3, 5, 6, 8, 10]) - assert a.similarity(b) == 4. / 6. + assert a.similarity(b) == 4.0 / 6.0 def test_mh_similarity_downsample_jaccard_value(): @@ -526,10 +558,10 @@ def test_mh_similarity_downsample_jaccard_value(): b = MinHash(0, 20, scaled=scaled100, track_abundance=False) a.add_many([1, 3, 5, 8, 70]) - b.add_many([1, 3, 5, 6, 8, 10, 70 ]) + b.add_many([1, 3, 5, 6, 8, 10, 70]) # the hash=70 will be truncated by downsampling - assert a.similarity(b, downsample=True) == 4. / 6. + assert a.similarity(b, downsample=True) == 4.0 / 6.0 def test_mh_angular_similarity(): @@ -539,11 +571,11 @@ def test_mh_angular_similarity(): # are always positive (https://en.wikipedia.org/wiki/Cosine_similarity) a = MinHash(0, 20, scaled=scaled50, track_abundance=True) b = MinHash(0, 20, scaled=scaled50, track_abundance=True) - a.set_abundances({ 1:5, 3:3, 5:2, 8:2}) - b.set_abundances({ 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 }) + a.set_abundances({1: 5, 3: 3, 5: 2, 8: 2}) + b.set_abundances({1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1}) cos_sim = 0.9356 - angular_sim = 1 - 2*math.acos(cos_sim) / math.pi + angular_sim = 1 - 2 * math.acos(cos_sim) / math.pi assert round(angular_sim, 4) == 0.7703 assert round(a.similarity(b), 4) == round(angular_sim, 4) @@ -553,13 +585,13 @@ def test_mh_angular_similarity_2(): # check actual angular similarity for a second non-trivial case a = MinHash(0, 20, scaled=scaled100, track_abundance=True) b = MinHash(0, 20, scaled=scaled100, track_abundance=True) - a.set_abundances({ 1:5, 3:3, 5:2, 8:2, 70:70 }) - b.set_abundances({ 1:3, 3:2, 5:1, 6:1, 8:1, 10:1, 70:70 }) + a.set_abundances({1: 5, 3: 3, 5: 2, 8: 2, 70: 70}) + b.set_abundances({1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1, 70: 70}) assert round(a.similarity(b), 4) == 0.9728 # ignore_abundance => jaccard - assert a.similarity(b, ignore_abundance=True) == 5. / 7. + assert a.similarity(b, ignore_abundance=True) == 5.0 / 7.0 def test_mh_similarity_downsample_angular_value(): @@ -570,8 +602,8 @@ def test_mh_similarity_downsample_angular_value(): # max_hash = 100 b = MinHash(0, 20, scaled=scaled100, track_abundance=True) - a.set_abundances({ 1:5, 3:3, 5:2, 8:2, 70:70 }) - b.set_abundances({ 1:3, 3:2, 5:1, 6:1, 8:1, 10:1, 70:70 }) + a.set_abundances({1: 5, 3: 3, 5: 2, 8: 2, 70: 70}) + b.set_abundances({1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1, 70: 70}) # the hash=70 will be truncated by downsampling sim = a.similarity(b, downsample=True) @@ -579,15 +611,16 @@ def test_mh_similarity_downsample_angular_value(): # with ignore_abundance, will be equal to jaccard jaccard = a.similarity(b, downsample=True, ignore_abundance=True) - assert jaccard == 4. / 6. + assert jaccard == 4.0 / 6.0 + def test_mh_angular_similarity_fail(): # raise TypeError if calling angular_similarity directly and # one or both sketches do not have abundance info a = MinHash(0, 20, scaled=scaled50, track_abundance=True) b = MinHash(0, 20, scaled=scaled50, track_abundance=False) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} a.set_abundances(a_values) b.add_many(b_values.keys()) @@ -596,14 +629,20 @@ def test_mh_angular_similarity_fail(): with pytest.raises(TypeError) as exc: a.angular_similarity(b) print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) # both sketches lack track abundance a = MinHash(0, 20, scaled=scaled50, track_abundance=False) a.add_many(a_values.keys()) with pytest.raises(TypeError) as exc: a.angular_similarity(b) print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) def test_mh_similarity_downsample_true(track_abundance): @@ -614,8 +653,8 @@ def test_mh_similarity_downsample_true(track_abundance): # max_hash = 100 b = MinHash(0, 20, scaled=scaled100, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) b.set_abundances(b_values) @@ -642,8 +681,8 @@ def test_mh_similarity_downsample_errors(track_abundance): # max_hash = 100 b = MinHash(0, 20, scaled=scaled100, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) b.set_abundances(b_values) @@ -653,20 +692,20 @@ def test_mh_similarity_downsample_errors(track_abundance): # error, incompatible max hash with pytest.raises(ValueError) as e: - a.similarity(b, ignore_abundance=True) # downsample=False - assert 'mismatch in scaled; comparison fail' in str(e.value) + a.similarity(b, ignore_abundance=True) # downsample=False + assert "mismatch in scaled; comparison fail" in str(e.value) with pytest.raises(ValueError) as e: a.similarity(b, ignore_abundance=False) # downsample=False - assert 'mismatch in scaled; comparison fail' in str(e.value) + assert "mismatch in scaled; comparison fail" in str(e.value) with pytest.raises(ValueError) as e: - b.similarity(a, ignore_abundance=True) # downsample=False - assert 'mismatch in scaled; comparison fail' in str(e.value) + b.similarity(a, ignore_abundance=True) # downsample=False + assert "mismatch in scaled; comparison fail" in str(e.value) with pytest.raises(ValueError) as e: b.similarity(a, ignore_abundance=False) # downsample=false - assert 'mismatch in scaled; comparison fail' in str(e.value) + assert "mismatch in scaled; comparison fail" in str(e.value) def test_basic_dna_bad(track_abundance): @@ -674,10 +713,10 @@ def test_basic_dna_bad(track_abundance): mh = MinHash(1, 4, track_abundance=track_abundance) with pytest.raises(ValueError) as e: - mh.add_sequence('ATGR') + mh.add_sequence("ATGR") print(e) - assert 'invalid DNA character in input k-mer: ATGR' in str(e.value) + assert "invalid DNA character in input k-mer: ATGR" in str(e.value) def test_basic_dna_bad_2(track_abundance): @@ -685,40 +724,40 @@ def test_basic_dna_bad_2(track_abundance): mh = MinHash(1, 6, track_abundance=track_abundance) with pytest.raises(ValueError): - mh.add_protein('YYYY') + mh.add_protein("YYYY") def test_basic_dna_bad_force(track_abundance): # test behavior on bad DNA; use 100 so multiple hashes get added. mh = MinHash(100, 4, track_abundance=track_abundance) assert len(mh.hashes) == 0 - mh.add_sequence('ATGN', True) # ambiguous kmer skipped. + mh.add_sequence("ATGN", True) # ambiguous kmer skipped. assert len(mh.hashes) == 0 - mh.add_sequence('AATGN', True) # but good k-mers still used. + mh.add_sequence("AATGN", True) # but good k-mers still used. assert len(mh.hashes) == 1 - mh.add_sequence('AATG', True) # checking that right kmer was added - assert len(mh.hashes) == 1 # (only 1 hash <- this is a dup) + mh.add_sequence("AATG", True) # checking that right kmer was added + assert len(mh.hashes) == 1 # (only 1 hash <- this is a dup) def test_basic_dna_bad_force_2(track_abundance): # test behavior on bad DNA mh = MinHash(100, 4, track_abundance=track_abundance) assert len(mh.hashes) == 0 - mh.add_sequence('AAGNCGG', True) # ambiguous kmers skipped. + mh.add_sequence("AAGNCGG", True) # ambiguous kmers skipped. assert len(mh.hashes) == 0 - mh.add_sequence('AATGNGCGG', True) # ambiguous kmers skipped. + mh.add_sequence("AATGNGCGG", True) # ambiguous kmers skipped. assert len(mh.hashes) == 2 - mh.add_sequence('AATG', True) # checking that right kmers were added - mh.add_sequence('GCGG', True) - assert len(mh.hashes) == 2 # (only 2 hashes should be there) + mh.add_sequence("AATG", True) # checking that right kmers were added + mh.add_sequence("GCGG", True) + assert len(mh.hashes) == 2 # (only 2 hashes should be there) def test_consume_lowercase(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) b = MinHash(20, 10, track_abundance=track_abundance) - a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA'.lower()) - b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') + a.add_sequence("TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA".lower()) + b.add_sequence("TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA") assert round(a.similarity(b), 3) == 1.0 assert round(b.similarity(b), 3) == 1.0 @@ -730,8 +769,8 @@ def test_similarity_1(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) b = MinHash(20, 10, track_abundance=track_abundance) - a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') - b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') + a.add_sequence("TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA") + b.add_sequence("TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA") assert round(a.similarity(b), 3) == 1.0 assert round(b.similarity(b), 3) == 1.0 @@ -739,14 +778,13 @@ def test_similarity_1(track_abundance): assert round(a.similarity(a), 3) == 1.0 # add same sequence again - b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') + b.add_sequence("TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA") assert round(a.similarity(b), 3) == 1.0 assert round(b.similarity(b), 3) == 1.0 assert round(b.similarity(a), 3) == 1.0 assert round(a.similarity(a), 3) == 1.0 - - b.add_sequence('GATTGGTGCACACTTAACTGGGTGCCGCGCTGGTGCTGATCCATGAAGTT') + b.add_sequence("GATTGGTGCACACTTAACTGGGTGCCGCGCTGGTGCTGATCCATGAAGTT") x = a.similarity(b) assert x >= 0.3, x @@ -777,7 +815,7 @@ def test_frozen_copy(track_abundance): def test_mh_copy(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) - a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') + a.add_sequence("TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA") b = a.__copy__() assert round(b.similarity(a), 3) == 1.0 @@ -786,7 +824,7 @@ def test_mh_len(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) assert len(a) == 0 - a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') + a.add_sequence("TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA") assert len(a) == 20 @@ -800,7 +838,7 @@ def test_mh_len_2(track_abundance): def test_mh_unsigned_long_long(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) - a.add_hash(9227159859419181011) # too big for a C long int. + a.add_hash(9227159859419181011) # too big for a C long int. assert 9227159859419181011 in a.hashes @@ -826,10 +864,20 @@ def test_mh_count_common_diff_protein(track_abundance): def test_mh_count_common_diff_maxhash(track_abundance): - a = MinHash(0, 5, is_protein=False, track_abundance=track_abundance, - scaled=_get_scaled_for_max_hash(1)) - b = MinHash(0, 5, is_protein=True, track_abundance=track_abundance, - scaled=_get_scaled_for_max_hash(2)) + a = MinHash( + 0, + 5, + is_protein=False, + track_abundance=track_abundance, + scaled=_get_scaled_for_max_hash(1), + ) + b = MinHash( + 0, + 5, + is_protein=True, + track_abundance=track_abundance, + scaled=_get_scaled_for_max_hash(2), + ) with pytest.raises(ValueError): a.count_common(b) @@ -991,6 +1039,7 @@ def test_mh_merge_check_length2(track_abundance): c.merge(b) assert len(c.hashes) == 3 + def test_mh_asymmetric_merge(track_abundance): # test merging two asymmetric (different size) MHs a = MinHash(20, 10, track_abundance=track_abundance) @@ -1055,13 +1104,13 @@ def test_mh_inplace_concat_asymmetric(track_abundance): try: d.similarity(a) except TypeError as exc: - assert 'must have same num' in str(exc) + assert "must have same num" in str(exc) a = a.downsample(num=d.num) if track_abundance: - assert round(d.similarity(a), 3) == 0.795 # see: d += a, above. + assert round(d.similarity(a), 3) == 0.795 # see: d += a, above. else: - assert d.similarity(a) == 1.0 # see: d += a, above. + assert d.similarity(a) == 1.0 # see: d += a, above. c = c.downsample(num=b.num) if track_abundance: @@ -1132,11 +1181,13 @@ def test_mh_similarity_diff_seed(track_abundance): def test_mh_compare_diff_max_hash(track_abundance): - a = MinHash(0, 5, track_abundance=track_abundance, - scaled=_get_max_hash_for_scaled(5)) + a = MinHash( + 0, 5, track_abundance=track_abundance, scaled=_get_max_hash_for_scaled(5) + ) - b = MinHash(0, 5, track_abundance=track_abundance, - scaled=_get_max_hash_for_scaled(10)) + b = MinHash( + 0, 5, track_abundance=track_abundance, scaled=_get_max_hash_for_scaled(10) + ) with pytest.raises(ValueError): a.similarity(b) @@ -1159,10 +1210,12 @@ def test_mh_concat_diff_ksize(track_abundance): def test_mh_concat_diff_max_hash(track_abundance): - a = MinHash(0, 5, track_abundance=track_abundance, - scaled=_get_max_hash_for_scaled(5)) - b = MinHash(0, 5, track_abundance=track_abundance, - scaled=_get_max_hash_for_scaled(10)) + a = MinHash( + 0, 5, track_abundance=track_abundance, scaled=_get_max_hash_for_scaled(5) + ) + b = MinHash( + 0, 5, track_abundance=track_abundance, scaled=_get_max_hash_for_scaled(10) + ) with pytest.raises(ValueError): a += b @@ -1178,7 +1231,7 @@ def test_mh_concat_diff_seed(track_abundance): def test_short_sequence(track_abundance): a = MinHash(20, 5, track_abundance=track_abundance) - a.add_sequence('GGGG') + a.add_sequence("GGGG") # adding a short sequence should fail silently assert len(a.hashes) == 0 @@ -1190,7 +1243,7 @@ def test_bytes_murmur(): x = hash_murmur(b"ACG") assert x == 1731421407650554201 - x = hash_murmur(u"ACG") + x = hash_murmur("ACG") assert x == 1731421407650554201 @@ -1214,11 +1267,11 @@ def test_murmur(): def test_abundance_simple(): a = MinHash(20, 5, is_protein=False, track_abundance=True) - a.add_sequence('AAAAA') + a.add_sequence("AAAAA") assert list(a.hashes) == [2110480117637990133] assert a.hashes == {2110480117637990133: 1} - a.add_sequence('AAAAA') + a.add_sequence("AAAAA") assert list(a.hashes) == [2110480117637990133] assert a.hashes == {2110480117637990133: 2} @@ -1269,15 +1322,15 @@ def test_abundance_simple_2(): a = MinHash(20, 5, is_protein=False, track_abundance=True) b = MinHash(20, 5, is_protein=False, track_abundance=True) - a.add_sequence('AAAAA') + a.add_sequence("AAAAA") assert list(a.hashes) == [2110480117637990133] assert a.hashes == {2110480117637990133: 1} - a.add_sequence('AAAAA') + a.add_sequence("AAAAA") assert list(a.hashes) == [2110480117637990133] assert a.hashes == {2110480117637990133: 2} - b.add_sequence('AAAAA') + b.add_sequence("AAAAA") assert a.count_common(b) == 1 @@ -1285,13 +1338,13 @@ def test_abundance_count_common(): a = MinHash(20, 5, is_protein=False, track_abundance=True) b = MinHash(20, 5, is_protein=False, track_abundance=False) - a.add_sequence('AAAAA') - a.add_sequence('AAAAA') + a.add_sequence("AAAAA") + a.add_sequence("AAAAA") assert list(a.hashes) == [2110480117637990133] assert a.hashes == {2110480117637990133: 2} - b.add_sequence('AAAAA') - b.add_sequence('GGGGG') + b.add_sequence("AAAAA") + b.add_sequence("GGGGG") assert a.count_common(b) == 1 assert a.count_common(b) == b.count_common(a) @@ -1302,8 +1355,8 @@ def test_abundance_similarity(): a = MinHash(20, 10, track_abundance=True) b = MinHash(20, 10, track_abundance=False) - a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') - b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') + a.add_sequence("TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA") + b.add_sequence("TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA") assert round(a.similarity(b), 3) == 1.0 assert round(b.similarity(b), 3) == 1.0 @@ -1311,13 +1364,13 @@ def test_abundance_similarity(): assert round(a.similarity(a), 3) == 1.0 # add same sequence again - b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') + b.add_sequence("TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA") assert round(a.similarity(b), 3) == 1.0 assert round(b.similarity(b), 3) == 1.0 assert round(b.similarity(a), 3) == 1.0 assert round(a.similarity(a), 3) == 1.0 - b.add_sequence('GATTGGTGCACACTTAACTGGGTGCCGCGCTGGTGCTGATCCATGAAGTT') + b.add_sequence("GATTGGTGCACACTTAACTGGGTGCCGCGCTGGTGCTGATCCATGAAGTT") x = a.similarity(b) assert x >= 0.3, x @@ -1338,9 +1391,7 @@ def test_set_abundance(): def test_set_abundance_2(): datapath = utils.get_test_data("genome-s12.fa.gz.sig") - sig = sourmash.load_one_signature(datapath, - ksize=30, - select_moltype='dna') + sig = sourmash.load_one_signature(datapath, ksize=30, select_moltype="dna") new_mh = sig.minhash.copy_and_clear() mins = sig.minhash.hashes mins = {k: 1 for k in mins} @@ -1377,7 +1428,7 @@ def test_set_abundance_clear_3(): a.add_hash(10) assert a.hashes == {10: 1} - + a.set_abundances({20: 1, 30: 4}, clear=False) assert a.hashes == {10: 1, 20: 1, 30: 4} @@ -1387,32 +1438,34 @@ def test_set_abundance_clear_4(): # the abundances together a = MinHash(20, 5, is_protein=False, track_abundance=True) - a.set_abundances({20: 2, 10: 1}, clear=False) # should also sort the hashes + a.set_abundances({20: 2, 10: 1}, clear=False) # should also sort the hashes assert a.hashes == {10: 1, 20: 2} a.set_abundances({20: 1, 10: 2}, clear=False) assert a.hashes == {10: 3, 20: 3} + def test_clear_abundance_on_zero(): mh = sourmash.minhash.MinHash(n=0, ksize=31, scaled=1, track_abundance=True) - mh.set_abundances({ 1: 5, 2: 3, 3 : 5 }) - mh.set_abundances({ 1: 0 }, clear=False) + mh.set_abundances({1: 5, 2: 3, 3: 5}) + mh.set_abundances({1: 0}, clear=False) assert 1 not in dict(mh.hashes) assert dict(mh.hashes)[2] == 3 assert dict(mh.hashes)[3] == 5 assert len(mh) == 2 with pytest.raises(ValueError): - mh.set_abundances({ 2: -1 }) # Test on clear = True + mh.set_abundances({2: -1}) # Test on clear = True with pytest.raises(ValueError): - mh.set_abundances({ 2: -1 }, clear=False) - - assert len(mh) == 2 # Assert that nothing was affected + mh.set_abundances({2: -1}, clear=False) + + assert len(mh) == 2 # Assert that nothing was affected + def test_reset_abundance_initialized(): a = MinHash(1, 4, track_abundance=True) - a.add_sequence('ATGC') + a.add_sequence("ATGC") # If we had a minhash with abundances and drop it, this shouldn't fail. # Convert from Abundance to Regular MinHash @@ -1423,12 +1476,14 @@ def test_reset_abundance_initialized(): def test_set_abundance_initialized(): a = MinHash(1, 4, track_abundance=False) - a.add_sequence('ATGC') + a.add_sequence("ATGC") with pytest.raises(RuntimeError) as e: a.track_abundance = True - assert "Can only set track_abundance=True if the MinHash is empty" in e.value.args[0] + assert ( + "Can only set track_abundance=True if the MinHash is empty" in e.value.args[0] + ) def test_set_abundance_num(): @@ -1459,8 +1514,9 @@ def test_mh_copy_and_clear(track_abundance): def test_mh_copy_and_clear_with_max_hash(track_abundance): # test basic creation of new, empty MinHash w/max_hash param set - a = MinHash(0, 10, track_abundance=track_abundance, - scaled=_get_scaled_for_max_hash(20)) + a = MinHash( + 0, 10, track_abundance=track_abundance, scaled=_get_scaled_for_max_hash(20) + ) for i in range(0, 40, 2): a.add_hash(i) @@ -1484,8 +1540,13 @@ def test_scaled_property(track_abundance): def test_pickle_protein(track_abundance): # check that protein/etc ksize is handled properly during serialization. - a = MinHash(0, 10, track_abundance=track_abundance, is_protein=True, - scaled=_get_scaled_for_max_hash(20)) + a = MinHash( + 0, + 10, + track_abundance=track_abundance, + is_protein=True, + scaled=_get_scaled_for_max_hash(20), + ) for i in range(0, 40, 2): a.add_hash(i) @@ -1505,8 +1566,13 @@ def test_pickle_protein(track_abundance): def test_pickle_dayhoff(track_abundance): # check that dayhoff ksize is handled properly during serialization. - a = MinHash(0, 10, track_abundance=track_abundance, dayhoff=True, - scaled=_get_scaled_for_max_hash(20)) + a = MinHash( + 0, + 10, + track_abundance=track_abundance, + dayhoff=True, + scaled=_get_scaled_for_max_hash(20), + ) for i in range(0, 40, 2): a.add_hash(i) @@ -1526,8 +1592,13 @@ def test_pickle_dayhoff(track_abundance): def test_pickle_hp(track_abundance): # check that hp ksize is handled properly during serialization. - a = MinHash(0, 10, track_abundance=track_abundance, hp=True, - scaled=_get_scaled_for_max_hash(20)) + a = MinHash( + 0, + 10, + track_abundance=track_abundance, + hp=True, + scaled=_get_scaled_for_max_hash(20), + ) for i in range(0, 40, 2): a.add_hash(i) @@ -1546,8 +1617,9 @@ def test_pickle_hp(track_abundance): def test_pickle_max_hash(track_abundance): - a = MinHash(0, 10, track_abundance=track_abundance, - scaled=_get_scaled_for_max_hash(20)) + a = MinHash( + 0, 10, track_abundance=track_abundance, scaled=_get_scaled_for_max_hash(20) + ) for i in range(0, 40, 2): a.add_hash(i) @@ -1650,8 +1722,9 @@ def test_minhash_abund_merge_flat_2(): def test_distance_matrix(track_abundance): import numpy - siglist = [next(signature.load_signatures(utils.get_test_data(f))) - for f in utils.SIG_FILES] + siglist = [ + next(signature.load_signatures(utils.get_test_data(f))) for f in utils.SIG_FILES + ] D1 = numpy.zeros([len(siglist), len(siglist)]) D2 = numpy.zeros([len(siglist), len(siglist)]) @@ -1690,14 +1763,15 @@ def test_remove_many(track_abundance): assert len(a) == 33 assert all(c % 6 != 0 for c in a.hashes) + def test_remove_minhash(track_abundance): original_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) added_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) tested_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) original_mh.add_many(list(range(101))) - added_mh.add_many(list(range(101,201))) # contains original in it - tested_mh.add_many(list(range(201))) # original + added + added_mh.add_many(list(range(101, 201))) # contains original in it + tested_mh.add_many(list(range(201))) # original + added # Now we should expect tested_minhash == original_minhash # Note we are passing a MinHash object instead of an iterable object @@ -1718,7 +1792,7 @@ def test_add_many(track_abundance): b = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) a.add_many(list(range(0, 100, 2))) - a.add_many(list(range(0, 100, 2))) # => abundance = 2 + a.add_many(list(range(0, 100, 2))) # => abundance = 2 assert len(a) == 50 assert all(c % 2 == 0 for c in a.hashes) @@ -1733,8 +1807,7 @@ def test_add_many(track_abundance): def test_set_abundances_huge(): max_hash = 4000000 - a = MinHash(0, 10, track_abundance=True, - scaled=_get_scaled_for_max_hash(max_hash)) + a = MinHash(0, 10, track_abundance=True, scaled=_get_scaled_for_max_hash(max_hash)) hashes = list(range(max_hash)) abundances = itertools.repeat(2) @@ -1744,7 +1817,7 @@ def test_set_abundances_huge(): def test_try_change_hashes(track_abundance): a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) - b = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) + MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) a.add_many(list(range(0, 100, 2))) @@ -1846,7 +1919,10 @@ def test_inflate_error(): with pytest.raises(ValueError) as exc: mh = mh.inflate(mh2) - assert "inflate operates on a flat MinHash and takes a MinHash object with track_abundance=True" in str(exc.value) + assert ( + "inflate operates on a flat MinHash and takes a MinHash object with track_abundance=True" + in str(exc.value) + ) def test_inflate_not_a_subset(): @@ -1878,7 +1954,7 @@ def test_inflate_not_a_subset(): mh3 = mh.inflate(mh2) assert mh3.hashes[10] == 3 - assert 20 not in mh3.hashes # should intersect, in this case. + assert 20 not in mh3.hashes # should intersect, in this case. assert mh3.hashes[30] == 3 @@ -1887,14 +1963,14 @@ def test_add_kmer(track_abundance): mh1 = MinHash(0, 4, scaled=1, track_abundance=track_abundance) mh2 = MinHash(0, 4, scaled=1, track_abundance=track_abundance) - mh1.add_sequence('ATGCGTGC') + mh1.add_sequence("ATGCGTGC") a = mh1.hashes - mh2.add_kmer('ATGC') - mh2.add_kmer('TGCG') - mh2.add_kmer('GCGT') - mh2.add_kmer('CGTG') - mh2.add_kmer('GTGC') + mh2.add_kmer("ATGC") + mh2.add_kmer("TGCG") + mh2.add_kmer("GCGT") + mh2.add_kmer("CGTG") + mh2.add_kmer("GTGC") b = mh2.hashes assert set(a.items()) == set(b.items()) @@ -1905,7 +1981,7 @@ def test_add_kmer_too_long(track_abundance): mh1 = MinHash(0, 4, scaled=1, track_abundance=track_abundance) with pytest.raises(ValueError): - mh1.add_kmer('ATGCGTGC') + mh1.add_kmer("ATGCGTGC") def test_get_mins_deprecated(track_abundance): @@ -1961,9 +2037,14 @@ def test_downsample_scaled(track_abundance): # test downsample(scaled...) method mh = MinHash(0, 21, scaled=1, track_abundance=track_abundance) - mins = (1, 2, 3, - 9223372036854775808 + 1, 9223372036854775808 + 2, - 9223372036854775808 + 3) + mins = ( + 1, + 2, + 3, + 9223372036854775808 + 1, + 9223372036854775808 + 2, + 9223372036854775808 + 3, + ) mh.add_many(mins) assert len(mh) == 6 @@ -1978,7 +2059,7 @@ def test_downsample_scaled(track_abundance): def test_is_molecule_type_1(track_abundance): mh = MinHash(1, 21, track_abundance=track_abundance) - assert mh.moltype == 'DNA' + assert mh.moltype == "DNA" assert mh.is_dna assert not mh.is_protein assert not mh.hp @@ -1987,7 +2068,7 @@ def test_is_molecule_type_1(track_abundance): def test_is_molecule_type_2(track_abundance): mh = MinHash(1, 21, track_abundance=track_abundance, is_protein=True) - assert mh.moltype == 'protein' + assert mh.moltype == "protein" assert not mh.is_dna assert mh.is_protein assert not mh.hp @@ -1996,17 +2077,16 @@ def test_is_molecule_type_2(track_abundance): def test_is_molecule_type_3(track_abundance): mh = MinHash(1, 21, track_abundance=track_abundance, hp=True) - assert mh.moltype == 'hp' + assert mh.moltype == "hp" assert not mh.is_dna assert not mh.is_protein assert mh.hp assert not mh.dayhoff - def test_is_molecule_type_4(track_abundance): mh = MinHash(1, 21, track_abundance=track_abundance, dayhoff=True) - assert mh.moltype == 'dayhoff' + assert mh.moltype == "dayhoff" assert not mh.is_dna assert not mh.is_protein assert not mh.hp @@ -2021,7 +2101,7 @@ def test_addition_num_incompatible(): mh2.add_hash(1) with pytest.raises(TypeError) as exc: - mh3 = mh1 + mh2 + mh1 + mh2 assert "incompatible num values: self=10 other=20" in str(exc.value) @@ -2030,8 +2110,8 @@ def test_addition_abund(): mh1 = MinHash(10, 21, track_abundance=True) mh2 = MinHash(10, 21, track_abundance=True) - mh1.set_abundances({ 0: 1 }) - mh2.set_abundances({ 0: 3 }) + mh1.set_abundances({0: 1}) + mh2.set_abundances({0: 3}) mh3 = mh1 + mh2 hashcounts = mh3.hashes @@ -2057,8 +2137,8 @@ def test_iaddition_abund(): mh1 = MinHash(10, 21, track_abundance=True) mh2 = MinHash(10, 21, track_abundance=True) - mh1.set_abundances({ 0: 1 }) - mh2.set_abundances({ 0: 3 }) + mh1.set_abundances({0: 1}) + mh2.set_abundances({0: 3}) mh1 += mh2 hashcounts = mh1.hashes @@ -2093,10 +2173,11 @@ def test_intersection_1_num(): mh2.add_hash(2) mh3 = mh1.intersection(mh2) - print("mh.intersection INTERSECTION HASHES:",set(mh3.hashes)) + print("mh.intersection INTERSECTION HASHES:", set(mh3.hashes)) assert len(mh3) == 1 assert 0 in mh3.hashes + def test_and_operator(): mh1 = MinHash(20, 21) mh1.add_hash(5) @@ -2110,11 +2191,14 @@ def test_and_operator(): mh3 = mh1.intersection(mh2) mh4 = mh1 & mh2 - print("\n Intersection hashes (mh3): ", mh3.hashes, "\n '&' hashes: (mh4)", mh4.hashes) + print( + "\n Intersection hashes (mh3): ", mh3.hashes, "\n '&' hashes: (mh4)", mh4.hashes + ) assert mh3 assert mh3 == mh4 + def test_intersection_2_scaled(): mh1 = MinHash(0, 21, scaled=1) mh2 = MinHash(0, 21, scaled=1) @@ -2136,7 +2220,7 @@ def test_intersection_3_abundance_error(): mh2 = MinHash(0, 21, scaled=1, track_abundance=True) with pytest.raises(TypeError) as exc: - mh3 = mh1.intersection(mh2) + mh1.intersection(mh2) assert str(exc.value) == "can only intersect flat MinHash objects" @@ -2147,7 +2231,7 @@ def test_intersection_4_incompatible_ksize(): mh2 = MinHash(500, 31) with pytest.raises(ValueError) as exc: - mh3 = mh1.intersection(mh2) + mh1.intersection(mh2) assert str(exc.value) == "different ksizes cannot be compared" @@ -2157,7 +2241,7 @@ def test_intersection_5_incompatible(): mh1 = MinHash(0, 21, scaled=1) with pytest.raises(TypeError) as exc: - mh3 = mh1.intersection(set()) + mh1.intersection(set()) assert str(exc.value) == "can only intersect MinHash objects" @@ -2189,6 +2273,7 @@ def test_intersection_6_full_num(): assert mh1.intersection_and_union_size(mh2) == (10, 20) + def test_intersection_7_full_scaled(): # intersection of two scaled objects is correct mh1 = MinHash(0, 21, scaled=100) @@ -2231,8 +2316,8 @@ def test_merge_abund(): mh1 = MinHash(10, 21, track_abundance=True) mh2 = MinHash(10, 21, track_abundance=True) - mh1.set_abundances({ 0: 1 }) - mh2.set_abundances({ 0: 3 }) + mh1.set_abundances({0: 1}) + mh2.set_abundances({0: 3}) ret = mh1.merge(mh2) assert ret is None @@ -2315,6 +2400,7 @@ def test_merge_scaled(): for k in mh2.hashes: assert k in mh3.hashes + def test_add_is_symmetric(): mh1 = MinHash(20, 21) mh1.add_hash(5) @@ -2324,10 +2410,11 @@ def test_add_is_symmetric(): mh3 = mh1 + mh2 mh4 = mh2 + mh1 print("\n mh3 EQUALS ", mh3.hashes, "\n mh4 EQUALS", mh4.hashes) - #if mh3 != 0, then it is "true", so it passes + # if mh3 != 0, then it is "true", so it passes assert mh3 assert mh3 == mh4 + def test_or_equals_add(): mh1 = MinHash(20, 21) mh1.add_hash(5) @@ -2340,6 +2427,7 @@ def test_or_equals_add(): assert mh3 assert mh3 == mh4 + def test_max_containment(): mh1 = MinHash(0, 21, scaled=1, track_abundance=False) mh2 = MinHash(0, 21, scaled=1, track_abundance=False) @@ -2347,10 +2435,10 @@ def test_max_containment(): mh1.add_many((1, 2, 3, 4)) mh2.add_many((1, 5)) - assert mh1.contained_by(mh2) == 1/4 - assert mh2.contained_by(mh1) == 1/2 - assert mh1.max_containment(mh2) == 1/2 - assert mh2.max_containment(mh1) == 1/2 + assert mh1.contained_by(mh2) == 1 / 4 + assert mh2.contained_by(mh1) == 1 / 2 + assert mh1.max_containment(mh2) == 1 / 2 + assert mh2.max_containment(mh1) == 1 / 2 def test_max_containment_empty(): @@ -2385,8 +2473,8 @@ def test_avg_containment(): mh1.add_many((1, 2, 3, 4)) mh2.add_many((1, 5)) - assert mh1.contained_by(mh2) == 1/4 - assert mh2.contained_by(mh1) == 1/2 + assert mh1.contained_by(mh2) == 1 / 4 + assert mh2.contained_by(mh1) == 1 / 2 assert mh1.avg_containment(mh2) == 0.375 assert mh2.avg_containment(mh1) == 0.375 @@ -2454,7 +2542,7 @@ def test_frozen_and_mutable_3(track_abundance): def test_dna_kmers(): # test seq_to_hashes for dna -> dna - mh = MinHash(0, ksize=31, scaled=1) # DNA + mh = MinHash(0, ksize=31, scaled=1) # DNA seq = "ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAACGTTTTCTGCGTGTTGCCGATATTCTGGAAAGCAATGCCAGGCAGGGGCAGGTGGCCACCGTCCTCTCTGCCCCCGCCAAAATCACCAACCACCTGGTGGCGATGATTGAAAAAACCATTAGCGGCCAGGATGCTTTACCCAATATCAGCGATGCCGAACGTATTTTTGCCGAACTTTTGACGGGACTCGCCGCCGCCCAGCCGGGGTTCCCGCTGGCGCAATTGAAAACTTTCGTCGATCAGGAATTTGCCCAAATAAAACATGTCCTGCATGGCATTAGTTTGTTGGGGCAGTGCCCGGATAGCATCAACGCTGCGCTGATTTGCCGTGGCGAGAAAATGTCGATCGCCATTATGGCCGGCGTATTAGAAGCGCGCGGTCACAACGTTACTGTTATCGATCCGGTCGAAAAACTGCTGGCAGTGGGGCATTACCTCGAATCTACCGTCGATATTGCTGAGTCCACCCGCCGTATTGCGGCAAGCCGCATTCCGGCTGATCACATGGTGCTGAT" # first calculate seq to hashes @@ -2469,7 +2557,7 @@ def test_dna_kmers(): # k-mer by k-mer? for i in range(0, len(seq) - 31 + 1): # calculate each k-mer - kmer = seq[i:i+31] + kmer = seq[i : i + 31] # add to minhash obj single_mh = mh.copy_and_clear() @@ -2488,7 +2576,7 @@ def test_dna_kmers(): def test_dna_kmers_2(): # test kmers_and_hashes for dna -> dna - mh = MinHash(0, ksize=31, scaled=1) # DNA + mh = MinHash(0, ksize=31, scaled=1) # DNA seq = "ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAACGTTTTCTGCGTGTTGCCGATATTCTGGAAAGCAATGCCAGGCAGGGGCAGGTGGCCACCGTCCTCTCTGCCCCCGCCAAAATCACCAACCACCTGGTGGCGATGATTGAAAAAACCATTAGCGGCCAGGATGCTTTACCCAATATCAGCGATGCCGAACGTATTTTTGCCGAACTTTTGACGGGACTCGCCGCCGCCCAGCCGGGGTTCCCGCTGGCGCAATTGAAAACTTTCGTCGATCAGGAATTTGCCCAAATAAAACATGTCCTGCATGGCATTAGTTTGTTGGGGCAGTGCCCGGATAGCATCAACGCTGCGCTGATTTGCCGTGGCGAGAAAATGTCGATCGCCATTATGGCCGGCGTATTAGAAGCGCGCGGTCACAACGTTACTGTTATCGATCCGGTCGAAAAACTGCTGGCAGTGGGGCATTACCTCGAATCTACCGTCGATATTGCTGAGTCCACCCGCCGTATTGCGGCAAGCCGCATTCCGGCTGATCACATGGTGCTGAT" # k-mer by k-mer? @@ -2504,7 +2592,7 @@ def test_dna_kmers_2(): def test_dna_kmers_3_bad_dna(): # test kmers_and_hashes for dna -> dna, with some bad k-mers in there - mh = MinHash(0, ksize=31, scaled=1) # DNA + mh = MinHash(0, ksize=31, scaled=1) # DNA seq = "NTGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAACGTTTTCTGCGTGTTGCCGATATTCTGGAAAGCAATGCCAGGCAGGGGCAGGTGGCCACCGTCCTCTCTGCCCCCGCCAAAATCACCAACCACCTGGTGGCGATGATTGAAAAAACCATTAGCGGCCAGGATGCTTTACCCAATATCAGCGATGCCGAACGTATTTTTGCCGAACTTTTGACGGGACTCGCCGCCGCCCAGCCGGGGTTCCCGCTGGCGCAATTGAAAACTTTCGTCGATCAGGAATTTGCCCAAATAAAACATGTCCTGCATGGCATTAGTTTGTTGGGGCAGTGCCCGGATAGCATCAACGCTGCGCTGATTTGCCGTGGCGAGAAAATGTCGATCGCCATTATGGCCGGCGTATTAGAAGCGCGCGGTCACAACGTTACTGTTATCGATCCGGTCGAAAAACTGCTGGCAGTGGGGCATTACCTCGAATCTACCGTCGATATTGCTGAGTCCACCCGCCGTATTGCGGCAAGCCGCATTCCGGCTGATCACATGGTGCTGAT" with pytest.raises(ValueError) as exc: @@ -2515,7 +2603,7 @@ def test_dna_kmers_3_bad_dna(): def test_dna_kmers_4_bad_dna(): # test kmers_and_hashes for bad dna -> dna, using force - mh = MinHash(0, ksize=31, scaled=1) # DNA + mh = MinHash(0, ksize=31, scaled=1) # DNA seq = "NTGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAACGTTTTCTGCGTGTTGCCGATATTCTGGAAAGCAATGCCAGGCAGGGGCAGGTGGCCACCGTCCTCTCTGCCCCCGCCAAAATCACCAACCACCTGGTGGCGATGATTGAAAAAACCATTAGCGGCCAGGATGCTTTACCCAATATCAGCGATGCCGAACGTATTTTTGCCGAACTTTTGACGGGACTCGCCGCCGCCCAGCCGGGGTTCCCGCTGGCGCAATTGAAAACTTTCGTCGATCAGGAATTTGCCCAAATAAAACATGTCCTGCATGGCATTAGTTTGTTGGGGCAGTGCCCGGATAGCATCAACGCTGCGCTGATTTGCCGTGGCGAGAAAATGTCGATCGCCATTATGGCCGGCGTATTAGAAGCGCGCGGTCACAACGTTACTGTTATCGATCCGGTCGAAAAACTGCTGGCAGTGGGGCATTACCTCGAATCTACCGTCGATATTGCTGAGTCCACCCGCCGTATTGCGGCAAGCCGCATTCCGGCTGATCACATGGTGCTGAT" # k-mer by k-mer? @@ -2524,8 +2612,8 @@ def test_dna_kmers_4_bad_dna(): # add to minhash obj single_mh = mh.copy_and_clear() - if hashval == None: - assert kmer == seq[:31] # first k-mer is baaaaad. + if hashval is None: + assert kmer == seq[:31] # first k-mer is baaaaad. found_bad_kmer = True continue @@ -2555,7 +2643,7 @@ def test_protein_kmers(): # k-mer by k-mer? for i in range(0, len(seq) - 7 + 1): # calculate each k-mer - kmer = seq[i:i+7] + kmer = seq[i : i + 7] # add to minhash obj single_mh = mh.copy_and_clear() @@ -2605,7 +2693,7 @@ def test_dayhoff_kmers(): # k-mer by k-mer? for i in range(0, len(seq) - 7 + 1): # calculate each k-mer - kmer = seq[i:i+7] + kmer = seq[i : i + 7] # add to minhash obj single_mh = mh.copy_and_clear() @@ -2655,7 +2743,7 @@ def test_hp_kmers(): # k-mer by k-mer? for i in range(0, len(seq) - 7 + 1): # calculate each k-mer - kmer = seq[i:i+7] + kmer = seq[i : i + 7] # add to minhash obj single_mh = mh.copy_and_clear() @@ -2789,8 +2877,8 @@ def test_containment(track_abundance): mh2.add_many((1, 5)) mh2.add_many((1, 5)) - assert mh1.contained_by(mh2) == 1/4 - assert mh2.contained_by(mh1) == 1/2 + assert mh1.contained_by(mh2) == 1 / 4 + assert mh2.contained_by(mh1) == 1 / 2 def test_sum_abundances(track_abundance): @@ -2808,8 +2896,8 @@ def test_sum_abundances(track_abundance): assert mh1.sum_abundances == 6 assert mh2.sum_abundances == 6 else: - assert mh1.sum_abundances == None - assert mh2.sum_abundances == None + assert mh1.sum_abundances is None + assert mh2.sum_abundances is None def test_mean_abundance(track_abundance): @@ -2885,32 +2973,44 @@ def test_unique_dataset_hashes(track_abundance): def test_containment_ANI(): - f1 = utils.get_test_data('2.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("2.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") mh1 = sourmash.load_one_signature(f1, ksize=31).minhash mh2 = sourmash.load_one_signature(f2, ksize=31).minhash - m1_cont_m2 = mh1.containment_ani(mh2, estimate_ci =True) - m2_cont_m1 = mh2.containment_ani(mh1, estimate_ci =True) + m1_cont_m2 = mh1.containment_ani(mh2, estimate_ci=True) + m2_cont_m1 = mh2.containment_ani(mh1, estimate_ci=True) print("\nmh1 contained by mh2", m1_cont_m2) print("mh2 contained by mh1", m2_cont_m1) - assert (round(m1_cont_m2.ani,3), m1_cont_m2.ani_low, m1_cont_m2.ani_high) == (1.0, 1.0, 1.0) - assert (round(m2_cont_m1.ani,3), round(m2_cont_m1.ani_low,3), round(m2_cont_m1.ani_high,3)) == (0.966, 0.965, 0.967) - - m1_mc_m2 = mh1.max_containment_ani(mh2, estimate_ci =True) - m2_mc_m1 = mh2.max_containment_ani(mh1, estimate_ci =True) + assert (round(m1_cont_m2.ani, 3), m1_cont_m2.ani_low, m1_cont_m2.ani_high) == ( + 1.0, + 1.0, + 1.0, + ) + assert ( + round(m2_cont_m1.ani, 3), + round(m2_cont_m1.ani_low, 3), + round(m2_cont_m1.ani_high, 3), + ) == (0.966, 0.965, 0.967) + + m1_mc_m2 = mh1.max_containment_ani(mh2, estimate_ci=True) + m2_mc_m1 = mh2.max_containment_ani(mh1, estimate_ci=True) print("mh1 max containment", m1_mc_m2) print("mh2 max containment", m2_mc_m1) m1_mc_m2.size_is_inaccurate = False m2_mc_m1.size_is_inaccurate = False assert m1_mc_m2 == m2_mc_m1 - assert (round(m1_mc_m2.ani, 3), round(m1_mc_m2.ani_low, 3), round(m1_mc_m2.ani_high, 3)) == (1.0,1.0,1.0) + assert ( + round(m1_mc_m2.ani, 3), + round(m1_mc_m2.ani_low, 3), + round(m1_mc_m2.ani_high, 3), + ) == (1.0, 1.0, 1.0) def test_containment_ANI_precalc_containment(): - f1 = utils.get_test_data('47+63.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("47+63.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") mh1 = sourmash.load_one_signature(f1, ksize=31).minhash mh2 = sourmash.load_one_signature(f2, ksize=31).minhash # precalc containments and assert same results @@ -2918,27 +3018,37 @@ def test_containment_ANI_precalc_containment(): s2c = mh2.contained_by(mh1) mc = max(s1c, s2c) - assert mh1.containment_ani(mh2, estimate_ci=True) == mh1.containment_ani(mh2, containment=s1c, estimate_ci=True) - assert mh2.containment_ani(mh1) == mh2.containment_ani(mh1, containment=s2c) - assert mh1.max_containment_ani(mh2) == mh2.max_containment_ani(mh1) - assert mh1.max_containment_ani(mh2) == mh1.max_containment_ani(mh2, max_containment=mc) - assert mh1.max_containment_ani(mh2) == mh2.max_containment_ani(mh1, max_containment=mc) + assert mh1.containment_ani(mh2, estimate_ci=True) == mh1.containment_ani( + mh2, containment=s1c, estimate_ci=True + ) + assert mh2.containment_ani(mh1) == mh2.containment_ani(mh1, containment=s2c) + assert mh1.max_containment_ani(mh2) == mh2.max_containment_ani(mh1) + assert mh1.max_containment_ani(mh2) == mh1.max_containment_ani( + mh2, max_containment=mc + ) + assert mh1.max_containment_ani(mh2) == mh2.max_containment_ani( + mh1, max_containment=mc + ) def test_avg_containment_ani(): - f1 = utils.get_test_data('47+63.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("47+63.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") mh1 = sourmash.load_one_signature(f1, ksize=31).minhash mh2 = sourmash.load_one_signature(f2, ksize=31).minhash # check average_containment_ani ac_m1 = mh1.avg_containment_ani(mh2) ac_m2 = mh2.avg_containment_ani(mh1) - assert ac_m1 == ac_m2 == (mh1.containment_ani(mh2).ani + mh2.containment_ani(mh1).ani)/2 + assert ( + ac_m1 + == ac_m2 + == (mh1.containment_ani(mh2).ani + mh2.containment_ani(mh1).ani) / 2 + ) def test_containment_ANI_downsample(): - f2 = utils.get_test_data('2+63.fa.sig') - f3 = utils.get_test_data('47+63.fa.sig') + f2 = utils.get_test_data("2+63.fa.sig") + f3 = utils.get_test_data("47+63.fa.sig") mh2 = sourmash.load_one_signature(f2, ksize=31).minhash mh3 = sourmash.load_one_signature(f3, ksize=31).minhash # check that downsampling works properly @@ -2947,8 +3057,8 @@ def test_containment_ANI_downsample(): assert mh2.scaled != mh3.scaled ds_s3c = mh2.containment_ani(mh3, downsample=True) ds_s4c = mh3.containment_ani(mh2, downsample=True) - mc_w_ds_1 = mh2.max_containment_ani(mh3, downsample=True) - mc_w_ds_2 = mh3.max_containment_ani(mh2, downsample=True) + mc_w_ds_1 = mh2.max_containment_ani(mh3, downsample=True) + mc_w_ds_2 = mh3.max_containment_ani(mh2, downsample=True) print(ds_s3c) with pytest.raises(ValueError) as e: mh2.containment_ani(mh3) @@ -2962,19 +3072,19 @@ def test_containment_ANI_downsample(): assert mh2.scaled == mh3.scaled ds_s3c_manual = mh2.containment_ani(mh3) ds_s4c_manual = mh3.containment_ani(mh2) - ds_mc_manual = mh2.max_containment_ani(mh3) + ds_mc_manual = mh2.max_containment_ani(mh3) assert ds_s3c == ds_s3c_manual assert ds_s4c == ds_s4c_manual assert mc_w_ds_1 == mc_w_ds_2 == ds_mc_manual ac_m2 = mh2.avg_containment_ani(mh3) ac_m3 = mh3.avg_containment_ani(mh2) - assert ac_m2 == ac_m3 == (ds_s3c.ani + ds_s4c.ani)/2 + assert ac_m2 == ac_m3 == (ds_s3c.ani + ds_s4c.ani) / 2 def test_jaccard_ANI(): - f1 = utils.get_test_data('2.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("2.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") mh1 = sourmash.load_one_signature(f1, ksize=31).minhash mh2 = sourmash.load_one_signature(f2).minhash @@ -2984,12 +3094,16 @@ def test_jaccard_ANI(): m2_jani_m1 = mh2.jaccard_ani(mh1) assert m1_jani_m2 == m2_jani_m1 - assert (m1_jani_m2.ani, m1_jani_m2.p_nothing_in_common, m1_jani_m2.jaccard_error) == (0.9783711630110239, 0.0, 3.891666770716877e-07) + assert ( + m1_jani_m2.ani, + m1_jani_m2.p_nothing_in_common, + m1_jani_m2.jaccard_error, + ) == (0.9783711630110239, 0.0, 3.891666770716877e-07) def test_jaccard_ANI_untrustworthy(): - f1 = utils.get_test_data('2.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("2.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") mh1 = sourmash.load_one_signature(f1, ksize=31).minhash mh2 = sourmash.load_one_signature(f2).minhash @@ -3000,28 +3114,32 @@ def test_jaccard_ANI_untrustworthy(): # since size is inaccurate on 2.fa.sig, need to override to be able to get ani m1_jani_m2.size_is_inaccurate = False - assert m1_jani_m2.ani == None - assert m1_jani_m2.je_exceeds_threshold==True + assert m1_jani_m2.ani is None + assert m1_jani_m2.je_exceeds_threshold == True assert m1_jani_m2.je_threshold == 1e-7 def test_jaccard_ANI_precalc_jaccard(): - f1 = utils.get_test_data('2.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("2.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") mh1 = sourmash.load_one_signature(f1, ksize=31).minhash mh2 = sourmash.load_one_signature(f2).minhash # precalc jaccard and assert same result jaccard = mh1.jaccard(mh2) - print("\nJACCARD_ANI", mh1.jaccard_ani(mh2,jaccard=jaccard)) + print("\nJACCARD_ANI", mh1.jaccard_ani(mh2, jaccard=jaccard)) - assert mh1.jaccard_ani(mh2) == mh1.jaccard_ani(mh2, jaccard=jaccard) == mh2.jaccard_ani(mh1, jaccard=jaccard) + assert ( + mh1.jaccard_ani(mh2) + == mh1.jaccard_ani(mh2, jaccard=jaccard) + == mh2.jaccard_ani(mh1, jaccard=jaccard) + ) wrong_jaccard = jaccard - 0.1 assert mh1.jaccard_ani(mh2) != mh1.jaccard_ani(mh2, jaccard=wrong_jaccard) def test_jaccard_ANI_downsample(): - f1 = utils.get_test_data('2.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("2.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") mh1 = sourmash.load_one_signature(f1, ksize=31).minhash mh2 = sourmash.load_one_signature(f2).minhash @@ -3058,13 +3176,13 @@ def test_containment_ani_ci_tiny_testdata(): # from the formula ANI = c^(1/k) for c=3/4 and k=21 np.testing.assert_almost_equal(m2_cani_m1.ani, 0.986394259982259, decimal=3) m2_cani_m1.size_is_inaccurate = False - assert m2_cani_m1.ani_low == None - assert m2_cani_m1.ani_high == None + assert m2_cani_m1.ani_low is None + assert m2_cani_m1.ani_high is None def test_containment_num_fail(): - f1 = utils.get_test_data('num/47.fa.sig') - f2 = utils.get_test_data('num/63.fa.sig') + f1 = utils.get_test_data("num/47.fa.sig") + f2 = utils.get_test_data("num/63.fa.sig") mh1 = sourmash.load_one_signature(f1, ksize=31).minhash mh2 = sourmash.load_one_signature(f2, ksize=31).minhash @@ -3081,8 +3199,8 @@ def test_containment_num_fail(): def test_ANI_num_fail(): - f1 = utils.get_test_data('num/47.fa.sig') - f2 = utils.get_test_data('num/63.fa.sig') + f1 = utils.get_test_data("num/47.fa.sig") + f2 = utils.get_test_data("num/63.fa.sig") mh1 = sourmash.load_one_signature(f1, ksize=31).minhash mh2 = sourmash.load_one_signature(f2, ksize=31).minhash @@ -3091,7 +3209,7 @@ def test_ANI_num_fail(): print(str(exc)) assert "Error: can only calculate ANI for scaled MinHashes" in str(exc) with pytest.raises(TypeError) as exc: - mh2.containment_ani(mh1, estimate_ci =True) + mh2.containment_ani(mh1, estimate_ci=True) assert "Error: can only calculate ANI for scaled MinHashes" in str(exc) with pytest.raises(TypeError) as exc: mh1.max_containment_ani(mh2) @@ -3105,8 +3223,8 @@ def test_ANI_num_fail(): def test_minhash_set_size_estimate_is_accurate(): - f1 = utils.get_test_data('2.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("2.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") mh1 = sourmash.load_one_signature(f1, ksize=31).minhash mh2 = sourmash.load_one_signature(f2).minhash mh1_ds = mh1.downsample(scaled=100000) @@ -3126,22 +3244,31 @@ def test_minhash_set_size_estimate_is_accurate(): # check that relative error and confidence must be between 0 and 1 with pytest.raises(ValueError) as exc: mh2.size_is_accurate(relative_error=-1) - assert "Error: relative error and confidence values must be between 0 and 1." in str(exc) + assert ( + "Error: relative error and confidence values must be between 0 and 1." + in str(exc) + ) with pytest.raises(ValueError) as exc: mh2.size_is_accurate(confidence=-1) - assert "Error: relative error and confidence values must be between 0 and 1." in str(exc) + assert ( + "Error: relative error and confidence values must be between 0 and 1." + in str(exc) + ) with pytest.raises(ValueError) as exc: mh2.size_is_accurate(relative_error=-1, confidence=-1) - assert "Error: relative error and confidence values must be between 0 and 1." in str(exc) + assert ( + "Error: relative error and confidence values must be between 0 and 1." + in str(exc) + ) def test_minhash_ani_inaccurate_size_est(): # TODO: It's actually really tricky to get the set size to be inaccurate. Eg. For a scale factor of 10000, # you would need - f1 = utils.get_test_data('2.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("2.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") mh1 = sourmash.load_one_signature(f1, ksize=31).minhash mh2 = sourmash.load_one_signature(f2).minhash # downsample @@ -3160,12 +3287,12 @@ def test_minhash_ani_inaccurate_size_est(): m1_ca_m2_ds = mh1_ds.containment_ani(mh2_ds) print(m1_ca_m2_ds) - assert m1_ca_m2_ds.ani == None #0.987 + assert m1_ca_m2_ds.ani is None # 0.987 assert m1_ca_m2_ds.size_is_inaccurate == True def test_size_num_fail(): - f1 = utils.get_test_data('num/47.fa.sig') + f1 = utils.get_test_data("num/47.fa.sig") mh1 = sourmash.load_one_signature(f1, ksize=31).minhash with pytest.raises(TypeError) as exc: diff --git a/tests/test_nodegraph.py b/tests/test_nodegraph.py index 68283dd620..bc9e02754b 100644 --- a/tests/test_nodegraph.py +++ b/tests/test_nodegraph.py @@ -2,15 +2,19 @@ import pytest -from sourmash.nodegraph import Nodegraph, extract_nodegraph_info, calc_expected_collisions +from sourmash.nodegraph import ( + Nodegraph, + extract_nodegraph_info, + calc_expected_collisions, +) import sourmash_tst_utils as utils def test_nodegraph_to_khmer_basic(): - pytest.importorskip('khmer') + pytest.importorskip("khmer") - ng_file = utils.get_test_data('.sbt.v3/internal.0') + ng_file = utils.get_test_data(".sbt.v3/internal.0") sourmash_ng = Nodegraph.load(ng_file) khmer_sm_ng = sourmash_ng.to_khmer_nodegraph() @@ -19,7 +23,7 @@ def test_nodegraph_to_khmer_basic(): def test_nodegraph_khmer_compare(): - khmer = pytest.importorskip('khmer') + khmer = pytest.importorskip("khmer") khmer_ng = khmer.Nodegraph(3, 23, 6) khmer_ng.count("ACG") @@ -43,14 +47,14 @@ def test_nodegraph_khmer_compare(): def test_nodegraph_same_file(): - khmer = pytest.importorskip('khmer') + khmer = pytest.importorskip("khmer") try: load_nodegraph = khmer.load_nodegraph except AttributeError: load_nodegraph = khmer.Nodegraph.load - ng_file = utils.get_test_data('.sbt.v3/internal.0') - with open(ng_file, 'rb') as f: + ng_file = utils.get_test_data(".sbt.v3/internal.0") + with open(ng_file, "rb") as f: ng_data = f.read() sourmash_ng = Nodegraph.load(ng_file) @@ -85,7 +89,7 @@ def test_nodegraph_same_file(): def test_nodegraph_expected_collisions(): - ng_file = utils.get_test_data('.sbt.v3/internal.0') + ng_file = utils.get_test_data(".sbt.v3/internal.0") sourmash_ng = Nodegraph.load(ng_file) @@ -93,7 +97,7 @@ def test_nodegraph_expected_collisions(): def test_nodegraph_expected_collisions_error(): - ng_file = utils.get_test_data('.sbt.v3/internal.0') + ng_file = utils.get_test_data(".sbt.v3/internal.0") sourmash_ng = Nodegraph.load(ng_file) diff --git a/tests/test_np_utils.py b/tests/test_np_utils.py index 50aaa756f4..e23ca361a0 100644 --- a/tests/test_np_utils.py +++ b/tests/test_np_utils.py @@ -5,7 +5,6 @@ def test_memmap(): - e1 = sourmash.MinHash(n=1, ksize=20) sig1 = SourmashSignature(e1) diff --git a/tests/test_picklist.py b/tests/test_picklist.py index 73c8799689..682d6fb8af 100644 --- a/tests/test_picklist.py +++ b/tests/test_picklist.py @@ -14,23 +14,23 @@ def test_load_empty_picklist_fail(): - empty = utils.get_test_data('picklist/empty.csv') + empty = utils.get_test_data("picklist/empty.csv") - pl = SignaturePicklist('manifest', pickfile=empty) + pl = SignaturePicklist("manifest", pickfile=empty) with pytest.raises(ValueError): pl.load(allow_empty=False) def test_load_empty_picklist_allow(): - empty = utils.get_test_data('picklist/empty.csv') + empty = utils.get_test_data("picklist/empty.csv") - pl = SignaturePicklist('manifest', pickfile=empty) + pl = SignaturePicklist("manifest", pickfile=empty) pl.load(allow_empty=True) def test_dup_md5_picked(runtmp): # load a sig, duplicate, and see if a picklist gets the right one - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss = sourmash.load_file_as_signatures(sig47) sig = list(ss)[0] @@ -41,26 +41,26 @@ def test_dup_md5_picked(runtmp): print(ml.manifest.rows) assert len(ml.manifest) == 1 - mf_csv = runtmp.output('select.csv') + mf_csv = runtmp.output("select.csv") ml.manifest.write_to_filename(mf_csv) # now make an index to select against, with an identical signature # (but diff name) new_sig = sig.to_mutable() - new_sig.name = 'foo' + new_sig.name = "foo" xl = LinearIndex([sig, new_sig]) ml2 = MultiIndex.load([xl], [None], None) assert len(ml2) == 2 # create a picklist... - pl = SignaturePicklist('manifest', pickfile=mf_csv) + pl = SignaturePicklist("manifest", pickfile=mf_csv) print(pl.load()) - print('loaded:', len(pl.pickset)) + print("loaded:", len(pl.pickset)) # use in select ml3 = ml2.select(picklist=pl) - print('picked:', len(ml3)) + print("picked:", len(ml3)) assert len(pl.pickset) == len(ml3) @@ -68,7 +68,7 @@ def test_dup_md5_picked(runtmp): def test_dup_md5_picked_mf_to_picklist(runtmp): # load a sig, duplicate, and see if a picklist gets the right one # uses an in memory picklist - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss = sourmash.load_file_as_signatures(sig47) sig = list(ss)[0] @@ -84,7 +84,7 @@ def test_dup_md5_picked_mf_to_picklist(runtmp): # now make an index to select against, with an identical signature # (but diff name) new_sig = sig.to_mutable() - new_sig.name = 'foo' + new_sig.name = "foo" xl = LinearIndex([sig, new_sig]) ml2 = MultiIndex.load([xl], [None], None) @@ -92,7 +92,7 @@ def test_dup_md5_picked_mf_to_picklist(runtmp): # use picklist in select ml3 = ml2.select(picklist=pl) - print('picked:', len(ml3)) + print("picked:", len(ml3)) assert len(pl.pickset) == len(ml3) @@ -100,12 +100,12 @@ def test_dup_md5_picked_mf_to_picklist(runtmp): def test_dup_md5_picked_mf_to_picklist_sqlite(runtmp): # load a sig, duplicate, and see if a picklist gets the right one # use a sqlite db with its own to_picklist behavior. - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss = sourmash.load_file_as_signatures(sig47) sig = list(ss)[0] # save a manifest with one entry - xl = SqliteIndex.create(':memory:') + xl = SqliteIndex.create(":memory:") xl.insert(sig) print(xl.manifest.rows) @@ -116,7 +116,7 @@ def test_dup_md5_picked_mf_to_picklist_sqlite(runtmp): # now make an index to select against, with an identical signature # (but diff name) new_sig = sig.to_mutable() - new_sig.name = 'foo' + new_sig.name = "foo" xl = LinearIndex([sig, new_sig]) ml2 = MultiIndex.load([xl], [None], None) @@ -124,6 +124,6 @@ def test_dup_md5_picked_mf_to_picklist_sqlite(runtmp): # use picklist in select ml3 = ml2.select(picklist=pl) - print('picked:', len(ml3)) + print("picked:", len(ml3)) assert len(pl.pickset) == len(ml3) diff --git a/tests/test_plugin_framework.py b/tests/test_plugin_framework.py index 06156e4d85..1acb78bd6c 100644 --- a/tests/test_plugin_framework.py +++ b/tests/test_plugin_framework.py @@ -13,22 +13,23 @@ import sourmash_tst_utils as utils from sourmash import plugins from sourmash.index import LinearIndex -from sourmash.save_load import (Base_SaveSignaturesToLocation, - SaveSignaturesToLocation) +from sourmash.save_load import Base_SaveSignaturesToLocation, SaveSignaturesToLocation + + +_Dist = collections.namedtuple("_Dist", ["version"]) -_Dist = collections.namedtuple('_Dist', ['version']) class FakeEntryPoint: """ A class that stores a name and an object to be returned on 'load()'. Mocks the EntryPoint class used by importlib.metadata. """ - module = 'test_plugin_framework' - dist = _Dist('0.1') - group = 'groupfoo' - def __init__(self, name, load_obj, *, - error_on_import=None): + module = "test_plugin_framework" + dist = _Dist("0.1") + group = "groupfoo" + + def __init__(self, name, load_obj, *, error_on_import=None): self.name = name self.load_obj = load_obj self.error_on_import = error_on_import @@ -38,15 +39,17 @@ def load(self): raise self.error_on_import("as requested") return self.load_obj + # # Test basic features of the load_from plugin hook. # + class Test_EntryPointBasics_LoadFrom: def get_some_sigs(self, location, *args, **kwargs): - ss2 = utils.get_test_data('2.fa.sig') - ss47 = utils.get_test_data('47.fa.sig') - ss63 = utils.get_test_data('63.fa.sig') + ss2 = utils.get_test_data("2.fa.sig") + ss47 = utils.get_test_data("47.fa.sig") + ss63 = utils.get_test_data("63.fa.sig") sig2 = sourmash.load_one_signature(ss2, ksize=31) sig47 = sourmash.load_one_signature(ss47, ksize=31) @@ -55,12 +58,17 @@ def get_some_sigs(self, location, *args, **kwargs): lidx = LinearIndex([sig2, sig47, sig63], location) return lidx + get_some_sigs.priority = 1 - + def setup_method(self): self.saved_plugins = plugins._plugin_load_from - plugins._plugin_load_from = [FakeEntryPoint('test_load', self.get_some_sigs), - FakeEntryPoint('test_load', self.get_some_sigs, error_on_import=ModuleNotFoundError)] + plugins._plugin_load_from = [ + FakeEntryPoint("test_load", self.get_some_sigs), + FakeEntryPoint( + "test_load", self.get_some_sigs, error_on_import=ModuleNotFoundError + ), + ] def teardown_method(self): plugins._plugin_load_from = self.saved_plugins @@ -70,7 +78,7 @@ def test_load_1(self): assert len(ps) == 1 def test_load_2(self, runtmp): - fake_location = runtmp.output('passed-through location') + fake_location = runtmp.output("passed-through location") idx = sourmash.load_file_as_index(fake_location) print(idx, idx.location) @@ -80,9 +88,9 @@ def test_load_2(self, runtmp): class Test_EntryPoint_LoadFrom_Priority: def get_some_sigs(self, location, *args, **kwargs): - ss2 = utils.get_test_data('2.fa.sig') - ss47 = utils.get_test_data('47.fa.sig') - ss63 = utils.get_test_data('63.fa.sig') + ss2 = utils.get_test_data("2.fa.sig") + ss47 = utils.get_test_data("47.fa.sig") + ss63 = utils.get_test_data("63.fa.sig") sig2 = sourmash.load_one_signature(ss2, ksize=31) sig47 = sourmash.load_one_signature(ss47, ksize=31) @@ -91,39 +99,43 @@ def get_some_sigs(self, location, *args, **kwargs): lidx = LinearIndex([sig2, sig47, sig63], location) return lidx + get_some_sigs.priority = 5 def set_called_flag_1(self, location, *args, **kwargs): # high priority 1, raise ValueError - print('setting flag 1') + print("setting flag 1") self.was_called_flag_1 = True raise ValueError + set_called_flag_1.priority = 1 def set_called_flag_2(self, location, *args, **kwargs): # high priority 2, return None - print('setting flag 2') + print("setting flag 2") self.was_called_flag_2 = True return None + set_called_flag_2.priority = 2 def set_called_flag_3(self, location, *args, **kwargs): # lower priority 10, should not be called - print('setting flag 3') + print("setting flag 3") self.was_called_flag_3 = True return None + set_called_flag_3.priority = 10 def setup_method(self): self.saved_plugins = plugins._plugin_load_from plugins._plugin_load_from = [ - FakeEntryPoint('test_load', self.get_some_sigs), - FakeEntryPoint('test_load_2', self.set_called_flag_1), - FakeEntryPoint('test_load_3', self.set_called_flag_2), - FakeEntryPoint('test_load_4', self.set_called_flag_3) - ] + FakeEntryPoint("test_load", self.get_some_sigs), + FakeEntryPoint("test_load_2", self.set_called_flag_1), + FakeEntryPoint("test_load_3", self.set_called_flag_2), + FakeEntryPoint("test_load_4", self.set_called_flag_3), + ] self.was_called_flag_1 = False self.was_called_flag_2 = False self.was_called_flag_3 = False @@ -140,7 +152,7 @@ def test_load_1(self): assert not self.was_called_flag_3 def test_load_2(self, runtmp): - fake_location = runtmp.output('passed-through location') + fake_location = runtmp.output("passed-through location") idx = sourmash.load_file_as_index(fake_location) print(idx, idx.location) @@ -156,10 +168,12 @@ def test_load_2(self, runtmp): # Test basic features of the save_to plugin hook. # + class FakeSaveClass(Base_SaveSignaturesToLocation): """ A fake save class that just records what was sent to it. """ + priority = 50 def __init__(self, location): @@ -169,7 +183,7 @@ def __init__(self, location): @classmethod def matches(cls, location): if location: - return location.endswith('.this-is-a-test') + return location.endswith(".this-is-a-test") def add(self, ss): super().add(ss) @@ -184,8 +198,12 @@ class Test_EntryPointBasics_SaveTo: # test the basics def setup_method(self): self.saved_plugins = plugins._plugin_save_to - plugins._plugin_save_to = [FakeEntryPoint('test_save', FakeSaveClass), - FakeEntryPoint('test_save', FakeSaveClass, error_on_import=ModuleNotFoundError)] + plugins._plugin_save_to = [ + FakeEntryPoint("test_save", FakeSaveClass), + FakeEntryPoint( + "test_save", FakeSaveClass, error_on_import=ModuleNotFoundError + ), + ] def teardown_method(self): plugins._plugin_save_to = self.saved_plugins @@ -197,9 +215,9 @@ def test_save_1(self): def test_save_2(self, runtmp): # load some signatures to save - ss2 = utils.get_test_data('2.fa.sig') - ss47 = utils.get_test_data('47.fa.sig') - ss63 = utils.get_test_data('63.fa.sig') + ss2 = utils.get_test_data("2.fa.sig") + ss47 = utils.get_test_data("47.fa.sig") + ss63 = utils.get_test_data("63.fa.sig") sig2 = sourmash.load_one_signature(ss2, ksize=31) sig47 = sourmash.load_one_signature(ss47, ksize=31) @@ -207,7 +225,7 @@ def test_save_2(self, runtmp): # build a fake location that matches the FakeSaveClass # extension - fake_location = runtmp.output('out.this-is-a-test') + fake_location = runtmp.output("out.this-is-a-test") # this should use the plugin architecture to return an object # of type FakeSaveClass, with the three signatures in it. @@ -230,8 +248,8 @@ class Test_EntryPointPriority_SaveTo: def setup_method(self): self.saved_plugins = plugins._plugin_save_to plugins._plugin_save_to = [ - FakeEntryPoint('test_save', FakeSaveClass), - FakeEntryPoint('test_save2', FakeSaveClass_HighPriority), + FakeEntryPoint("test_save", FakeSaveClass), + FakeEntryPoint("test_save2", FakeSaveClass_HighPriority), ] def teardown_method(self): @@ -244,9 +262,9 @@ def test_save_1(self): def test_save_2(self, runtmp): # load some signatures to save - ss2 = utils.get_test_data('2.fa.sig') - ss47 = utils.get_test_data('47.fa.sig') - ss63 = utils.get_test_data('63.fa.sig') + ss2 = utils.get_test_data("2.fa.sig") + ss47 = utils.get_test_data("47.fa.sig") + ss63 = utils.get_test_data("63.fa.sig") sig2 = sourmash.load_one_signature(ss2, ksize=31) sig47 = sourmash.load_one_signature(ss47, ksize=31) @@ -254,7 +272,7 @@ def test_save_2(self, runtmp): # build a fake location that matches the FakeSaveClass # extension - fake_location = runtmp.output('out.this-is-a-test') + fake_location = runtmp.output("out.this-is-a-test") # this should use the plugin architecture to return an object # of type FakeSaveClass, with the three signatures in it. @@ -276,18 +294,20 @@ def test_save_2(self, runtmp): # Test basic features of the save_to plugin hook. # + class FakeCommandClass(plugins.CommandLinePlugin): """ A fake CLI class. """ - command = 'nifty' + + command = "nifty" description = "do somethin' nifty" def __init__(self, parser): super().__init__(parser) - parser.add_argument('arg1') - parser.add_argument('--other', action='store_true') - parser.add_argument('--do-fail', action='store_true') + parser.add_argument("arg1") + parser.add_argument("--other", action="store_true") + parser.add_argument("--do-fail", action="store_true") def main(self, args): super().main(args) @@ -305,8 +325,7 @@ def setup_method(self): _ = plugins.get_cli_script_plugins() self.saved_plugins = plugins._plugin_cli plugins._plugin_cli_once = False - plugins._plugin_cli = [FakeEntryPoint('test_command', - FakeCommandClass)] + plugins._plugin_cli = [FakeEntryPoint("test_command", FakeCommandClass)] def teardown_method(self): plugins._plugin_cli = self.saved_plugins @@ -316,17 +335,17 @@ def test_empty(self, runtmp): plugins._plugin_cli = [] with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts') + runtmp.sourmash("scripts") out = runtmp.last_result.out err = runtmp.last_result.err print(out) print(err) - assert '(No script plugins detected!)' in out + assert "(No script plugins detected!)" in out def test_cmd_0(self, runtmp): # test default output with some plugins with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts') + runtmp.sourmash("scripts") out = runtmp.last_result.out err = runtmp.last_result.err @@ -354,32 +373,32 @@ def test_cmd_2(self): def test_cmd_3(self, runtmp): # test ability to run 'nifty' ;) with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'nifty') + runtmp.sourmash("scripts", "nifty") out = runtmp.last_result.out err = runtmp.last_result.err print(out) print(err) - assert 'nifty: error: the following arguments are required: arg1' in err - assert 'usage: nifty [-h] [-q] [-d] [--other] [--do-fail] arg1' in err + assert "nifty: error: the following arguments are required: arg1" in err + assert "usage: nifty [-h] [-q] [-d] [--other] [--do-fail] arg1" in err def test_cmd_4(self, runtmp): # test basic argument parsing etc - runtmp.sourmash('scripts', 'nifty', '--other', 'some arg') + runtmp.sourmash("scripts", "nifty", "--other", "some arg") out = runtmp.last_result.out err = runtmp.last_result.err print(out) print(err) - assert 'other is True' in out - assert 'hello, world! argument is: some arg' in out + assert "other is True" in out + assert "hello, world! argument is: some arg" in out def test_cmd_5(self, runtmp): # test exit code passthru with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'nifty', '--do-fail', 'some arg') + runtmp.sourmash("scripts", "nifty", "--do-fail", "some arg") status = runtmp.last_result.status out = runtmp.last_result.out @@ -388,22 +407,23 @@ def test_cmd_5(self, runtmp): print(err) print(status) - assert 'other is False' in out - assert 'hello, world! argument is: some arg' in out + assert "other is False" in out + assert "hello, world! argument is: some arg" in out class FakeCommandClass_Second(plugins.CommandLinePlugin): """ A fake CLI class. """ - command = 'more_nifty' + + command = "more_nifty" description = "do somethin' else nifty" def __init__(self, parser): super().__init__(parser) - parser.add_argument('arg1') - parser.add_argument('--other', action='store_true') - parser.add_argument('--do-fail', action='store_true') + parser.add_argument("arg1") + parser.add_argument("--other", action="store_true") + parser.add_argument("--do-fail", action="store_true") def main(self, args): super().main(args) @@ -419,6 +439,7 @@ class FakeCommandClass_Broken_1: """ A fake CLI class. """ + # command = 'more_nifty' # no command def __init__(self, parser): @@ -432,7 +453,8 @@ class FakeCommandClass_Broken_2: """ A fake CLI class. """ - command = 'broken' + + command = "broken" # no description def __init__(self, parser): @@ -448,18 +470,15 @@ def setup_method(self): _ = plugins.get_cli_script_plugins() self.saved_plugins = plugins._plugin_cli plugins._plugin_cli_once = False - plugins._plugin_cli = [FakeEntryPoint('test_command', - FakeCommandClass), - FakeEntryPoint('test_command2', - FakeCommandClass_Second), - FakeEntryPoint('test_command3', - FakeCommandClass_Broken_1), - FakeEntryPoint('test_command4', - FakeCommandClass_Broken_2), - FakeEntryPoint('error-on-import', - FakeCommandClass, - error_on_import=ModuleNotFoundError) - ] + plugins._plugin_cli = [ + FakeEntryPoint("test_command", FakeCommandClass), + FakeEntryPoint("test_command2", FakeCommandClass_Second), + FakeEntryPoint("test_command3", FakeCommandClass_Broken_1), + FakeEntryPoint("test_command4", FakeCommandClass_Broken_2), + FakeEntryPoint( + "error-on-import", FakeCommandClass, error_on_import=ModuleNotFoundError + ), + ] def teardown_method(self): plugins._plugin_cli = self.saved_plugins @@ -467,7 +486,7 @@ def teardown_method(self): def test_cmd_0(self, runtmp): # test default output for a few plugins with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts') + runtmp.sourmash("scripts") out = runtmp.last_result.out err = runtmp.last_result.err @@ -481,7 +500,7 @@ def test_cmd_0(self, runtmp): def test_cmd_1(self, runtmp): # test 'nifty' - runtmp.sourmash('scripts', 'nifty', 'some arg') + runtmp.sourmash("scripts", "nifty", "some arg") status = runtmp.last_result.status out = runtmp.last_result.out @@ -490,12 +509,12 @@ def test_cmd_1(self, runtmp): print(err) print(status) - assert 'other is False' in out - assert 'hello, world! argument is: some arg' in out + assert "other is False" in out + assert "hello, world! argument is: some arg" in out def test_cmd_2(self, runtmp): # test 'more_nifty' - runtmp.sourmash('scripts', 'more_nifty', 'some arg') + runtmp.sourmash("scripts", "more_nifty", "some arg") status = runtmp.last_result.status out = runtmp.last_result.out @@ -504,12 +523,12 @@ def test_cmd_2(self, runtmp): print(err) print(status) - assert 'other is False' in out - assert 'hello, world! argument is: some arg' in out + assert "other is False" in out + assert "hello, world! argument is: some arg" in out def test_sourmash_info(self, runtmp): # test 'sourmash info -v' => shows the plugins - runtmp.sourmash('info', '-v') + runtmp.sourmash("info", "-v") out = runtmp.last_result.out err = runtmp.last_result.err diff --git a/tests/test_prefetch.py b/tests/test_prefetch.py index 7ab2d2c1dd..44c6b4aac5 100644 --- a/tests/test_prefetch.py +++ b/tests/test_prefetch.py @@ -25,29 +25,40 @@ def test_prefetch_basic(runtmp, linear_gather): c = runtmp # test a basic prefetch - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - linear_gather) + c.run_sourmash("prefetch", "-k", "31", sig47, sig63, sig2, sig47, linear_gather) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "WARNING: no output(s) specified! Nothing will be saved from this prefetch!" in c.last_result.err + assert ( + "WARNING: no output(s) specified! Nothing will be saved from this prefetch!" + in c.last_result.err + ) assert "selecting specified query k=31" in c.last_result.err - assert "loaded query: NC_009665.1 Shewanella baltica... (k=31, DNA)" in c.last_result.err - assert "query sketch has scaled=1000; will be dynamically downsampled as needed" in c.last_result.err + assert ( + "loaded query: NC_009665.1 Shewanella baltica... (k=31, DNA)" + in c.last_result.err + ) + assert ( + "query sketch has scaled=1000; will be dynamically downsampled as needed" + in c.last_result.err + ) err = c.last_result.err assert "loaded 5 total signatures from 3 locations." in err assert "after selecting signatures compatible with search, 3 remain." in err assert "total of 2 matching signatures." in c.last_result.err - assert "of 5177 distinct query hashes, 5177 were found in matches above threshold." in c.last_result.err + assert ( + "of 5177 distinct query hashes, 5177 were found in matches above threshold." + in c.last_result.err + ) assert "a total of 0 query hashes remain unmatched." in c.last_result.err @@ -55,15 +66,18 @@ def test_prefetch_select_query_ksize(runtmp, linear_gather): # test prefetch where query and subject db both have multiple ksizes c = runtmp - ss = utils.get_test_data('GCF_000005845.2_ASM584v2_genomic.fna.gz.sig') + ss = utils.get_test_data("GCF_000005845.2_ASM584v2_genomic.fna.gz.sig") - c.run_sourmash('prefetch', ss, ss, linear_gather) + c.run_sourmash("prefetch", ss, ss, linear_gather) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'of 4476 distinct query hashes, 4476 were found in matches above threshold.' in c.last_result.err + assert ( + "of 4476 distinct query hashes, 4476 were found in matches above threshold." + in c.last_result.err + ) def test_prefetch_subject_scaled_is_larger(runtmp, linear_gather): @@ -71,26 +85,39 @@ def test_prefetch_subject_scaled_is_larger(runtmp, linear_gather): c = runtmp # make a query sketch with scaled=1000 - fa = utils.get_test_data('genome-s10.fa.gz') - c.run_sourmash('sketch', 'dna', fa, '-o', 'query.sig') - assert os.path.exists(runtmp.output('query.sig')) + fa = utils.get_test_data("genome-s10.fa.gz") + c.run_sourmash("sketch", "dna", fa, "-o", "query.sig") + assert os.path.exists(runtmp.output("query.sig")) # this has a scaled of 10000, from same genome: - against1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - against2 = utils.get_test_data('scaled/all.sbt.zip') - against3 = utils.get_test_data('scaled/all.lca.json') + against1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + against2 = utils.get_test_data("scaled/all.sbt.zip") + against3 = utils.get_test_data("scaled/all.lca.json") # run against large scaled, then small (self) - c.run_sourmash('prefetch', 'query.sig', against1, against2, against3, - 'query.sig', linear_gather) + c.run_sourmash( + "prefetch", + "query.sig", + against1, + against2, + against3, + "query.sig", + linear_gather, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'total of 8 matching signatures.' in c.last_result.err - assert 'of 48 distinct query hashes, 48 were found in matches above threshold.' in c.last_result.err - assert 'final scaled value (max across query and all matches) is 10000' in c.last_result.err + assert "total of 8 matching signatures." in c.last_result.err + assert ( + "of 48 distinct query hashes, 48 were found in matches above threshold." + in c.last_result.err + ) + assert ( + "final scaled value (max across query and all matches) is 10000" + in c.last_result.err + ) def test_prefetch_subject_scaled_is_larger_outsigs(runtmp, linear_gather): @@ -98,30 +125,45 @@ def test_prefetch_subject_scaled_is_larger_outsigs(runtmp, linear_gather): c = runtmp # make a query sketch with scaled=1000 - fa = utils.get_test_data('genome-s10.fa.gz') - c.run_sourmash('sketch', 'dna', fa, '-o', 'query.sig') - assert os.path.exists(runtmp.output('query.sig')) + fa = utils.get_test_data("genome-s10.fa.gz") + c.run_sourmash("sketch", "dna", fa, "-o", "query.sig") + assert os.path.exists(runtmp.output("query.sig")) # this has a scaled of 10000, from same genome: - against1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - against2 = utils.get_test_data('scaled/all.sbt.zip') - against3 = utils.get_test_data('scaled/all.lca.json') + against1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + against2 = utils.get_test_data("scaled/all.sbt.zip") + against3 = utils.get_test_data("scaled/all.lca.json") # run against large scaled, then small (self) - c.run_sourmash('prefetch', 'query.sig', against1, against2, against3, - 'query.sig', linear_gather, '--save-matches', 'matches.sig') + c.run_sourmash( + "prefetch", + "query.sig", + against1, + against2, + against3, + "query.sig", + linear_gather, + "--save-matches", + "matches.sig", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'total of 8 matching signatures.' in c.last_result.err - assert 'of 48 distinct query hashes, 48 were found in matches above threshold.' in c.last_result.err - assert 'final scaled value (max across query and all matches) is 10000' in c.last_result.err + assert "total of 8 matching signatures." in c.last_result.err + assert ( + "of 48 distinct query hashes, 48 were found in matches above threshold." + in c.last_result.err + ) + assert ( + "final scaled value (max across query and all matches) is 10000" + in c.last_result.err + ) # make sure non-downsampled sketches were saved. - matches = sourmash.load_file_as_signatures(runtmp.output('matches.sig')) - scaled_vals = set([ match.minhash.scaled for match in matches ]) + matches = sourmash.load_file_as_signatures(runtmp.output("matches.sig")) + scaled_vals = set([match.minhash.scaled for match in matches]) assert 1000 in scaled_vals assert 10000 in scaled_vals assert len(scaled_vals) == 2 @@ -131,25 +173,36 @@ def test_prefetch_query_abund(runtmp, linear_gather): c = runtmp # test a basic prefetch w/abund query - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - linear_gather) + c.run_sourmash("prefetch", "-k", "31", sig47, sig63, sig2, sig47, linear_gather) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "WARNING: no output(s) specified! Nothing will be saved from this prefetch!" in c.last_result.err + assert ( + "WARNING: no output(s) specified! Nothing will be saved from this prefetch!" + in c.last_result.err + ) assert "selecting specified query k=31" in c.last_result.err - assert "loaded query: NC_009665.1 Shewanella baltica... (k=31, DNA)" in c.last_result.err - assert "query sketch has scaled=1000; will be dynamically downsampled as needed" in c.last_result.err + assert ( + "loaded query: NC_009665.1 Shewanella baltica... (k=31, DNA)" + in c.last_result.err + ) + assert ( + "query sketch has scaled=1000; will be dynamically downsampled as needed" + in c.last_result.err + ) assert "total of 2 matching signatures." in c.last_result.err - assert "of 5177 distinct query hashes, 5177 were found in matches above threshold." in c.last_result.err + assert ( + "of 5177 distinct query hashes, 5177 were found in matches above threshold." + in c.last_result.err + ) assert "a total of 0 query hashes remain unmatched." in c.last_result.err @@ -157,25 +210,36 @@ def test_prefetch_subj_abund(runtmp, linear_gather): c = runtmp # test a basic prefetch w/abund signature. - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - linear_gather) + c.run_sourmash("prefetch", "-k", "31", sig47, sig63, sig2, sig47, linear_gather) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "WARNING: no output(s) specified! Nothing will be saved from this prefetch!" in c.last_result.err + assert ( + "WARNING: no output(s) specified! Nothing will be saved from this prefetch!" + in c.last_result.err + ) assert "selecting specified query k=31" in c.last_result.err - assert "loaded query: NC_009665.1 Shewanella baltica... (k=31, DNA)" in c.last_result.err - assert "query sketch has scaled=1000; will be dynamically downsampled as needed" in c.last_result.err + assert ( + "loaded query: NC_009665.1 Shewanella baltica... (k=31, DNA)" + in c.last_result.err + ) + assert ( + "query sketch has scaled=1000; will be dynamically downsampled as needed" + in c.last_result.err + ) assert "total of 2 matching signatures." in c.last_result.err - assert "of 5177 distinct query hashes, 5177 were found in matches above threshold." in c.last_result.err + assert ( + "of 5177 distinct query hashes, 5177 were found in matches above threshold." + in c.last_result.err + ) assert "a total of 0 query hashes remain unmatched." in c.last_result.err @@ -183,14 +247,15 @@ def test_prefetch_csv_out(runtmp, linear_gather): c = runtmp # test a basic prefetch, with CSV output - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") - csvout = c.output('out.csv') + csvout = c.output("out.csv") - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - '-o', csvout, linear_gather) + c.run_sourmash( + "prefetch", "-k", "31", sig47, sig63, sig2, sig47, "-o", csvout, linear_gather + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -199,25 +264,26 @@ def test_prefetch_csv_out(runtmp, linear_gather): assert os.path.exists(csvout) expected_intersect_bp = [2529000, 5177000] - with open(csvout, 'rt', newline="") as fp: + with open(csvout, newline="") as fp: r = csv.DictReader(fp) - for (row, expected) in zip(r, expected_intersect_bp): + for row, expected in zip(r, expected_intersect_bp): print(row) - assert int(row['intersect_bp']) == expected + assert int(row["intersect_bp"]) == expected def test_prefetch_csv_gz_out(runtmp, linear_gather): c = runtmp # test a basic prefetch, with CSV output to a .gz file - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") - csvout = c.output('out.csv.gz') + csvout = c.output("out.csv.gz") - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - '-o', csvout, linear_gather) + c.run_sourmash( + "prefetch", "-k", "31", sig47, sig63, sig2, sig47, "-o", csvout, linear_gather + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -226,25 +292,35 @@ def test_prefetch_csv_gz_out(runtmp, linear_gather): assert os.path.exists(csvout) expected_intersect_bp = [2529000, 5177000] - with gzip.open(csvout, 'rt', newline="") as fp: + with gzip.open(csvout, "rt", newline="") as fp: r = csv.DictReader(fp) - for (row, expected) in zip(r, expected_intersect_bp): + for row, expected in zip(r, expected_intersect_bp): print(row) - assert int(row['intersect_bp']) == expected + assert int(row["intersect_bp"]) == expected def test_prefetch_matches(runtmp, linear_gather): c = runtmp # test a basic prefetch, with --save-matches - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - - matches_out = c.output('matches.sig') - - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - '--save-matches', matches_out, linear_gather) + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + + matches_out = c.output("matches.sig") + + c.run_sourmash( + "prefetch", + "-k", + "31", + sig47, + sig63, + sig2, + sig47, + "--save-matches", + matches_out, + linear_gather, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -255,7 +331,7 @@ def test_prefetch_matches(runtmp, linear_gather): sigs = sourmash.load_file_as_index(matches_out) expected_matches = [sig63, sig47] - for (match, expected) in zip(sigs.signatures(), expected_matches): + for match, expected in zip(sigs.signatures(), expected_matches): ss = sourmash.load_one_signature(expected, ksize=31) assert match == ss @@ -264,16 +340,26 @@ def test_prefetch_matches_to_dir(runtmp, linear_gather): c = runtmp # test a basic prefetch, with --save-matches to a directory - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss63 = sourmash.load_one_signature(sig63) ss47 = sourmash.load_one_signature(sig47) - matches_out = c.output('matches_dir/') - - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - '--save-matches', matches_out, linear_gather) + matches_out = c.output("matches_dir/") + + c.run_sourmash( + "prefetch", + "-k", + "31", + sig47, + sig63, + sig2, + sig47, + "--save-matches", + matches_out, + linear_gather, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -296,16 +382,26 @@ def test_prefetch_matches_to_sig_gz(runtmp, linear_gather): import gzip # test a basic prefetch, with --save-matches to a sig.gz file - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss63 = sourmash.load_one_signature(sig63) ss47 = sourmash.load_one_signature(sig47) - matches_out = c.output('matches.sig.gz') - - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - '--save-matches', matches_out, linear_gather) + matches_out = c.output("matches.sig.gz") + + c.run_sourmash( + "prefetch", + "-k", + "31", + sig47, + sig63, + sig2, + sig47, + "--save-matches", + matches_out, + linear_gather, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -332,16 +428,26 @@ def test_prefetch_matches_to_zip(runtmp, linear_gather): # test a basic prefetch, with --save-matches to a zipfile import zipfile - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss63 = sourmash.load_one_signature(sig63) ss47 = sourmash.load_one_signature(sig47) - matches_out = c.output('matches.zip') - - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - '--save-matches', matches_out, linear_gather) + matches_out = c.output("matches.zip") + + c.run_sourmash( + "prefetch", + "-k", + "31", + sig47, + sig63, + sig2, + sig47, + "--save-matches", + matches_out, + linear_gather, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -367,14 +473,22 @@ def test_prefetch_matching_hashes(runtmp, linear_gather): c = runtmp # test a basic prefetch, with --save-matches - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - - matches_out = c.output('matches.sig') - - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, - '--save-matching-hashes', matches_out, linear_gather) + utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + + matches_out = c.output("matches.sig") + + c.run_sourmash( + "prefetch", + "-k", + "31", + sig47, + sig63, + "--save-matching-hashes", + matches_out, + linear_gather, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -390,7 +504,7 @@ def test_prefetch_matching_hashes(runtmp, linear_gather): intersect.add_many(matches) ss = sourmash.load_one_signature(matches_out) - assert ss.name.endswith('-known') + assert ss.name.endswith("-known") assert ss.minhash == intersect @@ -398,14 +512,23 @@ def test_prefetch_nomatch_hashes(runtmp, linear_gather): c = runtmp # test a basic prefetch, with --save-matches - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - - nomatch_out = c.output('unmatched_hashes.sig') - - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, - '--save-unmatched-hashes', nomatch_out, linear_gather) + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + + nomatch_out = c.output("unmatched_hashes.sig") + + c.run_sourmash( + "prefetch", + "-k", + "31", + sig47, + sig63, + sig2, + "--save-unmatched-hashes", + nomatch_out, + linear_gather, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -420,7 +543,7 @@ def test_prefetch_nomatch_hashes(runtmp, linear_gather): remain.remove_many(ss63.minhash.hashes) ss = sourmash.load_one_signature(nomatch_out) - assert ss.name.endswith('-unknown') + assert ss.name.endswith("-unknown") assert ss.minhash == remain @@ -428,12 +551,11 @@ def test_prefetch_no_num_query(runtmp, linear_gather): c = runtmp # can't do prefetch with num signatures for query - sig47 = utils.get_test_data('num/47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("num/47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig47, - linear_gather) + c.run_sourmash("prefetch", "-k", "31", sig47, sig63, sig47, linear_gather) print(c.last_result.status) print(c.last_result.out) @@ -446,50 +568,66 @@ def test_prefetch_no_num_subj(runtmp, linear_gather): c = runtmp # can't do prefetch with num signatures for query; no matches! - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('num/63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("num/63.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, linear_gather) + c.run_sourmash("prefetch", "-k", "31", sig47, sig63, linear_gather) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status != 0 - assert "ERROR in prefetch: after picklists and patterns, no signatures to search!?" in c.last_result.err + assert ( + "ERROR in prefetch: after picklists and patterns, no signatures to search!?" + in c.last_result.err + ) def test_prefetch_db_fromfile(runtmp, linear_gather): c = runtmp # test a basic prefetch - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") - from_file = c.output('from-list.txt') + from_file = c.output("from-list.txt") - with open(from_file, 'wt') as fp: + with open(from_file, "w") as fp: print(sig63, file=fp) print(sig2, file=fp) print(sig47, file=fp) - c.run_sourmash('prefetch', '-k', '31', sig47, linear_gather, - '--db-from-file', from_file) + c.run_sourmash( + "prefetch", "-k", "31", sig47, linear_gather, "--db-from-file", from_file + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "WARNING: no output(s) specified! Nothing will be saved from this prefetch!" in c.last_result.err + assert ( + "WARNING: no output(s) specified! Nothing will be saved from this prefetch!" + in c.last_result.err + ) assert "selecting specified query k=31" in c.last_result.err - assert "loaded query: NC_009665.1 Shewanella baltica... (k=31, DNA)" in c.last_result.err - assert "query sketch has scaled=1000; will be dynamically downsampled as needed" in c.last_result.err + assert ( + "loaded query: NC_009665.1 Shewanella baltica... (k=31, DNA)" + in c.last_result.err + ) + assert ( + "query sketch has scaled=1000; will be dynamically downsampled as needed" + in c.last_result.err + ) assert "total of 2 matching signatures." in c.last_result.err - assert "of 5177 distinct query hashes, 5177 were found in matches above threshold." in c.last_result.err + assert ( + "of 5177 distinct query hashes, 5177 were found in matches above threshold." + in c.last_result.err + ) assert "a total of 0 query hashes remain unmatched." in c.last_result.err @@ -497,10 +635,10 @@ def test_prefetch_no_db(runtmp, linear_gather): c = runtmp # test a basic prefetch with no databases/signatures - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('prefetch', '-k', '31', sig47, linear_gather) + c.run_sourmash("prefetch", "-k", "31", sig47, linear_gather) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -512,13 +650,23 @@ def test_prefetch_no_db(runtmp, linear_gather): def test_prefetch_check_scaled_bounds_negative(runtmp, linear_gather): c = runtmp - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - '--scaled', '-5', linear_gather) + c.run_sourmash( + "prefetch", + "-k", + "31", + sig47, + sig63, + sig2, + sig47, + "--scaled", + "-5", + linear_gather, + ) assert "ERROR: scaled value must be positive" in str(exc.value) @@ -526,41 +674,75 @@ def test_prefetch_check_scaled_bounds_negative(runtmp, linear_gather): def test_prefetch_check_scaled_bounds_less_than_minimum(runtmp, linear_gather): c = runtmp - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - '--scaled', '50', linear_gather) - - assert "WARNING: scaled value should be >= 100. Continuing anyway." in str(exc.value) + c.run_sourmash( + "prefetch", + "-k", + "31", + sig47, + sig63, + sig2, + sig47, + "--scaled", + "50", + linear_gather, + ) + + assert "WARNING: scaled value should be >= 100. Continuing anyway." in str( + exc.value + ) def test_prefetch_check_scaled_bounds_more_than_maximum(runtmp, linear_gather): c = runtmp - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - '--scaled', '1e9', linear_gather) - - assert "WARNING: scaled value should be <= 1e6. Continuing anyway." in str(exc.value) + c.run_sourmash( + "prefetch", + "-k", + "31", + sig47, + sig63, + sig2, + sig47, + "--scaled", + "1e9", + linear_gather, + ) + + assert "WARNING: scaled value should be <= 1e6. Continuing anyway." in str( + exc.value + ) def test_prefetch_downsample_scaled(runtmp, linear_gather): c = runtmp # test --scaled - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - '--scaled', '1e5', linear_gather) + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + + c.run_sourmash( + "prefetch", + "-k", + "31", + sig47, + sig63, + sig2, + sig47, + "--scaled", + "1e5", + linear_gather, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -569,21 +751,19 @@ def test_prefetch_downsample_scaled(runtmp, linear_gather): assert "downsampling query from scaled=1000 to 10000" in c.last_result.err - - def test_prefetch_downsample_multiple(runtmp, linear_gather): # test multiple different downsamplings in prefetch code - query_sig = utils.get_test_data('GCF_000006945.2-s500.sig') + query_sig = utils.get_test_data("GCF_000006945.2-s500.sig") # load in the hashes and do split them into four bins, randomly. ss = sourmash.load_one_signature(query_sig) hashes = list(ss.minhash.hashes) - random.seed(a=1) # fix seed so test is reproducible + random.seed(a=1) # fix seed so test is reproducible random.shuffle(hashes) # split into 4 bins: - mh_bins = [ ss.minhash.copy_and_clear() for i in range(4) ] + mh_bins = [ss.minhash.copy_and_clear() for i in range(4)] for i, hashval in enumerate(hashes): mh_bins[i % 4].add_hash(hashval) @@ -602,25 +782,38 @@ def test_prefetch_downsample_multiple(runtmp, linear_gather): gathersigs.append(f"bin{i}.sig") - runtmp.sourmash('prefetch', linear_gather, query_sig, *gathersigs) + runtmp.sourmash("prefetch", linear_gather, query_sig, *gathersigs) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert "final scaled value (max across query and all matches) is 1000" in runtmp.last_result.err + assert ( + "final scaled value (max across query and all matches) is 1000" + in runtmp.last_result.err + ) def test_prefetch_empty(runtmp, linear_gather): c = runtmp # test --scaled - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - '--scaled', '1e9', linear_gather) + c.run_sourmash( + "prefetch", + "-k", + "31", + sig47, + sig63, + sig2, + sig47, + "--scaled", + "1e9", + linear_gather, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -633,13 +826,13 @@ def test_prefetch_basic_many_sigs(runtmp, linear_gather): c = runtmp # test what happens with many (and duplicate) signatures - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") manysigs = [sig63, sig2, sig47] * 5 - c.run_sourmash('prefetch', '-k', '31', sig47, *manysigs, linear_gather) + c.run_sourmash("prefetch", "-k", "31", sig47, *manysigs, linear_gather) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -647,18 +840,22 @@ def test_prefetch_basic_many_sigs(runtmp, linear_gather): assert c.last_result.status == 0 assert "total of 10 matching signatures so far." in c.last_result.err assert "total of 10 matching signatures." in c.last_result.err - assert "of 5177 distinct query hashes, 5177 were found in matches above threshold." in c.last_result.err + assert ( + "of 5177 distinct query hashes, 5177 were found in matches above threshold." + in c.last_result.err + ) assert "a total of 0 query hashes remain unmatched." in c.last_result.err def test_prefetch_with_picklist(runtmp): # test 'sourmash prefetch' with picklists - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") - runtmp.sourmash('prefetch', metag_sig, *gcf_sigs, - '--picklist', f"{picklist}:md5:md5") + runtmp.sourmash( + "prefetch", metag_sig, *gcf_sigs, "--picklist", f"{picklist}:md5:md5" + ) err = runtmp.last_result.err print(err) @@ -670,18 +867,22 @@ def test_prefetch_with_picklist(runtmp): print(out) assert "total of 3 matching signatures." in err - assert "of 1466 distinct query hashes, 453 were found in matches above threshold." in err + assert ( + "of 1466 distinct query hashes, 453 were found in matches above threshold." + in err + ) assert "a total of 1013 query hashes remain unmatched." in err def test_prefetch_with_picklist_exclude(runtmp): # test 'sourmash prefetch' with picklists, exclude - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") - runtmp.sourmash('prefetch', metag_sig, *gcf_sigs, - '--picklist', f"{picklist}:md5:md5:exclude") + runtmp.sourmash( + "prefetch", metag_sig, *gcf_sigs, "--picklist", f"{picklist}:md5:md5:exclude" + ) err = runtmp.last_result.err print(err) @@ -692,17 +893,19 @@ def test_prefetch_with_picklist_exclude(runtmp): print(out) assert "total of 9 matching signatures." in err - assert "of 1466 distinct query hashes, 1013 were found in matches above threshold." in err + assert ( + "of 1466 distinct query hashes, 1013 were found in matches above threshold." + in err + ) assert "a total of 453 query hashes remain unmatched." in err def test_prefetch_with_pattern_include(runtmp): # test 'sourmash prefetch' with --include-db-pattern - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") - runtmp.sourmash('prefetch', metag_sig, *gcf_sigs, - '--include', 'thermotoga') + runtmp.sourmash("prefetch", metag_sig, *gcf_sigs, "--include", "thermotoga") err = runtmp.last_result.err print(err) @@ -711,17 +914,19 @@ def test_prefetch_with_pattern_include(runtmp): print(out) assert "total of 3 matching signatures." in err - assert "of 1466 distinct query hashes, 453 were found in matches above threshold." in err + assert ( + "of 1466 distinct query hashes, 453 were found in matches above threshold." + in err + ) assert "a total of 1013 query hashes remain unmatched." in err def test_prefetch_with_pattern_exclude(runtmp): # test 'sourmash prefetch' with --exclude-db-pattern - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") - runtmp.sourmash('prefetch', metag_sig, *gcf_sigs, - '--exclude', 'thermotoga') + runtmp.sourmash("prefetch", metag_sig, *gcf_sigs, "--exclude", "thermotoga") err = runtmp.last_result.err print(err) @@ -730,27 +935,37 @@ def test_prefetch_with_pattern_exclude(runtmp): print(out) assert "total of 9 matching signatures." in err - assert "of 1466 distinct query hashes, 1013 were found in matches above threshold." in err + assert ( + "of 1466 distinct query hashes, 1013 were found in matches above threshold." + in err + ) assert "a total of 453 query hashes remain unmatched." in err def test_prefetch_output_with_abundance(runtmp, prefetch_gather, linear_gather): c = runtmp - query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig') - against = utils.get_test_data('gather-abund/genome-s10.fa.gz.sig') - - c.run_sourmash('prefetch', linear_gather, query, against, - '--save-matching-hashes', c.output('match-hash.sig'), - '--save-unmatched-hashes', c.output('nomatch-hash.sig')) + query = utils.get_test_data("gather-abund/reads-s10x10-s11.sig") + against = utils.get_test_data("gather-abund/genome-s10.fa.gz.sig") + + c.run_sourmash( + "prefetch", + linear_gather, + query, + against, + "--save-matching-hashes", + c.output("match-hash.sig"), + "--save-unmatched-hashes", + c.output("nomatch-hash.sig"), + ) print(c.last_result.out) - assert os.path.exists(c.output('match-hash.sig')) - ss = list(sourmash.load_file_as_signatures(c.output('match-hash.sig')))[0] + assert os.path.exists(c.output("match-hash.sig")) + ss = list(sourmash.load_file_as_signatures(c.output("match-hash.sig")))[0] assert ss.minhash.track_abundance - assert os.path.exists(c.output('nomatch-hash.sig')) - ss = list(sourmash.load_file_as_signatures(c.output('nomatch-hash.sig')))[0] + assert os.path.exists(c.output("nomatch-hash.sig")) + ss = list(sourmash.load_file_as_signatures(c.output("nomatch-hash.sig")))[0] assert ss.minhash.track_abundance @@ -758,14 +973,15 @@ def test_prefetch_ani_csv_out(runtmp, linear_gather): c = runtmp # test a basic prefetch, with CSV output - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") - csvout = c.output('out.csv') + csvout = c.output("out.csv") - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - '-o', csvout, linear_gather) + c.run_sourmash( + "prefetch", "-k", "31", sig47, sig63, sig2, sig47, "-o", csvout, linear_gather + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -774,37 +990,56 @@ def test_prefetch_ani_csv_out(runtmp, linear_gather): assert os.path.exists(csvout) prefetch_result_names = PrefetchResult.prefetch_write_cols - exp1 = {'q_ani': '0.9771552502238963','m_ani': '0.9767860811200507', - 'ac_ani': '0.9769706656719734','mc_ani': '0.9771552502238963', - 'pfn': 'False'} - exp2 = {'q_ani': '1.0','m_ani': '1.0', - 'ac_ani': '1.0','mc_ani': '1.0', - 'pfn': 'False'} + exp1 = { + "q_ani": "0.9771552502238963", + "m_ani": "0.9767860811200507", + "ac_ani": "0.9769706656719734", + "mc_ani": "0.9771552502238963", + "pfn": "False", + } + exp2 = { + "q_ani": "1.0", + "m_ani": "1.0", + "ac_ani": "1.0", + "mc_ani": "1.0", + "pfn": "False", + } expected_ani_vals = [exp1, exp2] - with open(csvout, 'rt', newline="") as fp: + with open(csvout, newline="") as fp: r = csv.DictReader(fp) - for (row, expected) in zip(r, expected_ani_vals): + for row, expected in zip(r, expected_ani_vals): print(row) assert prefetch_result_names == list(row.keys()) - assert approx_eq(row['query_containment_ani'], expected['q_ani']) - assert approx_eq(row['match_containment_ani'], expected['m_ani']) - assert approx_eq(row['max_containment_ani'], expected['mc_ani']) - assert approx_eq(row['average_containment_ani'], expected['ac_ani']) - assert row['potential_false_negative'] == expected['pfn'] + assert approx_eq(row["query_containment_ani"], expected["q_ani"]) + assert approx_eq(row["match_containment_ani"], expected["m_ani"]) + assert approx_eq(row["max_containment_ani"], expected["mc_ani"]) + assert approx_eq(row["average_containment_ani"], expected["ac_ani"]) + assert row["potential_false_negative"] == expected["pfn"] def test_prefetch_ani_csv_out_estimate_ci(runtmp, linear_gather): c = runtmp # test a basic prefetch, with CSV output - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - - csvout = c.output('out.csv') - - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - '-o', csvout, linear_gather, '--estimate-ani-ci') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + + csvout = c.output("out.csv") + + c.run_sourmash( + "prefetch", + "-k", + "31", + sig47, + sig63, + sig2, + sig47, + "-o", + csvout, + linear_gather, + "--estimate-ani-ci", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -813,54 +1048,64 @@ def test_prefetch_ani_csv_out_estimate_ci(runtmp, linear_gather): assert os.path.exists(csvout) prefetch_result_names_ci = PrefetchResult.prefetch_write_cols_ci - exp1 = {'q_ani': '0.9771552502238963','m_ani': '0.9767860811200507', - 'q_ani_low': "0.9762537506990911", 'q_ani_high': "0.9780336875157754", - 'm_ani_low': "0.9758801604653301", "m_ani_high": "0.9776692390768575", - 'ac_ani': '0.9769706656719734','mc_ani': '0.9771552502238963', - 'pfn': 'False'} - exp2 = {'q_ani': '1.0','m_ani': '1.0', - 'q_ani_low': "1.0", 'q_ani_high': "1.0", - 'm_ani_low': "1.0", "m_ani_high": "1.0", - 'ac_ani': '1.0','mc_ani': '1.0', - 'pfn': 'False'} + exp1 = { + "q_ani": "0.9771552502238963", + "m_ani": "0.9767860811200507", + "q_ani_low": "0.9762537506990911", + "q_ani_high": "0.9780336875157754", + "m_ani_low": "0.9758801604653301", + "m_ani_high": "0.9776692390768575", + "ac_ani": "0.9769706656719734", + "mc_ani": "0.9771552502238963", + "pfn": "False", + } + exp2 = { + "q_ani": "1.0", + "m_ani": "1.0", + "q_ani_low": "1.0", + "q_ani_high": "1.0", + "m_ani_low": "1.0", + "m_ani_high": "1.0", + "ac_ani": "1.0", + "mc_ani": "1.0", + "pfn": "False", + } expected_ani_vals = [exp1, exp2] - with open(csvout, 'rt', newline="") as fp: + with open(csvout, newline="") as fp: r = csv.DictReader(fp) - for (row, expected) in zip(r, expected_ani_vals): + for row, expected in zip(r, expected_ani_vals): print(row) assert prefetch_result_names_ci == list(row.keys()) - assert approx_eq(row['query_containment_ani'],expected['q_ani']) - assert approx_eq(row['query_containment_ani_low'], expected['q_ani_low']) - assert approx_eq(row['query_containment_ani_high'], expected['q_ani_high']) - assert approx_eq(row['match_containment_ani'], expected['m_ani']) - assert approx_eq(row['match_containment_ani_low'], expected['m_ani_low']) - assert approx_eq(row['match_containment_ani_high'], expected['m_ani_high']) - assert approx_eq(row['max_containment_ani'], expected['mc_ani']) - assert approx_eq(row['average_containment_ani'], expected['ac_ani']) - assert row['potential_false_negative'] == expected['pfn'] + assert approx_eq(row["query_containment_ani"], expected["q_ani"]) + assert approx_eq(row["query_containment_ani_low"], expected["q_ani_low"]) + assert approx_eq(row["query_containment_ani_high"], expected["q_ani_high"]) + assert approx_eq(row["match_containment_ani"], expected["m_ani"]) + assert approx_eq(row["match_containment_ani_low"], expected["m_ani_low"]) + assert approx_eq(row["match_containment_ani_high"], expected["m_ani_high"]) + assert approx_eq(row["max_containment_ani"], expected["mc_ani"]) + assert approx_eq(row["average_containment_ani"], expected["ac_ani"]) + assert row["potential_false_negative"] == expected["pfn"] def test_prefetch_ani_containment_asymmetry(runtmp): # test contained_by asymmetries, viz #2215 - query_sig = utils.get_test_data('47.fa.sig') - merged_sig = utils.get_test_data('47-63-merge.sig') + query_sig = utils.get_test_data("47.fa.sig") + merged_sig = utils.get_test_data("47-63-merge.sig") - runtmp.sourmash('prefetch', query_sig, merged_sig, '-o', - 'query-in-merged.csv') - runtmp.sourmash('prefetch', merged_sig, query_sig, '-o', - 'merged-in-query.csv') + runtmp.sourmash("prefetch", query_sig, merged_sig, "-o", "query-in-merged.csv") + runtmp.sourmash("prefetch", merged_sig, query_sig, "-o", "merged-in-query.csv") - with sourmash_args.FileInputCSV(runtmp.output('query-in-merged.csv')) as r: + with sourmash_args.FileInputCSV(runtmp.output("query-in-merged.csv")) as r: query_in_merged = list(r)[0] - with sourmash_args.FileInputCSV(runtmp.output('merged-in-query.csv')) as r: + with sourmash_args.FileInputCSV(runtmp.output("merged-in-query.csv")) as r: merged_in_query = list(r)[0] - assert query_in_merged['query_containment_ani'] == '1.0' - assert query_in_merged['match_containment_ani'] == '0.9865155060423993' - assert query_in_merged['average_containment_ani'] == '0.9932577530211997' + assert query_in_merged["query_containment_ani"] == "1.0" + assert query_in_merged["match_containment_ani"] == "0.9865155060423993" + assert query_in_merged["average_containment_ani"] == "0.9932577530211997" - assert merged_in_query['match_containment_ani'] == '1.0' - assert merged_in_query['query_containment_ani'] == '0.9865155060423993' - assert merged_in_query['average_containment_ani'] == '0.9932577530211997' + assert merged_in_query["match_containment_ani"] == "1.0" + assert merged_in_query["query_containment_ani"] == "0.9865155060423993" + assert merged_in_query["average_containment_ani"] == "0.9932577530211997" diff --git a/tests/test_sbt.py b/tests/test_sbt.py index a66d0c634e..cfc71d43dd 100644 --- a/tests/test_sbt.py +++ b/tests/test_sbt.py @@ -6,13 +6,11 @@ import pytest import sourmash -from sourmash import (load_one_signature, SourmashSignature, - load_file_as_signatures) +from sourmash import load_one_signature, SourmashSignature, load_file_as_signatures from sourmash.exceptions import IndexNotSupported from sourmash.sbt import SBT, GraphFactory, Leaf, Node -from sourmash.sbtmh import (SigLeaf, load_sbt_index) -from sourmash.sbt_storage import (FSStorage, RedisStorage, - IPFSStorage, ZipStorage) +from sourmash.sbtmh import SigLeaf, load_sbt_index +from sourmash.sbt_storage import FSStorage, RedisStorage, IPFSStorage, ZipStorage from sourmash.search import make_jaccard_search_query from sourmash.picklist import SignaturePicklist, PickStyle @@ -24,29 +22,29 @@ def test_simple(runtmp, n_children): root = SBT(factory, d=n_children) leaf1 = Leaf("a", factory()) - leaf1.data.count('AAAAA') - leaf1.data.count('AAAAT') - leaf1.data.count('AAAAC') + leaf1.data.count("AAAAA") + leaf1.data.count("AAAAT") + leaf1.data.count("AAAAC") leaf2 = Leaf("b", factory()) - leaf2.data.count('AAAAA') - leaf2.data.count('AAAAT') - leaf2.data.count('AAAAG') + leaf2.data.count("AAAAA") + leaf2.data.count("AAAAT") + leaf2.data.count("AAAAG") leaf3 = Leaf("c", factory()) - leaf3.data.count('AAAAA') - leaf3.data.count('AAAAT') - leaf3.data.count('CAAAA') + leaf3.data.count("AAAAA") + leaf3.data.count("AAAAT") + leaf3.data.count("CAAAA") leaf4 = Leaf("d", factory()) - leaf4.data.count('AAAAA') - leaf4.data.count('CAAAA') - leaf4.data.count('GAAAA') + leaf4.data.count("AAAAA") + leaf4.data.count("CAAAA") + leaf4.data.count("GAAAA") leaf5 = Leaf("e", factory()) - leaf5.data.count('AAAAA') - leaf5.data.count('AAAAT') - leaf5.data.count('GAAAA') + leaf5.data.count("AAAAA") + leaf5.data.count("AAAAT") + leaf5.data.count("GAAAA") root.add_node(leaf1) root.add_node(leaf2) @@ -58,8 +56,8 @@ def test_simple(runtmp, n_children): def search_kmer(leaf, kmer): return leaf.data.get(kmer) - leaves = [leaf1, leaf2, leaf3, leaf4, leaf5 ] - kmers = [ "AAAAA", "AAAAT", "AAAAG", "CAAAA", "GAAAA" ] + leaves = [leaf1, leaf2, leaf3, leaf4, leaf5] + kmers = ["AAAAA", "AAAAT", "AAAAG", "CAAAA", "GAAAA"] # define an exhaustive search function that looks in all the leaf nodes. def search_kmer_in_list(kmer): @@ -75,20 +73,20 @@ def search_kmer_in_list(kmer): for kmer in kmers: assert set(root._find_nodes(search_kmer, kmer)) == search_kmer_in_list(kmer) - print('-----') - print([ x.metadata for x in root._find_nodes(search_kmer, "AAAAA") ]) - print([ x.metadata for x in root._find_nodes(search_kmer, "AAAAT") ]) - print([ x.metadata for x in root._find_nodes(search_kmer, "AAAAG") ]) - print([ x.metadata for x in root._find_nodes(search_kmer, "CAAAA") ]) - print([ x.metadata for x in root._find_nodes(search_kmer, "GAAAA") ]) + print("-----") + print([x.metadata for x in root._find_nodes(search_kmer, "AAAAA")]) + print([x.metadata for x in root._find_nodes(search_kmer, "AAAAT")]) + print([x.metadata for x in root._find_nodes(search_kmer, "AAAAG")]) + print([x.metadata for x in root._find_nodes(search_kmer, "CAAAA")]) + print([x.metadata for x in root._find_nodes(search_kmer, "GAAAA")]) # save SBT to a directory and then reload - root.save(runtmp.output('demo')) - root = SBT.load(runtmp.output('demo')) + root.save(runtmp.output("demo")) + root = SBT.load(runtmp.output("demo")) for kmer in kmers: new_result = {str(r) for r in root._find_nodes(search_kmer, kmer)} - print(*new_result, sep='\n') + print(*new_result, sep="\n") assert new_result == {str(r) for r in search_kmer_in_list(kmer)} @@ -99,29 +97,29 @@ def test_longer_search(n_children): root = SBT(factory, d=n_children) leaf1 = Leaf("a", factory()) - leaf1.data.count('AAAAA') - leaf1.data.count('AAAAT') - leaf1.data.count('AAAAC') + leaf1.data.count("AAAAA") + leaf1.data.count("AAAAT") + leaf1.data.count("AAAAC") leaf2 = Leaf("b", factory()) - leaf2.data.count('AAAAA') - leaf2.data.count('AAAAT') - leaf2.data.count('AAAAG') + leaf2.data.count("AAAAA") + leaf2.data.count("AAAAT") + leaf2.data.count("AAAAG") leaf3 = Leaf("c", factory()) - leaf3.data.count('AAAAA') - leaf3.data.count('AAAAT') - leaf3.data.count('CAAAA') + leaf3.data.count("AAAAA") + leaf3.data.count("AAAAT") + leaf3.data.count("CAAAA") leaf4 = Leaf("d", factory()) - leaf4.data.count('AAAAA') - leaf4.data.count('CAAAA') - leaf4.data.count('GAAAA') + leaf4.data.count("AAAAA") + leaf4.data.count("CAAAA") + leaf4.data.count("GAAAA") leaf5 = Leaf("e", factory()) - leaf5.data.count('AAAAA') - leaf5.data.count('AAAAT') - leaf5.data.count('GAAAA') + leaf5.data.count("AAAAA") + leaf5.data.count("AAAAT") + leaf5.data.count("GAAAA") root.add_node(leaf1) root.add_node(leaf2) @@ -131,32 +129,32 @@ def test_longer_search(n_children): def kmers(k, seq): for start in range(len(seq) - k + 1): - yield seq[start:start + k] + yield seq[start : start + k] def search_transcript(node, seq, threshold): - presence = [ node.data.get(kmer) for kmer in kmers(ksize, seq) ] + presence = [node.data.get(kmer) for kmer in kmers(ksize, seq)] if sum(presence) >= int(threshold * (len(seq) - ksize + 1)): return 1 return 0 - try1 = [ x.metadata for x in root._find_nodes(search_transcript, "AAAAT", 1.0) ] - assert set(try1) == set([ 'a', 'b', 'c', 'e' ]), try1 # no 'd' + try1 = [x.metadata for x in root._find_nodes(search_transcript, "AAAAT", 1.0)] + assert set(try1) == set(["a", "b", "c", "e"]), try1 # no 'd' - try2 = [ x.metadata for x in root._find_nodes(search_transcript, "GAAAAAT", 0.6) ] - assert set(try2) == set([ 'a', 'b', 'c', 'd', 'e' ]) + try2 = [x.metadata for x in root._find_nodes(search_transcript, "GAAAAAT", 0.6)] + assert set(try2) == set(["a", "b", "c", "d", "e"]) - try3 = [ x.metadata for x in root._find_nodes(search_transcript, "GAAAA", 1.0) ] - assert set(try3) == set([ 'd', 'e' ]), try3 + try3 = [x.metadata for x in root._find_nodes(search_transcript, "GAAAA", 1.0)] + assert set(try3) == set(["d", "e"]), try3 -#@pytest.mark.parametrize("old_version", ["v1", "v2", "v3", "v4", "v5"]) +# @pytest.mark.parametrize("old_version", ["v1", "v2", "v3", "v4", "v5"]) @pytest.mark.parametrize("old_version", ["v3", "v4", "v5"]) def test_tree_old_load(old_version): - tree_old = SBT.load(utils.get_test_data('{}.sbt.json'.format(old_version)), - leaf_loader=SigLeaf.load) + tree_old = SBT.load( + utils.get_test_data(f"{old_version}.sbt.json"), leaf_loader=SigLeaf.load + ) - tree_cur = SBT.load(utils.get_test_data('v6.sbt.json'), - leaf_loader=SigLeaf.load) + tree_cur = SBT.load(utils.get_test_data("v6.sbt.json"), leaf_loader=SigLeaf.load) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) to_search = load_one_signature(testdata1) @@ -177,8 +175,8 @@ def test_tree_old_load(old_version): def test_load_future(tmpdir): - with open(str(tmpdir.join("v9999.sbt.json")), 'w') as f: - json.dump({'version': 9999}, f) + with open(str(tmpdir.join("v9999.sbt.json")), "w") as f: + json.dump({"version": 9999}, f) with pytest.raises(IndexNotSupported) as excinfo: SBT.load(str(tmpdir.join("v9999.sbt.json"))) @@ -196,21 +194,20 @@ def test_tree_save_load(runtmp, n_children): tree.add_node(leaf) to_search = leaf - print('*' * 60) - print("{}:".format(to_search.metadata)) + print("*" * 60) + print(f"{to_search.metadata}:") search_obj = make_jaccard_search_query(threshold=0.1) old_result = {str(s.signature) for s in tree.find(search_obj, to_search.data)} - print(*old_result, sep='\n') + print(*old_result, sep="\n") - tree.save(runtmp.output('demo')) - tree = SBT.load(runtmp.output('demo'), - leaf_loader=SigLeaf.load) + tree.save(runtmp.output("demo")) + tree = SBT.load(runtmp.output("demo"), leaf_loader=SigLeaf.load) - print('*' * 60) - print("{}:".format(to_search.metadata)) + print("*" * 60) + print(f"{to_search.metadata}:") search_obj = make_jaccard_search_query(threshold=0.1) new_result = {str(s.signature) for s in tree.find(search_obj, to_search.data)} - print(*new_result, sep='\n') + print(*new_result, sep="\n") assert old_result == new_result @@ -219,7 +216,6 @@ def test_search_minhashes(): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory) - n_leaves = 0 for f in utils.SIG_FILES: sig = load_one_signature(utils.get_test_data(f)) leaf = SigLeaf(os.path.basename(f), sig) @@ -257,12 +253,12 @@ def test_binary_nary_tree(): assert all([len(list(t.leaves())) == n_leaves for t in trees.values()]) results = {} - print('*' * 60) - print("{}:".format(to_search.metadata)) + print("*" * 60) + print(f"{to_search.metadata}:") for d, tree in trees.items(): search_obj = make_jaccard_search_query(threshold=0.1) results[d] = {str(s.signature) for s in tree.find(search_obj, to_search.data)} - print(*results[2], sep='\n') + print(*results[2], sep="\n") assert results[2] == results[5] assert results[5] == results[10] @@ -327,26 +323,26 @@ def test_sbt_fsstorage(runtmp): tree.add_node(leaf) to_search = leaf - print('*' * 60) - print("{}:".format(to_search.metadata)) + print("*" * 60) + print(f"{to_search.metadata}:") search_obj = make_jaccard_search_query(threshold=0.1) old_result = {str(s.signature) for s in tree.find(search_obj, to_search.data)} - print(*old_result, sep='\n') + print(*old_result, sep="\n") - with FSStorage(runtmp.location, '.fstree') as storage: - tree.save(runtmp.output('tree.sbt.json'), storage=storage) + with FSStorage(runtmp.location, ".fstree") as storage: + tree.save(runtmp.output("tree.sbt.json"), storage=storage) - tree = SBT.load(runtmp.output('tree.sbt.json'), leaf_loader=SigLeaf.load) - print('*' * 60) - print("{}:".format(to_search.metadata)) + tree = SBT.load(runtmp.output("tree.sbt.json"), leaf_loader=SigLeaf.load) + print("*" * 60) + print(f"{to_search.metadata}:") search_obj = make_jaccard_search_query(threshold=0.1) new_result = {str(s.signature) for s in tree.find(search_obj, to_search.data)} - print(*new_result, sep='\n') + print(*new_result, sep="\n") assert old_result == new_result assert os.path.exists(runtmp.output(tree.storage.subdir)) - assert os.path.exists(runtmp.output('.fstree')) + assert os.path.exists(runtmp.output(".fstree")) def test_sbt_zipstorage(tmpdir): @@ -361,31 +357,31 @@ def test_sbt_zipstorage(tmpdir): tree.add_node(leaf) to_search = leaf - print('*' * 60) - print("{}:".format(to_search.metadata)) + print("*" * 60) + print(f"{to_search.metadata}:") search_obj = make_jaccard_search_query(threshold=0.1) old_result = {str(s.signature) for s in tree.find(search_obj, to_search.data)} - print(*old_result, sep='\n') + print(*old_result, sep="\n") with ZipStorage(str(tmpdir.join("tree.sbt.zip")), mode="w") as storage: tree.save(str(tmpdir.join("tree.sbt.json")), storage=storage) with ZipStorage(str(tmpdir.join("tree.sbt.zip"))) as storage: - tree = SBT.load(str(tmpdir.join("tree.sbt.json")), - leaf_loader=SigLeaf.load, - storage=storage) + tree = SBT.load( + str(tmpdir.join("tree.sbt.json")), leaf_loader=SigLeaf.load, storage=storage + ) - print('*' * 60) - print("{}:".format(to_search.metadata)) + print("*" * 60) + print(f"{to_search.metadata}:") search_obj = make_jaccard_search_query(threshold=0.1) new_result = {str(s.signature) for s in tree.find(search_obj, to_search.data)} - print(*new_result, sep='\n') + print(*new_result, sep="\n") assert old_result == new_result def test_sbt_ipfsstorage(runtmp): - ipfshttpclient = pytest.importorskip('ipfshttpclient') + ipfshttpclient = pytest.importorskip("ipfshttpclient") factory = GraphFactory(31, 1e5, 4) tree = SBT(factory) @@ -397,34 +393,34 @@ def test_sbt_ipfsstorage(runtmp): tree.add_node(leaf) to_search = leaf - print('*' * 60) - print("{}:".format(to_search.metadata)) + print("*" * 60) + print(f"{to_search.metadata}:") search_obj = make_jaccard_search_query(threshold=0.1) old_result = {str(s.signature) for s in tree.find(search_obj, to_search.data)} - print(*old_result, sep='\n') + print(*old_result, sep="\n") try: with IPFSStorage() as storage: - tree.save(runtmp.output('tree.sbt.json'), storage=storage) + tree.save(runtmp.output("tree.sbt.json"), storage=storage) except ipfshttpclient.exceptions.ConnectionError: pytest.xfail("ipfs not installed/functioning probably") with IPFSStorage() as storage: - tree = SBT.load(runtmp.output('tree.sbt.json'), - leaf_loader=SigLeaf.load, - storage=storage) + tree = SBT.load( + runtmp.output("tree.sbt.json"), leaf_loader=SigLeaf.load, storage=storage + ) - print('*' * 60) - print("{}:".format(to_search.metadata)) + print("*" * 60) + print(f"{to_search.metadata}:") search_obj = make_jaccard_search_query(threshold=0.1) new_result = {str(s.signature) for s in tree.find(search_obj, to_search.data)} - print(*new_result, sep='\n') + print(*new_result, sep="\n") assert old_result == new_result def test_sbt_redisstorage(runtmp): - redis = pytest.importorskip('redis') + redis = pytest.importorskip("redis") factory = GraphFactory(31, 1e5, 4) tree = SBT(factory) @@ -435,28 +431,28 @@ def test_sbt_redisstorage(runtmp): tree.add_node(leaf) to_search = leaf - print('*' * 60) - print("{}:".format(to_search.metadata)) + print("*" * 60) + print(f"{to_search.metadata}:") search_obj = make_jaccard_search_query(threshold=0.1) old_result = {str(s.signature) for s in tree.find(search_obj, to_search.data)} - print(*old_result, sep='\n') + print(*old_result, sep="\n") try: with RedisStorage() as storage: - tree.save(runtmp.output('tree.sbt.json'), storage=storage) + tree.save(runtmp.output("tree.sbt.json"), storage=storage) except redis.exceptions.ConnectionError: pytest.xfail("Couldn't connect to redis server") with RedisStorage() as storage: - tree = SBT.load(runtmp.output('tree.sbt.json'), - leaf_loader=SigLeaf.load, - storage=storage) + tree = SBT.load( + runtmp.output("tree.sbt.json"), leaf_loader=SigLeaf.load, storage=storage + ) - print('*' * 60) - print("{}:".format(to_search.metadata)) + print("*" * 60) + print(f"{to_search.metadata}:") search_obj = make_jaccard_search_query(threshold=0.1) new_result = {str(s.signature) for s in tree.find(search_obj, to_search.data)} - print(*new_result, sep='\n') + print(*new_result, sep="\n") assert old_result == new_result @@ -475,12 +471,12 @@ def test_save_zip(tmpdir): new_tree = SBT.load(str(newsbt), leaf_loader=SigLeaf.load) assert isinstance(new_tree.storage, ZipStorage) - assert new_tree.storage.list_sbts() == ['new.sbt.json'] + assert new_tree.storage.list_sbts() == ["new.sbt.json"] to_search = load_one_signature(utils.get_test_data(utils.SIG_FILES[0])) print("*" * 60) - print("{}:".format(to_search)) + print(f"{to_search}:") search_obj = make_jaccard_search_query(threshold=0.1) old_result = {str(s.signature) for s in tree.find(search_obj, to_search)} new_result = {str(s.signature) for s in new_tree.find(search_obj, to_search)} @@ -502,7 +498,7 @@ def test_load_zip(tmpdir): to_search = load_one_signature(utils.get_test_data(utils.SIG_FILES[0])) print("*" * 60) - print("{}:".format(to_search)) + print(f"{to_search}:") search_obj = make_jaccard_search_query(threshold=0.1) new_result = {str(s.signature) for s in tree.find(search_obj, to_search)} print(*new_result, sep="\n") @@ -516,7 +512,7 @@ def test_load_zip_uncompressed(tmpdir): testdata = utils.get_test_data("v6.sbt.zip") testsbt = tmpdir.join("v6.sbt.json") - with zipfile.ZipFile(testdata, 'r') as z: + with zipfile.ZipFile(testdata, "r") as z: z.extractall(str(tmpdir)) tree = SBT.load(str(testsbt), leaf_loader=SigLeaf.load) @@ -524,7 +520,7 @@ def test_load_zip_uncompressed(tmpdir): to_search = load_one_signature(utils.get_test_data(utils.SIG_FILES[0])) print("*" * 60) - print("{}:".format(to_search)) + print(f"{to_search}:") search_obj = make_jaccard_search_query(threshold=0.1) new_result = {str(s.signature) for s in tree.find(search_obj, to_search)} print(*new_result, sep="\n") @@ -532,11 +528,11 @@ def test_load_zip_uncompressed(tmpdir): def test_tree_repair(): - tree_repair = SBT.load(utils.get_test_data('leaves.sbt.json'), - leaf_loader=SigLeaf.load) + tree_repair = SBT.load( + utils.get_test_data("leaves.sbt.json"), leaf_loader=SigLeaf.load + ) - tree_cur = SBT.load(utils.get_test_data('v3.sbt.json'), - leaf_loader=SigLeaf.load) + tree_cur = SBT.load(utils.get_test_data("v3.sbt.json"), leaf_loader=SigLeaf.load) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) to_search = load_one_signature(testdata1) @@ -550,8 +546,9 @@ def test_tree_repair(): def test_tree_repair_insert(): - tree_repair = SBT.load(utils.get_test_data('leaves.sbt.json'), - leaf_loader=SigLeaf.load) + tree_repair = SBT.load( + utils.get_test_data("leaves.sbt.json"), leaf_loader=SigLeaf.load + ) for f in utils.SIG_FILES: sig = load_one_signature(utils.get_test_data(f)) @@ -579,23 +576,23 @@ def test_save_sparseness(runtmp, n_children): tree.add_node(leaf) to_search = leaf - print('*' * 60) - print("{}:".format(to_search.metadata)) + print("*" * 60) + print(f"{to_search.metadata}:") search_obj = make_jaccard_search_query(threshold=0.1) old_result = {str(s.signature) for s in tree.find(search_obj, to_search.data)} - print(*old_result, sep='\n') + print(*old_result, sep="\n") - tree.save(runtmp.output('demo'), sparseness=1.0) - tree_loaded = SBT.load(runtmp.output('demo'), - leaf_loader=SigLeaf.load) + tree.save(runtmp.output("demo"), sparseness=1.0) + tree_loaded = SBT.load(runtmp.output("demo"), leaf_loader=SigLeaf.load) assert all(not isinstance(n, Node) for _, n in tree_loaded) - print('*' * 60) - print("{}:".format(to_search.metadata)) - new_result = {str(s.signature) for s in tree_loaded.find(search_obj, - to_search.data)} - print(*new_result, sep='\n') + print("*" * 60) + print(f"{to_search.metadata}:") + new_result = { + str(s.signature) for s in tree_loaded.find(search_obj, to_search.data) + } + print(*new_result, sep="\n") assert old_result == new_result @@ -615,8 +612,8 @@ def test_sbt_as_index_select(): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=2) - sig47 = load_one_signature(utils.get_test_data('47.fa.sig')) - sig63 = load_one_signature(utils.get_test_data('63.fa.sig')) + sig47 = load_one_signature(utils.get_test_data("47.fa.sig")) + sig63 = load_one_signature(utils.get_test_data("63.fa.sig")) tree.insert(sig47) tree.insert(sig63) @@ -624,7 +621,7 @@ def test_sbt_as_index_select(): xx = tree.select(ksize=31) assert xx == tree - xx = tree.select(moltype='DNA') + xx = tree.select(moltype="DNA") assert xx == tree xx = tree.select(abund=False) @@ -634,7 +631,7 @@ def test_sbt_as_index_select(): tree.select(ksize=21) with pytest.raises(ValueError): - tree.select(moltype='protein') + tree.select(moltype="protein") with pytest.raises(ValueError): tree.select(abund=True) @@ -646,15 +643,15 @@ def test_sbt_as_index_select_picklist(): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=2) - sig47 = load_one_signature(utils.get_test_data('47.fa.sig')) - sig63 = load_one_signature(utils.get_test_data('63.fa.sig')) + sig47 = load_one_signature(utils.get_test_data("47.fa.sig")) + sig63 = load_one_signature(utils.get_test_data("63.fa.sig")) tree.insert(sig47) tree.insert(sig63) # construct a picklist... - picklist = SignaturePicklist('md5prefix8') - picklist.init(['09a08691']) + picklist = SignaturePicklist("md5prefix8") + picklist.init(["09a08691"]) # select on picklist tree = tree.select(picklist=picklist) @@ -663,7 +660,7 @@ def test_sbt_as_index_select_picklist(): ss = siglist[0] assert ss.minhash.ksize == 31 - assert ss.md5sum().startswith('09a08691c') + assert ss.md5sum().startswith("09a08691c") def test_sbt_as_index_select_picklist_exclude(): @@ -672,15 +669,15 @@ def test_sbt_as_index_select_picklist_exclude(): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=2) - sig47 = load_one_signature(utils.get_test_data('47.fa.sig')) - sig63 = load_one_signature(utils.get_test_data('63.fa.sig')) + sig47 = load_one_signature(utils.get_test_data("47.fa.sig")) + sig63 = load_one_signature(utils.get_test_data("63.fa.sig")) tree.insert(sig47) tree.insert(sig63) # construct a picklist... - picklist = SignaturePicklist('md5prefix8', pickstyle=PickStyle.EXCLUDE) - picklist.init(['09a08691']) + picklist = SignaturePicklist("md5prefix8", pickstyle=PickStyle.EXCLUDE) + picklist.init(["09a08691"]) # select on picklist tree = tree.select(picklist=picklist) @@ -689,7 +686,7 @@ def test_sbt_as_index_select_picklist_exclude(): ss = siglist[0] assert ss.minhash.ksize == 31 - assert ss.md5sum().startswith('38729c637') + assert ss.md5sum().startswith("38729c637") def test_sbt_as_index_find_picklist(): @@ -698,15 +695,15 @@ def test_sbt_as_index_find_picklist(): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=2) - sig47 = load_one_signature(utils.get_test_data('47.fa.sig')) - sig63 = load_one_signature(utils.get_test_data('63.fa.sig')) + sig47 = load_one_signature(utils.get_test_data("47.fa.sig")) + sig63 = load_one_signature(utils.get_test_data("63.fa.sig")) tree.insert(sig47) tree.insert(sig63) # construct a picklist... - picklist = SignaturePicklist('md5prefix8') - picklist.init(['09a08691']) + picklist = SignaturePicklist("md5prefix8") + picklist.init(["09a08691"]) # run a 'find' with sig63, should find 47 and 63 both. search_obj = make_jaccard_search_query(do_containment=True, threshold=0.0) @@ -723,7 +720,7 @@ def test_sbt_as_index_find_picklist(): # and check that it is the expected one! ss = results[0].signature assert ss.minhash.ksize == 31 - assert ss.md5sum().startswith('09a08691c') + assert ss.md5sum().startswith("09a08691c") def test_sbt_as_index_find_picklist_exclude(): @@ -732,15 +729,15 @@ def test_sbt_as_index_find_picklist_exclude(): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=2) - sig47 = load_one_signature(utils.get_test_data('47.fa.sig')) - sig63 = load_one_signature(utils.get_test_data('63.fa.sig')) + sig47 = load_one_signature(utils.get_test_data("47.fa.sig")) + sig63 = load_one_signature(utils.get_test_data("63.fa.sig")) tree.insert(sig47) tree.insert(sig63) # construct a picklist... - picklist = SignaturePicklist('md5prefix8', pickstyle=PickStyle.EXCLUDE) - picklist.init(['09a08691']) + picklist = SignaturePicklist("md5prefix8", pickstyle=PickStyle.EXCLUDE) + picklist.init(["09a08691"]) # run a 'find' with sig63, should find 47 and 63 both. search_obj = make_jaccard_search_query(do_containment=True, threshold=0.0) @@ -757,7 +754,7 @@ def test_sbt_as_index_find_picklist_exclude(): # and check that it is the expected one! ss = results[0].signature assert ss.minhash.ksize == 31 - assert ss.md5sum().startswith('38729c637') + assert ss.md5sum().startswith("38729c637") def test_sbt_as_index_find_picklist_twice(): @@ -766,15 +763,15 @@ def test_sbt_as_index_find_picklist_twice(): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=2) - sig47 = load_one_signature(utils.get_test_data('47.fa.sig')) - sig63 = load_one_signature(utils.get_test_data('63.fa.sig')) + sig47 = load_one_signature(utils.get_test_data("47.fa.sig")) + sig63 = load_one_signature(utils.get_test_data("63.fa.sig")) tree.insert(sig47) tree.insert(sig63) # construct a picklist... - picklist = SignaturePicklist('md5prefix8') - picklist.init(['09a08691']) + picklist = SignaturePicklist("md5prefix8") + picklist.init(["09a08691"]) # run a 'find' with sig63, should find 47 and 63 both. search_obj = make_jaccard_search_query(do_containment=True, threshold=0.0) @@ -787,7 +784,9 @@ def test_sbt_as_index_find_picklist_twice(): with pytest.raises(ValueError): tree = tree.select(picklist=picklist) - assert "we do not (yet) support multiple picklists for SBT databases" in str(exc) + assert "we do not (yet) support multiple picklists for SBT databases" in str( + exc + ) def test_sbt_as_index_signatures(): @@ -795,8 +794,8 @@ def test_sbt_as_index_signatures(): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=2) - sig47 = load_one_signature(utils.get_test_data('47.fa.sig')) - sig63 = load_one_signature(utils.get_test_data('63.fa.sig')) + sig47 = load_one_signature(utils.get_test_data("47.fa.sig")) + sig63 = load_one_signature(utils.get_test_data("63.fa.sig")) tree.insert(sig47) tree.insert(sig63) @@ -813,9 +812,9 @@ def test_sbt_gather_threshold_1(): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=2) - sig2 = load_one_signature(utils.get_test_data('2.fa.sig'), ksize=31) - sig47 = load_one_signature(utils.get_test_data('47.fa.sig'), ksize=31) - sig63 = load_one_signature(utils.get_test_data('63.fa.sig'), ksize=31) + sig2 = load_one_signature(utils.get_test_data("2.fa.sig"), ksize=31) + sig47 = load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + sig63 = load_one_signature(utils.get_test_data("63.fa.sig"), ksize=31) tree.insert(sig47) tree.insert(sig63) @@ -861,7 +860,7 @@ def test_sbt_gather_threshold_1(): assert name is None # check with a too-high threshold -> should be no results. - print('len mh', len(new_mh)) + print("len mh", len(new_mh)) with pytest.raises(ValueError): tree.best_containment(SourmashSignature(new_mh), threshold_bp=5000) @@ -871,9 +870,9 @@ def test_sbt_gather_threshold_5(): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=2) - sig2 = load_one_signature(utils.get_test_data('2.fa.sig'), ksize=31) - sig47 = load_one_signature(utils.get_test_data('47.fa.sig'), ksize=31) - sig63 = load_one_signature(utils.get_test_data('63.fa.sig'), ksize=31) + sig2 = load_one_signature(utils.get_test_data("2.fa.sig"), ksize=31) + sig47 = load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + sig63 = load_one_signature(utils.get_test_data("63.fa.sig"), ksize=31) tree.insert(sig47) tree.insert(sig63) @@ -902,7 +901,7 @@ def test_sbt_gather_threshold_5(): assert name is None # now, check with a threshold_bp that should be meet-able. - results = tree.best_containment(SourmashSignature(new_mh), threshold_bp=5000) + tree.best_containment(SourmashSignature(new_mh), threshold_bp=5000) assert result containment, match_sig, name = result assert containment == 1.0 @@ -913,9 +912,9 @@ def test_sbt_gather_threshold_5(): @utils.in_tempdir def test_gather_single_return(c): # test gather() number of returns - sig2file = utils.get_test_data('2.fa.sig') - sig47file = utils.get_test_data('47.fa.sig') - sig63file = utils.get_test_data('63.fa.sig') + sig2file = utils.get_test_data("2.fa.sig") + sig47file = utils.get_test_data("47.fa.sig") + sig63file = utils.get_test_data("63.fa.sig") sig2 = load_one_signature(sig2file, ksize=31) sig47 = load_one_signature(sig47file, ksize=31) @@ -953,10 +952,10 @@ def test_sbt_jaccard_ordering(runtmp): def _intersect(x, y): return x.intersection_and_union_size(y)[0] - print('a intersect b:', _intersect(a, b)) - print('a intersect c:', _intersect(a, c)) - print('a jaccard b:', a.jaccard(b)) - print('a jaccard c:', a.jaccard(c)) + print("a intersect b:", _intersect(a, b)) + print("a intersect c:", _intersect(a, c)) + print("a jaccard b:", a.jaccard(b)) + print("a jaccard c:", a.jaccard(c)) assert _intersect(a, b) > _intersect(a, c) assert a.jaccard(b) < a.jaccard(c) @@ -965,9 +964,9 @@ def _intersect(x, y): assert a.jaccard(c) > 0.15 # now - make signatures, try out :) - ss_a = sourmash.SourmashSignature(a, name='A') - ss_b = sourmash.SourmashSignature(b, name='B') - ss_c = sourmash.SourmashSignature(c, name='C') + ss_a = sourmash.SourmashSignature(a, name="A") + ss_b = sourmash.SourmashSignature(b, name="B") + ss_c = sourmash.SourmashSignature(c, name="C") factory = GraphFactory(31, 1e5, 4) db = SBT(factory, d=2) @@ -988,16 +987,21 @@ def test_sbt_protein_command_index(runtmp): c = runtmp # test command-line creation of SBT database with protein sigs - sigfile1 = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') + sigfile1 = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) - db_out = c.output('protein.sbt.zip') + db_out = c.output("protein.sbt.zip") - c.run_sourmash('index', db_out, sigfile1, sigfile2, - '--scaled', '100', '-k', '19', '--protein') + c.run_sourmash( + "index", db_out, sigfile1, sigfile2, "--scaled", "100", "-k", "19", "--protein" + ) # check to make sure .sbt.protein directory doesn't get created - assert not os.path.exists(c.output('.sbt.protein')) + assert not os.path.exists(c.output(".sbt.protein")) db2 = load_sbt_index(db_out) @@ -1005,14 +1009,19 @@ def test_sbt_protein_command_index(runtmp): sig2 = sourmash.load_one_signature(sigfile2) # check reconstruction -- - mh_list = [ x.minhash for x in db2.signatures() ] + mh_list = [x.minhash for x in db2.signatures()] assert len(mh_list) == 2 assert sig1.minhash in mh_list assert sig2.minhash in mh_list # and search, gather - results = db2.search(sig1, threshold=0.0, ignore_abundance=True, - do_containment=False, best_only=False) + results = db2.search( + sig1, + threshold=0.0, + ignore_abundance=True, + do_containment=False, + best_only=False, + ) assert len(results) == 2 result = db2.best_containment(sig2) @@ -1024,13 +1033,18 @@ def test_sbt_protein_command_index(runtmp): @utils.in_tempdir def test_sbt_protein_search_no_threshold(c): # test the '.search' method on SBTs w/no threshold - sigfile1 = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') + sigfile1 = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) - db_out = c.output('protein.sbt.zip') + db_out = c.output("protein.sbt.zip") - c.run_sourmash('index', db_out, sigfile1, sigfile2, - '--scaled', '100', '-k', '19', '--protein') + c.run_sourmash( + "index", db_out, sigfile1, sigfile2, "--scaled", "100", "-k", "19", "--protein" + ) db2 = load_sbt_index(db_out) @@ -1038,34 +1052,41 @@ def test_sbt_protein_search_no_threshold(c): # and search, gather with pytest.raises(TypeError) as exc: - results = db2.search(sig1) + db2.search(sig1) assert "'search' requires 'threshold'" in str(exc) @utils.in_thisdir def test_sbt_protein_command_search(c): # test command-line search/gather of SBT database with protein sigs - sigfile1 = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - db_out = utils.get_test_data('prot/protein.sbt.zip') + sigfile1 = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + db_out = utils.get_test_data("prot/protein.sbt.zip") - c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches' in c.last_result.out + c.run_sourmash("search", sigfile1, db_out, "--threshold", "0.0") + assert "2 matches" in c.last_result.out - c.run_sourmash('gather', sigfile1, db_out) - assert 'found 1 matches total' in c.last_result.out - assert 'the recovered matches hit 100.0% of the query' in c.last_result.out + c.run_sourmash("gather", sigfile1, db_out) + assert "found 1 matches total" in c.last_result.out + assert "the recovered matches hit 100.0% of the query" in c.last_result.out @utils.in_tempdir def test_sbt_hp_command_index(c): # test command-line creation of SBT database with hp sigs - sigfile1 = utils.get_test_data('prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/hp/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') + sigfile1 = utils.get_test_data( + "prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/hp/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) - db_out = c.output('hp.sbt.zip') + db_out = c.output("hp.sbt.zip") - c.run_sourmash('index', db_out, sigfile1, sigfile2, - '--scaled', '100', '-k', '19', '--hp') + c.run_sourmash( + "index", db_out, sigfile1, sigfile2, "--scaled", "100", "-k", "19", "--hp" + ) db2 = load_sbt_index(db_out) @@ -1073,14 +1094,19 @@ def test_sbt_hp_command_index(c): sig2 = sourmash.load_one_signature(sigfile2) # check reconstruction -- - mh_list = [ x.minhash for x in db2.signatures() ] + mh_list = [x.minhash for x in db2.signatures()] assert len(mh_list) == 2 assert sig1.minhash in mh_list assert sig2.minhash in mh_list # and search, gather - results = db2.search(sig1, threshold=0.0, ignore_abundance=True, - do_containment=False, best_only=False) + results = db2.search( + sig1, + threshold=0.0, + ignore_abundance=True, + do_containment=False, + best_only=False, + ) assert results result = db2.best_containment(sig2) @@ -1092,27 +1118,34 @@ def test_sbt_hp_command_index(c): @utils.in_thisdir def test_sbt_hp_command_search(c): # test command-line search/gather of SBT database with hp sigs - sigfile1 = utils.get_test_data('prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - db_out = utils.get_test_data('prot/hp.sbt.zip') + sigfile1 = utils.get_test_data( + "prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + db_out = utils.get_test_data("prot/hp.sbt.zip") - c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches' in c.last_result.out + c.run_sourmash("search", sigfile1, db_out, "--threshold", "0.0") + assert "2 matches" in c.last_result.out - c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0') - assert 'found 1 matches total' in c.last_result.out - assert 'the recovered matches hit 100.0% of the query' in c.last_result.out + c.run_sourmash("gather", sigfile1, db_out, "--threshold", "0.0") + assert "found 1 matches total" in c.last_result.out + assert "the recovered matches hit 100.0% of the query" in c.last_result.out @utils.in_tempdir def test_sbt_dayhoff_command_index(c): # test command-line creation of SBT database with dayhoff sigs - sigfile1 = utils.get_test_data('prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/dayhoff/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') + sigfile1 = utils.get_test_data( + "prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/dayhoff/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) - db_out = c.output('dayhoff.sbt.zip') + db_out = c.output("dayhoff.sbt.zip") - c.run_sourmash('index', db_out, sigfile1, sigfile2, - '--scaled', '100', '-k', '19', '--dayhoff') + c.run_sourmash( + "index", db_out, sigfile1, sigfile2, "--scaled", "100", "-k", "19", "--dayhoff" + ) db2 = load_sbt_index(db_out) @@ -1120,14 +1153,19 @@ def test_sbt_dayhoff_command_index(c): sig2 = sourmash.load_one_signature(sigfile2) # check reconstruction -- - mh_list = [ x.minhash for x in db2.signatures() ] + mh_list = [x.minhash for x in db2.signatures()] assert len(mh_list) == 2 assert sig1.minhash in mh_list assert sig2.minhash in mh_list # and search, gather - results = db2.search(sig1, threshold=0.0, ignore_abundance=True, - do_containment=False, best_only=False) + results = db2.search( + sig1, + threshold=0.0, + ignore_abundance=True, + do_containment=False, + best_only=False, + ) assert len(results) == 2 result = db2.best_containment(sig2) @@ -1139,21 +1177,23 @@ def test_sbt_dayhoff_command_index(c): @utils.in_thisdir def test_sbt_dayhoff_command_search(c): # test command-line search/gather of SBT database with dayhoff sigs - sigfile1 = utils.get_test_data('prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - db_out = utils.get_test_data('prot/dayhoff.sbt.zip') + sigfile1 = utils.get_test_data( + "prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + db_out = utils.get_test_data("prot/dayhoff.sbt.zip") - c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches' in c.last_result.out + c.run_sourmash("search", sigfile1, db_out, "--threshold", "0.0") + assert "2 matches" in c.last_result.out - c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0') - assert 'found 1 matches total' in c.last_result.out - assert 'the recovered matches hit 100.0% of the query' in c.last_result.out + c.run_sourmash("gather", sigfile1, db_out, "--threshold", "0.0") + assert "found 1 matches total" in c.last_result.out + assert "the recovered matches hit 100.0% of the query" in c.last_result.out def test_sbt_node_cache(): - tree = SBT.load(utils.get_test_data('v6.sbt.json'), - leaf_loader=SigLeaf.load, - cache_size=1) + tree = SBT.load( + utils.get_test_data("v6.sbt.json"), leaf_loader=SigLeaf.load, cache_size=1 + ) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) to_search = load_one_signature(testdata1) @@ -1172,28 +1212,28 @@ def test_sbt_node_cache(): def test_sbt_no_containment_on_num(): - tree = SBT.load(utils.get_test_data('v6.sbt.json'), - leaf_loader=SigLeaf.load, - cache_size=1) + tree = SBT.load( + utils.get_test_data("v6.sbt.json"), leaf_loader=SigLeaf.load, cache_size=1 + ) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) to_search = load_one_signature(testdata1) search_obj = make_jaccard_search_query(do_containment=True, threshold=0.05) with pytest.raises(TypeError) as exc: - results = list(tree.find(search_obj, to_search)) + list(tree.find(search_obj, to_search)) assert "this search requires a scaled signature" in str(exc) def test_build_sbt_zip_with_dups(runtmp): - dups_data = utils.get_test_data('duplicate-sigs') + dups_data = utils.get_test_data("duplicate-sigs") all_sigs = set(sourmash.load_file_as_signatures(dups_data)) assert len(all_sigs) == 4 - runtmp.run_sourmash('index', 'dups.sbt.zip', dups_data) - outfile = runtmp.output('dups.sbt.zip') + runtmp.run_sourmash("index", "dups.sbt.zip", dups_data) + outfile = runtmp.output("dups.sbt.zip") sbt_sigs = set(sourmash.load_file_as_signatures(outfile)) assert len(sbt_sigs) == 4 @@ -1202,17 +1242,17 @@ def test_build_sbt_zip_with_dups(runtmp): def test_build_sbt_zip_with_dups_exists(runtmp): - dups_data = utils.get_test_data('duplicate-sigs') + dups_data = utils.get_test_data("duplicate-sigs") all_sigs = set(sourmash.load_file_as_signatures(dups_data)) assert len(all_sigs) == 4 - runtmp.run_sourmash('index', 'dups.sbt.zip', dups_data) - outfile = runtmp.output('dups.sbt.zip') + runtmp.run_sourmash("index", "dups.sbt.zip", dups_data) + outfile = runtmp.output("dups.sbt.zip") # run again, to see what happens :) - runtmp.run_sourmash('index', 'dups.sbt.zip', dups_data) - outfile = runtmp.output('dups.sbt.zip') + runtmp.run_sourmash("index", "dups.sbt.zip", dups_data) + outfile = runtmp.output("dups.sbt.zip") sbt_sigs = set(sourmash.load_file_as_signatures(outfile)) assert len(sbt_sigs) == 4 @@ -1221,13 +1261,13 @@ def test_build_sbt_zip_with_dups_exists(runtmp): def test_build_sbt_json_with_dups(runtmp): - dups_data = utils.get_test_data('duplicate-sigs') + dups_data = utils.get_test_data("duplicate-sigs") all_sigs = set(sourmash.load_file_as_signatures(dups_data)) assert len(all_sigs) == 4 - runtmp.run_sourmash('index', 'dups.sbt.json', dups_data) - outfile = runtmp.output('dups.sbt.json') + runtmp.run_sourmash("index", "dups.sbt.json", dups_data) + outfile = runtmp.output("dups.sbt.json") sbt_sigs = set(sourmash.load_file_as_signatures(outfile)) assert len(sbt_sigs) == 4 @@ -1236,17 +1276,17 @@ def test_build_sbt_json_with_dups(runtmp): def test_build_sbt_json_with_dups_exists(runtmp): - dups_data = utils.get_test_data('duplicate-sigs') + dups_data = utils.get_test_data("duplicate-sigs") all_sigs = set(sourmash.load_file_as_signatures(dups_data)) assert len(all_sigs) == 4 - runtmp.run_sourmash('index', 'dups.sbt.json', dups_data) - outfile = runtmp.output('dups.sbt.json') + runtmp.run_sourmash("index", "dups.sbt.json", dups_data) + outfile = runtmp.output("dups.sbt.json") # run again, see what happens! - runtmp.run_sourmash('index', 'dups.sbt.json', dups_data) - outfile = runtmp.output('dups.sbt.json') + runtmp.run_sourmash("index", "dups.sbt.json", dups_data) + outfile = runtmp.output("dups.sbt.json") sbt_sigs = set(sourmash.load_file_as_signatures(outfile)) assert len(sbt_sigs) == 4 @@ -1258,9 +1298,9 @@ def test_load_fail_on_file_not_dir(runtmp): # make sure the load function raises a ValueError for {filename}/sbt, # rather than a NotADirectoryError - filename = runtmp.output('foo') - with open(filename, 'wt') as fp: - fp.write('something') + filename = runtmp.output("foo") + with open(filename, "w") as fp: + fp.write("something") - with pytest.raises(ValueError) as exc: - x = SBT.load(runtmp.output('foo/bar.sbt.json')) + with pytest.raises(ValueError): + SBT.load(runtmp.output("foo/bar.sbt.json")) diff --git a/tests/test_search.py b/tests/test_search.py index a1b8171cfd..c9c6d601cc 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -5,9 +5,13 @@ import sourmash_tst_utils as utils from sourmash import search, SourmashSignature, MinHash, load_one_signature -from sourmash.search import (make_jaccard_search_query, - make_containment_query, - SearchResult, PrefetchResult, GatherResult) +from sourmash.search import ( + make_jaccard_search_query, + make_containment_query, + SearchResult, + PrefetchResult, + GatherResult, +) from sourmash.index import LinearIndex @@ -20,8 +24,7 @@ def test_make_jaccard_search_query(): def test_make_jaccard_search_query_cont(): - search_obj = make_jaccard_search_query(do_containment=True, - threshold=0) + search_obj = make_jaccard_search_query(do_containment=True, threshold=0) assert search_obj.score_fn == search_obj.score_containment assert search_obj.require_scaled @@ -29,8 +32,7 @@ def test_make_jaccard_search_query_cont(): def test_make_jaccard_search_query_max_cont(): - search_obj = make_jaccard_search_query(do_max_containment=True, - threshold=0) + search_obj = make_jaccard_search_query(do_max_containment=True, threshold=0) assert search_obj.score_fn == search_obj.score_max_containment assert search_obj.require_scaled @@ -55,16 +57,18 @@ def test_make_jaccard_search_query_no_threshold_none(): def test_make_jaccard_search_query_cont_and_max_cont(): with pytest.raises(TypeError) as exc: - search_obj = make_jaccard_search_query(do_containment=True, - do_max_containment=True) + make_jaccard_search_query(do_containment=True, do_max_containment=True) - assert str(exc.value) == "'do_containment' and 'do_max_containment' cannot both be True" + assert ( + str(exc.value) + == "'do_containment' and 'do_max_containment' cannot both be True" + ) def test_cont_requires_scaled(): search_obj = make_jaccard_search_query(do_containment=True) assert search_obj.require_scaled - + mh = MinHash(n=500, ksize=31) with pytest.raises(TypeError) as exc: search_obj.check_is_compatible(SourmashSignature(mh)) @@ -73,7 +77,7 @@ def test_cont_requires_scaled(): def test_search_requires_flat(): search_obj = make_jaccard_search_query() - + mh = MinHash(n=500, ksize=31, track_abundance=True) with pytest.raises(TypeError) as exc: search_obj.check_is_compatible(SourmashSignature(mh)) @@ -164,7 +168,7 @@ def test_make_containment_query_num_minhash(): mh.add_hash(i) with pytest.raises(TypeError) as exc: - search_obj = make_containment_query(mh, 5e4) + make_containment_query(mh, 5e4) assert str(exc.value) == "query signature must be calculated with scaled" @@ -177,7 +181,7 @@ def test_make_containment_query_empty_minhash(): mh.add_hash(i) with pytest.raises(TypeError) as exc: - search_obj = make_containment_query(mh, -1) + make_containment_query(mh, -1) assert str(exc.value) == "threshold_bp must be non-negative" @@ -191,7 +195,7 @@ def test_make_containment_query_high_threshold(): # effective threshold > 1; raise ValueError with pytest.raises(ValueError): - search_obj = make_containment_query(mh, 200000) + make_containment_query(mh, 200000) class FakeIndex(LinearIndex): @@ -240,29 +244,31 @@ def test_search_with_abund_query(): query = SourmashSignature(mh) with pytest.raises(TypeError): - search.search_databases_with_abund_query(query, [], - threshold=0, - do_containment=True) + search.search_databases_with_abund_query( + query, [], threshold=0, do_containment=True + ) with pytest.raises(TypeError): - search.search_databases_with_abund_query(query, [], - threshold=0, - do_max_containment=True) + search.search_databases_with_abund_query( + query, [], threshold=0, do_max_containment=True + ) def test_scaledSearchResult(): # check that values get stored/calculated correctly - ss47_file = utils.get_test_data('47.fa.sig') - ss4763_file = utils.get_test_data('47+63.fa.sig') - ss47 = load_one_signature(ss47_file, ksize=31, select_moltype='dna') - ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype='dna') + ss47_file = utils.get_test_data("47.fa.sig") + ss4763_file = utils.get_test_data("47+63.fa.sig") + ss47 = load_one_signature(ss47_file, ksize=31, select_moltype="dna") + ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype="dna") ss4763 = ss4763.to_mutable() ss4763.filename = ss4763_file scaled = ss47.minhash.scaled - res = SearchResult(ss47, ss4763, cmp_scaled=scaled, similarity= ss47.contained_by(ss4763)) + res = SearchResult( + ss47, ss4763, cmp_scaled=scaled, similarity=ss47.contained_by(ss4763) + ) assert res.query_name == ss47.name assert res.match_name == ss4763.name @@ -271,16 +277,16 @@ def test_scaledSearchResult(): assert res.cmp_scaled == 1000 assert res.query_abundance == ss47.minhash.track_abundance assert res.match_abundance == ss4763.minhash.track_abundance -# assert res.query_bp == len(ss47.minhash) * scaled -# assert res.match_bp == len(ss4763.minhash) * scaled + # assert res.query_bp == len(ss47.minhash) * scaled + # assert res.match_bp == len(ss4763.minhash) * scaled assert res.ksize == 31 - assert res.moltype == 'DNA' - assert res.query_filename == '47.fa' + assert res.moltype == "DNA" + assert res.query_filename == "47.fa" assert res.match_filename == ss4763_file assert res.query_md5 == ss47.md5sum() assert res.match_md5 == ss4763.md5sum() - # assert res.query_n_hashes == len(ss47.minhash) - # assert res.match_n_hashes == len(ss4763.minhash) + # assert res.query_n_hashes == len(ss47.minhash) + # assert res.match_n_hashes == len(ss4763.minhash) assert res.md5 == ss4763.md5sum() assert res.name == ss4763.name assert res.filename == ss4763.filename @@ -289,18 +295,19 @@ def test_scaledSearchResult(): # check that we _can_ get avg_containment_ani assert res.cmp.avg_containment_ani == np.mean([queryc_ani.ani, matchc_ani.ani]) + def test_numSearchResult(): # check that values get stored/calculated correctly - ss47_file = utils.get_test_data('num/47.fa.sig') - ss63_file = utils.get_test_data('num/63.fa.sig') - ss47 = load_one_signature(ss47_file, ksize=31, select_moltype='dna') - ss63 = load_one_signature(ss63_file, ksize=31, select_moltype='dna') + ss47_file = utils.get_test_data("num/47.fa.sig") + ss63_file = utils.get_test_data("num/63.fa.sig") + ss47 = load_one_signature(ss47_file, ksize=31, select_moltype="dna") + ss63 = load_one_signature(ss63_file, ksize=31, select_moltype="dna") ss63 = ss63.to_mutable() ss63.filename = ss63_file assert ss47.minhash.num and ss63.minhash.num - res = SearchResult(ss47, ss63, similarity= ss47.jaccard(ss63)) + res = SearchResult(ss47, ss63, similarity=ss47.jaccard(ss63)) print(res.cmp_num) assert res.mh1.num assert res.cmp.cmp_num == 500 @@ -311,8 +318,8 @@ def test_numSearchResult(): assert res.query_abundance == ss47.minhash.track_abundance assert res.match_abundance == ss63.minhash.track_abundance assert res.ksize == 31 - assert res.moltype == 'DNA' - assert res.query_filename == '47.fa' + assert res.moltype == "DNA" + assert res.query_filename == "47.fa" assert res.match_filename == ss63_file assert res.query_md5 == ss47.md5sum() assert res.match_md5 == ss63.md5sum() @@ -323,7 +330,7 @@ def test_numSearchResult(): # check that we can't get ani with pytest.raises(TypeError) as exc: res.estimate_search_ani() - assert("ANI can only be estimated from scaled signatures.") in str(exc) + assert ("ANI can only be estimated from scaled signatures.") in str(exc) # get result as dictionary (of just items to write) resD = res.resultdict @@ -333,10 +340,10 @@ def test_numSearchResult(): def test_SearchResult_incompatible_sigs(): - ss47_file = utils.get_test_data('num/47.fa.sig') - ss4763_file = utils.get_test_data('47+63.fa.sig') - ss47 = load_one_signature(ss47_file, ksize=31, select_moltype='dna') - ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype='dna') + ss47_file = utils.get_test_data("num/47.fa.sig") + ss4763_file = utils.get_test_data("47+63.fa.sig") + ss47 = load_one_signature(ss47_file, ksize=31, select_moltype="dna") + ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype="dna") with pytest.raises(TypeError) as exc: SearchResult(ss47, ss4763, similarity=10) @@ -345,8 +352,8 @@ def test_SearchResult_incompatible_sigs(): def test_SearchResult_notsigs(): - ss47_file = utils.get_test_data('num/47.fa.sig') - ss4763_file = utils.get_test_data('47+63.fa.sig') + ss47_file = utils.get_test_data("num/47.fa.sig") + ss4763_file = utils.get_test_data("47+63.fa.sig") with pytest.raises(AttributeError) as exc: SearchResult(ss47_file, ss4763_file, similarity=10) @@ -356,10 +363,10 @@ def test_SearchResult_notsigs(): def test_SearchResult_no_similarity(): # check that values get stored/calculated correctly - ss47_file = utils.get_test_data('47.fa.sig') - ss4763_file = utils.get_test_data('47+63.fa.sig') - ss47 = load_one_signature(ss47_file, ksize=31, select_moltype='dna') - ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype='dna') + ss47_file = utils.get_test_data("47.fa.sig") + ss4763_file = utils.get_test_data("47+63.fa.sig") + ss47 = load_one_signature(ss47_file, ksize=31, select_moltype="dna") + ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype="dna") with pytest.raises(ValueError) as exc: SearchResult(ss47, ss4763) @@ -369,10 +376,10 @@ def test_SearchResult_no_similarity(): def test_PrefetchResult(): # check that values get stored/calculated correctly - ss47_file = utils.get_test_data('47.fa.sig') - ss4763_file = utils.get_test_data('47+63.fa.sig') - ss47 = load_one_signature(ss47_file, ksize=31, select_moltype='dna') - ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype='dna') + ss47_file = utils.get_test_data("47.fa.sig") + ss4763_file = utils.get_test_data("47+63.fa.sig") + ss47 = load_one_signature(ss47_file, ksize=31, select_moltype="dna") + ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype="dna") ss4763 = ss4763.to_mutable() ss4763.filename = ss4763_file @@ -381,14 +388,14 @@ def test_PrefetchResult(): intersect_mh = ss47.minhash.intersection(ss4763.minhash) intersect_bp = len(intersect_mh) * scaled - jaccard=ss4763.jaccard(ss47) - max_containment=ss4763.max_containment(ss47) - f_match_query=ss47.contained_by(ss4763) - f_query_match=ss4763.contained_by(ss47) + jaccard = ss4763.jaccard(ss47) + max_containment = ss4763.max_containment(ss47) + f_match_query = ss47.contained_by(ss4763) + f_query_match = ss4763.contained_by(ss47) queryc_ani = ss47.containment_ani(ss4763) matchc_ani = ss4763.containment_ani(ss47) - res = PrefetchResult(ss47, ss4763, cmp_scaled = scaled) + res = PrefetchResult(ss47, ss4763, cmp_scaled=scaled) assert res.query_name == ss47.name assert res.match_name == ss4763.name @@ -400,8 +407,8 @@ def test_PrefetchResult(): assert res.query_bp == len(ss47.minhash) * scaled assert res.match_bp == len(ss4763.minhash) * scaled assert res.ksize == 31 - assert res.moltype == 'DNA' - assert res.query_filename == '47.fa' + assert res.moltype == "DNA" + assert res.query_filename == "47.fa" assert res.match_filename == ss4763_file assert res.query_md5 == ss47.md5sum() assert res.match_md5 == ss4763.md5sum() @@ -426,23 +433,26 @@ def test_PrefetchResult(): def test_PrefetchResult_incompatible_sigs(): - ss47_file = utils.get_test_data('num/47.fa.sig') - ss4763_file = utils.get_test_data('47+63.fa.sig') - ss47 = load_one_signature(ss47_file, ksize=31, select_moltype='dna') - ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype='dna') + ss47_file = utils.get_test_data("num/47.fa.sig") + ss4763_file = utils.get_test_data("47+63.fa.sig") + ss47 = load_one_signature(ss47_file, ksize=31, select_moltype="dna") + ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype="dna") with pytest.raises(TypeError) as exc: PrefetchResult(ss47, ss4763) print(str(exc)) - assert "Error: prefetch and gather results must be between scaled signatures." in str(exc) + assert ( + "Error: prefetch and gather results must be between scaled signatures." + in str(exc) + ) def test_GatherResult(): # check that values get stored/calculated correctly - ss47_file = utils.get_test_data('track_abund/47.fa.sig') - ss4763_file = utils.get_test_data('47+63.fa.sig') - ss47 = load_one_signature(ss47_file, ksize=31, select_moltype='dna') - ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype='dna') + ss47_file = utils.get_test_data("track_abund/47.fa.sig") + ss4763_file = utils.get_test_data("47+63.fa.sig") + ss47 = load_one_signature(ss47_file, ksize=31, select_moltype="dna") + ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype="dna") ss4763 = ss4763.to_mutable() ss4763.filename = ss4763_file @@ -454,8 +464,8 @@ def test_GatherResult(): remaining_mh.remove_many(intersect_mh) intersect_bp = len(intersect_mh) * scaled - max_containment=ss4763.max_containment(ss47) - f_match_query = ss47.contained_by(ss4763) + max_containment = ss4763.max_containment(ss47) + ss47.contained_by(ss4763) orig_query_abunds = ss47.minhash.hashes queryc_ani = ss47.containment_ani(ss4763) matchc_ani = ss4763.containment_ani(ss47) @@ -464,12 +474,16 @@ def test_GatherResult(): gather_result_rank = 1 sum_abunds = 1000 - res = GatherResult(ss47, ss4763, cmp_scaled=scaled, - gather_querymh=remaining_mh, - gather_result_rank=gather_result_rank, - total_weighted_hashes = sum_abunds, - orig_query_len=len(ss47.minhash), - orig_query_abunds=orig_query_abunds) + res = GatherResult( + ss47, + ss4763, + cmp_scaled=scaled, + gather_querymh=remaining_mh, + gather_result_rank=gather_result_rank, + total_weighted_hashes=sum_abunds, + orig_query_len=len(ss47.minhash), + orig_query_abunds=orig_query_abunds, + ) assert res.query_name == ss47.name assert res.match_name == ss4763.name @@ -481,8 +495,8 @@ def test_GatherResult(): assert res.query_bp == len(ss47.minhash) * scaled assert res.match_bp == len(ss4763.minhash) * scaled assert res.ksize == 31 - assert res.moltype == 'DNA' - assert res.query_filename == 'podar-ref/47.fa' + assert res.moltype == "DNA" + assert res.query_filename == "podar-ref/47.fa" assert res.match_filename == ss4763_file assert res.query_md5 == ss47.md5sum() assert res.match_md5 == ss4763.md5sum() @@ -516,10 +530,10 @@ def test_GatherResult(): def test_GatherResult_ci(): # check that values get stored/calculated correctly - ss47_file = utils.get_test_data('track_abund/47.fa.sig') - ss4763_file = utils.get_test_data('47+63.fa.sig') - ss47 = load_one_signature(ss47_file, ksize=31, select_moltype='dna') - ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype='dna') + ss47_file = utils.get_test_data("track_abund/47.fa.sig") + ss4763_file = utils.get_test_data("47+63.fa.sig") + ss47 = load_one_signature(ss47_file, ksize=31, select_moltype="dna") + ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype="dna") ss4763 = ss4763.to_mutable() ss4763.filename = ss4763_file @@ -531,20 +545,24 @@ def test_GatherResult_ci(): remaining_mh.remove_many(intersect_mh) orig_query_abunds = ss47.minhash.hashes - queryc_ani = ss47.containment_ani(ss4763,estimate_ci=True) + queryc_ani = ss47.containment_ani(ss4763, estimate_ci=True) matchc_ani = ss4763.containment_ani(ss47, estimate_ci=True) # make some fake vals to check gather_result_rank = 1 sum_abunds = 1000 - res = GatherResult(ss47, ss4763, cmp_scaled=scaled, - gather_querymh=remaining_mh, - gather_result_rank=gather_result_rank, - total_weighted_hashes = sum_abunds, - orig_query_len=len(ss47.minhash), - orig_query_abunds=orig_query_abunds, - estimate_ani_ci=True) + res = GatherResult( + ss47, + ss4763, + cmp_scaled=scaled, + gather_querymh=remaining_mh, + gather_result_rank=gather_result_rank, + total_weighted_hashes=sum_abunds, + orig_query_len=len(ss47.minhash), + orig_query_abunds=orig_query_abunds, + estimate_ani_ci=True, + ) # check that we can write prefetch result directly from gather pf = PrefetchResult(ss47, ss4763, cmp_scaled=scaled, estimate_ani_ci=True) @@ -568,130 +586,183 @@ def test_GatherResult_ci(): def test_GatherResult_incompatible_sigs(): - ss47_file = utils.get_test_data('num/47.fa.sig') - ss4763_file = utils.get_test_data('47+63.fa.sig') - ss47 = load_one_signature(ss47_file, ksize=31, select_moltype='dna') - ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype='dna') + ss47_file = utils.get_test_data("num/47.fa.sig") + ss4763_file = utils.get_test_data("47+63.fa.sig") + ss47 = load_one_signature(ss47_file, ksize=31, select_moltype="dna") + ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype="dna") orig_query_abunds = ss47.minhash.hashes with pytest.raises(TypeError) as exc: - GatherResult(ss47, ss4763, cmp_scaled=1, - gather_querymh=ss47.minhash, - gather_result_rank=1, - total_weighted_hashes = 1, - orig_query_len=len(ss47.minhash), - orig_query_abunds=orig_query_abunds) + GatherResult( + ss47, + ss4763, + cmp_scaled=1, + gather_querymh=ss47.minhash, + gather_result_rank=1, + total_weighted_hashes=1, + orig_query_len=len(ss47.minhash), + orig_query_abunds=orig_query_abunds, + ) print(str(exc)) - assert "Error: prefetch and gather results must be between scaled signatures." in str(exc) + assert ( + "Error: prefetch and gather results must be between scaled signatures." + in str(exc) + ) def test_GatherResult_incomplete_input_cmpscaled(): - ss47_file = utils.get_test_data('47.fa.sig') - ss4763_file = utils.get_test_data('47+63.fa.sig') - ss47 = load_one_signature(ss47_file, ksize=31, select_moltype='dna') - ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype='dna') + ss47_file = utils.get_test_data("47.fa.sig") + ss4763_file = utils.get_test_data("47+63.fa.sig") + ss47 = load_one_signature(ss47_file, ksize=31, select_moltype="dna") + ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype="dna") orig_query_abunds = ss47.minhash.hashes with pytest.raises(ValueError) as exc: - GatherResult(ss47, ss4763, cmp_scaled=None, - gather_querymh=ss47.minhash, - gather_result_rank=1, - total_weighted_hashes = 1, - orig_query_len=len(ss47.minhash), - orig_query_abunds=orig_query_abunds) + GatherResult( + ss47, + ss4763, + cmp_scaled=None, + gather_querymh=ss47.minhash, + gather_result_rank=1, + total_weighted_hashes=1, + orig_query_len=len(ss47.minhash), + orig_query_abunds=orig_query_abunds, + ) print(str(exc)) - assert "Error: must provide comparison scaled value ('cmp_scaled') for GatherResult" in str(exc) + assert ( + "Error: must provide comparison scaled value ('cmp_scaled') for GatherResult" + in str(exc) + ) def test_GatherResult_incomplete_input_gathermh(): - ss47_file = utils.get_test_data('47.fa.sig') - ss4763_file = utils.get_test_data('47+63.fa.sig') - ss47 = load_one_signature(ss47_file, ksize=31, select_moltype='dna') - ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype='dna') + ss47_file = utils.get_test_data("47.fa.sig") + ss4763_file = utils.get_test_data("47+63.fa.sig") + ss47 = load_one_signature(ss47_file, ksize=31, select_moltype="dna") + ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype="dna") orig_query_abunds = ss47.minhash.hashes with pytest.raises(ValueError) as exc: - GatherResult(ss47, ss4763, cmp_scaled=1000, - gather_querymh=None, - gather_result_rank=1, - total_weighted_hashes = 1, - orig_query_len=len(ss47.minhash), - orig_query_abunds=orig_query_abunds) + GatherResult( + ss47, + ss4763, + cmp_scaled=1000, + gather_querymh=None, + gather_result_rank=1, + total_weighted_hashes=1, + orig_query_len=len(ss47.minhash), + orig_query_abunds=orig_query_abunds, + ) print(str(exc)) - assert "Error: must provide current gather sketch (remaining hashes) for GatherResult" in str(exc) + assert ( + "Error: must provide current gather sketch (remaining hashes) for GatherResult" + in str(exc) + ) def test_GatherResult_incomplete_input_gather_result_rank(): - ss47_file = utils.get_test_data('47.fa.sig') - ss4763_file = utils.get_test_data('47+63.fa.sig') - ss47 = load_one_signature(ss47_file, ksize=31, select_moltype='dna') - ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype='dna') + ss47_file = utils.get_test_data("47.fa.sig") + ss4763_file = utils.get_test_data("47+63.fa.sig") + ss47 = load_one_signature(ss47_file, ksize=31, select_moltype="dna") + ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype="dna") orig_query_abunds = ss47.minhash.hashes with pytest.raises(ValueError) as exc: - GatherResult(ss47, ss4763, cmp_scaled=1000, - gather_querymh=ss47.minhash, - gather_result_rank=None, - total_weighted_hashes = 1, - orig_query_len=len(ss47.minhash), - orig_query_abunds=orig_query_abunds) + GatherResult( + ss47, + ss4763, + cmp_scaled=1000, + gather_querymh=ss47.minhash, + gather_result_rank=None, + total_weighted_hashes=1, + orig_query_len=len(ss47.minhash), + orig_query_abunds=orig_query_abunds, + ) print(str(exc)) assert "Error: must provide 'gather_result_rank' to GatherResult" in str(exc) def test_GatherResult_incomplete_input_total_weighted_hashes(): - ss47_file = utils.get_test_data('47.fa.sig') - ss4763_file = utils.get_test_data('47+63.fa.sig') - ss47 = load_one_signature(ss47_file, ksize=31, select_moltype='dna') - ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype='dna') + ss47_file = utils.get_test_data("47.fa.sig") + ss4763_file = utils.get_test_data("47+63.fa.sig") + ss47 = load_one_signature(ss47_file, ksize=31, select_moltype="dna") + ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype="dna") orig_query_abunds = ss47.minhash.hashes with pytest.raises(ValueError) as exc: - GatherResult(ss47, ss4763, cmp_scaled=1000, - gather_querymh=ss47.minhash, - gather_result_rank=1, - total_weighted_hashes = None, - orig_query_len=len(ss47.minhash), - orig_query_abunds=orig_query_abunds) + GatherResult( + ss47, + ss4763, + cmp_scaled=1000, + gather_querymh=ss47.minhash, + gather_result_rank=1, + total_weighted_hashes=None, + orig_query_len=len(ss47.minhash), + orig_query_abunds=orig_query_abunds, + ) print(str(exc)) - assert "Error: must provide sum of all abundances ('total_weighted_hashes') to GatherResult" in str(exc) + assert ( + "Error: must provide sum of all abundances ('total_weighted_hashes') to GatherResult" + in str(exc) + ) with pytest.raises(ValueError) as exc: - GatherResult(ss47, ss4763, cmp_scaled=1000, - gather_querymh=ss47.minhash, - gather_result_rank=1, - total_weighted_hashes = 0, - orig_query_len=len(ss47.minhash), - orig_query_abunds=orig_query_abunds) + GatherResult( + ss47, + ss4763, + cmp_scaled=1000, + gather_querymh=ss47.minhash, + gather_result_rank=1, + total_weighted_hashes=0, + orig_query_len=len(ss47.minhash), + orig_query_abunds=orig_query_abunds, + ) print(str(exc)) - assert "Error: must provide sum of all abundances ('total_weighted_hashes') to GatherResult" in str(exc) + assert ( + "Error: must provide sum of all abundances ('total_weighted_hashes') to GatherResult" + in str(exc) + ) def test_GatherResult_incomplete_input_orig_query_abunds(): - ss47_file = utils.get_test_data('47.fa.sig') - ss4763_file = utils.get_test_data('47+63.fa.sig') - ss47 = load_one_signature(ss47_file, ksize=31, select_moltype='dna') - ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype='dna') + ss47_file = utils.get_test_data("47.fa.sig") + ss4763_file = utils.get_test_data("47+63.fa.sig") + ss47 = load_one_signature(ss47_file, ksize=31, select_moltype="dna") + ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype="dna") orig_query_abunds = None with pytest.raises(ValueError) as exc: - GatherResult(ss47, ss4763, cmp_scaled=1000, - gather_querymh=ss47.minhash, - gather_result_rank=1, - total_weighted_hashes = 1, - orig_query_len=len(ss47.minhash), - orig_query_abunds=orig_query_abunds) + GatherResult( + ss47, + ss4763, + cmp_scaled=1000, + gather_querymh=ss47.minhash, + gather_result_rank=1, + total_weighted_hashes=1, + orig_query_len=len(ss47.minhash), + orig_query_abunds=orig_query_abunds, + ) print(str(exc)) - assert "Error: must provide original query abundances ('orig_query_abunds') to GatherResult" in str(exc) + assert ( + "Error: must provide original query abundances ('orig_query_abunds') to GatherResult" + in str(exc) + ) orig_query_abunds = {} with pytest.raises(ValueError) as exc: - GatherResult(ss47, ss4763, cmp_scaled=1000, - gather_querymh=ss47.minhash, - gather_result_rank=1, - total_weighted_hashes = 1, - orig_query_len=len(ss47.minhash), - orig_query_abunds=orig_query_abunds) + GatherResult( + ss47, + ss4763, + cmp_scaled=1000, + gather_querymh=ss47.minhash, + gather_result_rank=1, + total_weighted_hashes=1, + orig_query_len=len(ss47.minhash), + orig_query_abunds=orig_query_abunds, + ) print(str(exc)) - assert "Error: must provide original query abundances ('orig_query_abunds') to GatherResult" in str(exc) + assert ( + "Error: must provide original query abundances ('orig_query_abunds') to GatherResult" + in str(exc) + ) diff --git a/tests/test_signature.py b/tests/test_signature.py index 95ea058dc4..b82a02364e 100644 --- a/tests/test_signature.py +++ b/tests/test_signature.py @@ -3,8 +3,13 @@ import pytest import sourmash -from sourmash.signature import SourmashSignature, save_signatures, \ - load_signatures, load_one_signature, FrozenSourmashSignature +from sourmash.signature import ( + SourmashSignature, + save_signatures, + load_signatures, + load_one_signature, + FrozenSourmashSignature, +) import sourmash_tst_utils as utils from sourmash.minhash import MinHash, FrozenMinHash from sourmash_tst_utils import SourmashCommandFailed @@ -13,7 +18,7 @@ def test_minhash_copy(track_abundance): e = MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add_kmer("AT" * 10) - sig = SourmashSignature(e, name='foo') + SourmashSignature(e, name="foo") f = e.copy() assert e == f @@ -21,7 +26,7 @@ def test_minhash_copy(track_abundance): def test_sig_copy(track_abundance): e = MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add_kmer("AT" * 10) - sig1 = SourmashSignature(e, name='foo') + sig1 = SourmashSignature(e, name="foo") sig2 = sig1.copy() assert sig1 == sig2 @@ -29,35 +34,35 @@ def test_sig_copy(track_abundance): def test_sig_copy_frozen(track_abundance): e = MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add_kmer("AT" * 10) - sig1 = SourmashSignature(e, name='foo') + sig1 = SourmashSignature(e, name="foo") sig2 = sig1.copy() assert sig1 == sig2 with pytest.raises(TypeError) as e: sig2.minhash.add_hash(5) - assert 'FrozenMinHash does not support modification' in str(e.value) + assert "FrozenMinHash does not support modification" in str(e.value) def test_sig_copy_frozen_mutable(track_abundance): e = MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add_kmer("AT" * 10) - sig1 = SourmashSignature(e, name='foo') + sig1 = SourmashSignature(e, name="foo") sig1.minhash = sig1.minhash.to_mutable() sig2 = sig1.copy() assert sig1 == sig2 with pytest.raises(TypeError) as e: sig2.minhash.add_hash(5) - assert 'FrozenMinHash does not support modification' in str(e.value) + assert "FrozenMinHash does not support modification" in str(e.value) def test_compare(track_abundance): # same content, same name -> equal e = MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add_kmer("AT" * 10) - sig1 = SourmashSignature(e, name='foo') + SourmashSignature(e, name="foo") f = MinHash(n=1, ksize=20, track_abundance=track_abundance) f.add_kmer("AT" * 10) - sig2 = SourmashSignature(f, name='foo') + SourmashSignature(f, name="foo") assert e == f @@ -66,11 +71,11 @@ def test_compare_ne(track_abundance): # same content, different names -> different e = MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add_kmer("AT" * 10) - sig1 = SourmashSignature(e, name='foo') + sig1 = SourmashSignature(e, name="foo") f = MinHash(n=1, ksize=20, track_abundance=track_abundance) f.add_kmer("AT" * 10) - sig2 = SourmashSignature(f, name='bar') + sig2 = SourmashSignature(f, name="bar") assert sig1 != sig2 @@ -79,11 +84,11 @@ def test_compare_ne2(track_abundance): # same content, different filename -> different e = MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add_kmer("AT" * 10) - sig1 = SourmashSignature(e, name='foo', filename='a') + sig1 = SourmashSignature(e, name="foo", filename="a") f = MinHash(n=1, ksize=20, track_abundance=track_abundance) f.add_kmer("AT" * 10) - sig2 = SourmashSignature(f, name='foo', filename='b') + sig2 = SourmashSignature(f, name="foo", filename="b") assert sig1 != sig2 assert sig2 != sig1 @@ -93,11 +98,11 @@ def test_compare_ne2_reverse(track_abundance): # same content, one has filename, other does not -> different e = MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add_kmer("AT" * 10) - sig1 = SourmashSignature(e, name='foo') + sig1 = SourmashSignature(e, name="foo") f = MinHash(n=1, ksize=20, track_abundance=track_abundance) f.add_kmer("AT" * 10) - sig2 = SourmashSignature(f, filename='b') + sig2 = SourmashSignature(f, filename="b") assert sig2 != sig1 assert sig1 != sig2 @@ -124,8 +129,8 @@ def test_str(track_abundance): print(sig) assert repr(sig) == "SourmashSignature('', 59502a74)" - sig._name = 'fizbar' - assert repr(sig) == 'SourmashSignature(\'fizbar\', 59502a74)' + sig._name = "fizbar" + assert repr(sig) == "SourmashSignature('fizbar', 59502a74)" def test_roundtrip(track_abundance): @@ -135,7 +140,6 @@ def test_roundtrip(track_abundance): s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] - e2 = sig2.minhash assert sig.similarity(sig2) == 1.0 assert sig2.similarity(sig) == 1.0 @@ -164,9 +168,8 @@ def test_load_signature_ksize_nonint(track_abundance): e.add_kmer("AT" * 10) sig = SourmashSignature(e) s = save_signatures([sig]) - siglist = list(load_signatures(s, ksize='20')) + siglist = list(load_signatures(s, ksize="20")) sig2 = siglist[0] - e2 = sig2.minhash assert sig.similarity(sig2) == 1.0 assert sig2.similarity(sig) == 1.0 @@ -180,15 +183,13 @@ def test_roundtrip_empty(track_abundance): s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] - e2 = sig2.minhash assert sig.similarity(sig2) == 0 assert sig2.similarity(sig) == 0 def test_roundtrip_scaled(track_abundance): - e = MinHash(n=0, ksize=20, track_abundance=track_abundance, - max_hash=10) + e = MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=10) e.add_hash(5) sig = SourmashSignature(e) s = save_signatures([sig]) @@ -203,8 +204,7 @@ def test_roundtrip_scaled(track_abundance): def test_roundtrip_seed(track_abundance): - e = MinHash(n=1, ksize=20, track_abundance=track_abundance, - seed=10) + e = MinHash(n=1, ksize=20, track_abundance=track_abundance, seed=10) e.add_hash(5) sig = SourmashSignature(e) s = save_signatures([sig]) @@ -219,26 +219,24 @@ def test_roundtrip_seed(track_abundance): def test_similarity_downsample(track_abundance): - e = MinHash(n=0, ksize=20, track_abundance=track_abundance, - max_hash=2**63) - f = MinHash(n=0, ksize=20, track_abundance=track_abundance, - max_hash=2**2) + e = MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=2**63) + f = MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=2**2) e.add_hash(1) e.add_hash(5) assert len(e.hashes) == 2 f.add_hash(1) - f.add_hash(5) # should be discarded due to max_hash + f.add_hash(5) # should be discarded due to max_hash assert len(f.hashes) == 1 ee = SourmashSignature(e) ff = SourmashSignature(f) - with pytest.raises(ValueError) as e: # mismatch in max_hash + with pytest.raises(ValueError) as e: # mismatch in max_hash ee.similarity(ff) - assert 'mismatch in scaled; comparison fail' in str(e.value) + assert "mismatch in scaled; comparison fail" in str(e.value) x = ee.similarity(ff, downsample=True) assert round(x, 1) == 1.0 @@ -252,33 +250,32 @@ def test_add_sequence_bad_dna(track_abundance): with pytest.raises(ValueError) as e: sig.add_sequence("N" * 21, force=False) - assert 'invalid DNA character in input k-mer: NNNNNNNNNNNNNNNNNNNNN' in str(e.value) + assert "invalid DNA character in input k-mer: NNNNNNNNNNNNNNNNNNNNN" in str(e.value) def test_md5(track_abundance): e = MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add_hash(5) sig = SourmashSignature(e) - assert sig.md5sum() == 'eae27d77ca20db309e056e3d2dcd7d69', sig.md5sum() + assert sig.md5sum() == "eae27d77ca20db309e056e3d2dcd7d69", sig.md5sum() def test_str_1(track_abundance): e = MinHash(n=1, ksize=20, track_abundance=track_abundance) - sig = SourmashSignature(e, name='foo') - assert str(sig) == 'foo' + sig = SourmashSignature(e, name="foo") + assert str(sig) == "foo" def test_str_2(track_abundance): e = MinHash(n=1, ksize=20, track_abundance=track_abundance) - sig = SourmashSignature(e, filename='foo.txt') - assert str(sig) == 'foo.txt' + sig = SourmashSignature(e, filename="foo.txt") + assert str(sig) == "foo.txt" def test_str_3(track_abundance): e = MinHash(n=1, ksize=20, track_abundance=track_abundance) - sig = SourmashSignature(e, name='foo', - filename='foo.txt') - assert str(sig) == 'foo' + sig = SourmashSignature(e, name="foo", filename="foo.txt") + assert str(sig) == "foo" def test_name_4(track_abundance): @@ -300,7 +297,7 @@ def test_save_load_multisig(track_abundance): print(x) assert len(y) == 2 - assert sig1 in y # order not guaranteed, note. + assert sig1 in y # order not guaranteed, note. assert sig2 in y assert sig1 != sig2 @@ -309,7 +306,7 @@ def test_load_one_fail_nosig(track_abundance): x = save_signatures([]) print((x,)) with pytest.raises(ValueError): - y = load_one_signature(x) + load_one_signature(x) def test_load_one_succeed(track_abundance): @@ -332,7 +329,7 @@ def test_load_one_fail_multisig(track_abundance): x = save_signatures([sig1, sig2]) with pytest.raises(ValueError): - y = load_one_signature(x) + load_one_signature(x) def test_save_minified(track_abundance): @@ -343,24 +340,24 @@ def test_save_minified(track_abundance): sig2 = SourmashSignature(e2, name="bar baz") x = save_signatures([sig1, sig2]) - assert b'\n' not in x - assert len(x.split(b'\n')) == 1 + assert b"\n" not in x + assert len(x.split(b"\n")) == 1 y = list(load_signatures(x)) assert len(y) == 2 - assert any(sig.name == 'foo' for sig in y) - assert any(sig.name == 'bar baz' for sig in y) + assert any(sig.name == "foo" for sig in y) + assert any(sig.name == "bar baz" for sig in y) def test_load_minified(track_abundance): - sigfile = utils.get_test_data('genome-s10+s11.sig') + sigfile = utils.get_test_data("genome-s10+s11.sig") sigs = load_signatures(sigfile) minified = save_signatures(sigs) - with open(sigfile, 'r') as f: + with open(sigfile) as f: orig_file = f.read() assert len(minified) < len(orig_file) - assert b'\n' not in minified + assert b"\n" not in minified def test_load_compressed(track_abundance): @@ -372,8 +369,8 @@ def test_load_compressed(track_abundance): y = load_one_signature(x) assert sig1 == y - sigfile = utils.get_test_data('genome-s10+s11.sig.gz') - sigs = load_signatures(sigfile) + sigfile = utils.get_test_data("genome-s10+s11.sig.gz") + load_signatures(sigfile) def test_binary_fp(tmpdir, track_abundance): @@ -381,9 +378,9 @@ def test_binary_fp(tmpdir, track_abundance): e.add_kmer("AT" * 10) path = tmpdir.join("1.sig") - with open(str(path), 'wb') as fp: + with open(str(path), "wb") as fp: sig = SourmashSignature(e) - s = save_signatures([sig], fp) + save_signatures([sig], fp) def test_load_signatures_no_file_do_raise(tmpdir): @@ -409,10 +406,10 @@ def test_max_containment(): ss1 = SourmashSignature(mh1) ss2 = SourmashSignature(mh2) - assert ss1.contained_by(ss2) == 1/4 - assert ss2.contained_by(ss1) == 1/2 - assert ss1.max_containment(ss2) == 1/2 - assert ss2.max_containment(ss1) == 1/2 + assert ss1.contained_by(ss2) == 1 / 4 + assert ss2.contained_by(ss1) == 1 / 2 + assert ss1.max_containment(ss2) == 1 / 2 + assert ss2.max_containment(ss1) == 1 / 2 def test_max_containment_empty(): @@ -447,32 +444,44 @@ def test_max_containment_equal(): def test_containment_ANI(): - f1 = utils.get_test_data('2.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("2.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") ss1 = sourmash.load_one_signature(f1, ksize=31) ss2 = sourmash.load_one_signature(f2, ksize=31) - s1_cont_s2 = ss1.containment_ani(ss2, estimate_ci =True) - s2_cont_s1 = ss2.containment_ani(ss1, estimate_ci =True) + s1_cont_s2 = ss1.containment_ani(ss2, estimate_ci=True) + s2_cont_s1 = ss2.containment_ani(ss1, estimate_ci=True) print("\nss1 contained by ss2", s1_cont_s2) print("ss2 contained by ss1", s2_cont_s1) - assert (round(s1_cont_s2.ani,3), s1_cont_s2.ani_low, s1_cont_s2.ani_high) == (1.0,1.0,1.0) - assert (round(s2_cont_s1.ani,3), round(s2_cont_s1.ani_low,3), round(s2_cont_s1.ani_high,3)) == (0.966, 0.965, 0.967) - - s1_mc_s2 = ss1.max_containment_ani(ss2, estimate_ci =True) - s2_mc_s1 = ss2.max_containment_ani(ss1, estimate_ci =True) + assert (round(s1_cont_s2.ani, 3), s1_cont_s2.ani_low, s1_cont_s2.ani_high) == ( + 1.0, + 1.0, + 1.0, + ) + assert ( + round(s2_cont_s1.ani, 3), + round(s2_cont_s1.ani_low, 3), + round(s2_cont_s1.ani_high, 3), + ) == (0.966, 0.965, 0.967) + + s1_mc_s2 = ss1.max_containment_ani(ss2, estimate_ci=True) + s2_mc_s1 = ss2.max_containment_ani(ss1, estimate_ci=True) print("mh1 max containment", s1_mc_s2) print("mh2 max containment", s2_mc_s1) s1_mc_s2.size_is_inaccurate = False s2_mc_s1.size_is_inaccurate = False assert s1_mc_s2 == s2_mc_s1 - assert (round(s1_mc_s2.ani, 3), round(s1_mc_s2.ani_low, 3), round(s1_mc_s2.ani_high, 3)) == (1.0,1.0,1.0) + assert ( + round(s1_mc_s2.ani, 3), + round(s1_mc_s2.ani_low, 3), + round(s1_mc_s2.ani_high, 3), + ) == (1.0, 1.0, 1.0) def test_containment_ANI_precalc_containment(): - f1 = utils.get_test_data('47+63.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("47+63.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") ss1 = sourmash.load_one_signature(f1, ksize=31) ss2 = sourmash.load_one_signature(f2, ksize=31) # precalc containments and assert same results @@ -480,38 +489,53 @@ def test_containment_ANI_precalc_containment(): s2c = ss2.contained_by(ss1) mc = max(s1c, s2c) - assert ss1.containment_ani(ss2, estimate_ci=True) == ss1.containment_ani(ss2, containment=s1c, estimate_ci=True) - assert ss2.containment_ani(ss1) == ss2.containment_ani(ss1, containment=s2c) - assert ss1.max_containment_ani(ss2) == ss2.max_containment_ani(ss1) - assert ss1.max_containment_ani(ss2) == ss1.max_containment_ani(ss2, max_containment=mc) - assert ss1.max_containment_ani(ss2) == ss2.max_containment_ani(ss1, max_containment=mc) + assert ss1.containment_ani(ss2, estimate_ci=True) == ss1.containment_ani( + ss2, containment=s1c, estimate_ci=True + ) + assert ss2.containment_ani(ss1) == ss2.containment_ani(ss1, containment=s2c) + assert ss1.max_containment_ani(ss2) == ss2.max_containment_ani(ss1) + assert ss1.max_containment_ani(ss2) == ss1.max_containment_ani( + ss2, max_containment=mc + ) + assert ss1.max_containment_ani(ss2) == ss2.max_containment_ani( + ss1, max_containment=mc + ) def test_avg_containment(): - f1 = utils.get_test_data('47+63.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("47+63.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") ss1 = sourmash.load_one_signature(f1, ksize=31) ss2 = sourmash.load_one_signature(f2, ksize=31) # check average_containment_ani ac_s1 = ss1.avg_containment(ss2) ac_s2 = ss2.avg_containment(ss1) - assert ac_s1 == ac_s2 == (ss1.contained_by(ss2) + ss2.contained_by(ss1))/2 == 0.6619979467456603 + assert ( + ac_s1 + == ac_s2 + == (ss1.contained_by(ss2) + ss2.contained_by(ss1)) / 2 + == 0.6619979467456603 + ) def test_avg_containment_ani(): - f1 = utils.get_test_data('47+63.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("47+63.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") ss1 = sourmash.load_one_signature(f1, ksize=31) ss2 = sourmash.load_one_signature(f2, ksize=31) # check average_containment_ani ac_s1 = ss1.avg_containment_ani(ss2) ac_s2 = ss2.avg_containment_ani(ss1) - assert ac_s1 == ac_s2 == (ss1.containment_ani(ss2).ani + ss2.containment_ani(ss1).ani)/2 + assert ( + ac_s1 + == ac_s2 + == (ss1.containment_ani(ss2).ani + ss2.containment_ani(ss1).ani) / 2 + ) def test_containment_ANI_downsample(): - f2 = utils.get_test_data('2+63.fa.sig') - f3 = utils.get_test_data('47+63.fa.sig') + f2 = utils.get_test_data("2+63.fa.sig") + f3 = utils.get_test_data("47+63.fa.sig") ss2 = sourmash.load_one_signature(f2, ksize=31) ss3 = sourmash.load_one_signature(f3, ksize=31) # check that downsampling works properly @@ -522,8 +546,8 @@ def test_containment_ANI_downsample(): assert ss2.minhash.scaled != ss3.minhash.scaled ds_s3c = ss2.containment_ani(ss3, downsample=True) ds_s4c = ss3.containment_ani(ss2, downsample=True) - mc_w_ds_1 = ss2.max_containment_ani(ss3, downsample=True) - mc_w_ds_2 = ss3.max_containment_ani(ss2, downsample=True) + mc_w_ds_1 = ss2.max_containment_ani(ss3, downsample=True) + mc_w_ds_2 = ss3.max_containment_ani(ss2, downsample=True) with pytest.raises(ValueError) as e: ss2.containment_ani(ss3) @@ -538,15 +562,15 @@ def test_containment_ANI_downsample(): assert ss2.minhash.scaled == ss3.minhash.scaled ds_s3c_manual = ss2.containment_ani(ss3) ds_s4c_manual = ss3.containment_ani(ss2) - ds_mc_manual = ss2.max_containment_ani(ss3) + ds_mc_manual = ss2.max_containment_ani(ss3) assert ds_s3c == ds_s3c_manual assert ds_s4c == ds_s4c_manual assert mc_w_ds_1 == mc_w_ds_2 == ds_mc_manual def test_jaccard_ANI(): - f1 = utils.get_test_data('2.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("2.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") ss1 = sourmash.load_one_signature(f1, ksize=31) ss2 = sourmash.load_one_signature(f2) @@ -556,12 +580,16 @@ def test_jaccard_ANI(): s2_jani_s1 = ss2.jaccard_ani(ss1) assert s1_jani_s2 == s2_jani_s1 - assert (s1_jani_s2.ani, s1_jani_s2.p_nothing_in_common, s1_jani_s2.jaccard_error) == (0.9783711630110239, 0.0, 3.891666770716877e-07) + assert ( + s1_jani_s2.ani, + s1_jani_s2.p_nothing_in_common, + s1_jani_s2.jaccard_error, + ) == (0.9783711630110239, 0.0, 3.891666770716877e-07) def test_jaccard_ANI_untrustworthy(): - f1 = utils.get_test_data('2.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("2.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") ss1 = sourmash.load_one_signature(f1, ksize=31) ss2 = sourmash.load_one_signature(f2) @@ -572,28 +600,32 @@ def test_jaccard_ANI_untrustworthy(): # since size is inaccurate on 2.fa.sig, need to override to be able to get ani s1_jani_s2.size_is_inaccurate = False - assert s1_jani_s2.ani == None - assert s1_jani_s2.je_exceeds_threshold==True + assert s1_jani_s2.ani is None + assert s1_jani_s2.je_exceeds_threshold == True assert s1_jani_s2.je_threshold == 1e-7 def test_jaccard_ANI_precalc_jaccard(): - f1 = utils.get_test_data('47+63.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("47+63.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") ss1 = sourmash.load_one_signature(f1, ksize=31) ss2 = sourmash.load_one_signature(f2) # precalc jaccard and assert same result jaccard = ss1.jaccard(ss2) - print("\nJACCARD_ANI", ss1.jaccard_ani(ss2,jaccard=jaccard)) + print("\nJACCARD_ANI", ss1.jaccard_ani(ss2, jaccard=jaccard)) - assert ss1.jaccard_ani(ss2) == ss1.jaccard_ani(ss2, jaccard=jaccard) == ss2.jaccard_ani(ss1, jaccard=jaccard) + assert ( + ss1.jaccard_ani(ss2) + == ss1.jaccard_ani(ss2, jaccard=jaccard) + == ss2.jaccard_ani(ss1, jaccard=jaccard) + ) wrong_jaccard = jaccard - 0.1 assert ss1.jaccard_ani(ss2) != ss1.jaccard_ani(ss2, jaccard=wrong_jaccard) def test_jaccard_ANI_downsample(): - f1 = utils.get_test_data('47+63.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("47+63.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") ss1 = sourmash.load_one_signature(f1, ksize=31) ss2 = sourmash.load_one_signature(f2) @@ -619,10 +651,10 @@ def test_frozen_signature_update_1(track_abundance): # setting .name should fail on a FrozenSourmashSignature e = MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add_kmer("AT" * 10) - ss = SourmashSignature(e, name='foo').to_frozen() + ss = SourmashSignature(e, name="foo").to_frozen() with pytest.raises(ValueError): - ss.name = 'foo2' + ss.name = "foo2" def test_frozen_signature_update_2(track_abundance): @@ -630,7 +662,7 @@ def test_frozen_signature_update_2(track_abundance): e = MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add_kmer("AT" * 10) e2 = e.copy_and_clear() - ss = SourmashSignature(e, name='foo').to_frozen() + ss = SourmashSignature(e, name="foo").to_frozen() with pytest.raises(ValueError): ss.minhash = e2 @@ -640,9 +672,9 @@ def test_frozen_signature_update_3(track_abundance): # setting .minhash should succeed with update() context manager e = MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add_kmer("AT" * 10) - ss = SourmashSignature(e, name='foo').to_frozen() + ss = SourmashSignature(e, name="foo").to_frozen() with ss.update() as ss2: - ss2.name = 'foo2' + ss2.name = "foo2" - assert ss2.name == 'foo2' + assert ss2.name == "foo2" diff --git a/tests/test_sketchcomparison.py b/tests/test_sketchcomparison.py index 30282895fc..5b7e78537d 100644 --- a/tests/test_sketchcomparison.py +++ b/tests/test_sketchcomparison.py @@ -11,14 +11,15 @@ import sourmash_tst_utils as utils + # can we parameterize scaled too (so don't need separate downsample tests?) def test_FracMinHashComparison(track_abundance): # build FracMinHash Comparison and check values a = MinHash(0, 21, scaled=1, track_abundance=track_abundance) b = MinHash(0, 21, scaled=1, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -43,23 +44,45 @@ def test_FracMinHashComparison(track_abundance): intersect_mh = a.flatten().intersection(b.flatten()) assert cmp.intersect_mh == intersect_mh == b.flatten().intersection(a.flatten()) assert cmp.total_unique_intersect_hashes == 4 - assert cmp.pass_threshold # default threshold is 0; this should pass + assert cmp.pass_threshold # default threshold is 0; this should pass if track_abundance: - assert cmp.angular_similarity == a.angular_similarity(b) == b.angular_similarity(a) - assert cmp.cosine_similarity == a.angular_similarity(b) == b.angular_similarity(a) - assert cmp.weighted_intersection(from_mh=cmp.mh1).hashes == intersect_mh.inflate(a).hashes - assert cmp.weighted_intersection(from_mh=cmp.mh2).hashes == intersect_mh.inflate(b).hashes - assert cmp.weighted_intersection(from_abundD=a_values).hashes == intersect_mh.inflate(a).hashes - assert cmp.weighted_intersection(from_abundD=b_values).hashes == intersect_mh.inflate(b).hashes + assert ( + cmp.angular_similarity == a.angular_similarity(b) == b.angular_similarity(a) + ) + assert ( + cmp.cosine_similarity == a.angular_similarity(b) == b.angular_similarity(a) + ) + assert ( + cmp.weighted_intersection(from_mh=cmp.mh1).hashes + == intersect_mh.inflate(a).hashes + ) + assert ( + cmp.weighted_intersection(from_mh=cmp.mh2).hashes + == intersect_mh.inflate(b).hashes + ) + assert ( + cmp.weighted_intersection(from_abundD=a_values).hashes + == intersect_mh.inflate(a).hashes + ) + assert ( + cmp.weighted_intersection(from_abundD=b_values).hashes + == intersect_mh.inflate(b).hashes + ) else: with pytest.raises(TypeError) as exc: cmp.angular_similarity print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) with pytest.raises(TypeError) as exc: cmp.cosine_similarity print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) assert cmp.weighted_intersection(from_mh=cmp.mh1).hashes == intersect_mh.hashes assert cmp.weighted_intersection(from_mh=cmp.mh2).hashes == intersect_mh.hashes @@ -69,8 +92,8 @@ def test_FracMinHashComparison_downsample(track_abundance): a = MinHash(0, 21, scaled=1, track_abundance=track_abundance) b = MinHash(0, 21, scaled=1, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -84,7 +107,7 @@ def test_FracMinHashComparison_downsample(track_abundance): ds_b = b.downsample(scaled=cmp_scaled) # build FracMinHashComparison - cmp = FracMinHashComparison(a, b, cmp_scaled = cmp_scaled) + cmp = FracMinHashComparison(a, b, cmp_scaled=cmp_scaled) assert cmp.mh1 == a assert cmp.mh2 == b assert cmp.mh1_cmp == ds_a @@ -99,27 +122,59 @@ def test_FracMinHashComparison_downsample(track_abundance): assert cmp.max_containment == ds_a.max_containment(ds_b) assert cmp.jaccard == ds_a.jaccard(ds_b) == ds_b.jaccard(ds_a) intersect_mh = ds_a.flatten().intersection(ds_b.flatten()) - assert cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten()) + assert ( + cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten()) + ) assert cmp.total_unique_intersect_hashes == 8 - assert cmp.pass_threshold # default threshold is 0; this should pass + assert cmp.pass_threshold # default threshold is 0; this should pass if track_abundance: - assert cmp.angular_similarity == ds_a.angular_similarity(ds_b) == ds_b.angular_similarity(ds_a) - assert cmp.cosine_similarity == ds_a.angular_similarity(ds_b) == ds_b.angular_similarity(ds_a) - assert cmp.weighted_intersection(from_mh=cmp.mh1_cmp).hashes == intersect_mh.inflate(ds_a).hashes - assert cmp.weighted_intersection(from_mh=cmp.mh2_cmp).hashes == intersect_mh.inflate(ds_b).hashes - assert cmp.weighted_intersection(from_abundD=cmp.mh1_cmp.hashes).hashes == intersect_mh.inflate(ds_a).hashes - assert cmp.weighted_intersection(from_abundD=cmp.mh2_cmp.hashes).hashes == intersect_mh.inflate(ds_b).hashes + assert ( + cmp.angular_similarity + == ds_a.angular_similarity(ds_b) + == ds_b.angular_similarity(ds_a) + ) + assert ( + cmp.cosine_similarity + == ds_a.angular_similarity(ds_b) + == ds_b.angular_similarity(ds_a) + ) + assert ( + cmp.weighted_intersection(from_mh=cmp.mh1_cmp).hashes + == intersect_mh.inflate(ds_a).hashes + ) + assert ( + cmp.weighted_intersection(from_mh=cmp.mh2_cmp).hashes + == intersect_mh.inflate(ds_b).hashes + ) + assert ( + cmp.weighted_intersection(from_abundD=cmp.mh1_cmp.hashes).hashes + == intersect_mh.inflate(ds_a).hashes + ) + assert ( + cmp.weighted_intersection(from_abundD=cmp.mh2_cmp.hashes).hashes + == intersect_mh.inflate(ds_b).hashes + ) else: with pytest.raises(TypeError) as exc: cmp.angular_similarity print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) with pytest.raises(TypeError) as exc: cmp.cosine_similarity print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) - assert cmp.weighted_intersection(from_mh=cmp.mh1_cmp).hashes == intersect_mh.hashes - assert cmp.weighted_intersection(from_mh=cmp.mh2_cmp).hashes == intersect_mh.hashes + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) + assert ( + cmp.weighted_intersection(from_mh=cmp.mh1_cmp).hashes == intersect_mh.hashes + ) + assert ( + cmp.weighted_intersection(from_mh=cmp.mh2_cmp).hashes == intersect_mh.hashes + ) def test_FracMinHashComparison_autodownsample(track_abundance): @@ -127,8 +182,8 @@ def test_FracMinHashComparison_autodownsample(track_abundance): a = MinHash(0, 21, scaled=1, track_abundance=track_abundance) b = MinHash(0, 21, scaled=2, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -157,27 +212,59 @@ def test_FracMinHashComparison_autodownsample(track_abundance): assert cmp.max_containment == ds_a.max_containment(ds_b) assert cmp.jaccard == ds_a.jaccard(ds_b) == ds_b.jaccard(ds_a) intersect_mh = ds_a.flatten().intersection(ds_b.flatten()) - assert cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten()) + assert ( + cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten()) + ) assert cmp.total_unique_intersect_hashes == 8 - assert cmp.pass_threshold # default threshold is 0; this should pass + assert cmp.pass_threshold # default threshold is 0; this should pass if track_abundance: - assert cmp.angular_similarity == ds_a.angular_similarity(ds_b) == ds_b.angular_similarity(ds_a) - assert cmp.cosine_similarity == ds_a.angular_similarity(ds_b) == ds_b.angular_similarity(ds_a) - assert cmp.weighted_intersection(from_mh=cmp.mh1_cmp).hashes == intersect_mh.inflate(ds_a).hashes - assert cmp.weighted_intersection(from_mh=cmp.mh2_cmp).hashes == intersect_mh.inflate(ds_b).hashes - assert cmp.weighted_intersection(from_abundD=a_values).hashes == intersect_mh.inflate(a).hashes - assert cmp.weighted_intersection(from_abundD=b_values).hashes == intersect_mh.inflate(b).hashes + assert ( + cmp.angular_similarity + == ds_a.angular_similarity(ds_b) + == ds_b.angular_similarity(ds_a) + ) + assert ( + cmp.cosine_similarity + == ds_a.angular_similarity(ds_b) + == ds_b.angular_similarity(ds_a) + ) + assert ( + cmp.weighted_intersection(from_mh=cmp.mh1_cmp).hashes + == intersect_mh.inflate(ds_a).hashes + ) + assert ( + cmp.weighted_intersection(from_mh=cmp.mh2_cmp).hashes + == intersect_mh.inflate(ds_b).hashes + ) + assert ( + cmp.weighted_intersection(from_abundD=a_values).hashes + == intersect_mh.inflate(a).hashes + ) + assert ( + cmp.weighted_intersection(from_abundD=b_values).hashes + == intersect_mh.inflate(b).hashes + ) else: with pytest.raises(TypeError) as exc: cmp.angular_similarity print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) with pytest.raises(TypeError) as exc: cmp.cosine_similarity print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) - assert cmp.weighted_intersection(from_mh=cmp.mh1_cmp).hashes == intersect_mh.hashes - assert cmp.weighted_intersection(from_mh=cmp.mh2_cmp).hashes == intersect_mh.hashes + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) + assert ( + cmp.weighted_intersection(from_mh=cmp.mh1_cmp).hashes == intersect_mh.hashes + ) + assert ( + cmp.weighted_intersection(from_mh=cmp.mh2_cmp).hashes == intersect_mh.hashes + ) def test_FracMinHashComparison_ignore_abundance(track_abundance): @@ -185,9 +272,8 @@ def test_FracMinHashComparison_ignore_abundance(track_abundance): a = MinHash(0, 21, scaled=1, track_abundance=track_abundance) b = MinHash(0, 21, scaled=1, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } - intersection_w_abund = {1:8, 3:5, 5:3, 8:3} + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -201,7 +287,7 @@ def test_FracMinHashComparison_ignore_abundance(track_abundance): ds_b = b.flatten().downsample(scaled=cmp_scaled) # build FracMinHashComparison - cmp = FracMinHashComparison(a, b, cmp_scaled = cmp_scaled, ignore_abundance=True) + cmp = FracMinHashComparison(a, b, cmp_scaled=cmp_scaled, ignore_abundance=True) assert cmp.mh1 == a assert cmp.mh2 == b assert cmp.mh1_cmp == ds_a @@ -216,18 +302,26 @@ def test_FracMinHashComparison_ignore_abundance(track_abundance): assert cmp.max_containment == ds_a.max_containment(ds_b) assert cmp.jaccard == ds_a.jaccard(ds_b) == ds_b.jaccard(ds_a) intersect_mh = ds_a.flatten().intersection(ds_b.flatten()) - assert cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten()) + assert ( + cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten()) + ) assert cmp.total_unique_intersect_hashes == 8 - assert cmp.pass_threshold # default threshold is 0; this should pass + assert cmp.pass_threshold # default threshold is 0; this should pass # with ignore_abundance = True, all of these should not be usable. Do we want errors, or ""/None? with pytest.raises(TypeError) as exc: cmp.angular_similarity print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) with pytest.raises(TypeError) as exc: cmp.cosine_similarity print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) assert not cmp.mh1_cmp.track_abundance assert not cmp.mh2_cmp.track_abundance assert cmp.weighted_intersection(from_mh=cmp.mh1_cmp).hashes == intersect_mh.hashes @@ -239,8 +333,8 @@ def test_FracMinHashComparison_fail_threshold(track_abundance): a = MinHash(0, 21, scaled=1, track_abundance=track_abundance) b = MinHash(0, 21, scaled=1, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -254,7 +348,7 @@ def test_FracMinHashComparison_fail_threshold(track_abundance): ds_b = b.flatten().downsample(scaled=cmp_scaled) # build FracMinHashComparison - cmp = FracMinHashComparison(a, b, cmp_scaled = cmp_scaled, threshold_bp=40) + cmp = FracMinHashComparison(a, b, cmp_scaled=cmp_scaled, threshold_bp=40) assert cmp.mh1 == a assert cmp.mh2 == b assert cmp.ignore_abundance == False @@ -267,15 +361,19 @@ def test_FracMinHashComparison_fail_threshold(track_abundance): assert cmp.max_containment == ds_a.max_containment(ds_b) assert cmp.jaccard == a.jaccard(b) == b.jaccard(a) intersect_mh = ds_a.flatten().intersection(ds_b.flatten()) - assert cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten()) + assert ( + cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten()) + ) assert cmp.total_unique_intersect_hashes == 8 - assert not cmp.pass_threshold # threshold is 40; this should fail + assert not cmp.pass_threshold # threshold is 40; this should fail def test_FracMinHashComparison_potential_false_negative(): - f1 = utils.get_test_data('scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz') - f2 = utils.get_test_data('scaled100/GCF_000006945.1_ASM694v1_genomic.fna.gz.sig.gz') - f3 = utils.get_test_data('scaled100/GCF_000783305.1_ASM78330v1_genomic.fna.gz.sig.gz') + f1 = utils.get_test_data("scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz") + f2 = utils.get_test_data("scaled100/GCF_000006945.1_ASM694v1_genomic.fna.gz.sig.gz") + f3 = utils.get_test_data( + "scaled100/GCF_000783305.1_ASM78330v1_genomic.fna.gz.sig.gz" + ) a = load_one_signature(f1, ksize=21).minhash b = load_one_signature(f2).minhash c = load_one_signature(f3).minhash @@ -289,9 +387,17 @@ def test_FracMinHashComparison_potential_false_negative(): cmp.estimate_jaccard_ani() assert cmp.jaccard_ani == a.jaccard_ani(b).ani == b.jaccard_ani(a).ani print(cmp.jaccard_ani) - assert cmp.potential_false_negative == a.jaccard_ani(b).p_exceeds_threshold == b.jaccard_ani(a).p_exceeds_threshold + assert ( + cmp.potential_false_negative + == a.jaccard_ani(b).p_exceeds_threshold + == b.jaccard_ani(a).p_exceeds_threshold + ) assert cmp.potential_false_negative == False - assert cmp.jaccard_ani_untrustworthy == a.jaccard_ani(b).je_exceeds_threshold == b.jaccard_ani(a).je_exceeds_threshold + assert ( + cmp.jaccard_ani_untrustworthy + == a.jaccard_ani(b).je_exceeds_threshold + == b.jaccard_ani(a).je_exceeds_threshold + ) cmp.estimate_ani_from_mh1_containment_in_mh2() a_cont_ani_manual = a.containment_ani(b) @@ -308,12 +414,18 @@ def test_FracMinHashComparison_potential_false_negative(): cmp.estimate_max_containment_ani() mc_ani_manual = a.max_containment_ani(b) - assert cmp.max_containment_ani == max(a.containment_ani(b).ani, b.containment_ani(a).ani) == mc_ani_manual.ani + assert ( + cmp.max_containment_ani + == max(a.containment_ani(b).ani, b.containment_ani(a).ani) + == mc_ani_manual.ani + ) assert cmp.potential_false_negative == mc_ani_manual.p_exceeds_threshold - assert cmp.avg_containment_ani == np.mean([a.containment_ani(b).ani, b.containment_ani(a).ani]) + assert cmp.avg_containment_ani == np.mean( + [a.containment_ani(b).ani, b.containment_ani(a).ani] + ) assert cmp.potential_false_negative == False - #downsample to where it becomes a potential false negative + # downsample to where it becomes a potential false negative cmp = FracMinHashComparison(a, b, cmp_scaled=16000) cmp.estimate_ani_from_mh1_containment_in_mh2() assert cmp.potential_false_negative == True @@ -323,8 +435,8 @@ def test_FracMinHashComparison_incompatible_ksize(track_abundance): a = MinHash(0, 31, scaled=1, track_abundance=track_abundance) b = MinHash(0, 21, scaled=2, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -343,8 +455,8 @@ def test_FracMinHashComparison_incompatible_moltype(track_abundance): a = MinHash(0, 31, scaled=1, track_abundance=track_abundance) b = MinHash(0, 31, scaled=2, is_protein=True, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -363,8 +475,8 @@ def test_FracMinHashComparison_incompatible_sketchtype(track_abundance): a = MinHash(0, 31, scaled=1, track_abundance=track_abundance) b = MinHash(10, 31, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -384,8 +496,8 @@ def test_FracMinHashComparison_incompatible_cmp_scaled(track_abundance): a = MinHash(0, 31, scaled=1, track_abundance=track_abundance) b = MinHash(0, 31, scaled=10, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -395,7 +507,7 @@ def test_FracMinHashComparison_incompatible_cmp_scaled(track_abundance): b.add_many(b_values.keys()) with pytest.raises(ValueError) as exc: - FracMinHashComparison(a, b, cmp_scaled = 1) + FracMinHashComparison(a, b, cmp_scaled=1) print(str(exc)) assert "new scaled 1 is lower than current sample scaled 10" in str(exc) @@ -404,8 +516,8 @@ def test_FracMinHashComparison_redownsample_without_scaled(track_abundance): a = MinHash(0, 31, scaled=1, track_abundance=track_abundance) b = MinHash(0, 31, scaled=10, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -429,8 +541,8 @@ def test_NumMinHashComparison(track_abundance): a = MinHash(10, 21, scaled=0, track_abundance=track_abundance) b = MinHash(10, 21, scaled=0, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -453,17 +565,27 @@ def test_NumMinHashComparison(track_abundance): intersect_mh = a.flatten().intersection(b.flatten()) assert cmp.intersect_mh == intersect_mh == b.flatten().intersection(a.flatten()) if track_abundance: - assert cmp.angular_similarity == a.angular_similarity(b) == b.angular_similarity(a) - assert cmp.cosine_similarity == a.angular_similarity(b) == b.angular_similarity(a) + assert ( + cmp.angular_similarity == a.angular_similarity(b) == b.angular_similarity(a) + ) + assert ( + cmp.cosine_similarity == a.angular_similarity(b) == b.angular_similarity(a) + ) else: with pytest.raises(TypeError) as exc: cmp.angular_similarity print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) with pytest.raises(TypeError) as exc: cmp.cosine_similarity print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) def test_NumMinHashComparison_downsample(track_abundance): @@ -471,8 +593,8 @@ def test_NumMinHashComparison_downsample(track_abundance): a = MinHash(10, 21, scaled=0, track_abundance=track_abundance) b = MinHash(10, 21, scaled=0, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -487,7 +609,7 @@ def test_NumMinHashComparison_downsample(track_abundance): ds_a = a.downsample(num=cmp_num) ds_b = b.downsample(num=cmp_num) # build NumMinHashComparison - cmp = NumMinHashComparison(a, b, cmp_num = cmp_num) + cmp = NumMinHashComparison(a, b, cmp_num=cmp_num) assert cmp.mh1 == a assert cmp.mh2 == b assert cmp.ignore_abundance == False @@ -496,19 +618,35 @@ def test_NumMinHashComparison_downsample(track_abundance): assert cmp.moltype == "DNA" assert cmp.jaccard == ds_a.jaccard(ds_b) == ds_b.jaccard(ds_a) intersect_mh = ds_a.flatten().intersection(ds_b.flatten()) - assert cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten()) + assert ( + cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten()) + ) if track_abundance: - assert cmp.angular_similarity == ds_a.angular_similarity(ds_b) == ds_b.angular_similarity(ds_a) - assert cmp.cosine_similarity == ds_a.angular_similarity(ds_b) == ds_b.angular_similarity(ds_a) + assert ( + cmp.angular_similarity + == ds_a.angular_similarity(ds_b) + == ds_b.angular_similarity(ds_a) + ) + assert ( + cmp.cosine_similarity + == ds_a.angular_similarity(ds_b) + == ds_b.angular_similarity(ds_a) + ) else: with pytest.raises(TypeError) as exc: cmp.angular_similarity print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) with pytest.raises(TypeError) as exc: cmp.cosine_similarity print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) def test_NumMinHashComparison_autodownsample(track_abundance): @@ -516,8 +654,8 @@ def test_NumMinHashComparison_autodownsample(track_abundance): a = MinHash(10, 21, scaled=0, track_abundance=track_abundance) b = MinHash(5, 21, scaled=0, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -541,27 +679,43 @@ def test_NumMinHashComparison_autodownsample(track_abundance): assert cmp.moltype == "DNA" assert cmp.jaccard == ds_a.jaccard(ds_b) == ds_b.jaccard(ds_a) intersect_mh = ds_a.flatten().intersection(ds_b.flatten()) - assert cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten()) + assert ( + cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten()) + ) if track_abundance: - assert cmp.angular_similarity == ds_a.angular_similarity(ds_b) == ds_b.angular_similarity(ds_a) - assert cmp.cosine_similarity == ds_a.angular_similarity(ds_b) == ds_b.angular_similarity(ds_a) + assert ( + cmp.angular_similarity + == ds_a.angular_similarity(ds_b) + == ds_b.angular_similarity(ds_a) + ) + assert ( + cmp.cosine_similarity + == ds_a.angular_similarity(ds_b) + == ds_b.angular_similarity(ds_a) + ) else: with pytest.raises(TypeError) as exc: cmp.angular_similarity print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) with pytest.raises(TypeError) as exc: cmp.cosine_similarity print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) def test_NumMinHashComparison_incompatible_ksize(track_abundance): a_num = MinHash(20, 31, track_abundance=track_abundance) b_num = MinHash(10, 21, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a_num.set_abundances(a_values) @@ -581,8 +735,8 @@ def test_NumMinHashComparison_incompatible_moltype(track_abundance): a_num = MinHash(20, 31, track_abundance=track_abundance) b_num = MinHash(10, 31, is_protein=True, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a_num.set_abundances(a_values) @@ -601,8 +755,8 @@ def test_NumMinHashComparison_incompatible_sketchtype(track_abundance): a = MinHash(0, 31, scaled=1, track_abundance=track_abundance) b = MinHash(10, 31, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -621,8 +775,8 @@ def test_NumMinHashComparison_redownsample_without_num(track_abundance): a = MinHash(10, 31, track_abundance=track_abundance) b = MinHash(5, 31, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -645,8 +799,8 @@ def test_NumMinHashComparison_incompatible_cmp_num(track_abundance): a = MinHash(200, 31, track_abundance=track_abundance) b = MinHash(100, 31, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -656,7 +810,7 @@ def test_NumMinHashComparison_incompatible_cmp_num(track_abundance): b.add_many(b_values.keys()) with pytest.raises(ValueError) as exc: - NumMinHashComparison(a, b, cmp_num = 150) + NumMinHashComparison(a, b, cmp_num=150) print(str(exc)) assert "new sample num is higher than current sample num" in str(exc) @@ -664,11 +818,11 @@ def test_NumMinHashComparison_incompatible_cmp_num(track_abundance): def test_FracMinHashComparison_ANI(track_abundance): # need real mh here, small test data fails if track_abundance: - f1 = utils.get_test_data('track_abund/47.fa.sig') - f2 = utils.get_test_data('track_abund/63.fa.sig') + f1 = utils.get_test_data("track_abund/47.fa.sig") + f2 = utils.get_test_data("track_abund/63.fa.sig") else: - f1 = utils.get_test_data('47.fa.sig') - f2 = utils.get_test_data('63.fa.sig') + f1 = utils.get_test_data("47.fa.sig") + f2 = utils.get_test_data("63.fa.sig") a = load_one_signature(f1, ksize=31).minhash b = load_one_signature(f2, ksize=31).minhash @@ -677,40 +831,54 @@ def test_FracMinHashComparison_ANI(track_abundance): # check jaccard ani cmp.estimate_jaccard_ani() assert cmp.jaccard_ani == a.jaccard_ani(b).ani == b.jaccard_ani(a).ani - assert cmp.potential_false_negative == a.jaccard_ani(b).p_exceeds_threshold == b.jaccard_ani(a).p_exceeds_threshold - assert cmp.jaccard_ani_untrustworthy == a.jaccard_ani(b).je_exceeds_threshold == b.jaccard_ani(a).je_exceeds_threshold + assert ( + cmp.potential_false_negative + == a.jaccard_ani(b).p_exceeds_threshold + == b.jaccard_ani(a).p_exceeds_threshold + ) + assert ( + cmp.jaccard_ani_untrustworthy + == a.jaccard_ani(b).je_exceeds_threshold + == b.jaccard_ani(a).je_exceeds_threshold + ) cmp.estimate_ani_from_mh1_containment_in_mh2() a_cont_ani_manual = a.containment_ani(b) assert cmp.ani_from_mh1_containment_in_mh2 == a_cont_ani_manual.ani assert cmp.potential_false_negative == a_cont_ani_manual.p_exceeds_threshold -# assert cmp.mh1_containment_ani_low is None -# assert cmp.mh1_containment_ani_high is None + # assert cmp.mh1_containment_ani_low is None + # assert cmp.mh1_containment_ani_high is None cmp.estimate_ani_from_mh2_containment_in_mh1() b_cont_ani_manual = b.containment_ani(a) assert cmp.ani_from_mh2_containment_in_mh1 == b_cont_ani_manual.ani assert cmp.potential_false_negative == b_cont_ani_manual.p_exceeds_threshold -# assert cmp.mh2_containment_ani_low is None -# assert cmp.mh2_containment_ani_high is None + # assert cmp.mh2_containment_ani_low is None + # assert cmp.mh2_containment_ani_high is None cmp.estimate_max_containment_ani() mc_ani_manual = a.max_containment_ani(b) - assert cmp.max_containment_ani == max(a.containment_ani(b).ani, b.containment_ani(a).ani) == mc_ani_manual.ani + assert ( + cmp.max_containment_ani + == max(a.containment_ani(b).ani, b.containment_ani(a).ani) + == mc_ani_manual.ani + ) assert cmp.potential_false_negative == mc_ani_manual.p_exceeds_threshold -# assert cmp.max_containment_ani_low is None -# assert cmp.max_containment_ani_high is None - assert cmp.avg_containment_ani == np.mean([a.containment_ani(b).ani, b.containment_ani(a).ani]) + # assert cmp.max_containment_ani_low is None + # assert cmp.max_containment_ani_high is None + assert cmp.avg_containment_ani == np.mean( + [a.containment_ani(b).ani, b.containment_ani(a).ani] + ) def test_FracMinHashComparison_ANI_provide_similarity(track_abundance): # need real mh here, small test data fails if track_abundance: - f1 = utils.get_test_data('track_abund/47.fa.sig') - f2 = utils.get_test_data('track_abund/63.fa.sig') + f1 = utils.get_test_data("track_abund/47.fa.sig") + f2 = utils.get_test_data("track_abund/63.fa.sig") else: - f1 = utils.get_test_data('47.fa.sig') - f2 = utils.get_test_data('63.fa.sig') + f1 = utils.get_test_data("47.fa.sig") + f2 = utils.get_test_data("63.fa.sig") a = load_one_signature(f1, ksize=31).minhash b = load_one_signature(f2, ksize=31).minhash @@ -720,8 +888,16 @@ def test_FracMinHashComparison_ANI_provide_similarity(track_abundance): jaccard = a.jaccard(b) cmp.estimate_jaccard_ani(jaccard=jaccard) assert cmp.jaccard_ani == a.jaccard_ani(b).ani == b.jaccard_ani(a).ani - assert cmp.potential_false_negative == a.jaccard_ani(b).p_exceeds_threshold == b.jaccard_ani(a).p_exceeds_threshold - assert cmp.jaccard_ani_untrustworthy == a.jaccard_ani(b).je_exceeds_threshold == b.jaccard_ani(a).je_exceeds_threshold + assert ( + cmp.potential_false_negative + == a.jaccard_ani(b).p_exceeds_threshold + == b.jaccard_ani(a).p_exceeds_threshold + ) + assert ( + cmp.jaccard_ani_untrustworthy + == a.jaccard_ani(b).je_exceeds_threshold + == b.jaccard_ani(a).je_exceeds_threshold + ) a_cont = a.contained_by(b) b_cont = b.contained_by(a) @@ -739,19 +915,25 @@ def test_FracMinHashComparison_ANI_provide_similarity(track_abundance): cmp.estimate_max_containment_ani(max_containment=mc) mc_ani_manual = a.max_containment_ani(b) - assert cmp.max_containment_ani == max(a.containment_ani(b).ani, b.containment_ani(a).ani) == mc_ani_manual.ani + assert ( + cmp.max_containment_ani + == max(a.containment_ani(b).ani, b.containment_ani(a).ani) + == mc_ani_manual.ani + ) assert cmp.potential_false_negative == mc_ani_manual.p_exceeds_threshold - assert cmp.avg_containment_ani == np.mean([a.containment_ani(b).ani, b.containment_ani(a).ani]) + assert cmp.avg_containment_ani == np.mean( + [a.containment_ani(b).ani, b.containment_ani(a).ani] + ) def test_FracMinHashComparison_ANI_estimate_CI(track_abundance): # need real mh here, small test data fails if track_abundance: - f1 = utils.get_test_data('track_abund/47.fa.sig') - f2 = utils.get_test_data('track_abund/63.fa.sig') + f1 = utils.get_test_data("track_abund/47.fa.sig") + f2 = utils.get_test_data("track_abund/63.fa.sig") else: - f1 = utils.get_test_data('47.fa.sig') - f2 = utils.get_test_data('63.fa.sig') + f1 = utils.get_test_data("47.fa.sig") + f2 = utils.get_test_data("63.fa.sig") a = load_one_signature(f1, ksize=31).minhash b = load_one_signature(f2, ksize=31).minhash @@ -759,8 +941,16 @@ def test_FracMinHashComparison_ANI_estimate_CI(track_abundance): cmp = FracMinHashComparison(a, b, estimate_ani_ci=True) cmp.estimate_jaccard_ani() assert cmp.jaccard_ani == a.jaccard_ani(b).ani == b.jaccard_ani(a).ani - assert cmp.potential_false_negative == a.jaccard_ani(b).p_exceeds_threshold == b.jaccard_ani(a).p_exceeds_threshold - assert cmp.jaccard_ani_untrustworthy == a.jaccard_ani(b).je_exceeds_threshold == b.jaccard_ani(a).je_exceeds_threshold + assert ( + cmp.potential_false_negative + == a.jaccard_ani(b).p_exceeds_threshold + == b.jaccard_ani(a).p_exceeds_threshold + ) + assert ( + cmp.jaccard_ani_untrustworthy + == a.jaccard_ani(b).je_exceeds_threshold + == b.jaccard_ani(a).je_exceeds_threshold + ) cmp.estimate_ani_from_mh1_containment_in_mh2() a_cont_ani_manual = a.containment_ani(b, estimate_ci=True) @@ -778,20 +968,24 @@ def test_FracMinHashComparison_ANI_estimate_CI(track_abundance): cmp.estimate_max_containment_ani() mc_ani_manual = a.max_containment_ani(b, estimate_ci=True) - assert cmp.max_containment_ani == max(a.containment_ani(b).ani, b.containment_ani(a).ani) == mc_ani_manual.ani + assert ( + cmp.max_containment_ani + == max(a.containment_ani(b).ani, b.containment_ani(a).ani) + == mc_ani_manual.ani + ) assert cmp.potential_false_negative == mc_ani_manual.p_exceeds_threshold assert cmp.max_containment_ani_low == mc_ani_manual.ani_low - assert cmp.max_containment_ani_high ==mc_ani_manual.ani_high + assert cmp.max_containment_ani_high == mc_ani_manual.ani_high def test_FracMinHashComparison_ANI_estimate_CI_ci99(track_abundance): # need real mh here, small test data fails if track_abundance: - f1 = utils.get_test_data('track_abund/47.fa.sig') - f2 = utils.get_test_data('track_abund/63.fa.sig') + f1 = utils.get_test_data("track_abund/47.fa.sig") + f2 = utils.get_test_data("track_abund/63.fa.sig") else: - f1 = utils.get_test_data('47.fa.sig') - f2 = utils.get_test_data('63.fa.sig') + f1 = utils.get_test_data("47.fa.sig") + f2 = utils.get_test_data("63.fa.sig") a = load_one_signature(f1, ksize=31).minhash b = load_one_signature(f2, ksize=31).minhash @@ -814,20 +1008,24 @@ def test_FracMinHashComparison_ANI_estimate_CI_ci99(track_abundance): cmp.estimate_max_containment_ani() mc_ani_manual = a.max_containment_ani(b, estimate_ci=True, confidence=0.99) - assert cmp.max_containment_ani == max(a.containment_ani(b).ani, b.containment_ani(a).ani) == mc_ani_manual.ani + assert ( + cmp.max_containment_ani + == max(a.containment_ani(b).ani, b.containment_ani(a).ani) + == mc_ani_manual.ani + ) assert cmp.potential_false_negative == mc_ani_manual.p_exceeds_threshold assert cmp.max_containment_ani_low == mc_ani_manual.ani_low - assert cmp.max_containment_ani_high ==mc_ani_manual.ani_high + assert cmp.max_containment_ani_high == mc_ani_manual.ani_high def test_FracMinHashComparison_ANI_downsample(track_abundance): # need real mh here, small test data fails if track_abundance: - f1 = utils.get_test_data('track_abund/47.fa.sig') - f2 = utils.get_test_data('track_abund/63.fa.sig') + f1 = utils.get_test_data("track_abund/47.fa.sig") + f2 = utils.get_test_data("track_abund/63.fa.sig") else: - f1 = utils.get_test_data('47.fa.sig') - f2 = utils.get_test_data('63.fa.sig') + f1 = utils.get_test_data("47.fa.sig") + f2 = utils.get_test_data("63.fa.sig") a = load_one_signature(f1, ksize=31).minhash b = load_one_signature(f2, ksize=31).minhash @@ -841,8 +1039,16 @@ def test_FracMinHashComparison_ANI_downsample(track_abundance): # check jaccard ani cmp.estimate_jaccard_ani() assert cmp.jaccard_ani == a.jaccard_ani(b).ani == b.jaccard_ani(a).ani - assert cmp.potential_false_negative == a.jaccard_ani(b).p_exceeds_threshold == b.jaccard_ani(a).p_exceeds_threshold - assert cmp.jaccard_ani_untrustworthy == a.jaccard_ani(b).je_exceeds_threshold == b.jaccard_ani(a).je_exceeds_threshold + assert ( + cmp.potential_false_negative + == a.jaccard_ani(b).p_exceeds_threshold + == b.jaccard_ani(a).p_exceeds_threshold + ) + assert ( + cmp.jaccard_ani_untrustworthy + == a.jaccard_ani(b).je_exceeds_threshold + == b.jaccard_ani(a).je_exceeds_threshold + ) cmp.estimate_ani_from_mh1_containment_in_mh2() a_cont_ani_manual = a.containment_ani(b, estimate_ci=True) @@ -860,7 +1066,11 @@ def test_FracMinHashComparison_ANI_downsample(track_abundance): cmp.estimate_max_containment_ani() mc_ani_manual = a.max_containment_ani(b, estimate_ci=True) - assert cmp.max_containment_ani == max(a.containment_ani(b).ani, b.containment_ani(a).ani) == mc_ani_manual.ani + assert ( + cmp.max_containment_ani + == max(a.containment_ani(b).ani, b.containment_ani(a).ani) + == mc_ani_manual.ani + ) assert cmp.potential_false_negative == mc_ani_manual.p_exceeds_threshold assert cmp.max_containment_ani_low == mc_ani_manual.ani_low - assert cmp.max_containment_ani_high ==mc_ani_manual.ani_high + assert cmp.max_containment_ani_high == mc_ani_manual.ani_high diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 9ee703f6f7..7aaac0446e 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -26,7 +26,8 @@ try: import matplotlib - matplotlib.use('Agg') + + matplotlib.use("Agg") except ImportError: pass @@ -40,30 +41,30 @@ def test_citation_file(): import yaml thisdir = os.path.dirname(__file__) - citation_file = os.path.join(thisdir, '../CITATION.cff') + citation_file = os.path.join(thisdir, "../CITATION.cff") with open(citation_file) as fp: x = yaml.safe_load(fp) - assert x['title'] == "sourmash: a library for MinHash sketching of DNA", x + assert x["title"] == "sourmash: a library for MinHash sketching of DNA", x def test_run_sourmash(): - status, out, err = utils.runscript('sourmash', [], fail_ok=True) - assert status != 0 # no args provided, ok ;) + status, out, err = utils.runscript("sourmash", [], fail_ok=True) + assert status != 0 # no args provided, ok ;) def test_run_sourmash_badcmd(): - status, out, err = utils.runscript('sourmash', ['foobarbaz'], fail_ok=True) - assert status != 0 # bad arg! + status, out, err = utils.runscript("sourmash", ["foobarbaz"], fail_ok=True) + assert status != 0 # bad arg! assert "cmd: invalid choice" in err def test_run_sourmash_subcmd_help(): - status, out, err = utils.runscript('sourmash', ['sbt'], fail_ok=True) + status, out, err = utils.runscript("sourmash", ["sbt"], fail_ok=True) print(out) print(err) - assert status != 0 # should fail + assert status != 0 # should fail assert "invalid choice:" in err assert "'sbt' (choose from" in err @@ -73,7 +74,7 @@ def test_run_sourmash_subcmd_help(): def test_sourmash_info(): - status, out, err = utils.runscript('sourmash', ['info'], fail_ok=False) + status, out, err = utils.runscript("sourmash", ["info"], fail_ok=False) # no output to stdout assert not out @@ -83,7 +84,7 @@ def test_sourmash_info(): def test_sourmash_info_verbose(): - status, out, err = utils.runscript('sourmash', ['info', '-v']) + status, out, err = utils.runscript("sourmash", ["info", "-v"]) # no output to stdout assert not out @@ -94,6 +95,7 @@ def test_sourmash_info_verbose(): def test_load_pathlist_from_file_does_not_exist(): from sourmash.sourmash_args import load_pathlist_from_file + with pytest.raises(ValueError) as e: load_pathlist_from_file("") assert "file '' does not exist" in str(e.value) @@ -122,7 +124,7 @@ def test_load_pathlist_from_file_badly_formatted(c): @utils.in_tempdir def test_load_pathlist_from_file_badly_formatted_2(c): file_list = c.output("file_list") - sig1 = utils.get_test_data('compare/genome-s10.fa.gz.sig') + sig1 = utils.get_test_data("compare/genome-s10.fa.gz.sig") with open(file_list, "w") as fp: fp.write(sig1 + "\n") fp.write("{'a':1}") @@ -134,12 +136,12 @@ def test_load_pathlist_from_file_badly_formatted_2(c): @utils.in_tempdir def test_load_pathlist_from_file_duplicate(c): file_list = c.output("file_list") - sig1 = utils.get_test_data('compare/genome-s10.fa.gz.sig') + sig1 = utils.get_test_data("compare/genome-s10.fa.gz.sig") with open(file_list, "w") as fp: fp.write(sig1 + "\n") fp.write(sig1 + "\n") check = load_pathlist_from_file(file_list) - print (check) + print(check) assert len(check) == 1 @@ -147,19 +149,18 @@ def test_compare_serial(runtmp): # try doing a compare serially c = runtmp - testsigs = utils.get_test_data('genome-s1*.sig') + testsigs = utils.get_test_data("genome-s1*.sig") testsigs = glob.glob(testsigs) - c.run_sourmash('compare', '-o', 'cmp', '-k', '21', '--dna', *testsigs) + c.run_sourmash("compare", "-o", "cmp", "-k", "21", "--dna", *testsigs) - cmp_outfile = c.output('cmp') + cmp_outfile = c.output("cmp") assert os.path.exists(cmp_outfile) cmp_out = numpy.load(cmp_outfile) sigs = [] for fn in testsigs: - sigs.append(sourmash.load_one_signature(fn, ksize=21, - select_moltype='dna')) + sigs.append(sourmash.load_one_signature(fn, ksize=21, select_moltype="dna")) cmp_calc = numpy.zeros([len(sigs), len(sigs)]) for i, si in enumerate(sigs): @@ -168,8 +169,7 @@ def test_compare_serial(runtmp): sigs = [] for fn in testsigs: - sigs.append(sourmash.load_one_signature(fn, ksize=21, - select_moltype='dna')) + sigs.append(sourmash.load_one_signature(fn, ksize=21, select_moltype="dna")) assert (cmp_out == cmp_calc).all() @@ -177,20 +177,18 @@ def test_compare_serial_distance(runtmp): # try doing a compare serially, with --distance output c = runtmp - testsigs = utils.get_test_data('genome-s1*.sig') + testsigs = utils.get_test_data("genome-s1*.sig") testsigs = glob.glob(testsigs) - c.run_sourmash('compare', '-o', 'cmp', '-k', '21', '--dna', *testsigs, - '--distance') + c.run_sourmash("compare", "-o", "cmp", "-k", "21", "--dna", *testsigs, "--distance") - cmp_outfile = c.output('cmp') + cmp_outfile = c.output("cmp") assert os.path.exists(cmp_outfile) cmp_out = numpy.load(cmp_outfile) sigs = [] for fn in testsigs: - sigs.append(sourmash.load_one_signature(fn, ksize=21, - select_moltype='dna')) + sigs.append(sourmash.load_one_signature(fn, ksize=21, select_moltype="dna")) cmp_calc = numpy.zeros([len(sigs), len(sigs)]) for i, si in enumerate(sigs): @@ -199,8 +197,7 @@ def test_compare_serial_distance(runtmp): sigs = [] for fn in testsigs: - sigs.append(sourmash.load_one_signature(fn, ksize=21, - select_moltype='dna')) + sigs.append(sourmash.load_one_signature(fn, ksize=21, select_moltype="dna")) assert (cmp_out == cmp_calc).all() @@ -208,20 +205,20 @@ def test_compare_parallel(runtmp): # try doing a compare parallel c = runtmp - testsigs = utils.get_test_data('genome-s1*.sig') + testsigs = utils.get_test_data("genome-s1*.sig") testsigs = glob.glob(testsigs) - c.run_sourmash('compare', '-o', 'cmp', '-k', '21', '--dna', - "--processes", "2", *testsigs) + c.run_sourmash( + "compare", "-o", "cmp", "-k", "21", "--dna", "--processes", "2", *testsigs + ) - cmp_outfile = c.output('cmp') + cmp_outfile = c.output("cmp") assert os.path.exists(cmp_outfile) cmp_out = numpy.load(cmp_outfile) sigs = [] for fn in testsigs: - sigs.append(sourmash.load_one_signature(fn, ksize=21, - select_moltype='dna')) + sigs.append(sourmash.load_one_signature(fn, ksize=21, select_moltype="dna")) cmp_calc = numpy.zeros([len(sigs), len(sigs)]) for i, si in enumerate(sigs): @@ -230,32 +227,31 @@ def test_compare_parallel(runtmp): sigs = [] for fn in testsigs: - sigs.append(sourmash.load_one_signature(fn, ksize=21, - select_moltype='dna')) + sigs.append(sourmash.load_one_signature(fn, ksize=21, select_moltype="dna")) assert (cmp_out == cmp_calc).all() def test_compare_do_serial_compare_with_from_file(runtmp): # try doing a compare serial c = runtmp - testsigs = utils.get_test_data('genome-s1*.sig') + testsigs = utils.get_test_data("genome-s1*.sig") testsigs = glob.glob(testsigs) - file_list = c.output('file.list') - with open(file_list, 'wt') as fp: + file_list = c.output("file.list") + with open(file_list, "w") as fp: print("\n".join(testsigs), file=fp) - c.run_sourmash('compare', '-o', 'cmp', '-k', '21', '--dna', - '--from-file', file_list) + c.run_sourmash( + "compare", "-o", "cmp", "-k", "21", "--dna", "--from-file", file_list + ) - cmp_outfile = c.output('cmp') + cmp_outfile = c.output("cmp") assert os.path.exists(cmp_outfile) cmp_out = numpy.load(cmp_outfile) sigs = [] for fn in testsigs: - sigs.append(sourmash.load_one_signature(fn, ksize=21, - select_moltype='dna')) + sigs.append(sourmash.load_one_signature(fn, ksize=21, select_moltype="dna")) cmp_calc = numpy.zeros([len(sigs), len(sigs)]) for i, si in enumerate(sigs): @@ -264,8 +260,7 @@ def test_compare_do_serial_compare_with_from_file(runtmp): sigs = [] for fn in testsigs: - sigs.append(sourmash.load_one_signature(fn, ksize=21, - select_moltype='dna')) + sigs.append(sourmash.load_one_signature(fn, ksize=21, select_moltype="dna")) assert numpy.array_equal(numpy.sort(cmp_out.flat), numpy.sort(cmp_calc.flat)) @@ -274,19 +269,18 @@ def test_compare_do_basic_compare_using_rna_arg(runtmp): # try doing a basic compare using --rna instead of --dna c = runtmp - testsigs = utils.get_test_data('genome-s1*.sig') + testsigs = utils.get_test_data("genome-s1*.sig") testsigs = glob.glob(testsigs) - c.run_sourmash('compare', '-o', 'cmp', '-k', '21', '--rna', *testsigs) + c.run_sourmash("compare", "-o", "cmp", "-k", "21", "--rna", *testsigs) - cmp_outfile = c.output('cmp') + cmp_outfile = c.output("cmp") assert os.path.exists(cmp_outfile) cmp_out = numpy.load(cmp_outfile) sigs = [] for fn in testsigs: - sigs.append(sourmash.load_one_signature(fn, ksize=21, - select_moltype='dna')) + sigs.append(sourmash.load_one_signature(fn, ksize=21, select_moltype="dna")) cmp_calc = numpy.zeros([len(sigs), len(sigs)]) for i, si in enumerate(sigs): @@ -299,19 +293,18 @@ def test_compare_do_basic_compare_using_rna_arg(runtmp): def test_compare_do_basic_using_nucleotide_arg(runtmp): # try doing a basic compare using --nucleotide instead of --dna/--rna c = runtmp - testsigs = utils.get_test_data('genome-s1*.sig') + testsigs = utils.get_test_data("genome-s1*.sig") testsigs = glob.glob(testsigs) - c.run_sourmash('compare', '-o', 'cmp', '-k', '21', '--nucleotide', *testsigs) + c.run_sourmash("compare", "-o", "cmp", "-k", "21", "--nucleotide", *testsigs) - cmp_outfile = c.output('cmp') + cmp_outfile = c.output("cmp") assert os.path.exists(cmp_outfile) cmp_out = numpy.load(cmp_outfile) sigs = [] for fn in testsigs: - sigs.append(sourmash.load_one_signature(fn, ksize=21, - select_moltype='dna')) + sigs.append(sourmash.load_one_signature(fn, ksize=21, select_moltype="dna")) cmp_calc = numpy.zeros([len(sigs), len(sigs)]) for i, si in enumerate(sigs): @@ -324,22 +317,24 @@ def test_compare_do_basic_using_nucleotide_arg(runtmp): def test_compare_quiet(runtmp): # test 'compare -q' has no output c = runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - c.run_sourmash('sketch', 'translate', '-p', 'k=31,num=500', testdata1, testdata2) + c.run_sourmash("sketch", "translate", "-p", "k=31,num=500", testdata1, testdata2) - c.run_sourmash('compare', 'short.fa.sig', - 'short2.fa.sig', '--csv', 'xxx', '-q') + c.run_sourmash("compare", "short.fa.sig", "short2.fa.sig", "--csv", "xxx", "-q") assert not c.last_result.out assert not c.last_result.err def test_compare_do_traverse_directory_parse_args(runtmp): # test 'compare' on a directory, using sourmash.cli.parse_args. - import sourmash.commands, sourmash.cli - args = sourmash.cli.parse_args(['compare', '-k', '21', '--dna', - utils.get_test_data('compare')]) + import sourmash.commands + import sourmash.cli + + args = sourmash.cli.parse_args( + ["compare", "-k", "21", "--dna", utils.get_test_data("compare")] + ) sourmash.commands.compare(args) @@ -347,41 +342,39 @@ def test_compare_do_traverse_directory_parse_args(runtmp): def test_compare_do_traverse_directory(runtmp): # test 'compare' on a directory c = runtmp - c.run_sourmash('compare', '-k 21', - '--dna', utils.get_test_data('compare')) + c.run_sourmash("compare", "-k 21", "--dna", utils.get_test_data("compare")) print(c.last_result.out) - assert 'genome-s10.fa.gz' in c.last_result.out - assert 'genome-s11.fa.gz' in c.last_result.out + assert "genome-s10.fa.gz" in c.last_result.out + assert "genome-s11.fa.gz" in c.last_result.out def test_compare_do_traverse_directory_compare_force(runtmp): # test 'compare' on a directory, with -f c = runtmp - sig1 = utils.get_test_data('compare/genome-s10.fa.gz.sig') - sig2 = utils.get_test_data('compare/genome-s11.fa.gz.sig') - newdir = c.output('newdir') + sig1 = utils.get_test_data("compare/genome-s10.fa.gz.sig") + sig2 = utils.get_test_data("compare/genome-s11.fa.gz.sig") + newdir = c.output("newdir") os.mkdir(newdir) - shutil.copyfile(sig1, os.path.join(newdir, 'sig1')) - shutil.copyfile(sig2, os.path.join(newdir, 'sig2')) + shutil.copyfile(sig1, os.path.join(newdir, "sig1")) + shutil.copyfile(sig2, os.path.join(newdir, "sig2")) - c.run_sourmash('compare', '-k 21', - '--dna', newdir, '-f') + c.run_sourmash("compare", "-k 21", "--dna", newdir, "-f") print(c.last_result.out) - assert 'genome-s10.fa.gz' in c.last_result.out - assert 'genome-s11.fa.gz' in c.last_result.out + assert "genome-s10.fa.gz" in c.last_result.out + assert "genome-s11.fa.gz" in c.last_result.out def test_compare_output_csv(runtmp): # test 'sourmash compare --csv' c = runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - c.run_sourmash('sketch', 'dna', '-p', 'k=31,num=500', testdata1, testdata2) - c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', '--csv', 'xxx') + c.run_sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) + c.run_sourmash("compare", "short.fa.sig", "short2.fa.sig", "--csv", "xxx") - with open(c.output('xxx')) as fp: + with open(c.output("xxx")) as fp: r = iter(csv.reader(fp)) row = next(r) print(row) @@ -394,21 +387,20 @@ def test_compare_output_csv(runtmp): assert float(row[1]) == 1.0 # exactly three lines - with pytest.raises(StopIteration) as e: + with pytest.raises(StopIteration): next(r) def test_compare_output_csv_gz(runtmp): # test 'sourmash compare --csv' with a .gz file c = runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - c.run_sourmash('sketch', 'dna', '-p', 'k=31,num=500', testdata1, testdata2) - c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', - '--csv', 'xxx.gz') + c.run_sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) + c.run_sourmash("compare", "short.fa.sig", "short2.fa.sig", "--csv", "xxx.gz") - with gzip.open(c.output('xxx.gz'), 'rt', newline='') as fp: + with gzip.open(c.output("xxx.gz"), "rt", newline="") as fp: r = iter(csv.reader(fp)) row = next(r) print(row) @@ -421,85 +413,97 @@ def test_compare_output_csv_gz(runtmp): assert float(row[1]) == 1.0 # exactly three lines - with pytest.raises(StopIteration) as e: + with pytest.raises(StopIteration): next(r) def test_compare_downsample(runtmp): # test 'compare' with implicit downsampling c = runtmp - testdata1 = utils.get_test_data('short.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=200', testdata1) + testdata1 = utils.get_test_data("short.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,scaled=200", testdata1) - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=100', testdata2) + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,scaled=100", testdata2) - c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', '--csv', 'xxx') + c.run_sourmash("compare", "short.fa.sig", "short2.fa.sig", "--csv", "xxx") print(c.last_result.status, c.last_result.out, c.last_result.err) - assert 'downsampling to scaled value of 200' in c.last_result.err - with open(c.output('xxx')) as fp: + assert "downsampling to scaled value of 200" in c.last_result.err + with open(c.output("xxx")) as fp: lines = fp.readlines() assert len(lines) == 3 - assert lines[1].startswith('1.0,0.6666') - assert lines[2].startswith('0.6666') + assert lines[1].startswith("1.0,0.6666") + assert lines[2].startswith("0.6666") def test_compare_downsample_scaled(runtmp): # test 'compare' with explicit --scaled downsampling c = runtmp - testdata1 = utils.get_test_data('short.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=200', testdata1) + testdata1 = utils.get_test_data("short.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,scaled=200", testdata1) - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=100', testdata2) + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,scaled=100", testdata2) - c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', '--csv', 'xxx', - '--scaled', '300') + c.run_sourmash( + "compare", "short.fa.sig", "short2.fa.sig", "--csv", "xxx", "--scaled", "300" + ) print(c.last_result.status, c.last_result.out, c.last_result.err) - assert 'downsampling to scaled value of 300' in c.last_result.err - with open(c.output('xxx')) as fp: + assert "downsampling to scaled value of 300" in c.last_result.err + with open(c.output("xxx")) as fp: lines = fp.readlines() assert len(lines) == 3 - assert lines[1].startswith('1.0,0.0') - assert lines[2].startswith('0.0') + assert lines[1].startswith("1.0,0.0") + assert lines[2].startswith("0.0") def test_compare_downsample_scaled_too_low(runtmp): # test 'compare' with explicit --scaled downsampling, but lower than min c = runtmp - testdata1 = utils.get_test_data('short.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=200', testdata1) + testdata1 = utils.get_test_data("short.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,scaled=200", testdata1) - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=100', testdata2) + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,scaled=100", testdata2) - c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', '--csv', 'xxx', - '--scaled', '100') + c.run_sourmash( + "compare", "short.fa.sig", "short2.fa.sig", "--csv", "xxx", "--scaled", "100" + ) print(c.last_result.status, c.last_result.out, c.last_result.err) - assert 'downsampling to scaled value of 200' in c.last_result.err - assert "WARNING: --scaled specified 100, but max scaled of sketches is 200" in c.last_result.err - with open(c.output('xxx')) as fp: + assert "downsampling to scaled value of 200" in c.last_result.err + assert ( + "WARNING: --scaled specified 100, but max scaled of sketches is 200" + in c.last_result.err + ) + with open(c.output("xxx")) as fp: lines = fp.readlines() assert len(lines) == 3 - assert lines[1].startswith('1.0,0.6666') - assert lines[2].startswith('0.6666') + assert lines[1].startswith("1.0,0.6666") + assert lines[2].startswith("0.6666") def test_compare_downsample_scaled_fail_num(runtmp): # test 'compare' with explicit --scaled downsampling; fail on num sketch c = runtmp - testdata1 = utils.get_test_data('short.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,num=20', testdata1) + testdata1 = utils.get_test_data("short.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,num=20", testdata1) - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=100', testdata2) + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,scaled=100", testdata2) - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', - '--csv', 'xxx', '--scaled', '300') + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "compare", + "short.fa.sig", + "short2.fa.sig", + "--csv", + "xxx", + "--scaled", + "300", + ) print(c.last_result.status, c.last_result.out, c.last_result.err) assert "cannot mix scaled signatures with num signatures" in c.last_result.err @@ -508,75 +512,88 @@ def test_compare_downsample_scaled_fail_num(runtmp): def test_compare_downsample_scaled_fail_all_num(runtmp): # test 'compare' with explicit --scaled downsampling; fail on all num sketches c = runtmp - testdata1 = utils.get_test_data('short.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,num=20', testdata1) + testdata1 = utils.get_test_data("short.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,num=20", testdata1) - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,num=30', testdata2) + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,num=30", testdata2) - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', - '--csv', 'xxx', '--scaled', '300') + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "compare", + "short.fa.sig", + "short2.fa.sig", + "--csv", + "xxx", + "--scaled", + "300", + ) print(c.last_result.status, c.last_result.out, c.last_result.err) - assert "ERROR: cannot specify --scaled with non-scaled signatures." in c.last_result.err + assert ( + "ERROR: cannot specify --scaled with non-scaled signatures." + in c.last_result.err + ) def test_compare_output_multiple_k(runtmp): # test 'compare' when given multiple k-mer sizes -> should fail c = runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'translate', '-p', 'k=21,num=500', testdata1) - c.run_sourmash('sketch', 'translate', '-p', 'k=31,num=500', testdata2) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "translate", "-p", "k=21,num=500", testdata1) + c.run_sourmash("sketch", "translate", "-p", "k=31,num=500", testdata2) - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', '--csv', 'xxx', - fail_ok=True) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "compare", "short.fa.sig", "short2.fa.sig", "--csv", "xxx", fail_ok=True + ) print(c.last_result.status, c.last_result.out, c.last_result.err) assert c.last_result.status == -1 - assert 'multiple k-mer sizes loaded; please specify one' in c.last_result.err - assert '(saw k-mer sizes 21, 31)' in c.last_result.err + assert "multiple k-mer sizes loaded; please specify one" in c.last_result.err + assert "(saw k-mer sizes 21, 31)" in c.last_result.err def test_compare_output_multiple_moltype(runtmp): # 'compare' should fail when given multiple moltypes c = runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=21,num=500', testdata1) - c.run_sourmash('sketch', 'translate', '-p', 'k=21,num=500', testdata2) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "dna", "-p", "k=21,num=500", testdata1) + c.run_sourmash("sketch", "translate", "-p", "k=21,num=500", testdata2) - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', '--csv', 'xxx', - fail_ok=True) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "compare", "short.fa.sig", "short2.fa.sig", "--csv", "xxx", fail_ok=True + ) assert c.last_result.status == -1 print(c.last_result.err) - assert 'multiple molecule types loaded;' in c.last_result.err + assert "multiple molecule types loaded;" in c.last_result.err def test_compare_dayhoff(runtmp): # test 'compare' works with dayhoff moltype c = runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'translate', '-p', 'k=21,num=500', '--dayhoff', testdata1) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "translate", "-p", "k=21,num=500", "--dayhoff", testdata1) assert c.last_result.status == 0 - c.run_sourmash('sketch', 'translate', '-p', 'k=21,num=500', '--dayhoff', testdata2) + c.run_sourmash("sketch", "translate", "-p", "k=21,num=500", "--dayhoff", testdata2) assert c.last_result.status == 0 - c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', - '--dayhoff', '--csv', 'xxx') - true_out = '''[1. 0.94] + c.run_sourmash( + "compare", "short.fa.sig", "short2.fa.sig", "--dayhoff", "--csv", "xxx" + ) + true_out = """[1. 0.94] [0.94 1. ] -min similarity in matrix: 0.940'''.splitlines() +min similarity in matrix: 0.940""".splitlines() for line in c.last_result.out: - cleaned_line = line.split('...')[-1].strip() + cleaned_line = line.split("...")[-1].strip() cleaned_line in true_out assert c.last_result.status == 0 @@ -584,21 +601,20 @@ def test_compare_dayhoff(runtmp): def test_compare_hp(runtmp): # test that 'compare' works with --hp moltype c = runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'translate', '-p', 'k=21,num=500', '--hp', testdata1) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "translate", "-p", "k=21,num=500", "--hp", testdata1) assert c.last_result.status == 0 - c.run_sourmash('sketch', 'translate', '-p', 'k=21,num=500', '--hp', testdata2) + c.run_sourmash("sketch", "translate", "-p", "k=21,num=500", "--hp", testdata2) assert c.last_result.status == 0 - c.run_sourmash('compare', 'short.fa.sig', - 'short2.fa.sig', '--hp', '--csv', 'xxx') - true_out = '''[1. 0.94] + c.run_sourmash("compare", "short.fa.sig", "short2.fa.sig", "--hp", "--csv", "xxx") + true_out = """[1. 0.94] [0.94 1. ] -min similarity in matrix: 0.940'''.splitlines() +min similarity in matrix: 0.940""".splitlines() for line in c.last_result.out: - cleaned_line = line.split('...')[-1].strip() + cleaned_line = line.split("...")[-1].strip() cleaned_line in true_out assert c.last_result.status == 0 @@ -607,7 +623,7 @@ def _load_compare_matrix_and_sigs(compare_csv, sigfiles, *, ksize=31): # load in the output of 'compare' together with sigs # load compare CSV - with open(compare_csv, 'rt', newline="") as fp: + with open(compare_csv, newline="") as fp: r = iter(csv.reader(fp)) headers = next(r) @@ -619,7 +635,7 @@ def _load_compare_matrix_and_sigs(compare_csv, sigfiles, *, ksize=31): print(mat) # load in all the input signatures - idx_to_sig = dict() + idx_to_sig = {} for idx, filename in enumerate(sigfiles): ss = sourmash.load_one_signature(filename, ksize=ksize) idx_to_sig[idx] = ss @@ -631,15 +647,17 @@ def test_compare_containment(runtmp): # test compare --containment c = runtmp - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - c.run_sourmash('compare', '--containment', '-k', '31', - '--csv', 'output.csv', *testdata_sigs) + c.run_sourmash( + "compare", "--containment", "-k", "31", "--csv", "output.csv", *testdata_sigs + ) # load the matrix output - mat, idx_to_sig = _load_compare_matrix_and_sigs(c.output('output.csv'), - testdata_sigs) + mat, idx_to_sig = _load_compare_matrix_and_sigs( + c.output("output.csv"), testdata_sigs + ) # check explicit containment against output of compare for i in range(len(idx_to_sig)): @@ -657,15 +675,24 @@ def test_compare_containment_distance(runtmp): # test compare --containment --distance-matrix c = runtmp - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - c.run_sourmash('compare', '--containment', '--distance-matrix', '-k', '31', - '--csv', 'output.csv', *testdata_sigs) + c.run_sourmash( + "compare", + "--containment", + "--distance-matrix", + "-k", + "31", + "--csv", + "output.csv", + *testdata_sigs, + ) # load the matrix output - mat, idx_to_sig = _load_compare_matrix_and_sigs(c.output('output.csv'), - testdata_sigs) + mat, idx_to_sig = _load_compare_matrix_and_sigs( + c.output("output.csv"), testdata_sigs + ) # check explicit containment against output of compare for i in range(len(idx_to_sig)): @@ -683,15 +710,23 @@ def test_compare_max_containment(runtmp): # test compare --max-containment c = runtmp - testdata_glob = utils.get_test_data('scaled/*.sig') + testdata_glob = utils.get_test_data("scaled/*.sig") testdata_sigs = glob.glob(testdata_glob) - c.run_sourmash('compare', '--max-containment', '-k', '31', - '--csv', 'output.csv', *testdata_sigs) + c.run_sourmash( + "compare", + "--max-containment", + "-k", + "31", + "--csv", + "output.csv", + *testdata_sigs, + ) # load the matrix output - mat, idx_to_sig = _load_compare_matrix_and_sigs(c.output('output.csv'), - testdata_sigs) + mat, idx_to_sig = _load_compare_matrix_and_sigs( + c.output("output.csv"), testdata_sigs + ) # check explicit containment against output of compare for i in range(len(idx_to_sig)): @@ -709,15 +744,23 @@ def test_compare_avg_containment(runtmp): # test compare --avg-containment c = runtmp - testdata_glob = utils.get_test_data('scaled/*.sig') + testdata_glob = utils.get_test_data("scaled/*.sig") testdata_sigs = glob.glob(testdata_glob) - c.run_sourmash('compare', '--avg-containment', '-k', '31', - '--csv', 'output.csv', *testdata_sigs) + c.run_sourmash( + "compare", + "--avg-containment", + "-k", + "31", + "--csv", + "output.csv", + *testdata_sigs, + ) # load the matrix output - mat, idx_to_sig = _load_compare_matrix_and_sigs(c.output('output.csv'), - testdata_sigs) + mat, idx_to_sig = _load_compare_matrix_and_sigs( + c.output("output.csv"), testdata_sigs + ) # check explicit containment against output of compare for i in range(len(idx_to_sig)): @@ -735,93 +778,125 @@ def test_compare_max_containment_and_containment(runtmp): # make sure that can't specify both --max-containment and --containment c = runtmp - testdata_glob = utils.get_test_data('scaled/*.sig') + testdata_glob = utils.get_test_data("scaled/*.sig") testdata_sigs = glob.glob(testdata_glob) - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('compare', '--max-containment', '-k', '31', - '--containment', - '--csv', 'output.csv', *testdata_sigs) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "compare", + "--max-containment", + "-k", + "31", + "--containment", + "--csv", + "output.csv", + *testdata_sigs, + ) print(c.last_result.err) - assert "ERROR: cannot specify more than one containment argument!" in c.last_result.err + assert ( + "ERROR: cannot specify more than one containment argument!" in c.last_result.err + ) def test_compare_avg_containment_and_containment(runtmp): # make sure that can't specify both --avg-containment and --containment c = runtmp - testdata_glob = utils.get_test_data('scaled/*.sig') + testdata_glob = utils.get_test_data("scaled/*.sig") testdata_sigs = glob.glob(testdata_glob) - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('compare', '--avg-containment', '-k', '31', - '--containment', - '--csv', 'output.csv', *testdata_sigs) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "compare", + "--avg-containment", + "-k", + "31", + "--containment", + "--csv", + "output.csv", + *testdata_sigs, + ) print(c.last_result.err) - assert "ERROR: cannot specify more than one containment argument!" in c.last_result.err + assert ( + "ERROR: cannot specify more than one containment argument!" in c.last_result.err + ) def test_compare_avg_containment_and_max_containment(runtmp): # make sure that can't specify both --avg-containment and --max-containment c = runtmp - testdata_glob = utils.get_test_data('scaled/*.sig') + testdata_glob = utils.get_test_data("scaled/*.sig") testdata_sigs = glob.glob(testdata_glob) - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('compare', '--avg-containment', '-k', '31', - '--max-containment', - '--csv', 'output.csv', *testdata_sigs) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "compare", + "--avg-containment", + "-k", + "31", + "--max-containment", + "--csv", + "output.csv", + *testdata_sigs, + ) print(c.last_result.err) - assert "ERROR: cannot specify more than one containment argument!" in c.last_result.err + assert ( + "ERROR: cannot specify more than one containment argument!" in c.last_result.err + ) def test_compare_containment_abund_flatten_warning(runtmp): # check warning message about ignoring abund signatures - c = runtmp - s47 = utils.get_test_data('track_abund/47.fa.sig') - s63 = utils.get_test_data('track_abund/63.fa.sig') + c = runtmp + s47 = utils.get_test_data("track_abund/47.fa.sig") + s63 = utils.get_test_data("track_abund/63.fa.sig") - c.run_sourmash('compare', '--containment', '-k', '31', s47, s63) + c.run_sourmash("compare", "--containment", "-k", "31", s47, s63) print(c.last_result.out) print(c.last_result.err) - assert 'NOTE: --containment, --max-containment, --avg-containment, and --estimate-ani ignore signature abundances.' in \ - c.last_result.err + assert ( + "NOTE: --containment, --max-containment, --avg-containment, and --estimate-ani ignore signature abundances." + in c.last_result.err + ) def test_compare_ani_abund_flatten(runtmp): # check warning message about ignoring abund signatures c = runtmp - s47 = utils.get_test_data('track_abund/47.fa.sig') - s63 = utils.get_test_data('track_abund/63.fa.sig') + s47 = utils.get_test_data("track_abund/47.fa.sig") + s63 = utils.get_test_data("track_abund/63.fa.sig") - c.run_sourmash('compare', '--estimate-ani', '-k', '31', s47, s63) + c.run_sourmash("compare", "--estimate-ani", "-k", "31", s47, s63) print(c.last_result.out) print(c.last_result.err) - assert 'NOTE: --containment, --max-containment, --avg-containment, and --estimate-ani ignore signature abundances.' in \ - c.last_result.err + assert ( + "NOTE: --containment, --max-containment, --avg-containment, and --estimate-ani ignore signature abundances." + in c.last_result.err + ) def test_compare_containment_require_scaled(runtmp): # check warning message about scaled signatures & containment c = runtmp - s47 = utils.get_test_data('num/47.fa.sig') - s63 = utils.get_test_data('num/63.fa.sig') + s47 = utils.get_test_data("num/47.fa.sig") + s63 = utils.get_test_data("num/63.fa.sig") - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('compare', '--containment', '-k', '31', s47, s63, - fail_ok=True) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("compare", "--containment", "-k", "31", s47, s63, fail_ok=True) - assert 'must use scaled signatures with --containment, --max-containment, and --avg-containment' in \ - c.last_result.err + assert ( + "must use scaled signatures with --containment, --max-containment, and --avg-containment" + in c.last_result.err + ) assert c.last_result.status != 0 @@ -829,13 +904,13 @@ def test_do_plot_comparison(runtmp): # make sure 'plot' outputs files ;) c = runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,num=500', testdata1, testdata2) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', '-o', 'cmp') + c.run_sourmash("compare", "short.fa.sig", "short2.fa.sig", "-o", "cmp") - c.run_sourmash('plot', 'cmp') + c.run_sourmash("plot", "cmp") assert os.path.exists(c.output("cmp.dendro.png")) assert os.path.exists(c.output("cmp.matrix.png")) @@ -845,13 +920,13 @@ def test_do_plot_comparison_2_pdf(runtmp): # test plot --pdf c = runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'translate', '-p', 'k=31,num=500', testdata1, testdata2) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "translate", "-p", "k=31,num=500", testdata1, testdata2) - c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', '-o', 'cmp') + c.run_sourmash("compare", "short.fa.sig", "short2.fa.sig", "-o", "cmp") - c.run_sourmash('plot', 'cmp', '--pdf') + c.run_sourmash("plot", "cmp", "--pdf") assert os.path.exists(c.output("cmp.dendro.pdf")) assert os.path.exists(c.output("cmp.matrix.pdf")) @@ -860,13 +935,13 @@ def test_do_plot_comparison_3(runtmp): # test plot --labels c = runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'translate', '-p', 'k=31,num=500', testdata1, testdata2) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "translate", "-p", "k=31,num=500", testdata1, testdata2) - c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', '-o', 'cmp') + c.run_sourmash("compare", "short.fa.sig", "short2.fa.sig", "-o", "cmp") - c.run_sourmash('plot', 'cmp', '--labels') + c.run_sourmash("plot", "cmp", "--labels") assert os.path.exists(c.output("cmp.dendro.png")) assert os.path.exists(c.output("cmp.matrix.png")) @@ -876,15 +951,15 @@ def test_do_plot_comparison_4_output_dir(runtmp): # test plot --output-dir c = runtmp - output_dir = c.output('xyz_test') + output_dir = c.output("xyz_test") - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'translate', '-p', 'k=31,num=500', testdata1, testdata2) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "translate", "-p", "k=31,num=500", testdata1, testdata2) - c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', '-o', 'cmp') + c.run_sourmash("compare", "short.fa.sig", "short2.fa.sig", "-o", "cmp") - c.run_sourmash('plot', 'cmp', '--labels', '--output-dir', output_dir) + c.run_sourmash("plot", "cmp", "--labels", "--output-dir", output_dir) assert os.path.exists(os.path.join(output_dir, "cmp.dendro.png")) assert os.path.exists(os.path.join(output_dir, "cmp.matrix.png")) @@ -896,13 +971,13 @@ def test_do_plot_comparison_5_force(runtmp): D = numpy.zeros([2, 2]) D[0, 0] = 5 - with open(c.output('cmp'), 'wb') as fp: + with open(c.output("cmp"), "wb") as fp: numpy.save(fp, D) - with open(c.output('cmp.labels.txt'), 'wt') as fp: + with open(c.output("cmp.labels.txt"), "w") as fp: fp.write("a\nb\n") - c.run_sourmash('plot', 'cmp', '--labels', '-f') + c.run_sourmash("plot", "cmp", "--labels", "-f") print(c.last_result.status, c.last_result.out, c.last_result.err) assert c.last_result.status == 0 @@ -913,14 +988,14 @@ def test_do_plot_comparison_4_fail_not_distance(runtmp): D = numpy.zeros([2, 2]) D[0, 0] = 5 - with open(c.output('cmp'), 'wb') as fp: + with open(c.output("cmp"), "wb") as fp: numpy.save(fp, D) - with open(c.output('cmp.labels.txt'), 'wt') as fp: + with open(c.output("cmp.labels.txt"), "w") as fp: fp.write("a\nb\n") - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('plot', 'cmp', '--labels', fail_ok=True) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("plot", "cmp", "--labels", fail_ok=True) print(c.last_result.status, c.last_result.out, c.last_result.err) assert c.last_result.status != 0 @@ -928,14 +1003,25 @@ def test_do_plot_comparison_4_fail_not_distance(runtmp): def test_plot_6_labels_default(runtmp): # plot --labels is default - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') - testdata4 = utils.get_test_data('genome-s10+s11.sig') - - runtmp.run_sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') - - runtmp.sourmash('plot', 'cmp', '--labels') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") + + runtmp.run_sourmash( + "compare", + testdata1, + testdata2, + testdata3, + testdata4, + "-o", + "cmp", + "-k", + "21", + "--dna", + ) + + runtmp.sourmash("plot", "cmp", "--labels") print(runtmp.last_result.out) @@ -949,14 +1035,25 @@ def test_plot_6_labels_default(runtmp): def test_plot_6_labels(runtmp): # specifing --labels gives the right result - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') - testdata4 = utils.get_test_data('genome-s10+s11.sig') - - runtmp.run_sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') - - runtmp.sourmash('plot', 'cmp', '--labels') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") + + runtmp.run_sourmash( + "compare", + testdata1, + testdata2, + testdata3, + testdata4, + "-o", + "cmp", + "-k", + "21", + "--dna", + ) + + runtmp.sourmash("plot", "cmp", "--labels") print(runtmp.last_result.out) @@ -970,14 +1067,25 @@ def test_plot_6_labels(runtmp): def test_plot_6_indices(runtmp): # test plot --indices - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') - testdata4 = utils.get_test_data('genome-s10+s11.sig') - - runtmp.run_sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') - - runtmp.sourmash('plot', 'cmp', '--indices') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") + + runtmp.run_sourmash( + "compare", + testdata1, + testdata2, + testdata3, + testdata4, + "-o", + "cmp", + "-k", + "21", + "--dna", + ) + + runtmp.sourmash("plot", "cmp", "--indices") print(runtmp.last_result.out) @@ -991,14 +1099,25 @@ def test_plot_6_indices(runtmp): def test_plot_6_no_labels(runtmp): # test plot --no-labels - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') - testdata4 = utils.get_test_data('genome-s10+s11.sig') - - runtmp.run_sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') - - runtmp.sourmash('plot', 'cmp', '--no-labels') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") + + runtmp.run_sourmash( + "compare", + testdata1, + testdata2, + testdata3, + testdata4, + "-o", + "cmp", + "-k", + "21", + "--dna", + ) + + runtmp.sourmash("plot", "cmp", "--no-labels") print(runtmp.last_result.out) @@ -1012,14 +1131,25 @@ def test_plot_6_no_labels(runtmp): def test_plot_6_no_indices(runtmp): # test plot --no-labels - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') - testdata4 = utils.get_test_data('genome-s10+s11.sig') - - runtmp.run_sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') - - runtmp.sourmash('plot', 'cmp', '--no-labels') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") + + runtmp.run_sourmash( + "compare", + testdata1, + testdata2, + testdata3, + testdata4, + "-o", + "cmp", + "-k", + "21", + "--dna", + ) + + runtmp.sourmash("plot", "cmp", "--no-labels") print(runtmp.last_result.out) @@ -1033,14 +1163,25 @@ def test_plot_6_no_indices(runtmp): def test_plot_6_no_labels_no_indices(runtmp): # test plot --no-labels --no-indices - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') - testdata4 = utils.get_test_data('genome-s10+s11.sig') - - runtmp.run_sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') - - runtmp.sourmash('plot', 'cmp', '--no-labels', '--no-indices') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") + + runtmp.run_sourmash( + "compare", + testdata1, + testdata2, + testdata3, + testdata4, + "-o", + "cmp", + "-k", + "21", + "--dna", + ) + + runtmp.sourmash("plot", "cmp", "--no-labels", "--no-indices") print((runtmp.last_result.out,)) @@ -1054,14 +1195,25 @@ def test_plot_6_no_labels_no_indices(runtmp): def test_plot_6_indices_labels(runtmp): # check that --labels --indices => --labels - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') - testdata4 = utils.get_test_data('genome-s10+s11.sig') - - runtmp.run_sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') - - runtmp.sourmash('plot', 'cmp', '--labels', '--indices') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") + + runtmp.run_sourmash( + "compare", + testdata1, + testdata2, + testdata3, + testdata4, + "-o", + "cmp", + "-k", + "21", + "--dna", + ) + + runtmp.sourmash("plot", "cmp", "--labels", "--indices") print(runtmp.last_result.out) @@ -1075,21 +1227,32 @@ def test_plot_6_indices_labels(runtmp): def test_plot_override_labeltext(runtmp): # test overriding labeltext - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') - testdata4 = utils.get_test_data('genome-s10+s11.sig') - - runtmp.run_sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') - - with open(runtmp.output('new.labels.txt'), 'wt') as fp: - fp.write('a\nb\nc\nd\n') - - runtmp.sourmash('plot', 'cmp', '--labeltext', 'new.labels.txt') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") + + runtmp.run_sourmash( + "compare", + testdata1, + testdata2, + testdata3, + testdata4, + "-o", + "cmp", + "-k", + "21", + "--dna", + ) + + with open(runtmp.output("new.labels.txt"), "w") as fp: + fp.write("a\nb\nc\nd\n") + + runtmp.sourmash("plot", "cmp", "--labeltext", "new.labels.txt") print(runtmp.last_result.out) - assert 'loading labels from new.labels.txt' in runtmp.last_result.err + assert "loading labels from new.labels.txt" in runtmp.last_result.err expected = """\ 0\ta @@ -1101,46 +1264,59 @@ def test_plot_override_labeltext(runtmp): def test_plot_override_labeltext_fail(runtmp): # test failed override of labeltext - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') - testdata4 = utils.get_test_data('genome-s10+s11.sig') - - runtmp.sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') - - with open(runtmp.output('new.labels.txt'), 'wt') as fp: - fp.write('a\nb\nc\n') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") + + runtmp.sourmash( + "compare", + testdata1, + testdata2, + testdata3, + testdata4, + "-o", + "cmp", + "-k", + "21", + "--dna", + ) + + with open(runtmp.output("new.labels.txt"), "w") as fp: + fp.write("a\nb\nc\n") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('plot', 'cmp', '--labeltext', 'new.labels.txt') + runtmp.sourmash("plot", "cmp", "--labeltext", "new.labels.txt") print(runtmp.last_result.out) print(runtmp.last_result.err) assert runtmp.last_result.status != 0 - assert 'loading labels from new.labels.txt' in runtmp.last_result.err - assert '3 labels != matrix size, exiting' in runtmp.last_result.err + assert "loading labels from new.labels.txt" in runtmp.last_result.err + assert "3 labels != matrix size, exiting" in runtmp.last_result.err def test_plot_reordered_labels_csv(runtmp): # test 'plot --csv' & correct ordering of labels c = runtmp - ss2 = utils.get_test_data('2.fa.sig') - ss47 = utils.get_test_data('47.fa.sig') - ss63 = utils.get_test_data('63.fa.sig') + ss2 = utils.get_test_data("2.fa.sig") + ss47 = utils.get_test_data("47.fa.sig") + ss63 = utils.get_test_data("63.fa.sig") - c.run_sourmash('compare', '-k', '31', '-o', 'cmp', ss2, ss47, ss63) - c.run_sourmash('plot', 'cmp', '--csv', 'neworder.csv') + c.run_sourmash("compare", "-k", "31", "-o", "cmp", ss2, ss47, ss63) + c.run_sourmash("plot", "cmp", "--csv", "neworder.csv") - with open(c.output('neworder.csv'), newline="") as fp: + with open(c.output("neworder.csv"), newline="") as fp: r = csv.DictReader(fp) akker_vals = set() for row in r: - akker_vals.add(row['CP001071.1 Akkermansia muciniphila ATCC BAA-835, complete genome']) + akker_vals.add( + row["CP001071.1 Akkermansia muciniphila ATCC BAA-835, complete genome"] + ) - assert '1.0' in akker_vals - assert '0.0' in akker_vals + assert "1.0" in akker_vals + assert "0.0" in akker_vals assert len(akker_vals) == 2 @@ -1148,35 +1324,48 @@ def test_plot_reordered_labels_csv_gz(runtmp): # test 'plot --csv' with a .gz output c = runtmp - ss2 = utils.get_test_data('2.fa.sig') - ss47 = utils.get_test_data('47.fa.sig') - ss63 = utils.get_test_data('63.fa.sig') + ss2 = utils.get_test_data("2.fa.sig") + ss47 = utils.get_test_data("47.fa.sig") + ss63 = utils.get_test_data("63.fa.sig") - c.run_sourmash('compare', '-k', '31', '-o', 'cmp', ss2, ss47, ss63) - c.run_sourmash('plot', 'cmp', '--csv', 'neworder.csv.gz') + c.run_sourmash("compare", "-k", "31", "-o", "cmp", ss2, ss47, ss63) + c.run_sourmash("plot", "cmp", "--csv", "neworder.csv.gz") - with gzip.open(c.output('neworder.csv.gz'), 'rt', newline="") as fp: + with gzip.open(c.output("neworder.csv.gz"), "rt", newline="") as fp: r = csv.DictReader(fp) akker_vals = set() for row in r: - akker_vals.add(row['CP001071.1 Akkermansia muciniphila ATCC BAA-835, complete genome']) + akker_vals.add( + row["CP001071.1 Akkermansia muciniphila ATCC BAA-835, complete genome"] + ) - assert '1.0' in akker_vals - assert '0.0' in akker_vals + assert "1.0" in akker_vals + assert "0.0" in akker_vals assert len(akker_vals) == 2 def test_plot_subsample_1(runtmp): # test plotting with --subsample - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') - testdata4 = utils.get_test_data('genome-s10+s11.sig') - - runtmp.sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') - - runtmp.sourmash('plot', 'cmp', '--subsample', '3') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") + + runtmp.sourmash( + "compare", + testdata1, + testdata2, + testdata3, + testdata4, + "-o", + "cmp", + "-k", + "21", + "--dna", + ) + + runtmp.sourmash("plot", "cmp", "--subsample", "3") print(runtmp.last_result.out) @@ -1189,14 +1378,25 @@ def test_plot_subsample_1(runtmp): def test_plot_subsample_2(runtmp): # test plotting --subsample with --subsample-seed - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') - testdata4 = utils.get_test_data('genome-s10+s11.sig') - - runtmp.sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') - - runtmp.sourmash('plot', 'cmp', '--subsample', '3', '--subsample-seed=2') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") + + runtmp.sourmash( + "compare", + testdata1, + testdata2, + testdata3, + testdata4, + "-o", + "cmp", + "-k", + "21", + "--dna", + ) + + runtmp.sourmash("plot", "cmp", "--subsample", "3", "--subsample-seed=2") print(runtmp.last_result.out) expected = """\ @@ -1208,25 +1408,25 @@ def test_plot_subsample_2(runtmp): @utils.in_tempdir def test_search_query_sig_does_not_exist(c): - testdata1 = utils.get_test_data('short.fa') - c.run_sourmash('sketch', 'translate', '-p', 'k=31,num=500', testdata1) + testdata1 = utils.get_test_data("short.fa") + c.run_sourmash("sketch", "translate", "-p", "k=31,num=500", testdata1) - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('search', 'short2.fa.sig', 'short.fa.sig', fail_ok=True) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("search", "short2.fa.sig", "short.fa.sig", fail_ok=True) print(c.last_result.status, c.last_result.out, c.last_result.err) assert c.last_result.status == -1 assert "Cannot open query file 'short2.fa.sig'" in c.last_result.err - assert len(c.last_result.err.split('\n\r')) < 5 + assert len(c.last_result.err.split("\n\r")) < 5 @utils.in_tempdir def test_search_subject_sig_does_not_exist(c): - testdata1 = utils.get_test_data('short.fa') - c.run_sourmash('sketch', 'translate', '-p', 'k=31,num=500', testdata1) + testdata1 = utils.get_test_data("short.fa") + c.run_sourmash("sketch", "translate", "-p", "k=31,num=500", testdata1) - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('search', 'short.fa.sig', 'short2.fa.sig', fail_ok=True) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("search", "short.fa.sig", "short2.fa.sig", fail_ok=True) print(c.last_result.status, c.last_result.out, c.last_result.err) assert c.last_result.status == -1 @@ -1235,12 +1435,13 @@ def test_search_subject_sig_does_not_exist(c): @utils.in_tempdir def test_search_second_subject_sig_does_not_exist(c): - testdata1 = utils.get_test_data('short.fa') - c.run_sourmash('sketch', 'translate', '-p', 'k=31,num=500', testdata1) + testdata1 = utils.get_test_data("short.fa") + c.run_sourmash("sketch", "translate", "-p", "k=31,num=500", testdata1) - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('search', 'short.fa.sig', 'short.fa.sig', - 'short2.fa.sig', fail_ok=True) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "search", "short.fa.sig", "short.fa.sig", "short2.fa.sig", fail_ok=True + ) print(c.last_result.status, c.last_result.out, c.last_result.err) assert c.last_result.status == -1 @@ -1249,35 +1450,35 @@ def test_search_second_subject_sig_does_not_exist(c): @utils.in_tempdir def test_search(c): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,num=500', testdata1, testdata2) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - c.run_sourmash('search', 'short.fa.sig', 'short2.fa.sig') + c.run_sourmash("search", "short.fa.sig", "short2.fa.sig") print(c.last_result.status, c.last_result.out, c.last_result.err) - assert '1 matches' in c.last_result.out - assert '93.0%' in c.last_result.out + assert "1 matches" in c.last_result.out + assert "93.0%" in c.last_result.out def test_search_ignore_abundance(runtmp): # note: uses num signatures. - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - runtmp.sourmash('sketch', 'dna', '-p','k=31,num=500,abund', testdata1, testdata2) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500,abund", testdata1, testdata2) # Make sure there's different percent matches when using or # not using abundance - runtmp.sourmash('search', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("search", "short.fa.sig", "short2.fa.sig") out1 = runtmp.last_result.out print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '1 matches' in runtmp.last_result.out - assert '81.5%' in runtmp.last_result.out + assert "1 matches" in runtmp.last_result.out + assert "81.5%" in runtmp.last_result.out - runtmp.sourmash('search', '--ignore-abundance', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("search", "--ignore-abundance", "short.fa.sig", "short2.fa.sig") out2 = runtmp.last_result.out print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '1 matches' in runtmp.last_result.out - assert '93.0%' in runtmp.last_result.out + assert "1 matches" in runtmp.last_result.out + assert "93.0%" in runtmp.last_result.out # Make sure results are different! assert out1 != out2 @@ -1285,102 +1486,104 @@ def test_search_ignore_abundance(runtmp): def test_search_abund_subj_flat(runtmp): # test Index.search_abund requires an abund subj - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('search', sig47, sig63) + runtmp.sourmash("search", sig47, sig63) - assert "'search_abund' requires subject signatures with abundance information" in str(exc.value) + assert ( + "'search_abund' requires subject signatures with abundance information" + in str(exc.value) + ) def test_search_abund_csv(runtmp): # test search with abundance signatures, look at CSV output - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - runtmp.sourmash('sketch', 'dna', '-p','k=31,scaled=1,abund', testdata1, testdata2) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + runtmp.sourmash("sketch", "dna", "-p", "k=31,scaled=1,abund", testdata1, testdata2) - runtmp.sourmash('search', 'short.fa.sig', 'short2.fa.sig', '-o', 'xxx.csv') - out1 = runtmp.last_result.out + runtmp.sourmash("search", "short.fa.sig", "short2.fa.sig", "-o", "xxx.csv") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '1 matches' in runtmp.last_result.out - assert '82.7%' in runtmp.last_result.out + assert "1 matches" in runtmp.last_result.out + assert "82.7%" in runtmp.last_result.out - with open(runtmp.output('xxx.csv'), newline="") as fp: + with open(runtmp.output("xxx.csv"), newline="") as fp: r = csv.DictReader(fp) row = next(r) print(row) - assert float(row['similarity']) == 0.8266277454288367 - assert row['md5'] == 'bf752903d635b1eb83c53fe4aae951db' - assert row['filename'].endswith('short2.fa.sig') - assert row['md5'] == 'bf752903d635b1eb83c53fe4aae951db' - assert row['query_filename'].endswith('short.fa') - assert row['query_name'] == '' - assert row['query_md5'] == '9191284a' - assert row['filename'] == 'short2.fa.sig', row['filename'] + assert float(row["similarity"]) == 0.8266277454288367 + assert row["md5"] == "bf752903d635b1eb83c53fe4aae951db" + assert row["filename"].endswith("short2.fa.sig") + assert row["md5"] == "bf752903d635b1eb83c53fe4aae951db" + assert row["query_filename"].endswith("short.fa") + assert row["query_name"] == "" + assert row["query_md5"] == "9191284a" + assert row["filename"] == "short2.fa.sig", row["filename"] @utils.in_tempdir def test_search_csv(c): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,num=500', testdata1, testdata2) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - c.run_sourmash('search', 'short.fa.sig', 'short2.fa.sig', '-o', 'xxx.csv') + c.run_sourmash("search", "short.fa.sig", "short2.fa.sig", "-o", "xxx.csv") print(c.last_result.status, c.last_result.out, c.last_result.err) - csv_file = c.output('xxx.csv') + csv_file = c.output("xxx.csv") with open(csv_file) as fp: reader = csv.DictReader(fp) row = next(reader) print(row) - assert float(row['similarity']) == 0.93 - assert row['filename'].endswith('short2.fa.sig') - assert row['md5'] == '914591cd1130aa915fe0c0c63db8f19d' - assert row['query_filename'].endswith('short.fa') - assert row['query_name'] == '' - assert row['query_md5'] == 'e26a306d' + assert float(row["similarity"]) == 0.93 + assert row["filename"].endswith("short2.fa.sig") + assert row["md5"] == "914591cd1130aa915fe0c0c63db8f19d" + assert row["query_filename"].endswith("short.fa") + assert row["query_name"] == "" + assert row["query_md5"] == "e26a306d" @utils.in_tempdir def test_search_lca_db(c): # can we do a 'sourmash search' on an LCA database? - query = utils.get_test_data('47.fa.sig') - lca_db = utils.get_test_data('lca/47+63.lca.json') + query = utils.get_test_data("47.fa.sig") + lca_db = utils.get_test_data("lca/47+63.lca.json") - c.run_sourmash('search', query, lca_db) + c.run_sourmash("search", query, lca_db) print(c) - assert 'NC_009665.1 Shewanella baltica OS185, complete genome' in str(c) + assert "NC_009665.1 Shewanella baltica OS185, complete genome" in str(c) def test_search_query_db_md5(runtmp): # pull a search query out of a database with an md5sum - db = utils.get_test_data('prot/protein.sbt.zip') - runtmp.run_sourmash('search', db, db, '--md5', '16869d2c8a1') + db = utils.get_test_data("prot/protein.sbt.zip") + runtmp.run_sourmash("search", db, db, "--md5", "16869d2c8a1") - assert '100.0% GCA_001593925' in str(runtmp) + assert "100.0% GCA_001593925" in str(runtmp) def test_gather_query_db_md5(runtmp, linear_gather, prefetch_gather): # pull a search query out of a database with an md5sum - db = utils.get_test_data('prot/protein.sbt.zip') - runtmp.run_sourmash('gather', db, db, '--md5', '16869d2c8a1', - linear_gather, prefetch_gather) + db = utils.get_test_data("prot/protein.sbt.zip") + runtmp.run_sourmash( + "gather", db, db, "--md5", "16869d2c8a1", linear_gather, prefetch_gather + ) - assert '340.9 kbp 100.0% 100.0% GCA_001593925' in str(runtmp) + assert "340.9 kbp 100.0% 100.0% GCA_001593925" in str(runtmp) def test_gather_query_db_md5_ambiguous(runtmp, linear_gather, prefetch_gather): c = runtmp # what if we give an ambiguous md5 prefix? - db = utils.get_test_data('prot/protein.sbt.zip') + db = utils.get_test_data("prot/protein.sbt.zip") - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('gather', db, db, '--md5', '1', linear_gather, - prefetch_gather) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("gather", db, db, "--md5", "1", linear_gather, prefetch_gather) err = c.last_result.err assert "Error! Multiple signatures start with md5 '1'" in err @@ -1388,38 +1591,46 @@ def test_gather_query_db_md5_ambiguous(runtmp, linear_gather, prefetch_gather): def test_gather_lca_db(runtmp, linear_gather, prefetch_gather): # can we do a 'sourmash gather' on an LCA database? - query = utils.get_test_data('47+63.fa.sig') - lca_db = utils.get_test_data('lca/47+63.lca.json') + query = utils.get_test_data("47+63.fa.sig") + lca_db = utils.get_test_data("lca/47+63.lca.json") - runtmp.sourmash('gather', query, lca_db, linear_gather, prefetch_gather) + runtmp.sourmash("gather", query, lca_db, linear_gather, prefetch_gather) print(runtmp) out = runtmp.last_result.out - assert 'NC_009665.1 Shewanella baltica OS185' in out - assert 'WARNING: final scaled was 10000, vs query scaled of 1000' in out + assert "NC_009665.1 Shewanella baltica OS185" in out + assert "WARNING: final scaled was 10000, vs query scaled of 1000" in out def test_gather_csv_output_filename_bug(runtmp, linear_gather, prefetch_gather): c = runtmp # check a bug where the database filename in the output CSV was incorrect - query = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db_1 = utils.get_test_data('lca/delmont-1.lca.json') - lca_db_2 = utils.get_test_data('lca/delmont-2.lca.json') - - c.run_sourmash('gather', query, lca_db_1, lca_db_2, '-o', 'out.csv', - linear_gather, prefetch_gather) - with open(c.output('out.csv'), 'rt') as fp: + query = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db_1 = utils.get_test_data("lca/delmont-1.lca.json") + lca_db_2 = utils.get_test_data("lca/delmont-2.lca.json") + + c.run_sourmash( + "gather", + query, + lca_db_1, + lca_db_2, + "-o", + "out.csv", + linear_gather, + prefetch_gather, + ) + with open(c.output("out.csv")) as fp: r = csv.DictReader(fp) row = next(r) - assert row['filename'] == lca_db_1 + assert row["filename"] == lca_db_1 def test_compare_no_such_file(runtmp): # 'compare' fails on nonexistent files c = runtmp - with pytest.raises(SourmashCommandFailed) as e: - c.run_sourmash('compare', 'nosuchfile.sig') + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("compare", "nosuchfile.sig") assert "Error while reading signatures from 'nosuchfile.sig'." in c.last_result.err @@ -1427,8 +1638,8 @@ def test_compare_no_such_file(runtmp): def test_compare_no_such_file_force(runtmp): # can still run compare on nonexistent with -f c = runtmp - with pytest.raises(SourmashCommandFailed) as e: - c.run_sourmash('compare', 'nosuchfile.sig', '-f') + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("compare", "nosuchfile.sig", "-f") print(c.last_result.err) assert "Error while reading signatures from 'nosuchfile.sig'." @@ -1437,191 +1648,197 @@ def test_compare_no_such_file_force(runtmp): def test_compare_no_matching_sigs(runtmp): # compare fails when no sketches found with desired ksize c = runtmp - query = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') + query = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") - with pytest.raises(SourmashCommandFailed) as exc: - c.last_result.status, c.last_result.out, c.last_result.err = \ - c.run_sourmash('compare', '-k', '100', query, fail_ok=True) + with pytest.raises(SourmashCommandFailed): + c.last_result.status, c.last_result.out, c.last_result.err = c.run_sourmash( + "compare", "-k", "100", query, fail_ok=True + ) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status - assert 'warning: no signatures loaded at given ksize/molecule type' in c.last_result.err - assert 'no signatures found! exiting.' in c.last_result.err + assert ( + "warning: no signatures loaded at given ksize/molecule type" + in c.last_result.err + ) + assert "no signatures found! exiting." in c.last_result.err def test_compare_deduce_molecule(runtmp): # deduce DNA vs protein from query, if it is unique - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'translate', '-p', 'k=10,num=500', testdata1,testdata2) + runtmp.sourmash("sketch", "translate", "-p", "k=10,num=500", testdata1, testdata2) - runtmp.sourmash('compare', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("compare", "short.fa.sig", "short2.fa.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert 'min similarity in matrix: 0.91' in runtmp.last_result.out + assert "min similarity in matrix: 0.91" in runtmp.last_result.out def test_compare_choose_molecule_dna(runtmp): # choose molecule type - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('compute', '-k', '30', '--dna', '--protein', testdata1, testdata2) + runtmp.sourmash("compute", "-k", "30", "--dna", "--protein", testdata1, testdata2) - runtmp.sourmash('compare', '--dna', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("compare", "--dna", "short.fa.sig", "short2.fa.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert 'min similarity in matrix: 0.938' in runtmp.last_result.out + assert "min similarity in matrix: 0.938" in runtmp.last_result.out def test_compare_choose_molecule_protein(runtmp): # choose molecule type - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('compute', '-k', '30', '--dna', '--protein', testdata1, testdata2) + runtmp.sourmash("compute", "-k", "30", "--dna", "--protein", testdata1, testdata2) - runtmp.sourmash('compare', '--protein', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("compare", "--protein", "short.fa.sig", "short2.fa.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert 'min similarity in matrix: 0.91' in runtmp.last_result.out + assert "min similarity in matrix: 0.91" in runtmp.last_result.out def test_compare_no_choose_molecule_fail(runtmp): # choose molecule type - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'dna', '-p', 'k=30,num=500',testdata1) + runtmp.sourmash("sketch", "dna", "-p", "k=30,num=500", testdata1) - runtmp.sourmash('sketch', 'protein', '-p', 'k=30,num=500', testdata2) + runtmp.sourmash("sketch", "protein", "-p", "k=30,num=500", testdata2) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('compare', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("compare", "short.fa.sig", "short2.fa.sig") - assert 'multiple molecule types loaded; please specify' in runtmp.last_result.err + assert "multiple molecule types loaded; please specify" in runtmp.last_result.err assert runtmp.last_result.status != 0 def test_compare_deduce_ksize(runtmp): # deduce ksize, if it is unique - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'dna', '-p', 'k=29,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=29,num=500", testdata1, testdata2) - runtmp.sourmash('compare', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("compare", "short.fa.sig", "short2.fa.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert 'min similarity in matrix: 0.938' in runtmp.last_result.out + assert "min similarity in matrix: 0.938" in runtmp.last_result.out def test_search_deduce_molecule(runtmp): # deduce DNA vs protein from query, if it is unique - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'translate', '-p', 'k=10,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "translate", "-p", "k=10,num=500", testdata1, testdata2) - runtmp.sourmash('search', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("search", "short.fa.sig", "short2.fa.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '1 matches' in runtmp.last_result.out - assert '(k=10, protein)' in runtmp.last_result.err + assert "1 matches" in runtmp.last_result.out + assert "(k=10, protein)" in runtmp.last_result.err def test_search_deduce_ksize(runtmp): # deduce ksize from query, if it is unique - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'translate', '-p', 'k=23,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "translate", "-p", "k=23,num=500", testdata1, testdata2) - runtmp.sourmash('search', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("search", "short.fa.sig", "short2.fa.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '1 matches' in runtmp.last_result.out - assert 'k=23' in runtmp.last_result.err + assert "1 matches" in runtmp.last_result.out + assert "k=23" in runtmp.last_result.err def test_do_sourmash_index_multik_fail(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'translate', '-p', 'k=31,num=500', testdata1) + runtmp.sourmash("sketch", "translate", "-p", "k=31,num=500", testdata1) - runtmp.sourmash('sketch', 'translate', '-p', 'k=32,num=500', testdata2) + runtmp.sourmash("sketch", "translate", "-p", "k=32,num=500", testdata2) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('index', 'zzz', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("index", "zzz", "short.fa.sig", "short2.fa.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) assert runtmp.last_result.status == -1 def test_do_sourmash_index_multimol_fail(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'translate', testdata1) + runtmp.sourmash("sketch", "translate", testdata1) - runtmp.sourmash('sketch', 'translate', '-p', 'k=30,num=500', testdata2) + runtmp.sourmash("sketch", "translate", "-p", "k=30,num=500", testdata2) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('index', 'zzz', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("index", "zzz", "short.fa.sig", "short2.fa.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) assert runtmp.last_result.status == -1 def test_do_sourmash_index_multinum_fail(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'translate', '-p', 'k=31,num=500', testdata1) + runtmp.sourmash("sketch", "translate", "-p", "k=31,num=500", testdata1) - runtmp.sourmash('sketch', 'translate', '-p', 'k=31,num=1000', testdata2) + runtmp.sourmash("sketch", "translate", "-p", "k=31,num=1000", testdata2) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('index', 'zzz', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("index", "zzz", "short.fa.sig", "short2.fa.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) assert runtmp.last_result.status == -1 - assert 'trying to build an SBT with incompatible signatures.' in runtmp.last_result.err + assert ( + "trying to build an SBT with incompatible signatures." in runtmp.last_result.err + ) def test_do_sourmash_index_multiscaled_fail(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'dna', '-p', 'scaled=10', testdata1) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", testdata1) - runtmp.sourmash('sketch', 'dna', '-p', 'scaled=1', testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=1", testdata2) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('index', '-k', '31', 'zzz', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("index", "-k", "31", "zzz", "short.fa.sig", "short2.fa.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) assert runtmp.last_result.status == -1 - assert 'trying to build an SBT with incompatible signatures.' in runtmp.last_result.err + assert ( + "trying to build an SBT with incompatible signatures." in runtmp.last_result.err + ) @utils.in_tempdir def test_do_sourmash_index_multiscaled_rescale(c): # test sourmash index --scaled - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - c.run_sourmash('sketch', 'dna', '-p', 'scaled=10', testdata1) - c.run_sourmash('sketch', 'dna', '-p', 'scaled=1', testdata2) + c.run_sourmash("sketch", "dna", "-p", "scaled=10", testdata1) + c.run_sourmash("sketch", "dna", "-p", "scaled=1", testdata2) - c.run_sourmash('index', 'zzz', - 'short.fa.sig', - 'short2.fa.sig', - '-k', '31', - '--scaled', '10') + c.run_sourmash( + "index", "zzz", "short.fa.sig", "short2.fa.sig", "-k", "31", "--scaled", "10" + ) print(c) assert c.last_result.status == 0 @@ -1630,190 +1847,202 @@ def test_do_sourmash_index_multiscaled_rescale(c): @utils.in_tempdir def test_do_sourmash_index_multiscaled_rescale_fail(c): # test sourmash index --scaled with invalid rescaling (10 -> 5) - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - c.run_sourmash('sketch', 'dna', '-p', 'scaled=10', testdata1) - c.run_sourmash('sketch', 'dna', '-p', 'scaled=1', testdata2) + c.run_sourmash("sketch", "dna", "-p", "scaled=10", testdata1) + c.run_sourmash("sketch", "dna", "-p", "scaled=1", testdata2) # this should fail: cannot go from a scaled value of 10 to 5 with pytest.raises(SourmashCommandFailed) as e: - c.run_sourmash('index', 'zzz', - 'short.fa.sig', - 'short2.fa.sig', - '-k', '31', - '--scaled', '5') + c.run_sourmash( + "index", "zzz", "short.fa.sig", "short2.fa.sig", "-k", "31", "--scaled", "5" + ) print(e.value) assert c.last_result.status == -1 - assert 'new scaled 5 is lower than current sample scaled 10' in c.last_result.err + assert "new scaled 5 is lower than current sample scaled 10" in c.last_result.err def test_do_sourmash_sbt_search_output(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'dna', '-p', 'k=31,num=500', testdata1,testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - runtmp.sourmash('index', 'zzz', 'short.fa.sig', 'short2.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz", "short.fa.sig", "short2.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('search', 'short.fa.sig', 'zzz', '-o', 'foo') + runtmp.sourmash("search", "short.fa.sig", "zzz", "-o", "foo") - output = Path(runtmp.output('foo')).read_text() + output = Path(runtmp.output("foo")).read_text() print(output) - assert 'e26a306d26512' in output - assert '914591cd1130aa915' in output + assert "e26a306d26512" in output + assert "914591cd1130aa915" in output # check against a bug in sbt search triggered by incorrect max Jaccard # calculation. def test_do_sourmash_sbt_search_check_bug(runtmp): # mins: 431 - testdata1 = utils.get_test_data('sbt-search-bug/nano.sig') + testdata1 = utils.get_test_data("sbt-search-bug/nano.sig") # mins: 6264 - testdata2 = utils.get_test_data('sbt-search-bug/bacteroides.sig') + testdata2 = utils.get_test_data("sbt-search-bug/bacteroides.sig") - runtmp.sourmash('index', 'zzz', testdata1, testdata2, '-k', '31') + runtmp.sourmash("index", "zzz", testdata1, testdata2, "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('search', testdata1, 'zzz') + runtmp.sourmash("search", testdata1, "zzz") - assert '1 matches' in runtmp.last_result.out + assert "1 matches" in runtmp.last_result.out - tree = load_sbt_index(runtmp.output('zzz.sbt.zip')) - assert tree._nodes[0].metadata['min_n_below'] == 431 + tree = load_sbt_index(runtmp.output("zzz.sbt.zip")) + assert tree._nodes[0].metadata["min_n_below"] == 431 def test_do_sourmash_sbt_search_empty_sig(runtmp): # mins: 431 - testdata1 = utils.get_test_data('sbt-search-bug/nano.sig') + testdata1 = utils.get_test_data("sbt-search-bug/nano.sig") # mins: 0 - testdata2 = utils.get_test_data('sbt-search-bug/empty.sig') + testdata2 = utils.get_test_data("sbt-search-bug/empty.sig") - runtmp.sourmash('index', 'zzz', testdata1, testdata2, '-k', '31') + runtmp.sourmash("index", "zzz", testdata1, testdata2, "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('search', testdata1, 'zzz') + runtmp.sourmash("search", testdata1, "zzz") - assert '1 matches' in runtmp.last_result.out + assert "1 matches" in runtmp.last_result.out - tree = load_sbt_index(runtmp.output('zzz.sbt.zip')) - assert tree._nodes[0].metadata['min_n_below'] == 1 + tree = load_sbt_index(runtmp.output("zzz.sbt.zip")) + assert tree._nodes[0].metadata["min_n_below"] == 1 def test_do_sourmash_sbt_move_and_search_output(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'dna', '-p', 'k=31,num=500', testdata1,testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - runtmp.sourmash('index', 'zzz.sbt.json', 'short.fa.sig', 'short2.fa.sig', '-k', '31') + runtmp.sourmash( + "index", "zzz.sbt.json", "short.fa.sig", "short2.fa.sig", "-k", "31" + ) - assert os.path.exists(runtmp.output('zzz.sbt.json')) + assert os.path.exists(runtmp.output("zzz.sbt.json")) print(runtmp.last_result.out) - with open(runtmp.output('zzz.sbt.json')) as fp: + with open(runtmp.output("zzz.sbt.json")) as fp: d = json.load(fp) - assert d['storage']['args']['path'] == '.sbt.zzz' + assert d["storage"]["args"]["path"] == ".sbt.zzz" - newpath = runtmp.output('subdir') + newpath = runtmp.output("subdir") os.mkdir(newpath) # move both JSON file and subdirectory. - shutil.move(runtmp.output('zzz.sbt.json'), newpath) - shutil.move(runtmp.output('.sbt.zzz'), newpath) + shutil.move(runtmp.output("zzz.sbt.json"), newpath) + shutil.move(runtmp.output(".sbt.zzz"), newpath) - status, out, err = utils.runscript('sourmash', - ['search', '../short.fa.sig', - 'zzz.sbt.json', '-o', 'foo'], - in_directory=newpath) + status, out, err = utils.runscript( + "sourmash", + ["search", "../short.fa.sig", "zzz.sbt.json", "-o", "foo"], + in_directory=newpath, + ) - output = Path(os.path.join(newpath, 'foo')).read_text() + output = Path(os.path.join(newpath, "foo")).read_text() print(output) - assert '914591cd1130aa91' in output - assert 'e26a306d2651' in output + assert "914591cd1130aa91" in output + assert "e26a306d2651" in output def test_search_deduce_ksize_and_select_appropriate(runtmp): # deduce ksize from query and select correct signature from DB - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'translate', '-p', 'k=24,num=500', testdata1) + runtmp.sourmash("sketch", "translate", "-p", "k=24,num=500", testdata1) # The DB contains signatres for multiple ksizes - runtmp.sourmash('sketch', 'translate', '-p', 'k=23,num=500', '-p', 'k=24,num=500', testdata2) + runtmp.sourmash( + "sketch", "translate", "-p", "k=23,num=500", "-p", "k=24,num=500", testdata2 + ) - runtmp.sourmash('search', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("search", "short.fa.sig", "short2.fa.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '1 matches' in runtmp.last_result.out - assert 'k=24' in runtmp.last_result.err + assert "1 matches" in runtmp.last_result.out + assert "k=24" in runtmp.last_result.err def test_search_deduce_ksize_not_unique(runtmp): # deduce ksize from query, fail because it is not unique - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - - runtmp.sourmash('sketch', 'translate', '-p', 'k=23,num=500', '-p', 'k=25,num=500', testdata1, testdata2) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + + runtmp.sourmash( + "sketch", + "translate", + "-p", + "k=23,num=500", + "-p", + "k=25,num=500", + testdata1, + testdata2, + ) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('search', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("search", "short.fa.sig", "short2.fa.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) assert runtmp.last_result.status == -1 - assert '2 signatures matching ksize' in runtmp.last_result.err + assert "2 signatures matching ksize" in runtmp.last_result.err @utils.in_tempdir def test_search_deduce_ksize_no_match(c): # no matching sigs in search sig list - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - c.run_sourmash('sketch', 'translate', '-p', 'k=23,num=500', testdata1) - c.run_sourmash('sketch', 'translate', '-p', 'k=25,num=500', testdata2) + c.run_sourmash("sketch", "translate", "-p", "k=23,num=500", testdata1) + c.run_sourmash("sketch", "translate", "-p", "k=25,num=500", testdata2) with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('search', 'short.fa.sig', 'short2.fa.sig') + c.run_sourmash("search", "short.fa.sig", "short2.fa.sig") assert "no compatible signatures found in 'short2.fa.sig'" in str(exc.value) def test_search_deduce_ksize_vs_user_specified(runtmp): # user specified ksize is not available - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'translate', '-p', 'k=23,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "translate", "-p", "k=23,num=500", testdata1, testdata2) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('search', '-k', '24', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("search", "-k", "24", "short.fa.sig", "short2.fa.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) assert runtmp.last_result.status == -1 - assert '0 signatures matching ksize' in runtmp.last_result.err + assert "0 signatures matching ksize" in runtmp.last_result.err def test_search_containment(runtmp): # search with --containment in signatures - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'dna', '-p', 'scaled=1', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=1", testdata1, testdata2) - runtmp.sourmash('search', 'short.fa.sig', 'short2.fa.sig', '--containment') + runtmp.sourmash("search", "short.fa.sig", "short2.fa.sig", "--containment") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '1 matches' in runtmp.last_result.out - assert '95.6%' in runtmp.last_result.out + assert "1 matches" in runtmp.last_result.out + assert "95.6%" in runtmp.last_result.out def test_search_containment_abund(runtmp): @@ -1830,28 +2059,34 @@ def test_search_containment_abund(runtmp): mh2.add_many((1, 5)) # build signatures - x = sourmash.SourmashSignature(mh1, name='a') - y = sourmash.SourmashSignature(mh2, name='b') + x = sourmash.SourmashSignature(mh1, name="a") + y = sourmash.SourmashSignature(mh2, name="b") # save! - with open(runtmp.output('a.sig'), 'wt') as fp: + with open(runtmp.output("a.sig"), "w") as fp: sourmash.save_signatures([x], fp) - with open(runtmp.output('b.sig'), 'wt') as fp: + with open(runtmp.output("b.sig"), "w") as fp: sourmash.save_signatures([y], fp) # run sourmash search --containment with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('search', 'a.sig', 'b.sig', '-o', 'xxx.csv', - '--containment') + runtmp.sourmash("search", "a.sig", "b.sig", "-o", "xxx.csv", "--containment") - assert "ERROR: cannot do containment searches on an abund signature; maybe specify --ignore-abundance?" in str(exc) + assert ( + "ERROR: cannot do containment searches on an abund signature; maybe specify --ignore-abundance?" + in str(exc) + ) # run sourmash search --max-containment with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('search', 'a.sig', 'b.sig', '-o', 'xxx.csv', - '--max-containment') + runtmp.sourmash( + "search", "a.sig", "b.sig", "-o", "xxx.csv", "--max-containment" + ) - assert "ERROR: cannot do containment searches on an abund signature; maybe specify --ignore-abundance?" in str(exc) + assert ( + "ERROR: cannot do containment searches on an abund signature; maybe specify --ignore-abundance?" + in str(exc) + ) def test_search_containment_abund_ignore(runtmp): @@ -1868,25 +2103,32 @@ def test_search_containment_abund_ignore(runtmp): mh2.add_many((1, 5)) # build signatures - x = sourmash.SourmashSignature(mh1, name='a') - y = sourmash.SourmashSignature(mh2, name='b') + x = sourmash.SourmashSignature(mh1, name="a") + y = sourmash.SourmashSignature(mh2, name="b") # save! - with open(runtmp.output('a.sig'), 'wt') as fp: + with open(runtmp.output("a.sig"), "w") as fp: sourmash.save_signatures([x], fp) - with open(runtmp.output('b.sig'), 'wt') as fp: + with open(runtmp.output("b.sig"), "w") as fp: sourmash.save_signatures([y], fp) # run sourmash search - runtmp.sourmash('search', 'a.sig', 'b.sig', '-o', 'xxx.csv', - '--containment', '--ignore-abundance') + runtmp.sourmash( + "search", + "a.sig", + "b.sig", + "-o", + "xxx.csv", + "--containment", + "--ignore-abundance", + ) # check results - with open(runtmp.output('xxx.csv'), 'rt') as fp: + with open(runtmp.output("xxx.csv")) as fp: r = csv.DictReader(fp) row = next(r) - similarity = row['similarity'] - print(f'search output: similarity is {similarity}') + similarity = row["similarity"] + print(f"search output: similarity is {similarity}") print(mh1.contained_by(mh2)) assert float(similarity) == mh1.contained_by(mh2) @@ -1895,150 +2137,154 @@ def test_search_containment_abund_ignore(runtmp): def test_search_containment_sbt(runtmp): # search with --containment in an SBT - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'dna', '-p', 'scaled=1', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=1", testdata1, testdata2) - runtmp.sourmash('index', '-k', '31', 'zzz', 'short2.fa.sig') + runtmp.sourmash("index", "-k", "31", "zzz", "short2.fa.sig") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('search', 'short.fa.sig', 'zzz', '--containment') + runtmp.sourmash("search", "short.fa.sig", "zzz", "--containment") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '1 matches' in runtmp.last_result.out - assert '95.6%' in runtmp.last_result.out + assert "1 matches" in runtmp.last_result.out + assert "95.6%" in runtmp.last_result.out def test_search_containment_s10(runtmp): # check --containment for s10/s10-small - q1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - q2 = utils.get_test_data('scaled/genome-s10-small.fa.gz.sig') + q1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + q2 = utils.get_test_data("scaled/genome-s10-small.fa.gz.sig") - runtmp.sourmash('search', q1, q2, '--containment') + runtmp.sourmash("search", q1, q2, "--containment") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '1 matches' in runtmp.last_result.out - assert '16.7%' in runtmp.last_result.out + assert "1 matches" in runtmp.last_result.out + assert "16.7%" in runtmp.last_result.out def test_search_containment_s10_no_max(run): # check --containment for s10/s10-small - q1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - q2 = utils.get_test_data('scaled/genome-s10-small.fa.gz.sig') + q1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + q2 = utils.get_test_data("scaled/genome-s10-small.fa.gz.sig") - with pytest.raises(SourmashCommandFailed) as exc: - run.run_sourmash('search', q1, q2, '--containment', - '--max-containment') + with pytest.raises(SourmashCommandFailed): + run.run_sourmash("search", q1, q2, "--containment", "--max-containment") print(run.last_result.out) print(run.last_result.err) - assert "ERROR: cannot specify both --containment and --max-containment!" in run.last_result.err + assert ( + "ERROR: cannot specify both --containment and --max-containment!" + in run.last_result.err + ) def test_search_max_containment_s10_pairwise(runtmp): # check --max-containment for s10/s10-small - q1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - q2 = utils.get_test_data('scaled/genome-s10-small.fa.gz.sig') + q1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + q2 = utils.get_test_data("scaled/genome-s10-small.fa.gz.sig") - runtmp.sourmash('search', q1, q2,'--max-containment') + runtmp.sourmash("search", q1, q2, "--max-containment") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '1 matches' in runtmp.last_result.out - assert '100.0%' in runtmp.last_result.out + assert "1 matches" in runtmp.last_result.out + assert "100.0%" in runtmp.last_result.out def test_search_containment_s10_siglist(runtmp): # check --containment for s10/s10-small - q1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - q2 = utils.get_test_data('scaled/*.sig') + q1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + q2 = utils.get_test_data("scaled/*.sig") q2 = glob.glob(q2) - runtmp.sourmash('search', q1, *q2, '--containment') + runtmp.sourmash("search", q1, *q2, "--containment") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '3 matches' in runtmp.last_result.out - assert ' 16.7% ../genome-s10-small.fa.gz' in runtmp.last_result.out - assert '100.0% ../genome-s10.fa.gz' in runtmp.last_result.out - assert '100.0% ../genome-s10+s11.fa.gz' in runtmp.last_result.out + assert "3 matches" in runtmp.last_result.out + assert " 16.7% ../genome-s10-small.fa.gz" in runtmp.last_result.out + assert "100.0% ../genome-s10.fa.gz" in runtmp.last_result.out + assert "100.0% ../genome-s10+s11.fa.gz" in runtmp.last_result.out def test_search_max_containment_s10_siglist(runtmp): # check --max-containment for s10/s10-small - q1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - q2 = utils.get_test_data('scaled/*.sig') + q1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + q2 = utils.get_test_data("scaled/*.sig") q2 = glob.glob(q2) - runtmp.sourmash('search', q1, *q2, '--max-containment') + runtmp.sourmash("search", q1, *q2, "--max-containment") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '3 matches' in runtmp.last_result.out - assert '100.0% ../genome-s10-small.fa.gz' in runtmp.last_result.out - assert '100.0% ../genome-s10.fa.gz' in runtmp.last_result.out - assert '100.0% ../genome-s10+s11.fa.gz' in runtmp.last_result.out + assert "3 matches" in runtmp.last_result.out + assert "100.0% ../genome-s10-small.fa.gz" in runtmp.last_result.out + assert "100.0% ../genome-s10.fa.gz" in runtmp.last_result.out + assert "100.0% ../genome-s10+s11.fa.gz" in runtmp.last_result.out def test_search_containment_s10_sbt(runtmp): # check --containment for s10/s10-small - q1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - q2 = utils.get_test_data('scaled/all.sbt.zip') + q1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + q2 = utils.get_test_data("scaled/all.sbt.zip") - runtmp.sourmash('search', q1, q2, '--containment') + runtmp.sourmash("search", q1, q2, "--containment") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '3 matches' in runtmp.last_result.out - assert '100.0% ../genome-s10+s11.fa.gz' in runtmp.last_result.out - assert '100.0% ../genome-s10.fa.gz' in runtmp.last_result.out - assert ' 16.7% ../genome-s10-small.fa.gz' in runtmp.last_result.out + assert "3 matches" in runtmp.last_result.out + assert "100.0% ../genome-s10+s11.fa.gz" in runtmp.last_result.out + assert "100.0% ../genome-s10.fa.gz" in runtmp.last_result.out + assert " 16.7% ../genome-s10-small.fa.gz" in runtmp.last_result.out def test_search_containment_s10_sbt_best_only(runtmp): # check --containment for s10/s10-small - q1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - q2 = utils.get_test_data('scaled/all.sbt.zip') + q1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + q2 = utils.get_test_data("scaled/all.sbt.zip") - runtmp.sourmash('search', q1, q2, '--containment', '--best-only') + runtmp.sourmash("search", q1, q2, "--containment", "--best-only") print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '100.0% ' in runtmp.last_result.out # there are at least two perfect matches! + assert ( + "100.0% " in runtmp.last_result.out + ) # there are at least two perfect matches! assert runtmp.last_result.status == 0 def test_search_containment_s10_sbt_empty(runtmp): # check --containment for s10/s10-small at absurd scaled/empty mh - q1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - q2 = utils.get_test_data('scaled/all.sbt.zip') + q1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + q2 = utils.get_test_data("scaled/all.sbt.zip") - runtmp.sourmash('search', q1, q2, '--scaled', '1e7', '--containment') + runtmp.sourmash("search", q1, q2, "--scaled", "1e7", "--containment") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '0 matches' in runtmp.last_result.out + assert "0 matches" in runtmp.last_result.out def test_search_max_containment_s10_sbt(runtmp): # check --max-containment for s10/s10-small - q1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - q2 = utils.get_test_data('scaled/all.sbt.zip') + q1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + q2 = utils.get_test_data("scaled/all.sbt.zip") - runtmp.sourmash('search', q1, q2, '--max-containment') + runtmp.sourmash("search", q1, q2, "--max-containment") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '3 matches' in runtmp.last_result.out - assert '100.0% ../genome-s10-small.fa.gz' in runtmp.last_result.out - assert '100.0% ../genome-s10.fa.gz' in runtmp.last_result.out - assert '100.0% ../genome-s10+s11.fa.gz' in runtmp.last_result.out + assert "3 matches" in runtmp.last_result.out + assert "100.0% ../genome-s10-small.fa.gz" in runtmp.last_result.out + assert "100.0% ../genome-s10.fa.gz" in runtmp.last_result.out + assert "100.0% ../genome-s10+s11.fa.gz" in runtmp.last_result.out def test_search_max_containment_s10_sbt_best_only(runtmp): # check --max-containment for s10/s10-small - q1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - q2 = utils.get_test_data('scaled/all.sbt.zip') + q1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + q2 = utils.get_test_data("scaled/all.sbt.zip") - runtmp.sourmash('search', q1, q2, '--max-containment', '--best-only') + runtmp.sourmash("search", q1, q2, "--max-containment", "--best-only") print(runtmp.last_result.out) print(runtmp.last_result.err) @@ -2048,120 +2294,142 @@ def test_search_max_containment_s10_sbt_best_only(runtmp): def test_search_max_containment_s10_sbt_empty(runtmp): # check --max-containment for s10/s10-small at absurd scaled/empty mh. - q1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - q2 = utils.get_test_data('scaled/all.sbt.zip') + q1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + q2 = utils.get_test_data("scaled/all.sbt.zip") - runtmp.sourmash('search', q1, q2, '--scaled', '1e7', '--max-containment') + runtmp.sourmash("search", q1, q2, "--scaled", "1e7", "--max-containment") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '0 matches' in runtmp.last_result.out + assert "0 matches" in runtmp.last_result.out def test_search_containment_s10_lca(runtmp): # check --containment for s10/s10-small - q1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - q2 = utils.get_test_data('scaled/all.lca.json') + q1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + q2 = utils.get_test_data("scaled/all.lca.json") - runtmp.sourmash('search', q1, q2, '--containment') + runtmp.sourmash("search", q1, q2, "--containment") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '3 matches' in runtmp.last_result.out - assert '100.0% 455c2f95' in runtmp.last_result.out - assert '100.0% 684aa226' in runtmp.last_result.out - assert ' 16.7% 7f7835d2' in runtmp.last_result.out + assert "3 matches" in runtmp.last_result.out + assert "100.0% 455c2f95" in runtmp.last_result.out + assert "100.0% 684aa226" in runtmp.last_result.out + assert " 16.7% 7f7835d2" in runtmp.last_result.out def test_search_max_containment_s10_lca(runtmp): # check --max-containment for s10/s10-small - q1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - q2 = utils.get_test_data('scaled/all.lca.json') + q1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + q2 = utils.get_test_data("scaled/all.lca.json") - runtmp.sourmash('search', q1, q2, '--max-containment') + runtmp.sourmash("search", q1, q2, "--max-containment") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '3 matches' in runtmp.last_result.out - assert '100.0% 455c2f95' in runtmp.last_result.out - assert '100.0% 684aa226' in runtmp.last_result.out - assert '100.0% 7f7835d2' in runtmp.last_result.out + assert "3 matches" in runtmp.last_result.out + assert "100.0% 455c2f95" in runtmp.last_result.out + assert "100.0% 684aa226" in runtmp.last_result.out + assert "100.0% 7f7835d2" in runtmp.last_result.out def test_search_gzip(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna','-p','k=31,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - data = Path(runtmp.output('short.fa.sig')).read_bytes() - with gzip.open(runtmp.output('zzz.gz'), 'wb') as fp: + data = Path(runtmp.output("short.fa.sig")).read_bytes() + with gzip.open(runtmp.output("zzz.gz"), "wb") as fp: fp.write(data) - data = Path(runtmp.output('short2.fa.sig')).read_bytes() - with gzip.open(runtmp.output('yyy.gz'), 'wb') as fp: + data = Path(runtmp.output("short2.fa.sig")).read_bytes() + with gzip.open(runtmp.output("yyy.gz"), "wb") as fp: fp.write(data) - runtmp.sourmash('search', 'zzz.gz', 'yyy.gz') + runtmp.sourmash("search", "zzz.gz", "yyy.gz") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '1 matches' in runtmp.last_result.out - assert '93.0%' in runtmp.last_result.out + assert "1 matches" in runtmp.last_result.out + assert "93.0%" in runtmp.last_result.out def test_search_2(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") - runtmp.sourmash('sketch','dna','-p','k=31,num=500', testdata1, testdata2, testdata3) + runtmp.sourmash( + "sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2, testdata3 + ) - runtmp.sourmash('search', 'short.fa.sig', 'short2.fa.sig', 'short3.fa.sig') + runtmp.sourmash("search", "short.fa.sig", "short2.fa.sig", "short3.fa.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '2 matches' in runtmp.last_result.out - assert '93.0%' in runtmp.last_result.out - assert '89.6%' in runtmp.last_result.out + assert "2 matches" in runtmp.last_result.out + assert "93.0%" in runtmp.last_result.out + assert "89.6%" in runtmp.last_result.out def test_search_3(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") - runtmp.sourmash('sketch','dna','-p','k=31,num=500', testdata1, testdata2, testdata3) + runtmp.sourmash( + "sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2, testdata3 + ) - runtmp.sourmash('search', '-n', '1', 'short.fa.sig', 'short2.fa.sig', 'short3.fa.sig') + runtmp.sourmash( + "search", "-n", "1", "short.fa.sig", "short2.fa.sig", "short3.fa.sig" + ) print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '2 matches above threshold 0.080; showing first 1:' in runtmp.last_result.out + assert "2 matches above threshold 0.080; showing first 1:" in runtmp.last_result.out def test_search_4(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") - runtmp.sourmash('sketch','dna','-p','k=31,num=500', testdata1, testdata2, testdata3) + runtmp.sourmash( + "sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2, testdata3 + ) - runtmp.sourmash('search', '-n', '0', 'short.fa.sig', 'short2.fa.sig', 'short3.fa.sig') + runtmp.sourmash( + "search", "-n", "0", "short.fa.sig", "short2.fa.sig", "short3.fa.sig" + ) print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '2 matches above threshold 0.080:' in runtmp.last_result.out - assert 'short2.fa' in runtmp.last_result.out - assert 'short3.fa' in runtmp.last_result.out + assert "2 matches above threshold 0.080:" in runtmp.last_result.out + assert "short2.fa" in runtmp.last_result.out + assert "short3.fa" in runtmp.last_result.out def test_search_5_num_results(runtmp): - query = utils.get_test_data('gather/combined.sig') - against = glob.glob(utils.get_test_data('gather/GCF*.sig')) + query = utils.get_test_data("gather/combined.sig") + against = glob.glob(utils.get_test_data("gather/GCF*.sig")) - runtmp.sourmash('search', '-n', '5', query, *against) + runtmp.sourmash("search", "-n", "5", query, *against) print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '12 matches above threshold 0.080; showing first 5:' in runtmp.last_result.out + assert ( + "12 matches above threshold 0.080; showing first 5:" in runtmp.last_result.out + ) def test_index_check_scaled_bounds_negative(runtmp): with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('index', 'zzz', 'short.fa.sig', 'short2.fa.sig', '-k', '31', '--scaled', '-5', '--dna') + runtmp.sourmash( + "index", + "zzz", + "short.fa.sig", + "short2.fa.sig", + "-k", + "31", + "--scaled", + "-5", + "--dna", + ) print(runtmp.last_result.err) @@ -2170,37 +2438,70 @@ def test_index_check_scaled_bounds_negative(runtmp): def test_index_check_scaled_bounds_less_than_minimum(runtmp): with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('index', 'zzz', 'short.fa.sig', 'short2.fa.sig', '-k', '31', '--scaled', '50', '--dna') - - assert "WARNING: scaled value should be >= 100. Continuing anyway." in runtmp.last_result.err + runtmp.sourmash( + "index", + "zzz", + "short.fa.sig", + "short2.fa.sig", + "-k", + "31", + "--scaled", + "50", + "--dna", + ) + + assert ( + "WARNING: scaled value should be >= 100. Continuing anyway." + in runtmp.last_result.err + ) def test_index_check_scaled_bounds_more_than_maximum(runtmp): with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('index', 'zzz', 'short.fa.sig', 'short2.fa.sig', '-k', '31', '--scaled', '1e9', '--dna') - - assert "WARNING: scaled value should be <= 1e6. Continuing anyway." in runtmp.last_result.err + runtmp.sourmash( + "index", + "zzz", + "short.fa.sig", + "short2.fa.sig", + "-k", + "31", + "--scaled", + "1e9", + "--dna", + ) + + assert ( + "WARNING: scaled value should be <= 1e6. Continuing anyway." + in runtmp.last_result.err + ) @utils.in_tempdir def test_index_metagenome_fromfile(c): # test index --from-file - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") # construct a file list - with open(c.output('sig.list'), 'wt') as fp: + with open(c.output("sig.list"), "w") as fp: fp.write("\n".join(testdata_sigs)) - cmd = ['index', 'gcf_all', testdata_sigs[0], '-k', '21', - '--from-file', c.output('sig.list')] + cmd = [ + "index", + "gcf_all", + testdata_sigs[0], + "-k", + "21", + "--from-file", + c.output("sig.list"), + ] c.run_sourmash(*cmd) - assert os.path.exists(c.output('gcf_all.sbt.zip')) + assert os.path.exists(c.output("gcf_all.sbt.zip")) - cmd = 'search {} gcf_all -k 21'.format(query_sig) + cmd = f"search {query_sig} gcf_all -k 21" cmd = cmd.split() c.run_sourmash(*cmd) @@ -2208,28 +2509,31 @@ def test_index_metagenome_fromfile(c): print(out) print(c.last_result.err) - assert ' 33.2% NC_003198.1 Salmonella enterica subsp. enterica serovar T...' in out - assert '12 matches above threshold 0.080; showing first 3:' in out + assert ( + " 33.2% NC_003198.1 Salmonella enterica subsp. enterica serovar T..." + in out + ) + assert "12 matches above threshold 0.080; showing first 3:" in out + @utils.in_tempdir def test_index_metagenome_fromfile_no_cmdline_sig(c): # test index --from-file - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") # construct a file list - with open(c.output('sig.list'), 'wt') as fp: + with open(c.output("sig.list"), "w") as fp: fp.write("\n".join(testdata_sigs)) - cmd = ['index', 'gcf_all', '-k', '21', - '--from-file', c.output('sig.list')] + cmd = ["index", "gcf_all", "-k", "21", "--from-file", c.output("sig.list")] c.run_sourmash(*cmd) - assert os.path.exists(c.output('gcf_all.sbt.zip')) + assert os.path.exists(c.output("gcf_all.sbt.zip")) - cmd = 'search {} gcf_all -k 21'.format(query_sig) + cmd = f"search {query_sig} gcf_all -k 21" cmd = cmd.split() c.run_sourmash(*cmd) @@ -2237,81 +2541,98 @@ def test_index_metagenome_fromfile_no_cmdline_sig(c): print(out) print(c.last_result.err) - assert ' 33.2% NC_003198.1 Salmonella enterica subsp. enterica serovar T' in out - assert '12 matches above threshold 0.080; showing first 3:' in out + assert ( + " 33.2% NC_003198.1 Salmonella enterica subsp. enterica serovar T" in out + ) + assert "12 matches above threshold 0.080; showing first 3:" in out def test_search_metagenome(runtmp): - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) runtmp.sourmash(*cmd) - assert os.path.exists(runtmp.output('gcf_all.sbt.zip')) + assert os.path.exists(runtmp.output("gcf_all.sbt.zip")) - runtmp.sourmash('search', query_sig, 'gcf_all', '-k', '21') + runtmp.sourmash("search", query_sig, "gcf_all", "-k", "21") print(runtmp.last_result.out) print(runtmp.last_result.err) - assert ' 33.2% NC_003198.1 Salmonella enterica subsp. enterica serovar T' in runtmp.last_result.out - assert '12 matches above threshold 0.080; showing first 3:' in runtmp.last_result.out + assert ( + " 33.2% NC_003198.1 Salmonella enterica subsp. enterica serovar T" + in runtmp.last_result.out + ) + assert ( + "12 matches above threshold 0.080; showing first 3:" in runtmp.last_result.out + ) def test_search_metagenome_traverse(runtmp): - testdata_dir = utils.get_test_data('gather') + testdata_dir = utils.get_test_data("gather") - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - runtmp.sourmash('search', query_sig, testdata_dir, '-k', '21') + runtmp.sourmash("search", query_sig, testdata_dir, "-k", "21") print(runtmp.last_result.out) print(runtmp.last_result.err) - assert ' 33.2% NC_003198.1 Salmonella enterica subsp. enterica serovar T' in runtmp.last_result.out - assert '13 matches above threshold 0.080; showing first 3:' in runtmp.last_result.out + assert ( + " 33.2% NC_003198.1 Salmonella enterica subsp. enterica serovar T" + in runtmp.last_result.out + ) + assert ( + "13 matches above threshold 0.080; showing first 3:" in runtmp.last_result.out + ) def test_search_metagenome_traverse_check_csv(runtmp): # this test confirms that the CSV 'filename' output for signatures loaded # via directory traversal properly contains the actual path to the # signature file from which the signature was loaded. - testdata_dir = utils.get_test_data('gather') + testdata_dir = utils.get_test_data("gather") - query_sig = utils.get_test_data('gather/combined.sig') - out_csv = runtmp.output('out.csv') + query_sig = utils.get_test_data("gather/combined.sig") + out_csv = runtmp.output("out.csv") - runtmp.sourmash('search', query_sig, testdata_dir, '-k', '21', '-o', out_csv) + runtmp.sourmash("search", query_sig, testdata_dir, "-k", "21", "-o", out_csv) print(runtmp.last_result.out) print(runtmp.last_result.err) - with open(out_csv, 'rt') as fp: + with open(out_csv) as fp: prefix_len = len(testdata_dir) r = csv.DictReader(fp) for row in r: print(row) - filename = row['filename'] + filename = row["filename"] assert filename.startswith(testdata_dir), filename # should have full path to file sig was loaded from assert len(filename) > prefix_len - assert ' 33.2% NC_003198.1 Salmonella enterica subsp. enterica serovar T' in runtmp.last_result.out - assert '13 matches above threshold 0.080; showing first 3:' in runtmp.last_result.out + assert ( + " 33.2% NC_003198.1 Salmonella enterica subsp. enterica serovar T" + in runtmp.last_result.out + ) + assert ( + "13 matches above threshold 0.080; showing first 3:" in runtmp.last_result.out + ) @utils.in_thisdir def test_search_incompatible(c): - num_sig = utils.get_test_data('num/47.fa.sig') - scaled_sig = utils.get_test_data('47.fa.sig') + num_sig = utils.get_test_data("num/47.fa.sig") + scaled_sig = utils.get_test_data("47.fa.sig") - with pytest.raises(SourmashCommandFailed) as exc: + with pytest.raises(SourmashCommandFailed): c.run_sourmash("search", scaled_sig, num_sig, fail_ok=True) assert c.last_result.status != 0 print(c.last_result.out) @@ -2324,52 +2645,61 @@ def test_search_incompatible(c): def test_search_traverse_incompatible(c): # build a directory with some signatures in it, search for compatible # signatures. - searchdir = c.output('searchme') + searchdir = c.output("searchme") os.mkdir(searchdir) - num_sig = utils.get_test_data('num/47.fa.sig') - scaled_sig = utils.get_test_data('47.fa.sig') - shutil.copyfile(num_sig, c.output('searchme/num.sig')) - shutil.copyfile(scaled_sig, c.output('searchme/scaled.sig')) + num_sig = utils.get_test_data("num/47.fa.sig") + scaled_sig = utils.get_test_data("47.fa.sig") + shutil.copyfile(num_sig, c.output("searchme/num.sig")) + shutil.copyfile(scaled_sig, c.output("searchme/scaled.sig")) - c.run_sourmash("search", scaled_sig, c.output('searchme')) - assert '100.0% NC_009665.1 Shewanella baltica OS185, complete genome' in c.last_result.out + c.run_sourmash("search", scaled_sig, c.output("searchme")) + assert ( + "100.0% NC_009665.1 Shewanella baltica OS185, complete genome" + in c.last_result.out + ) def test_search_check_scaled_bounds_negative(runtmp): - testdata_glob = utils.get_test_data('gather/GCF*.sig') - testdata_sigs = glob.glob(testdata_glob) + testdata_glob = utils.get_test_data("gather/GCF*.sig") + glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('search', query_sig, 'gcf_all', '-k', '21', '--scaled', '-5') + runtmp.sourmash("search", query_sig, "gcf_all", "-k", "21", "--scaled", "-5") assert "ERROR: scaled value must be positive" in runtmp.last_result.err def test_search_check_scaled_bounds_less_than_minimum(runtmp): - testdata_glob = utils.get_test_data('gather/GCF*.sig') - testdata_sigs = glob.glob(testdata_glob) + testdata_glob = utils.get_test_data("gather/GCF*.sig") + glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('search', query_sig, 'gcf_all', '-k', '21', '--scaled', '50') + runtmp.sourmash("search", query_sig, "gcf_all", "-k", "21", "--scaled", "50") - assert "WARNING: scaled value should be >= 100. Continuing anyway." in runtmp.last_result.err + assert ( + "WARNING: scaled value should be >= 100. Continuing anyway." + in runtmp.last_result.err + ) def test_search_check_scaled_bounds_more_than_maximum(runtmp): - testdata_glob = utils.get_test_data('gather/GCF*.sig') - testdata_sigs = glob.glob(testdata_glob) + testdata_glob = utils.get_test_data("gather/GCF*.sig") + glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('search', query_sig, 'gcf_all', '-k', '21', '--scaled', '1e9') + runtmp.sourmash("search", query_sig, "gcf_all", "-k", "21", "--scaled", "1e9") - assert "WARNING: scaled value should be <= 1e6. Continuing anyway." in runtmp.last_result.err + assert ( + "WARNING: scaled value should be <= 1e6. Continuing anyway." + in runtmp.last_result.err + ) # explanation: you cannot downsample a scaled SBT to match a scaled @@ -2377,77 +2707,108 @@ def test_search_check_scaled_bounds_more_than_maximum(runtmp): # (you *can* downsample a signature to match an SBT.) def test_search_metagenome_sbt_downsample_fail(runtmp): # test downsample on SBT => failure, with --fail-on-empty-databases - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) runtmp.sourmash(*cmd) - assert os.path.exists(runtmp.output('gcf_all.sbt.zip')) + assert os.path.exists(runtmp.output("gcf_all.sbt.zip")) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('search', query_sig, 'gcf_all', '-k', '21', '--scaled', '100000') + runtmp.sourmash( + "search", query_sig, "gcf_all", "-k", "21", "--scaled", "100000" + ) print(runtmp.last_result.out) print(runtmp.last_result.err) assert runtmp.last_result.status == -1 assert "ERROR: cannot use 'gcf_all' for this query." in runtmp.last_result.err - assert "search scaled value 100000 is less than database scaled value of 10000" in runtmp.last_result.err + assert ( + "search scaled value 100000 is less than database scaled value of 10000" + in runtmp.last_result.err + ) def test_search_metagenome_sbt_downsample_nofail(runtmp): # test downsample on SBT => failure but ok with --no-fail-on-empty-database - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) runtmp.sourmash(*cmd) - assert os.path.exists(runtmp.output('gcf_all.sbt.zip')) + assert os.path.exists(runtmp.output("gcf_all.sbt.zip")) - runtmp.sourmash('search', query_sig, 'gcf_all', '-k', '21', '--scaled', '100000', '--no-fail-on-empty-database') + runtmp.sourmash( + "search", + query_sig, + "gcf_all", + "-k", + "21", + "--scaled", + "100000", + "--no-fail-on-empty-database", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) assert runtmp.last_result.status == 0 assert "ERROR: cannot use 'gcf_all' for this query." in runtmp.last_result.err - assert "search scaled value 100000 is less than database scaled value of 10000" in runtmp.last_result.err + assert ( + "search scaled value 100000 is less than database scaled value of 10000" + in runtmp.last_result.err + ) assert "0 matches" in runtmp.last_result.out def test_search_metagenome_downsample_containment(runtmp): - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) runtmp.sourmash(*cmd) - assert os.path.exists(runtmp.output('gcf_all.sbt.zip')) + assert os.path.exists(runtmp.output("gcf_all.sbt.zip")) - runtmp.sourmash('search', query_sig, 'gcf_all', '-k', '21', '--scaled', '100000', '--containment') + runtmp.sourmash( + "search", + query_sig, + "gcf_all", + "-k", + "21", + "--scaled", + "100000", + "--containment", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert ' 32.9% NC_003198.1 Salmonella enterica subsp. enterica serovar T' in runtmp.last_result.out - assert '12 matches above threshold 0.080; showing first 3:' in runtmp.last_result.out + assert ( + " 32.9% NC_003198.1 Salmonella enterica subsp. enterica serovar T" + in runtmp.last_result.out + ) + assert ( + "12 matches above threshold 0.080; showing first 3:" in runtmp.last_result.out + ) @utils.in_tempdir @@ -2455,36 +2816,46 @@ def test_search_metagenome_downsample_index(c): # does same search as search_metagenome_downsample_containment but # rescales during indexing - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") # downscale during indexing, rather than during search. - c.run_sourmash('index', 'gcf_all', *testdata_sigs, '-k', '21', - '--scaled', '100000') + c.run_sourmash("index", "gcf_all", *testdata_sigs, "-k", "21", "--scaled", "100000") - assert os.path.exists(c.output('gcf_all.sbt.zip')) + assert os.path.exists(c.output("gcf_all.sbt.zip")) - c.run_sourmash('search', query_sig, 'gcf_all', '-k', '21', - '--containment') + c.run_sourmash("search", query_sig, "gcf_all", "-k", "21", "--containment") print(c) - assert ' 32.9% NC_003198.1 Salmonella enterica subsp. enterica serovar T' in str( - c) - assert ' 29.7% NC_003197.2 Salmonella enterica subsp. enterica serovar T' in str( - c) - assert '12 matches above threshold 0.080; showing first 3:' in str(c) + assert ( + " 32.9% NC_003198.1 Salmonella enterica subsp. enterica serovar T" + in str(c) + ) + assert ( + " 29.7% NC_003197.2 Salmonella enterica subsp. enterica serovar T" + in str(c) + ) + assert "12 matches above threshold 0.080; showing first 3:" in str(c) def test_search_with_picklist(runtmp): # test 'sourmash search' with picklists - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') - - runtmp.sourmash('search', metag_sig, *gcf_sigs, '--containment', - '-k', '21', '--picklist', f"{picklist}:md5:md5") + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") + + runtmp.sourmash( + "search", + metag_sig, + *gcf_sigs, + "--containment", + "-k", + "21", + "--picklist", + f"{picklist}:md5:md5", + ) err = runtmp.last_result.err print(err) @@ -2502,12 +2873,20 @@ def test_search_with_picklist(runtmp): def test_search_with_picklist_exclude(runtmp): # test 'sourmash search' with picklists - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') - - runtmp.sourmash('search', metag_sig, *gcf_sigs, '--containment', - '-k', '21', '--picklist', f"{picklist}:md5:md5:exclude") + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") + + runtmp.sourmash( + "search", + metag_sig, + *gcf_sigs, + "--containment", + "-k", + "21", + "--picklist", + f"{picklist}:md5:md5:exclude", + ) err = runtmp.last_result.err print(err) @@ -2524,11 +2903,19 @@ def test_search_with_picklist_exclude(runtmp): def test_search_with_pattern_include(runtmp): # test 'sourmash search' with --include-db-pattern - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - - runtmp.sourmash('search', metag_sig, *gcf_sigs, '--containment', - '-k', '21', '--include', "thermotoga") + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + + runtmp.sourmash( + "search", + metag_sig, + *gcf_sigs, + "--containment", + "-k", + "21", + "--include", + "thermotoga", + ) err = runtmp.last_result.err print(err) @@ -2543,11 +2930,19 @@ def test_search_with_pattern_include(runtmp): def test_search_with_pattern_exclude(runtmp): # test 'sourmash search' with --exclude-db-pattern - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - - runtmp.sourmash('search', metag_sig, *gcf_sigs, '--containment', - '-k', '21', '--exclude', "thermotoga") + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + + runtmp.sourmash( + "search", + metag_sig, + *gcf_sigs, + "--containment", + "-k", + "21", + "--exclude", + "thermotoga", + ) err = runtmp.last_result.err print(err) @@ -2562,13 +2957,12 @@ def test_search_with_pattern_exclude(runtmp): def test_search_empty_db_fail(runtmp): # search should fail on empty db with --fail-on-empty-database - query = utils.get_test_data('2.fa.sig') - against = utils.get_test_data('47.fa.sig') - against2 = utils.get_test_data('lca/47+63.lca.json') + query = utils.get_test_data("2.fa.sig") + against = utils.get_test_data("47.fa.sig") + against2 = utils.get_test_data("lca/47+63.lca.json") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('search', query, against, against2, '-k', '51') - + runtmp.sourmash("search", query, against, against2, "-k", "51") err = runtmp.last_result.err assert "no compatible signatures found in " in err @@ -2576,12 +2970,13 @@ def test_search_empty_db_fail(runtmp): def test_search_empty_db_nofail(runtmp): # search should not fail on empty db with --no-fail-on-empty-database - query = utils.get_test_data('2.fa.sig') - against = utils.get_test_data('47.fa.sig') - against2 = utils.get_test_data('lca/47+63.lca.json') + query = utils.get_test_data("2.fa.sig") + against = utils.get_test_data("47.fa.sig") + against2 = utils.get_test_data("lca/47+63.lca.json") - runtmp.sourmash('search', query, against, against2, '-k', '51', - '--no-fail-on-empty-data') + runtmp.sourmash( + "search", query, against, against2, "-k", "51", "--no-fail-on-empty-data" + ) out = runtmp.last_result.out err = runtmp.last_result.err @@ -2589,206 +2984,239 @@ def test_search_empty_db_nofail(runtmp): print(err) assert "no compatible signatures found in " in err - assert "ksize on this database is 31; this is different from requested ksize of 51" in err + assert ( + "ksize on this database is 31; this is different from requested ksize of 51" + in err + ) assert "loaded 50 total signatures from 2 locations" in err assert "after selecting signatures compatible with search, 0 remain." in err def test_mash_csv_to_sig(runtmp): - testdata1 = utils.get_test_data('short.fa.msh.dump') - testdata2 = utils.get_test_data('short.fa') + testdata1 = utils.get_test_data("short.fa.msh.dump") + testdata2 = utils.get_test_data("short.fa") - runtmp.sourmash('import_csv', testdata1, '-o', 'xxx.sig') + runtmp.sourmash("import_csv", testdata1, "-o", "xxx.sig") - runtmp.sourmash('sketch', 'dna', '-p','k=31,num=970',testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=970", testdata2) - runtmp.sourmash('search', '-k', '31', 'short.fa.sig', 'xxx.sig') + runtmp.sourmash("search", "-k", "31", "short.fa.sig", "xxx.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '1 matches' in runtmp.last_result.out - assert '100.0% short.fa' in runtmp.last_result.out + assert "1 matches" in runtmp.last_result.out + assert "100.0% short.fa" in runtmp.last_result.out def test_do_sourmash_index_bad_args(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna','-p','k=31,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('index', 'zzz', 'short.fa.sig', 'short2.fa.sig', '-k', '31', '--dna', '--protein') + runtmp.sourmash( + "index", + "zzz", + "short.fa.sig", + "short2.fa.sig", + "-k", + "31", + "--dna", + "--protein", + ) print(runtmp.last_result.out, runtmp.last_result.err) - assert 'cannot specify more than one of --dna/--rna/--nucleotide/--protein/--hp/--dayhoff' in runtmp.last_result.err + assert ( + "cannot specify more than one of --dna/--rna/--nucleotide/--protein/--hp/--dayhoff" + in runtmp.last_result.err + ) assert runtmp.last_result.status != 0 def test_do_sourmash_sbt_search(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna','-p','k=31,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - runtmp.sourmash('index', 'zzz', 'short.fa.sig', 'short2.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz", "short.fa.sig", "short2.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('search', 'short.fa.sig', 'zzz') + runtmp.sourmash("search", "short.fa.sig", "zzz") print(runtmp.last_result.out) - assert 'short.fa' in runtmp.last_result.out - assert 'short2.fa' in runtmp.last_result.out + assert "short.fa" in runtmp.last_result.out + assert "short2.fa" in runtmp.last_result.out def test_do_sourmash_sbt_search_wrong_ksize(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'translate', '-p', 'k=31,num=500', '-p', 'k=51,num=500', testdata1, testdata2) + runtmp.sourmash( + "sketch", + "translate", + "-p", + "k=31,num=500", + "-p", + "k=51,num=500", + testdata1, + testdata2, + ) - runtmp.sourmash('index', 'zzz', 'short.fa.sig', 'short2.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz", "short.fa.sig", "short2.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('search', '-k', '51', 'short.fa.sig', 'zzz') + runtmp.sourmash("search", "-k", "51", "short.fa.sig", "zzz") assert runtmp.last_result.status == -1 print(runtmp.last_result.out) print(runtmp.last_result.err) assert "ERROR: cannot use 'zzz' for this query." in runtmp.last_result.err - assert "search ksize 51 is different from database ksize 31" in runtmp.last_result.err + assert ( + "search ksize 51 is different from database ksize 31" in runtmp.last_result.err + ) def test_do_sourmash_sbt_search_multiple(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna','-p','k=31,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - runtmp.sourmash('index', 'zzz', 'short.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz", "short.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('index', 'zzz2', 'short2.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz2", "short2.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz2.sbt.zip')) + assert os.path.exists(runtmp.output("zzz2.sbt.zip")) - runtmp.sourmash('search', 'short.fa.sig', 'zzz', 'zzz2') + runtmp.sourmash("search", "short.fa.sig", "zzz", "zzz2") print(runtmp.last_result.out) - assert 'short.fa' in runtmp.last_result.out - assert 'short2.fa' in runtmp.last_result.out + assert "short.fa" in runtmp.last_result.out + assert "short2.fa" in runtmp.last_result.out def test_do_sourmash_sbt_search_and_sigs(runtmp): # search an SBT and a signature at same time. - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna','-p','k=31,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - runtmp.sourmash('index', 'zzz', 'short.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz", "short.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('search', 'short.fa.sig', 'zzz', 'short2.fa.sig') + runtmp.sourmash("search", "short.fa.sig", "zzz", "short2.fa.sig") print(runtmp.last_result.out) - assert 'short.fa' in runtmp.last_result.out - assert 'short2.fa' in runtmp.last_result.out + assert "short.fa" in runtmp.last_result.out + assert "short2.fa" in runtmp.last_result.out def test_do_sourmash_sbt_search_downsample(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'dna', '-p', 'k=31,scaled=10', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,scaled=10", testdata1, testdata2) - testdata1 = utils.get_test_data('short.fa') + testdata1 = utils.get_test_data("short.fa") - runtmp.sourmash('sketch','dna','-p','k=31,scaled=5', '-o', 'query.sig', testdata1) + runtmp.sourmash( + "sketch", "dna", "-p", "k=31,scaled=5", "-o", "query.sig", testdata1 + ) - runtmp.sourmash('index', '-k', '31', 'zzz', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("index", "-k", "31", "zzz", "short.fa.sig", "short2.fa.sig") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('search', 'query.sig', 'zzz') + runtmp.sourmash("search", "query.sig", "zzz") print(runtmp.last_result.out) - assert 'short.fa' in runtmp.last_result.out - assert 'short2.fa' in runtmp.last_result.out + assert "short.fa" in runtmp.last_result.out + assert "short2.fa" in runtmp.last_result.out def test_do_sourmash_sbt_search_downsample_2(runtmp): - testdata1 = utils.get_test_data('lca-root/TARA_MED_MAG_00029.fa.sig') - testdata2 = utils.get_test_data('lca-root/TOBG_MED-875.fna.gz.sig') + testdata1 = utils.get_test_data("lca-root/TARA_MED_MAG_00029.fa.sig") + testdata2 = utils.get_test_data("lca-root/TOBG_MED-875.fna.gz.sig") - sbtname = 'foo' + sbtname = "foo" - runtmp.sourmash('index', '-k', '31', sbtname, testdata2) + runtmp.sourmash("index", "-k", "31", sbtname, testdata2) assert runtmp.last_result.status == 0 with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('search', testdata1, sbtname, '--scaled=100000', '--threshold=0.01') + runtmp.sourmash( + "search", testdata1, sbtname, "--scaled=100000", "--threshold=0.01" + ) assert runtmp.last_result.status == -1 print(runtmp.last_result.out) print(runtmp.last_result.err) assert "ERROR: cannot use 'foo' for this query." in runtmp.last_result.err - assert "search scaled value 100000 is less than database scaled value of 2000" in runtmp.last_result.err + assert ( + "search scaled value 100000 is less than database scaled value of 2000" + in runtmp.last_result.err + ) @utils.in_tempdir def test_do_sourmash_index_abund(c): # 'sourmash index' should flatten signatures w/track_abund. - testdata2 = utils.get_test_data('lca-root/TOBG_MED-875.fna.gz.sig') + testdata2 = utils.get_test_data("lca-root/TOBG_MED-875.fna.gz.sig") - with open(testdata2, 'rt') as fp: + with open(testdata2): ss = sourmash.load_one_signature(testdata2, ksize=31) assert ss.minhash.track_abundance == True - sbtname = 'foo' + sbtname = "foo" - c.run_sourmash('index', '-k', '31', sbtname, testdata2) + c.run_sourmash("index", "-k", "31", sbtname, testdata2) for kk in sourmash.load_file_as_signatures(c.output(sbtname)): assert kk.minhash.track_abundance == False def test_do_sourmash_index_single(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna','-p','k=31,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - runtmp.sourmash('index', '-k', '31', 'zzz', 'short.fa.sig') + runtmp.sourmash("index", "-k", "31", "zzz", "short.fa.sig") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('search', 'short.fa.sig', 'zzz') + runtmp.sourmash("search", "short.fa.sig", "zzz") print(runtmp.last_result.out) - assert 'short.fa' in runtmp.last_result.out + assert "short.fa" in runtmp.last_result.out def test_do_sourmash_sbt_search_selectprot(runtmp): # index should fail when run on signatures with multiple types - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - args = ['sketch', 'dna', '-p', 'k=30,num=500',testdata1, testdata2] + args = ["sketch", "dna", "-p", "k=30,num=500", testdata1, testdata2] runtmp.sourmash(*args) - args = ['index', '-k', '31', 'zzz', 'short.fa.sig', 'short2.fa.sig'] + args = ["index", "-k", "31", "zzz", "short.fa.sig", "short2.fa.sig"] with pytest.raises(SourmashCommandFailed): runtmp.sourmash(*args) @@ -2801,122 +3229,130 @@ def test_do_sourmash_sbt_search_selectprot(runtmp): def test_do_sourmash_search_multimoltype_query(runtmp): # 'search' should fail if multiple sigs are given as query, due to # multiple molecule types. - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") # first, calculate signatures with multiple molecule types - args = ['sketch', 'translate', testdata1, testdata2, - '-p', 'protein', '-p', 'dayhoff'] + args = [ + "sketch", + "translate", + testdata1, + testdata2, + "-p", + "protein", + "-p", + "dayhoff", + ] runtmp.sourmash(*args) # now, index one of 'em - args = ['index', 'zzz', 'short.fa.sig', 'short2.fa.sig', '--protein'] + args = ["index", "zzz", "short.fa.sig", "short2.fa.sig", "--protein"] runtmp.sourmash(*args) # output exists, yes? - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) # now, try searching. Should raise error. - args = ['search', 'short.fa.sig', 'zzz'] - with pytest.raises(SourmashCommandFailed) as exc: + args = ["search", "short.fa.sig", "zzz"] + with pytest.raises(SourmashCommandFailed): runtmp.sourmash(*args) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'need exactly one' in runtmp.last_result.err + assert "need exactly one" in runtmp.last_result.err def test_do_sourmash_index_traverse(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna','-p','k=31,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - runtmp.sourmash('index', '-k', '31', 'zzz', '.') + runtmp.sourmash("index", "-k", "31", "zzz", ".") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) - assert 'loaded 2 sigs; saving SBT under' in runtmp.last_result.err + assert os.path.exists(runtmp.output("zzz.sbt.zip")) + assert "loaded 2 sigs; saving SBT under" in runtmp.last_result.err - runtmp.sourmash('search', 'short.fa.sig', 'zzz') + runtmp.sourmash("search", "short.fa.sig", "zzz") print(runtmp.last_result.out) - assert 'short.fa' in runtmp.last_result.out - assert 'short2.fa' in runtmp.last_result.out + assert "short.fa" in runtmp.last_result.out + assert "short2.fa" in runtmp.last_result.out @utils.in_tempdir def test_do_sourmash_index_traverse_force(c): # test loading of files that don't end with .sig with -f - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - outdir = c.output('sigs') + outdir = c.output("sigs") os.mkdir(outdir) - out1 = os.path.join(outdir, 'short1') - out2 = os.path.join(outdir, 'short2') + out1 = os.path.join(outdir, "short1") + out2 = os.path.join(outdir, "short2") - c.run_sourmash('sketch','dna','-p','k=31,scaled=5', '-o', out1, testdata1) - c.run_sourmash('sketch','dna','-p','k=31,scaled=5', '-o', out2, testdata2) + c.run_sourmash("sketch", "dna", "-p", "k=31,scaled=5", "-o", out1, testdata1) + c.run_sourmash("sketch", "dna", "-p", "k=31,scaled=5", "-o", out2, testdata2) - c.run_sourmash('index', '-k', '31', 'zzz', '.', '-f') + c.run_sourmash("index", "-k", "31", "zzz", ".", "-f") err = c.last_result.err - assert os.path.exists(c.output('zzz.sbt.zip')) - assert 'loaded 2 sigs; saving SBT under' in err + assert os.path.exists(c.output("zzz.sbt.zip")) + assert "loaded 2 sigs; saving SBT under" in err - c.run_sourmash('search', out1, 'zzz') + c.run_sourmash("search", out1, "zzz") out = c.last_result.out print(out) - assert 'short.fa' in out - assert 'short2.fa' in out + assert "short.fa" in out + assert "short2.fa" in out def test_do_sourmash_index_sparseness(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna','-p','k=31,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - runtmp.sourmash('index', '-k', '31', 'zzz.sbt.json', '.', '--sparseness', '1.0') + runtmp.sourmash("index", "-k", "31", "zzz.sbt.json", ".", "--sparseness", "1.0") - assert os.path.exists(runtmp.output('zzz.sbt.json')) - assert 'loaded 2 sigs; saving SBT under' in runtmp.last_result.err + assert os.path.exists(runtmp.output("zzz.sbt.json")) + assert "loaded 2 sigs; saving SBT under" in runtmp.last_result.err - runtmp.sourmash('search', 'short.fa.sig', 'zzz.sbt.json') + runtmp.sourmash("search", "short.fa.sig", "zzz.sbt.json") print(runtmp.last_result.out) - assert len(glob.glob(runtmp.output('.sbt.zzz/*'))) == 3 - assert not glob.glob(runtmp.output('.sbt.zzz/*internal*')) + assert len(glob.glob(runtmp.output(".sbt.zzz/*"))) == 3 + assert not glob.glob(runtmp.output(".sbt.zzz/*internal*")) - assert 'short.fa' in runtmp.last_result.out - assert 'short2.fa' in runtmp.last_result.out + assert "short.fa" in runtmp.last_result.out + assert "short2.fa" in runtmp.last_result.out def test_do_sourmash_sbt_combine(runtmp): files = [utils.get_test_data(f) for f in utils.SIG_FILES] - runtmp.sourmash('index', '-k', '31', 'zzz', *files) + runtmp.sourmash("index", "-k", "31", "zzz", *files) - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('sbt_combine', 'joined', 'zzz.sbt.zip', 'zzz.sbt.zip') + runtmp.sourmash("sbt_combine", "joined", "zzz.sbt.zip", "zzz.sbt.zip") - assert os.path.exists(runtmp.output('joined.sbt.zip')) + assert os.path.exists(runtmp.output("joined.sbt.zip")) filename = os.path.splitext(os.path.basename(utils.SIG_FILES[0]))[0] - runtmp.sourmash('search', files[0], 'zzz') + runtmp.sourmash("search", files[0], "zzz") print(runtmp.last_result.out) # we get notification of signature loading, too - so notify + result. assert runtmp.last_result.out.count(filename) == 1 - runtmp.sourmash('search', files[0], 'joined') + runtmp.sourmash("search", files[0], "joined") print(runtmp.last_result.out) @@ -2924,130 +3360,148 @@ def test_do_sourmash_sbt_combine(runtmp): def test_do_sourmash_index_append(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") - runtmp.sourmash('sketch','dna', '-p', 'k=31,num=500', testdata1, testdata2, testdata3) + runtmp.sourmash( + "sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2, testdata3 + ) - runtmp.sourmash('index', '-k', '31', 'zzz', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("index", "-k", "31", "zzz", "short.fa.sig", "short2.fa.sig") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - sbt_name = runtmp.output('zzz',) - sig_loc = runtmp.output('short3.fa.sig') + sbt_name = runtmp.output( + "zzz", + ) + sig_loc = runtmp.output("short3.fa.sig") - runtmp.sourmash('search', sig_loc, sbt_name) + runtmp.sourmash("search", sig_loc, sbt_name) print(runtmp.last_result.out) - assert 'short.fa' in runtmp.last_result.out - assert 'short2.fa' in runtmp.last_result.out - assert 'short3.fa' not in runtmp.last_result.out + assert "short.fa" in runtmp.last_result.out + assert "short2.fa" in runtmp.last_result.out + assert "short3.fa" not in runtmp.last_result.out - runtmp.sourmash('index', '-k', '31', '--append', 'zzz', 'short3.fa.sig') + runtmp.sourmash("index", "-k", "31", "--append", "zzz", "short3.fa.sig") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - sbt_name = runtmp.output('zzz',) - sig_loc = runtmp.output('short3.fa.sig') + sbt_name = runtmp.output( + "zzz", + ) + sig_loc = runtmp.output("short3.fa.sig") - runtmp.sourmash('search', '--threshold', '0.95', sig_loc, sbt_name) + runtmp.sourmash("search", "--threshold", "0.95", sig_loc, sbt_name) print(runtmp.last_result.out) - assert 'short.fa' not in runtmp.last_result.out - assert 'short2.fa' in runtmp.last_result.out - assert 'short3.fa' in runtmp.last_result.out + assert "short.fa" not in runtmp.last_result.out + assert "short2.fa" in runtmp.last_result.out + assert "short3.fa" in runtmp.last_result.out def test_do_sourmash_sbt_search_otherdir(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna', '-p', 'k=31,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - runtmp.sourmash('index', '-k', '31', 'xxx/zzz', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("index", "-k", "31", "xxx/zzz", "short.fa.sig", "short2.fa.sig") - assert os.path.exists(runtmp.output('xxx/zzz.sbt.zip')) + assert os.path.exists(runtmp.output("xxx/zzz.sbt.zip")) - sbt_name = runtmp.output('xxx/zzz',) - sig_loc = runtmp.output('short.fa.sig') + sbt_name = runtmp.output( + "xxx/zzz", + ) + sig_loc = runtmp.output("short.fa.sig") - runtmp.sourmash('search', sig_loc, sbt_name) + runtmp.sourmash("search", sig_loc, sbt_name) print(runtmp.last_result.out) - assert 'short.fa' in runtmp.last_result.out - assert 'short2.fa' in runtmp.last_result.out + assert "short.fa" in runtmp.last_result.out + assert "short2.fa" in runtmp.last_result.out def test_do_sourmash_sbt_search_scaled_vs_num_1(runtmp): # should not work: scaled query against num tree - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna', '-p', 'k=31,num=500', testdata1) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1) - runtmp.sourmash('sketch','dna', '-p', 'scaled=1000', testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=1000", testdata2) - runtmp.sourmash('index', '-k', '31', 'zzz', 'short.fa.sig') + runtmp.sourmash("index", "-k", "31", "zzz", "short.fa.sig") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - sbt_name = runtmp.output('zzz',) - sig_loc = runtmp.output('short2.fa.sig') + sbt_name = runtmp.output( + "zzz", + ) + sig_loc = runtmp.output("short2.fa.sig") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('search', sig_loc, sbt_name) + runtmp.sourmash("search", sig_loc, sbt_name) assert runtmp.last_result.status == -1 print(runtmp.last_result.out) print(runtmp.last_result.err) assert "ERROR: cannot use '" in runtmp.last_result.err - assert "this database was created with 'num' MinHash sketches, not 'scaled'" in runtmp.last_result.err + assert ( + "this database was created with 'num' MinHash sketches, not 'scaled'" + in runtmp.last_result.err + ) def test_do_sourmash_sbt_search_scaled_vs_num_2(runtmp): # should not work: num query against scaled tree - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna', '-p', 'k=31,num=500', testdata1) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1) - runtmp.sourmash('sketch','dna', '-p', 'scaled=1000', testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=1000", testdata2) - runtmp.sourmash('index', '-k', '31', 'zzz', 'short2.fa.sig') + runtmp.sourmash("index", "-k", "31", "zzz", "short2.fa.sig") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - sbt_name = runtmp.output('zzz',) - sig_loc = runtmp.output('short.fa.sig') + sbt_name = runtmp.output( + "zzz", + ) + sig_loc = runtmp.output("short.fa.sig") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('search', sig_loc, sbt_name) + runtmp.sourmash("search", sig_loc, sbt_name) assert runtmp.last_result.status == -1 print(runtmp.last_result.out) print(runtmp.last_result.err) assert "ERROR: cannot use '" in runtmp.last_result.err - assert "this database was created with 'scaled' MinHash sketches, not 'num'" in runtmp.last_result.err + assert ( + "this database was created with 'scaled' MinHash sketches, not 'num'" + in runtmp.last_result.err + ) def test_do_sourmash_sbt_search_scaled_vs_num_3(runtmp): # should not work: scaled query against num signature - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna', '-p', 'k=31,num=500', testdata1) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1) - runtmp.sourmash('sketch','dna', '-p', 'scaled=1000', testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=1000", testdata2) - sig_loc = runtmp.output('short.fa.sig') - sig_loc2 = runtmp.output('short2.fa.sig') + sig_loc = runtmp.output("short.fa.sig") + sig_loc2 = runtmp.output("short2.fa.sig") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('search', sig_loc, sig_loc2) + runtmp.sourmash("search", sig_loc, sig_loc2) assert runtmp.last_result.status == -1 print(runtmp.last_result.out) @@ -3057,18 +3511,18 @@ def test_do_sourmash_sbt_search_scaled_vs_num_3(runtmp): def test_do_sourmash_sbt_search_scaled_vs_num_4(runtmp): # should not work: num query against scaled signature - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna', '-p', 'k=31,num=500', testdata1) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1) - runtmp.sourmash('sketch','dna', '-p', 'scaled=1000', testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=1000", testdata2) - sig_loc = runtmp.output('short.fa.sig') - sig_loc2 = runtmp.output('short2.fa.sig') + sig_loc = runtmp.output("short.fa.sig") + sig_loc2 = runtmp.output("short2.fa.sig") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('search', sig_loc2, sig_loc) + runtmp.sourmash("search", sig_loc2, sig_loc) assert runtmp.last_result.status == -1 print(runtmp.last_result.out) @@ -3079,13 +3533,13 @@ def test_do_sourmash_sbt_search_scaled_vs_num_4(runtmp): def test_do_sourmash_check_search_vs_actual_similarity(runtmp): files = [utils.get_test_data(f) for f in utils.SIG_FILES] - runtmp.sourmash('index', '-k', '31', 'zzz', *files) + runtmp.sourmash("index", "-k", "31", "zzz", *files) - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - filename = os.path.splitext(os.path.basename(utils.SIG_FILES[0]))[0] + os.path.splitext(os.path.basename(utils.SIG_FILES[0]))[0] - runtmp.sourmash('search', files[0], 'zzz') + runtmp.sourmash("search", files[0], "zzz") assert runtmp.last_result.status == 0 @@ -3093,9 +3547,9 @@ def test_do_sourmash_check_search_vs_actual_similarity(runtmp): def test_do_sourmash_check_sbt_filenames(runtmp): files = [utils.get_test_data(f) for f in utils.SIG_FILES] - runtmp.sourmash('index', '-k', '31', 'zzz.sbt.json', *files) + runtmp.sourmash("index", "-k", "31", "zzz.sbt.json", *files) - assert os.path.exists(runtmp.output('zzz.sbt.json')) + assert os.path.exists(runtmp.output("zzz.sbt.json")) sig_names = set() sig_md5s = set() @@ -3104,11 +3558,11 @@ def test_do_sourmash_check_sbt_filenames(runtmp): sig_names.add(sig.name) sig_md5s.add(sig.md5sum()) - sbt_files = glob.glob(runtmp.output('.sbt.zzz/*')) + sbt_files = glob.glob(runtmp.output(".sbt.zzz/*")) assert len(sbt_files) == 14 for f in sbt_files: - if 'internal' in f or f.endswith('zzz.manifest.csv'): + if "internal" in f or f.endswith("zzz.manifest.csv"): continue f = os.path.basename(f) assert f not in sig_names @@ -3116,161 +3570,208 @@ def test_do_sourmash_check_sbt_filenames(runtmp): def test_do_sourmash_sbt_search_bestonly(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna', '-p', 'k=31,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - runtmp.sourmash('index', '-k', '31', 'zzz', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("index", "-k", "31", "zzz", "short.fa.sig", "short2.fa.sig") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('search', '--best-only', 'short.fa.sig', 'zzz') + runtmp.sourmash("search", "--best-only", "short.fa.sig", "zzz") print(runtmp.last_result.out) - assert 'short.fa' in runtmp.last_result.out + assert "short.fa" in runtmp.last_result.out def test_do_sourmash_sbt_search_bestonly_scaled(runtmp): # as currently implemented, the query signature will be automatically # downsampled to match the tree. - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna', '-p', 'scaled=1', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=1", testdata1, testdata2) - runtmp.sourmash('index', '-k', '31', 'zzz', 'short.fa.sig', 'short2.fa.sig', '--scaled', '10') + runtmp.sourmash( + "index", "-k", "31", "zzz", "short.fa.sig", "short2.fa.sig", "--scaled", "10" + ) - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('search', '--best-only', 'short.fa.sig', 'zzz') + runtmp.sourmash("search", "--best-only", "short.fa.sig", "zzz") print(runtmp.last_result.out) - assert 'short.fa' in runtmp.last_result.out + assert "short.fa" in runtmp.last_result.out def test_sbt_search_order_dependence(runtmp): - testdata1 = utils.get_test_data('genome-s10.fa.gz') - testdata2 = utils.get_test_data('genome-s11.fa.gz') - testdata3 = utils.get_test_data('genome-s12.fa.gz') - testdata4 = utils.get_test_data('genome-s10+s11.fa.gz') - - runtmp.sourmash('sketch', 'dna', '-p', 'k=21,scaled=10000', '-p', 'k=31,scaled=10000', testdata1, testdata2, testdata3, testdata4) - - runtmp.sourmash('index', '-k', '21', '134', 'genome-s10+s11.fa.gz.sig', 'genome-s11.fa.gz.sig', 'genome-s12.fa.gz.sig') - - runtmp.sourmash('search', '-k', '21', 'genome-s11.fa.gz.sig', '134', '--best-only', '-k', '21', '--dna') + testdata1 = utils.get_test_data("genome-s10.fa.gz") + testdata2 = utils.get_test_data("genome-s11.fa.gz") + testdata3 = utils.get_test_data("genome-s12.fa.gz") + testdata4 = utils.get_test_data("genome-s10+s11.fa.gz") + + runtmp.sourmash( + "sketch", + "dna", + "-p", + "k=21,scaled=10000", + "-p", + "k=31,scaled=10000", + testdata1, + testdata2, + testdata3, + testdata4, + ) + + runtmp.sourmash( + "index", + "-k", + "21", + "134", + "genome-s10+s11.fa.gz.sig", + "genome-s11.fa.gz.sig", + "genome-s12.fa.gz.sig", + ) + + runtmp.sourmash( + "search", + "-k", + "21", + "genome-s11.fa.gz.sig", + "134", + "--best-only", + "-k", + "21", + "--dna", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '100.0%' in runtmp.last_result.out + assert "100.0%" in runtmp.last_result.out def test_sbt_search_order_dependence_2(runtmp): # *should* return the same result as test_sbt_search_order_dependence, # but does not due to a bug. - testdata1 = utils.get_test_data('genome-s10.fa.gz') - testdata2 = utils.get_test_data('genome-s11.fa.gz') - testdata3 = utils.get_test_data('genome-s12.fa.gz') - testdata4 = utils.get_test_data('genome-s10+s11.fa.gz') - - runtmp.sourmash('sketch', 'dna', '-p', 'k=21,scaled=10000', '-p', 'k=31,scaled=10000', testdata1, testdata2, testdata3, testdata4) - - runtmp.sourmash('index', '-k', '21', '314', 'genome-s11.fa.gz.sig', 'genome-s10+s11.fa.gz.sig', 'genome-s12.fa.gz.sig') - - runtmp.sourmash('search', '-k', '21', 'genome-s11.fa.gz.sig', '314', '--best-only', '--dna') + testdata1 = utils.get_test_data("genome-s10.fa.gz") + testdata2 = utils.get_test_data("genome-s11.fa.gz") + testdata3 = utils.get_test_data("genome-s12.fa.gz") + testdata4 = utils.get_test_data("genome-s10+s11.fa.gz") + + runtmp.sourmash( + "sketch", + "dna", + "-p", + "k=21,scaled=10000", + "-p", + "k=31,scaled=10000", + testdata1, + testdata2, + testdata3, + testdata4, + ) + + runtmp.sourmash( + "index", + "-k", + "21", + "314", + "genome-s11.fa.gz.sig", + "genome-s10+s11.fa.gz.sig", + "genome-s12.fa.gz.sig", + ) + + runtmp.sourmash( + "search", "-k", "21", "genome-s11.fa.gz.sig", "314", "--best-only", "--dna" + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '100.0%' in runtmp.last_result.out + assert "100.0%" in runtmp.last_result.out def test_compare_with_abundance_1(runtmp): # create two signatures - E1 = MinHash(ksize=5, n=5, is_protein=False, - track_abundance=True) - E2 = MinHash(ksize=5, n=5, is_protein=False, - track_abundance=True) + E1 = MinHash(ksize=5, n=5, is_protein=False, track_abundance=True) + E2 = MinHash(ksize=5, n=5, is_protein=False, track_abundance=True) - E1.add_sequence('ATGGA') - E2.add_sequence('ATGGA') + E1.add_sequence("ATGGA") + E2.add_sequence("ATGGA") - s1 = signature.SourmashSignature(E1, filename='e1', name='e1') - s2 = signature.SourmashSignature(E2, filename='e2', name='e2') + s1 = signature.SourmashSignature(E1, filename="e1", name="e1") + s2 = signature.SourmashSignature(E2, filename="e2", name="e2") - with open(runtmp.output('e1.sig'), 'w') as f: + with open(runtmp.output("e1.sig"), "w") as f: signature.save_signatures([s1], f) - with open(runtmp.output('e2.sig'), 'w') as f: + with open(runtmp.output("e2.sig"), "w") as f: signature.save_signatures([s2], f) - runtmp.sourmash('search', 'e1.sig', 'e2.sig', '-k', '5') + runtmp.sourmash("search", "e1.sig", "e2.sig", "-k", "5") - assert '100.0%' in runtmp.last_result.out + assert "100.0%" in runtmp.last_result.out def test_compare_with_abundance_2(runtmp): # create two signatures - E1 = MinHash(ksize=5, n=5, is_protein=False, - track_abundance=True) - E2 = MinHash(ksize=5, n=5, is_protein=False, - track_abundance=True) + E1 = MinHash(ksize=5, n=5, is_protein=False, track_abundance=True) + E2 = MinHash(ksize=5, n=5, is_protein=False, track_abundance=True) - E1.add_sequence('ATGGA') + E1.add_sequence("ATGGA") - E1.add_sequence('ATGGA') - E2.add_sequence('ATGGA') + E1.add_sequence("ATGGA") + E2.add_sequence("ATGGA") - s1 = signature.SourmashSignature(E1, filename='e1', name='e1') - s2 = signature.SourmashSignature(E2, filename='e2', name='e2') + s1 = signature.SourmashSignature(E1, filename="e1", name="e1") + s2 = signature.SourmashSignature(E2, filename="e2", name="e2") - with open(runtmp.output('e1.sig'), 'w') as f: + with open(runtmp.output("e1.sig"), "w") as f: signature.save_signatures([s1], f) - with open(runtmp.output('e2.sig'), 'w') as f: + with open(runtmp.output("e2.sig"), "w") as f: signature.save_signatures([s2], f) - runtmp.sourmash('search', 'e1.sig', 'e2.sig', '-k', '5') + runtmp.sourmash("search", "e1.sig", "e2.sig", "-k", "5") - assert '100.0%' in runtmp.last_result.out + assert "100.0%" in runtmp.last_result.out def test_compare_with_abundance_3(runtmp): # create two signatures - E1 = MinHash(ksize=5, n=5, is_protein=False, - track_abundance=True) - E2 = MinHash(ksize=5, n=5, is_protein=False, - track_abundance=True) + E1 = MinHash(ksize=5, n=5, is_protein=False, track_abundance=True) + E2 = MinHash(ksize=5, n=5, is_protein=False, track_abundance=True) - E1.add_sequence('ATGGA') - E1.add_sequence('GGACA') + E1.add_sequence("ATGGA") + E1.add_sequence("GGACA") - E1.add_sequence('ATGGA') - E2.add_sequence('ATGGA') + E1.add_sequence("ATGGA") + E2.add_sequence("ATGGA") - s1 = signature.SourmashSignature(E1, filename='e1', name='e1') - s2 = signature.SourmashSignature(E2, filename='e2', name='e2') + s1 = signature.SourmashSignature(E1, filename="e1", name="e1") + s2 = signature.SourmashSignature(E2, filename="e2", name="e2") - with open(runtmp.output('e1.sig'), 'w') as f: + with open(runtmp.output("e1.sig"), "w") as f: signature.save_signatures([s1], f) - with open(runtmp.output('e2.sig'), 'w') as f: + with open(runtmp.output("e2.sig"), "w") as f: signature.save_signatures([s2], f) - runtmp.sourmash('search', 'e1.sig', 'e2.sig', '-k', '5') + runtmp.sourmash("search", "e1.sig", "e2.sig", "-k", "5") - assert '70.5%' in runtmp.last_result.out + assert "70.5%" in runtmp.last_result.out def test_compare_with_picklist(runtmp): # test 'sourmash compare' with picklists - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") - runtmp.sourmash('compare', *gcf_sigs, - '-k', '21', '--picklist', f"{picklist}:md5:md5") + runtmp.sourmash( + "compare", *gcf_sigs, "-k", "21", "--picklist", f"{picklist}:md5:md5" + ) err = runtmp.last_result.err out = runtmp.last_result.out @@ -3287,11 +3788,12 @@ def test_compare_with_picklist(runtmp): def test_compare_with_picklist_exclude(runtmp): # test 'sourmash compare' with picklists - exclude - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") - runtmp.sourmash('compare', *gcf_sigs, - '-k', '21', '--picklist', f"{picklist}:md5:md5:exclude") + runtmp.sourmash( + "compare", *gcf_sigs, "-k", "21", "--picklist", f"{picklist}:md5:md5:exclude" + ) err = runtmp.last_result.err out = runtmp.last_result.out @@ -3309,12 +3811,10 @@ def test_compare_with_picklist_exclude(runtmp): def test_compare_with_pattern_include(runtmp): # test 'sourmash compare' with --include-db-pattern - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) - runtmp.sourmash('compare', *gcf_sigs, - '-k', '21', '--include', "thermotoga") + runtmp.sourmash("compare", *gcf_sigs, "-k", "21", "--include", "thermotoga") - err = runtmp.last_result.err out = runtmp.last_result.out print(runtmp.last_result.out) print(runtmp.last_result.err) @@ -3326,12 +3826,10 @@ def test_compare_with_pattern_include(runtmp): def test_compare_with_pattern_exclude(runtmp): # test 'sourmash compare' with picklists - exclude - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) - runtmp.sourmash('compare', *gcf_sigs, - '-k', '21', '--exclude', "thermotoga") + runtmp.sourmash("compare", *gcf_sigs, "-k", "21", "--exclude", "thermotoga") - err = runtmp.last_result.err out = runtmp.last_result.out print(runtmp.last_result.out) print(runtmp.last_result.err) @@ -3344,310 +3842,443 @@ def test_compare_with_pattern_exclude(runtmp): def test_gather(runtmp, linear_gather, prefetch_gather): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'dna', '-p', 'scaled=10', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", testdata1, testdata2) - runtmp.sourmash('sketch','dna','-p','scaled=10', '-o', 'query.fa.sig', testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", "-o", "query.fa.sig", testdata2) - runtmp.sourmash('index', '-k', '31', 'zzz', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("index", "-k", "31", "zzz", "short.fa.sig", "short2.fa.sig") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('gather', 'query.fa.sig', 'zzz', '-o', 'foo.csv', '--threshold-bp=1', linear_gather, prefetch_gather) + runtmp.sourmash( + "gather", + "query.fa.sig", + "zzz", + "-o", + "foo.csv", + "--threshold-bp=1", + linear_gather, + prefetch_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '0.9 kbp 100.0% 100.0%' in runtmp.last_result.out + assert "0.9 kbp 100.0% 100.0%" in runtmp.last_result.out def test_gather_csv(runtmp, linear_gather, prefetch_gather): # test 'gather -o csvfile' - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - - runtmp.sourmash('sketch','dna','-p','scaled=10', '--name-from-first', testdata1, testdata2) - - runtmp.sourmash('sketch','dna','-p','scaled=10', '-o', 'query.fa.sig', '--name-from-first', testdata2) - - runtmp.sourmash('index', '-k', '31', 'zzz', 'short.fa.sig', 'short2.fa.sig') - - assert os.path.exists(runtmp.output('zzz.sbt.zip')) - - runtmp.sourmash('gather', 'query.fa.sig', 'zzz', '-o', 'foo.csv', '--threshold-bp=1', linear_gather, prefetch_gather) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + + runtmp.sourmash( + "sketch", "dna", "-p", "scaled=10", "--name-from-first", testdata1, testdata2 + ) + + runtmp.sourmash( + "sketch", + "dna", + "-p", + "scaled=10", + "-o", + "query.fa.sig", + "--name-from-first", + testdata2, + ) + + runtmp.sourmash("index", "-k", "31", "zzz", "short.fa.sig", "short2.fa.sig") + + assert os.path.exists(runtmp.output("zzz.sbt.zip")) + + runtmp.sourmash( + "gather", + "query.fa.sig", + "zzz", + "-o", + "foo.csv", + "--threshold-bp=1", + linear_gather, + prefetch_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - csv_file = runtmp.output('foo.csv') + csv_file = runtmp.output("foo.csv") with open(csv_file) as fp: reader = csv.DictReader(fp) row = next(reader) print(row) - assert float(row['intersect_bp']) == 910 - assert float(row['unique_intersect_bp']) == 910 - assert float(row['remaining_bp']) == 0 - assert float(row['f_orig_query']) == 1.0 - assert float(row['f_unique_to_query']) == 1.0 - assert float(row['f_match']) == 1.0 - assert row['filename'] == 'zzz' - assert row['name'] == 'tr1 4' - assert row['md5'] == 'c9d5a795eeaaf58e286fb299133e1938' - assert row['gather_result_rank'] == '0' - assert row['query_filename'].endswith('short2.fa') - assert row['query_name'] == 'tr1 4' - assert row['query_md5'] == 'c9d5a795' - assert row['query_bp'] == '910' - - assert row['query_abundance'] == 'False' - assert row['n_unique_weighted_found'] == '' + assert float(row["intersect_bp"]) == 910 + assert float(row["unique_intersect_bp"]) == 910 + assert float(row["remaining_bp"]) == 0 + assert float(row["f_orig_query"]) == 1.0 + assert float(row["f_unique_to_query"]) == 1.0 + assert float(row["f_match"]) == 1.0 + assert row["filename"] == "zzz" + assert row["name"] == "tr1 4" + assert row["md5"] == "c9d5a795eeaaf58e286fb299133e1938" + assert row["gather_result_rank"] == "0" + assert row["query_filename"].endswith("short2.fa") + assert row["query_name"] == "tr1 4" + assert row["query_md5"] == "c9d5a795" + assert row["query_bp"] == "910" + + assert row["query_abundance"] == "False" + assert row["n_unique_weighted_found"] == "" def test_gather_csv_gz(runtmp, linear_gather, prefetch_gather): # test 'gather -o csvfile.gz' - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - - runtmp.sourmash('sketch','dna','-p','scaled=10', '--name-from-first', testdata1, testdata2) - - runtmp.sourmash('sketch','dna','-p','scaled=10', '-o', 'query.fa.sig', '--name-from-first', testdata2) - - runtmp.sourmash('index', '-k', '31', 'zzz', 'short.fa.sig', 'short2.fa.sig') - - assert os.path.exists(runtmp.output('zzz.sbt.zip')) - - runtmp.sourmash('gather', 'query.fa.sig', 'zzz', '-o', 'foo.csv.gz', '--threshold-bp=1', linear_gather, prefetch_gather) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + + runtmp.sourmash( + "sketch", "dna", "-p", "scaled=10", "--name-from-first", testdata1, testdata2 + ) + + runtmp.sourmash( + "sketch", + "dna", + "-p", + "scaled=10", + "-o", + "query.fa.sig", + "--name-from-first", + testdata2, + ) + + runtmp.sourmash("index", "-k", "31", "zzz", "short.fa.sig", "short2.fa.sig") + + assert os.path.exists(runtmp.output("zzz.sbt.zip")) + + runtmp.sourmash( + "gather", + "query.fa.sig", + "zzz", + "-o", + "foo.csv.gz", + "--threshold-bp=1", + linear_gather, + prefetch_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - csv_file = runtmp.output('foo.csv.gz') + csv_file = runtmp.output("foo.csv.gz") with gzip.open(csv_file, "rt", newline="") as fp: reader = csv.DictReader(fp) row = next(reader) print(row) - assert float(row['intersect_bp']) == 910 - assert float(row['unique_intersect_bp']) == 910 - assert float(row['remaining_bp']) == 0 - assert float(row['f_orig_query']) == 1.0 - assert float(row['f_unique_to_query']) == 1.0 - assert float(row['f_match']) == 1.0 - assert row['filename'] == 'zzz' - assert row['name'] == 'tr1 4' - assert row['md5'] == 'c9d5a795eeaaf58e286fb299133e1938' - assert row['gather_result_rank'] == '0' - assert row['query_filename'].endswith('short2.fa') - assert row['query_name'] == 'tr1 4' - assert row['query_md5'] == 'c9d5a795' - assert row['query_bp'] == '910' + assert float(row["intersect_bp"]) == 910 + assert float(row["unique_intersect_bp"]) == 910 + assert float(row["remaining_bp"]) == 0 + assert float(row["f_orig_query"]) == 1.0 + assert float(row["f_unique_to_query"]) == 1.0 + assert float(row["f_match"]) == 1.0 + assert row["filename"] == "zzz" + assert row["name"] == "tr1 4" + assert row["md5"] == "c9d5a795eeaaf58e286fb299133e1938" + assert row["gather_result_rank"] == "0" + assert row["query_filename"].endswith("short2.fa") + assert row["query_name"] == "tr1 4" + assert row["query_md5"] == "c9d5a795" + assert row["query_bp"] == "910" def test_gather_abund_x_abund(runtmp, prefetch_gather, linear_gather): - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") - runtmp.sourmash('gather', sig47, sig63, linear_gather, prefetch_gather) + runtmp.sourmash("gather", sig47, sig63, linear_gather, prefetch_gather) - assert '2.5 Mbp 49.2% 48.3% 1.0 NC_011663.1' in runtmp.last_result.out + assert ( + "2.5 Mbp 49.2% 48.3% 1.0 NC_011663.1" in runtmp.last_result.out + ) def test_gather_multiple_sbts(runtmp, prefetch_gather, linear_gather): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna', '-p', 'scaled=10', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", testdata1, testdata2) - runtmp.sourmash('sketch','dna','-p','scaled=10', '-o', 'query.fa.sig', testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", "-o", "query.fa.sig", testdata2) - runtmp.sourmash('index', 'zzz', 'short.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz", "short.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('index', 'zzz2', 'short2.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz2", "short2.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('gather', 'query.fa.sig', 'zzz', 'zzz2', '-o', 'foo.csv', '--threshold-bp=1', linear_gather, prefetch_gather) + runtmp.sourmash( + "gather", + "query.fa.sig", + "zzz", + "zzz2", + "-o", + "foo.csv", + "--threshold-bp=1", + linear_gather, + prefetch_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '0.9 kbp 100.0% 100.0%' in runtmp.last_result.out + assert "0.9 kbp 100.0% 100.0%" in runtmp.last_result.out def test_gather_multiple_sbts_save_prefetch(runtmp, linear_gather): # test --save-prefetch with multiple databases - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna', '-p', 'scaled=10', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", testdata1, testdata2) - runtmp.sourmash('sketch','dna','-p','scaled=10', '-o', 'query.fa.sig', testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", "-o", "query.fa.sig", testdata2) - runtmp.sourmash('index', 'zzz', 'short.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz", "short.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('index', 'zzz2', 'short2.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz2", "short2.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('gather', 'query.fa.sig', 'zzz', 'zzz2', '-o', 'foo.csv', '--save-prefetch', 'out.zip', '--threshold-bp=1', linear_gather) + runtmp.sourmash( + "gather", + "query.fa.sig", + "zzz", + "zzz2", + "-o", + "foo.csv", + "--save-prefetch", + "out.zip", + "--threshold-bp=1", + linear_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '0.9 kbp 100.0% 100.0%' in runtmp.last_result.out - assert os.path.exists(runtmp.output('out.zip')) + assert "0.9 kbp 100.0% 100.0%" in runtmp.last_result.out + assert os.path.exists(runtmp.output("out.zip")) def test_gather_multiple_sbts_save_prefetch_csv(runtmp, linear_gather): # test --save-prefetch-csv with multiple databases - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna', '-p', 'scaled=10', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", testdata1, testdata2) - runtmp.sourmash('sketch','dna','-p','scaled=10', '-o', 'query.fa.sig', testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", "-o", "query.fa.sig", testdata2) - runtmp.sourmash('index', 'zzz', 'short.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz", "short.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('index', 'zzz2', 'short2.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz2", "short2.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('gather', 'query.fa.sig', 'zzz', 'zzz2', '-o', 'foo.csv', '--save-prefetch-csv', 'prefetch.csv', '--threshold-bp=1', linear_gather) + runtmp.sourmash( + "gather", + "query.fa.sig", + "zzz", + "zzz2", + "-o", + "foo.csv", + "--save-prefetch-csv", + "prefetch.csv", + "--threshold-bp=1", + linear_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '0.9 kbp 100.0% 100.0%' in runtmp.last_result.out - assert os.path.exists(runtmp.output('prefetch.csv')) - with open(runtmp.output('prefetch.csv')) as f: + assert "0.9 kbp 100.0% 100.0%" in runtmp.last_result.out + assert os.path.exists(runtmp.output("prefetch.csv")) + with open(runtmp.output("prefetch.csv")) as f: output = f.read() print((output,)) - assert '870,0.925531914893617,0.9666666666666667' in output + assert "870,0.925531914893617,0.9666666666666667" in output def test_gather_multiple_sbts_save_prefetch_csv_gz(runtmp, linear_gather): # test --save-prefetch-csv to a .gz file, with multiple databases - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna', '-p', 'scaled=10', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", testdata1, testdata2) - runtmp.sourmash('sketch','dna','-p','scaled=10', '-o', 'query.fa.sig', testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", "-o", "query.fa.sig", testdata2) - runtmp.sourmash('index', 'zzz', 'short.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz", "short.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('index', 'zzz2', 'short2.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz2", "short2.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('gather', 'query.fa.sig', 'zzz', 'zzz2', '-o', 'foo.csv', '--save-prefetch-csv', 'prefetch.csv.gz', '--threshold-bp=1', linear_gather) + runtmp.sourmash( + "gather", + "query.fa.sig", + "zzz", + "zzz2", + "-o", + "foo.csv", + "--save-prefetch-csv", + "prefetch.csv.gz", + "--threshold-bp=1", + linear_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '0.9 kbp 100.0% 100.0%' in runtmp.last_result.out - assert os.path.exists(runtmp.output('prefetch.csv.gz')) - with gzip.open(runtmp.output('prefetch.csv.gz'), 'rt', newline="") as f: + assert "0.9 kbp 100.0% 100.0%" in runtmp.last_result.out + assert os.path.exists(runtmp.output("prefetch.csv.gz")) + with gzip.open(runtmp.output("prefetch.csv.gz"), "rt", newline="") as f: output = f.read() print((output,)) - assert '870,0.925531914893617,0.9666666666666667' in output + assert "870,0.925531914893617,0.9666666666666667" in output def test_gather_multiple_sbts_save_prefetch_and_prefetch_csv(runtmp, linear_gather): # test --save-prefetch-csv with multiple databases - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna', '-p', 'scaled=10', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", testdata1, testdata2) - runtmp.sourmash('sketch','dna','-p','scaled=10', '-o', 'query.fa.sig', testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", "-o", "query.fa.sig", testdata2) - runtmp.sourmash('index', 'zzz', 'short.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz", "short.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('index', 'zzz2', 'short2.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz2", "short2.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('gather', 'query.fa.sig', 'zzz', 'zzz2', '-o', 'foo.csv', '--save-prefetch', 'out.zip', '--save-prefetch-csv', 'prefetch.csv', '--threshold-bp=1', linear_gather) + runtmp.sourmash( + "gather", + "query.fa.sig", + "zzz", + "zzz2", + "-o", + "foo.csv", + "--save-prefetch", + "out.zip", + "--save-prefetch-csv", + "prefetch.csv", + "--threshold-bp=1", + linear_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '0.9 kbp 100.0% 100.0%' in runtmp.last_result.out - assert os.path.exists(runtmp.output('prefetch.csv')) - with open(runtmp.output('prefetch.csv')) as f: + assert "0.9 kbp 100.0% 100.0%" in runtmp.last_result.out + assert os.path.exists(runtmp.output("prefetch.csv")) + with open(runtmp.output("prefetch.csv")) as f: output = f.read() print((output,)) - assert '870,0.925531914893617,0.9666666666666667' in output - assert os.path.exists(runtmp.output('out.zip')) + assert "870,0.925531914893617,0.9666666666666667" in output + assert os.path.exists(runtmp.output("out.zip")) def test_gather_sbt_and_sigs(runtmp, linear_gather, prefetch_gather): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'dna', '-p', 'k=31,scaled=10', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,scaled=10", testdata1, testdata2) - runtmp.sourmash('sketch','dna','-p','scaled=10', '-o', 'query.fa.sig', testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", "-o", "query.fa.sig", testdata2) - runtmp.sourmash('index', '-k', '31', 'zzz', 'short.fa.sig') + runtmp.sourmash("index", "-k", "31", "zzz", "short.fa.sig") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('gather', 'query.fa.sig', 'zzz', 'short2.fa.sig', '-o', 'foo.csv', linear_gather, prefetch_gather, '--threshold-bp=1') + runtmp.sourmash( + "gather", + "query.fa.sig", + "zzz", + "short2.fa.sig", + "-o", + "foo.csv", + linear_gather, + prefetch_gather, + "--threshold-bp=1", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '0.9 kbp 100.0% 100.0%' in runtmp.last_result.out + assert "0.9 kbp 100.0% 100.0%" in runtmp.last_result.out def test_gather_file_output(runtmp, linear_gather, prefetch_gather): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'dna', '-p', 'scaled=10', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", testdata1, testdata2) - runtmp.sourmash('sketch','dna','-p','scaled=10', '-o', 'query.fa.sig', testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", "-o", "query.fa.sig", testdata2) - runtmp.sourmash('index', '-k', '31', 'zzz', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("index", "-k", "31", "zzz", "short.fa.sig", "short2.fa.sig") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('gather', 'query.fa.sig', 'zzz', '--threshold-bp=500', linear_gather, prefetch_gather, '-o', 'foo.out') + runtmp.sourmash( + "gather", + "query.fa.sig", + "zzz", + "--threshold-bp=500", + linear_gather, + prefetch_gather, + "-o", + "foo.out", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '0.9 kbp 100.0% 100.0%' in runtmp.last_result.out - with open(runtmp.output('foo.out')) as f: + assert "0.9 kbp 100.0% 100.0%" in runtmp.last_result.out + with open(runtmp.output("foo.out")) as f: output = f.read() print((output,)) - assert '910,1.0,1.0' in output + assert "910,1.0,1.0" in output def test_gather_f_match_orig(runtmp, linear_gather, prefetch_gather): import copy - testdata_combined = utils.get_test_data('gather/combined.sig') - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_combined = utils.get_test_data("gather/combined.sig") + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - runtmp.sourmash('gather', testdata_combined, '-o', 'out.csv', - *testdata_sigs, linear_gather, prefetch_gather) + runtmp.sourmash( + "gather", + testdata_combined, + "-o", + "out.csv", + *testdata_sigs, + linear_gather, + prefetch_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) @@ -3658,25 +4289,25 @@ def test_gather_f_match_orig(runtmp, linear_gather, prefetch_gather): def approx_equal(a, b, n=5): return round(a, n) == round(b, n) - with open(runtmp.output('out.csv'), 'rt') as fp: + with open(runtmp.output("out.csv")) as fp: r = csv.DictReader(fp) for n, row in enumerate(r): - print(n, row['f_match'], row['f_match_orig']) + print(n, row["f_match"], row["f_match_orig"]) # each match is completely in the original query - assert row['f_match_orig'] == "1.0" + assert row["f_match_orig"] == "1.0" # double check -- should match 'search --containment'. # (this is kind of useless for a 1.0 contained_by, I guess) - filename = row['filename'] + filename = row["filename"] match = sourmash.load_one_signature(filename, ksize=21) assert match.contained_by(combined_sig) == 1.0 # check other fields, too. - f_orig_query = float(row['f_orig_query']) - f_match_orig = float(row['f_match_orig']) - f_match = float(row['f_match']) - f_unique_to_query = float(row['f_unique_to_query']) + f_orig_query = float(row["f_orig_query"]) + f_match_orig = float(row["f_match_orig"]) + f_match = float(row["f_match"]) + f_unique_to_query = float(row["f_unique_to_query"]) # f_orig_query is the containment of the query by the match. # (note, this only works because containment is 100% in combined). @@ -3687,8 +4318,7 @@ def approx_equal(a, b, n=5): assert approx_equal(match.contained_by(combined_sig), f_match_orig) # f_match is how much of the match is in the unallocated hashes - assert approx_equal(match.minhash.contained_by(remaining_mh), - f_match) + assert approx_equal(match.minhash.contained_by(remaining_mh), f_match) # f_unique_to_query is how much of the match is unique wrt # the original query. @@ -3704,14 +4334,21 @@ def approx_equal(a, b, n=5): def test_gather_nomatch(runtmp, linear_gather, prefetch_gather): testdata_query = utils.get_test_data( - 'gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig') - testdata_match = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - - out_csv = runtmp.output('results.csv') - - runtmp.sourmash('gather', testdata_query, testdata_match, - '-o', out_csv, - linear_gather, prefetch_gather) + "gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig" + ) + testdata_match = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + + out_csv = runtmp.output("results.csv") + + runtmp.sourmash( + "gather", + testdata_query, + testdata_match, + "-o", + out_csv, + linear_gather, + prefetch_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) @@ -3722,14 +4359,22 @@ def test_gather_nomatch(runtmp, linear_gather, prefetch_gather): def test_gather_nomatch_create_empty(runtmp, linear_gather, prefetch_gather): testdata_query = utils.get_test_data( - 'gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig') - testdata_match = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - - out_csv = runtmp.output('results.csv') - - runtmp.sourmash('gather', testdata_query, testdata_match, - '-o', out_csv, '--create-empty-results', - linear_gather, prefetch_gather) + "gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig" + ) + testdata_match = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + + out_csv = runtmp.output("results.csv") + + runtmp.sourmash( + "gather", + testdata_query, + testdata_match, + "-o", + out_csv, + "--create-empty-results", + linear_gather, + prefetch_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) @@ -3737,17 +4382,20 @@ def test_gather_nomatch_create_empty(runtmp, linear_gather, prefetch_gather): assert "No matches found for --threshold-bp at 50.0 kbp." in runtmp.last_result.err assert os.path.exists(out_csv) - with open(out_csv, 'rt') as fp: + with open(out_csv) as fp: data = fp.read() assert not data def test_gather_abund_nomatch(runtmp, linear_gather, prefetch_gather): - testdata_query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig') - testdata_match = utils.get_test_data('gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig') + testdata_query = utils.get_test_data("gather-abund/reads-s10x10-s11.sig") + testdata_match = utils.get_test_data( + "gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig" + ) - runtmp.sourmash('gather', testdata_query, testdata_match, - linear_gather, prefetch_gather) + runtmp.sourmash( + "gather", testdata_query, testdata_match, linear_gather, prefetch_gather + ) print(runtmp.last_result.out) print(runtmp.last_result.err) @@ -3756,50 +4404,58 @@ def test_gather_abund_nomatch(runtmp, linear_gather, prefetch_gather): def test_gather_metagenome(runtmp): - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) runtmp.sourmash(*cmd) - assert os.path.exists(runtmp.output('gcf_all.sbt.zip')) + assert os.path.exists(runtmp.output("gcf_all.sbt.zip")) - runtmp.sourmash('gather', query_sig, 'gcf_all', '-k', '21', '--threshold-bp=0') + runtmp.sourmash("gather", query_sig, "gcf_all", "-k", "21", "--threshold-bp=0") print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'found 12 matches total' in runtmp.last_result.out - assert 'the recovered matches hit 100.0% of the query' in runtmp.last_result.out - assert all(('4.9 Mbp 33.2% 100.0%' in runtmp.last_result.out, - 'NC_003198.1 Salmonella enterica subsp' in runtmp.last_result.out)) - assert all(('4.7 Mbp 0.5% 1.5%' in runtmp.last_result.out, - 'NC_011294.1 Salmonella enterica subs' in runtmp.last_result.out)) + assert "found 12 matches total" in runtmp.last_result.out + assert "the recovered matches hit 100.0% of the query" in runtmp.last_result.out + assert all( + ( + "4.9 Mbp 33.2% 100.0%" in runtmp.last_result.out, + "NC_003198.1 Salmonella enterica subsp" in runtmp.last_result.out, + ) + ) + assert all( + ( + "4.7 Mbp 0.5% 1.5%" in runtmp.last_result.out, + "NC_011294.1 Salmonella enterica subs" in runtmp.last_result.out, + ) + ) @utils.in_tempdir def test_gather_metagenome_num_results(c): # set a threshold on the number of results to be reported by gather - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) c.run_sourmash(*cmd) - assert os.path.exists(c.output('gcf_all.sbt.zip')) + assert os.path.exists(c.output("gcf_all.sbt.zip")) - cmd = 'gather {} gcf_all -k 21 --num-results 10'.format(query_sig) - cmd = cmd.split(' ') + cmd = f"gather {query_sig} gcf_all -k 21 --num-results 10" + cmd = cmd.split(" ") c.run_sourmash(*cmd) print(c.last_result.out) @@ -3807,85 +4463,122 @@ def test_gather_metagenome_num_results(c): out = c.last_result.out - assert 'found 10 matches total' in out - assert '(truncated gather because --num-results=10)' in out - assert 'the recovered matches hit 99.4% of the query' in out - assert all(('4.9 Mbp 33.2% 100.0%' in out, - 'NC_003198.1 Salmonella enterica subsp' in out)) - assert '4.3 Mbp 2.1% 7.3% NC_006511.1 Salmonella enterica subsp' in out + assert "found 10 matches total" in out + assert "(truncated gather because --num-results=10)" in out + assert "the recovered matches hit 99.4% of the query" in out + assert all( + ( + "4.9 Mbp 33.2% 100.0%" in out, + "NC_003198.1 Salmonella enterica subsp" in out, + ) + ) + assert "4.3 Mbp 2.1% 7.3% NC_006511.1 Salmonella enterica subsp" in out def test_gather_metagenome_threshold_bp(runtmp, linear_gather, prefetch_gather): # set a threshold on the gather output - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) runtmp.sourmash(*cmd) - assert os.path.exists(runtmp.output('gcf_all.sbt.zip')) + assert os.path.exists(runtmp.output("gcf_all.sbt.zip")) - runtmp.sourmash('gather', query_sig, 'gcf_all', '-k', '21', - '--threshold-bp', '2e6', linear_gather, prefetch_gather) + runtmp.sourmash( + "gather", + query_sig, + "gcf_all", + "-k", + "21", + "--threshold-bp", + "2e6", + linear_gather, + prefetch_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'found 1 matches total' in runtmp.last_result.out - assert 'found less than 2.0 Mbp in common. => exiting' in runtmp.last_result.err - assert 'the recovered matches hit 33.2% of the query' in runtmp.last_result.out - assert all(('4.9 Mbp 33.2% 100.0%' in runtmp.last_result.out, - 'NC_003198.1 Salmonella enterica subsp' in runtmp.last_result.out)) + assert "found 1 matches total" in runtmp.last_result.out + assert "found less than 2.0 Mbp in common. => exiting" in runtmp.last_result.err + assert "the recovered matches hit 33.2% of the query" in runtmp.last_result.out + assert all( + ( + "4.9 Mbp 33.2% 100.0%" in runtmp.last_result.out, + "NC_003198.1 Salmonella enterica subsp" in runtmp.last_result.out, + ) + ) def test_gather_metagenome_threshold_bp_low(runtmp, linear_gather, prefetch_gather): # set a threshold on the gather output => too low - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) runtmp.sourmash(*cmd) - assert os.path.exists(runtmp.output('gcf_all.sbt.zip')) + assert os.path.exists(runtmp.output("gcf_all.sbt.zip")) - runtmp.sourmash('gather', query_sig, 'gcf_all', '-k', '21', - '--threshold-bp', '1', linear_gather, prefetch_gather) + runtmp.sourmash( + "gather", + query_sig, + "gcf_all", + "-k", + "21", + "--threshold-bp", + "1", + linear_gather, + prefetch_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'found 12 matches total' in runtmp.last_result.out - assert 'found less than 1 bp in common. => exiting' in runtmp.last_result.err - assert 'the recovered matches hit 100.0% of the query' in runtmp.last_result.out + assert "found 12 matches total" in runtmp.last_result.out + assert "found less than 1 bp in common. => exiting" in runtmp.last_result.err + assert "the recovered matches hit 100.0% of the query" in runtmp.last_result.out -def test_gather_metagenome_threshold_bp_too_high(runtmp, linear_gather, prefetch_gather): +def test_gather_metagenome_threshold_bp_too_high( + runtmp, linear_gather, prefetch_gather +): # set a threshold on the gather output => no results - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) runtmp.sourmash(*cmd) - assert os.path.exists(runtmp.output('gcf_all.sbt.zip')) + assert os.path.exists(runtmp.output("gcf_all.sbt.zip")) - runtmp.sourmash('gather', query_sig, 'gcf_all', '-k', '21', - '--threshold-bp', '5e6', linear_gather, prefetch_gather) + runtmp.sourmash( + "gather", + query_sig, + "gcf_all", + "-k", + "21", + "--threshold-bp", + "5e6", + linear_gather, + prefetch_gather, + ) out = runtmp.last_result.out err = runtmp.last_result.err @@ -3896,46 +4589,67 @@ def test_gather_metagenome_threshold_bp_too_high(runtmp, linear_gather, prefetch def test_multigather_metagenome(runtmp): - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) runtmp.sourmash(*cmd) - assert os.path.exists(runtmp.output('gcf_all.sbt.zip')) + assert os.path.exists(runtmp.output("gcf_all.sbt.zip")) - runtmp.sourmash('multigather', '--query', query_sig, '--db', 'gcf_all', '-k', '21', '--threshold-bp=0') + runtmp.sourmash( + "multigather", + "--query", + query_sig, + "--db", + "gcf_all", + "-k", + "21", + "--threshold-bp=0", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'found 12 matches total' in runtmp.last_result.out - assert 'the recovered matches hit 100.0% of the query' in runtmp.last_result.out - assert all(('4.9 Mbp 33.2% 100.0%' in runtmp.last_result.out, - 'NC_003198.1 Salmonella enterica subsp' in runtmp.last_result.out)) - assert all(('4.7 Mbp 0.5% 1.5%' in runtmp.last_result.out, - 'NC_011294.1 Salmonella enterica subsp' in runtmp.last_result.out)) + assert "found 12 matches total" in runtmp.last_result.out + assert "the recovered matches hit 100.0% of the query" in runtmp.last_result.out + assert all( + ( + "4.9 Mbp 33.2% 100.0%" in runtmp.last_result.out, + "NC_003198.1 Salmonella enterica subsp" in runtmp.last_result.out, + ) + ) + assert all( + ( + "4.7 Mbp 0.5% 1.5%" in runtmp.last_result.out, + "NC_011294.1 Salmonella enterica subsp" in runtmp.last_result.out, + ) + ) def test_multigather_check_scaled_bounds_negative(runtmp): c = runtmp - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) c.run_sourmash(*cmd) - cmd = 'multigather --query {} --db gcf_all -k 21 --scaled -5 --threshold-bp=0'.format(query_sig) - cmd = cmd.split(' ') + cmd = ( + "multigather --query {} --db gcf_all -k 21 --scaled -5 --threshold-bp=0".format( + query_sig + ) + ) + cmd = cmd.split(" ") with pytest.raises(SourmashCommandFailed) as exc: c.run_sourmash(*cmd) @@ -3944,67 +4658,80 @@ def test_multigather_check_scaled_bounds_negative(runtmp): def test_multigather_check_scaled_bounds_less_than_minimum(runtmp): c = runtmp - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) c.run_sourmash(*cmd) - cmd = 'multigather --query {} --db gcf_all -k 21 --scaled 50 --threshold-bp=0'.format(query_sig) - cmd = cmd.split(' ') + cmd = ( + "multigather --query {} --db gcf_all -k 21 --scaled 50 --threshold-bp=0".format( + query_sig + ) + ) + cmd = cmd.split(" ") # Note: this is the value error that is emitted, but we want the Warning from below to be generated instead. (ValueError: new scaled 50.0 is lower than current sample scaled 10000) with pytest.raises(SourmashCommandFailed) as exc: c.run_sourmash(*cmd) - assert "WARNING: scaled value should be >= 100. Continuing anyway." in str(exc.value) + assert "WARNING: scaled value should be >= 100. Continuing anyway." in str( + exc.value + ) def test_multigather_check_scaled_bounds_more_than_maximum(runtmp): c = runtmp - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) c.run_sourmash(*cmd) - cmd = 'multigather --query {} --db gcf_all -k 21 --scaled 1e9 --threshold-bp=0'.format(query_sig) - cmd = cmd.split(' ') + cmd = "multigather --query {} --db gcf_all -k 21 --scaled 1e9 --threshold-bp=0".format( + query_sig + ) + cmd = cmd.split(" ") c.run_sourmash(*cmd) - assert "WARNING: scaled value should be <= 1e6. Continuing anyway." in c.last_result.err + assert ( + "WARNING: scaled value should be <= 1e6. Continuing anyway." + in c.last_result.err + ) def test_multigather_metagenome_query_from_file(runtmp): # test multigather --query-from-file c = runtmp - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) c.run_sourmash(*cmd) - assert os.path.exists(c.output('gcf_all.sbt.zip')) + assert os.path.exists(c.output("gcf_all.sbt.zip")) # make list w/query sig - query_list = c.output('query.list') - with open(query_list, 'wt') as fp: + query_list = c.output("query.list") + with open(query_list, "w") as fp: print(query_sig, file=fp) - cmd = 'multigather --query-from-file {} --db gcf_all -k 21 --threshold-bp=0'.format(query_list) - cmd = cmd.split(' ') + cmd = "multigather --query-from-file {} --db gcf_all -k 21 --threshold-bp=0".format( + query_list + ) + cmd = cmd.split(" ") c.run_sourmash(*cmd) out = c.last_result.out @@ -4012,36 +4739,44 @@ def test_multigather_metagenome_query_from_file(runtmp): err = c.last_result.err print(err) - assert 'found 12 matches total' in out - assert 'the recovered matches hit 100.0% of the query' in out - assert all(('4.9 Mbp 33.2% 100.0%' in out, - 'NC_003198.1 Salmonella enterica subsp' in out)) - assert all(('4.7 Mbp 0.5% 1.5%' in out, - 'NC_011294.1 Salmonella enterica subsp' in out)) + assert "found 12 matches total" in out + assert "the recovered matches hit 100.0% of the query" in out + assert all( + ( + "4.9 Mbp 33.2% 100.0%" in out, + "NC_003198.1 Salmonella enterica subsp" in out, + ) + ) + assert all( + ( + "4.7 Mbp 0.5% 1.5%" in out, + "NC_011294.1 Salmonella enterica subsp" in out, + ) + ) def test_multigather_metagenome_output(runtmp): # test multigather CSV output has more than one output line c = runtmp - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) c.run_sourmash(*cmd) - assert os.path.exists(c.output('gcf_all.sbt.zip')) + assert os.path.exists(c.output("gcf_all.sbt.zip")) - cmd = f'multigather --query {query_sig} --db gcf_all -k 21 --threshold-bp=0' - cmd = cmd.split(' ') + cmd = f"multigather --query {query_sig} --db gcf_all -k 21 --threshold-bp=0" + cmd = cmd.split(" ") c.run_sourmash(*cmd) - output_csv = runtmp.output('-.csv') + output_csv = runtmp.output("-.csv") assert os.path.exists(output_csv) - with open(output_csv, newline='') as fp: + with open(output_csv, newline="") as fp: x = fp.readlines() assert len(x) == 13 @@ -4049,50 +4784,49 @@ def test_multigather_metagenome_output(runtmp): def test_multigather_metagenome_output_outdir(runtmp): # test multigather CSV output to different location c = runtmp - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) c.run_sourmash(*cmd) - assert os.path.exists(c.output('gcf_all.sbt.zip')) + assert os.path.exists(c.output("gcf_all.sbt.zip")) # create output directory - outdir = runtmp.output('savehere') + outdir = runtmp.output("savehere") os.mkdir(outdir) - cmd = f'multigather --query {query_sig} --db gcf_all -k 21 --threshold-bp=0 --output-dir {outdir}' - cmd = cmd.split(' ') + cmd = f"multigather --query {query_sig} --db gcf_all -k 21 --threshold-bp=0 --output-dir {outdir}" + cmd = cmd.split(" ") c.run_sourmash(*cmd) - output_csv = runtmp.output('savehere/-.csv') + output_csv = runtmp.output("savehere/-.csv") assert os.path.exists(output_csv) - with open(output_csv, newline='') as fp: + with open(output_csv, newline="") as fp: x = fp.readlines() assert len(x) == 13 @utils.in_tempdir def test_multigather_metagenome_query_with_sbt(c): - - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all.sbt.zip'] + cmd = ["index", "gcf_all.sbt.zip"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) c.run_sourmash(*cmd) - assert os.path.exists(c.output('gcf_all.sbt.zip')) + assert os.path.exists(c.output("gcf_all.sbt.zip")) - cmd = 'multigather --query gcf_all.sbt.zip --db gcf_all.sbt.zip -k 21 --threshold-bp=0' - cmd = cmd.split(' ') + cmd = "multigather --query gcf_all.sbt.zip --db gcf_all.sbt.zip -k 21 --threshold-bp=0" + cmd = cmd.split(" ") c.run_sourmash(*cmd) out = c.last_result.out @@ -4100,35 +4834,50 @@ def test_multigather_metagenome_query_with_sbt(c): err = c.last_result.err print(err) - assert 'conducted gather searches on 12 signatures' in err - assert 'the recovered matches hit 100.0% of the query' in out - assert all(('4.7 Mbp 100.0% 100.0%' in out, - 'NC_011080.1 Salmonella enterica subsp' in out)) - assert all(('4.5 Mbp 100.0% 100.0%' in out, - 'NC_004631.1 Salmonella enterica subsp' in out)) - assert all (('1.6 Mbp 100.0% 100.0%' in out, - 'NC_002163.1 Campylobacter jejuni subs' in out)) - assert all(('1.9 Mbp 100.0% 100.0%' in out, - 'NC_000853.1 Thermotoga maritima MSB8 ' in out)) + assert "conducted gather searches on 12 signatures" in err + assert "the recovered matches hit 100.0% of the query" in out + assert all( + ( + "4.7 Mbp 100.0% 100.0%" in out, + "NC_011080.1 Salmonella enterica subsp" in out, + ) + ) + assert all( + ( + "4.5 Mbp 100.0% 100.0%" in out, + "NC_004631.1 Salmonella enterica subsp" in out, + ) + ) + assert all( + ( + "1.6 Mbp 100.0% 100.0%" in out, + "NC_002163.1 Campylobacter jejuni subs" in out, + ) + ) + assert all( + ( + "1.9 Mbp 100.0% 100.0%" in out, + "NC_000853.1 Thermotoga maritima MSB8 " in out, + ) + ) @utils.in_tempdir def test_multigather_metagenome_query_with_lca(c): - - testdata_glob = utils.get_test_data('47*.fa.sig') + testdata_glob = utils.get_test_data("47*.fa.sig") testdata_sigs = glob.glob(testdata_glob) - lca_db = utils.get_test_data('lca/47+63.lca.json') + lca_db = utils.get_test_data("lca/47+63.lca.json") - cmd = ['index', '47+63.sbt.zip'] + cmd = ["index", "47+63.sbt.zip"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '31']) + cmd.extend(["-k", "31"]) c.run_sourmash(*cmd) - assert os.path.exists(c.output('47+63.sbt.zip')) + assert os.path.exists(c.output("47+63.sbt.zip")) - cmd = 'multigather --query {} --db 47+63.sbt.zip -k 31 --threshold-bp=0'.format(lca_db) - cmd = cmd.split(' ') + cmd = f"multigather --query {lca_db} --db 47+63.sbt.zip -k 31 --threshold-bp=0" + cmd = cmd.split(" ") c.run_sourmash(*cmd) out = c.last_result.out @@ -4136,21 +4885,22 @@ def test_multigather_metagenome_query_with_lca(c): err = c.last_result.err print(err) - assert 'conducted gather searches on 2 signatures' in err - assert 'the recovered matches hit 100.0% of the query' in out -# assert '5.1 Mbp 100.0% 64.9% 491c0a81' in out - assert '5.5 Mbp 100.0% 69.4% 491c0a81' in out + assert "conducted gather searches on 2 signatures" in err + assert "the recovered matches hit 100.0% of the query" in out + # assert '5.1 Mbp 100.0% 64.9% 491c0a81' in out + assert "5.5 Mbp 100.0% 69.4% 491c0a81" in out @utils.in_tempdir def test_multigather_metagenome_query_on_lca_db(c): - - testdata_sig1 = utils.get_test_data('47.fa.sig') - testdata_sig2 = utils.get_test_data('63.fa.sig') - lca_db = utils.get_test_data('lca/47+63.lca.json') - - cmd = 'multigather --query {} {} --db {} -k 31 --threshold-bp=0'.format(testdata_sig1, testdata_sig2, lca_db) - cmd = cmd.split(' ') + testdata_sig1 = utils.get_test_data("47.fa.sig") + testdata_sig2 = utils.get_test_data("63.fa.sig") + lca_db = utils.get_test_data("lca/47+63.lca.json") + + cmd = "multigather --query {} {} --db {} -k 31 --threshold-bp=0".format( + testdata_sig1, testdata_sig2, lca_db + ) + cmd = cmd.split(" ") c.run_sourmash(*cmd) out = c.last_result.out @@ -4158,33 +4908,44 @@ def test_multigather_metagenome_query_on_lca_db(c): err = c.last_result.err print(err) - assert 'conducted gather searches on 2 signatures' in err - assert 'the recovered matches hit 100.0% of the query' in out - assert all(('5.1 Mbp 100.0% 100.0%' in out, - 'NC_009665.1 Shewanella baltica OS185,' in out)) - assert all(('5.5 Mbp 100.0% 100.0%' in out, - 'NC_011663.1 Shewanella baltica OS223,' in out)) + assert "conducted gather searches on 2 signatures" in err + assert "the recovered matches hit 100.0% of the query" in out + assert all( + ( + "5.1 Mbp 100.0% 100.0%" in out, + "NC_009665.1 Shewanella baltica OS185," in out, + ) + ) + assert all( + ( + "5.5 Mbp 100.0% 100.0%" in out, + "NC_011663.1 Shewanella baltica OS223," in out, + ) + ) @utils.in_tempdir def test_multigather_metagenome_query_with_sbt_addl_query(c): - - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all.sbt.zip'] + cmd = ["index", "gcf_all.sbt.zip"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) c.run_sourmash(*cmd) - assert os.path.exists(c.output('gcf_all.sbt.zip')) + assert os.path.exists(c.output("gcf_all.sbt.zip")) - another_query = utils.get_test_data('gather/GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig') + another_query = utils.get_test_data( + "gather/GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig" + ) - cmd = 'multigather --query {} gcf_all.sbt.zip --db gcf_all.sbt.zip -k 21 --threshold-bp=0'.format(another_query) - cmd = cmd.split(' ') + cmd = "multigather --query {} gcf_all.sbt.zip --db gcf_all.sbt.zip -k 21 --threshold-bp=0".format( + another_query + ) + cmd = cmd.split(" ") c.run_sourmash(*cmd) out = c.last_result.out @@ -4192,47 +4953,70 @@ def test_multigather_metagenome_query_with_sbt_addl_query(c): err = c.last_result.err print(err) - assert 'conducted gather searches on 13 signatures' in err - assert 'the recovered matches hit 100.0% of the query' in out - #check for matches to some of the sbt signatures - assert all(('4.7 Mbp 100.0% 100.0%' in out, - 'NC_011080.1 Salmonella enterica subsp' in out)) - assert all(('4.5 Mbp 100.0% 100.0%' in out, - 'NC_004631.1 Salmonella enterica subsp' in out)) - assert all (('1.6 Mbp 100.0% 100.0%' in out, - 'NC_002163.1 Campylobacter jejuni subs' in out)) - assert all(('1.9 Mbp 100.0% 100.0%' in out, - 'NC_000853.1 Thermotoga maritima MSB8 ' in out)) - - #check additional query sig - assert all(('4.9 Mbp 100.0% 100.0%' in out, - 'NC_003198.1 Salmonella enterica subsp' in out)) + assert "conducted gather searches on 13 signatures" in err + assert "the recovered matches hit 100.0% of the query" in out + # check for matches to some of the sbt signatures + assert all( + ( + "4.7 Mbp 100.0% 100.0%" in out, + "NC_011080.1 Salmonella enterica subsp" in out, + ) + ) + assert all( + ( + "4.5 Mbp 100.0% 100.0%" in out, + "NC_004631.1 Salmonella enterica subsp" in out, + ) + ) + assert all( + ( + "1.6 Mbp 100.0% 100.0%" in out, + "NC_002163.1 Campylobacter jejuni subs" in out, + ) + ) + assert all( + ( + "1.9 Mbp 100.0% 100.0%" in out, + "NC_000853.1 Thermotoga maritima MSB8 " in out, + ) + ) + + # check additional query sig + assert all( + ( + "4.9 Mbp 100.0% 100.0%" in out, + "NC_003198.1 Salmonella enterica subsp" in out, + ) + ) @utils.in_tempdir def test_multigather_metagenome_sbt_query_from_file_with_addl_query(c): - - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all.sbt.zip'] + cmd = ["index", "gcf_all.sbt.zip"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) c.run_sourmash(*cmd) - assert os.path.exists(c.output('gcf_all.sbt.zip')) + assert os.path.exists(c.output("gcf_all.sbt.zip")) # make list w/query sbt - query_list = c.output('query.list') - with open(query_list, 'wt') as fp: - print('gcf_all.sbt.zip', file=fp) - - another_query = utils.get_test_data('gather/GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig') - - cmd = 'multigather --query {} --query-from-file {} --db gcf_all.sbt.zip -k 21 --threshold-bp=0'.format(another_query, query_list) - cmd = cmd.split(' ') + query_list = c.output("query.list") + with open(query_list, "w") as fp: + print("gcf_all.sbt.zip", file=fp) + + another_query = utils.get_test_data( + "gather/GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig" + ) + + cmd = "multigather --query {} --query-from-file {} --db gcf_all.sbt.zip -k 21 --threshold-bp=0".format( + another_query, query_list + ) + cmd = cmd.split(" ") c.run_sourmash(*cmd) out = c.last_result.out @@ -4240,43 +5024,62 @@ def test_multigather_metagenome_sbt_query_from_file_with_addl_query(c): err = c.last_result.err print(err) - assert 'conducted gather searches on 13 signatures' in err - assert 'the recovered matches hit 100.0% of the query' in out - #check for matches to some of the sbt signatures - assert all(('4.7 Mbp 100.0% 100.0%' in out, - 'NC_011080.1 Salmonella enterica subsp' in out)) - assert all(('4.5 Mbp 100.0% 100.0%' in out, - 'NC_004631.1 Salmonella enterica subsp' in out)) - assert all (('1.6 Mbp 100.0% 100.0%' in out, - 'NC_002163.1 Campylobacter jejuni subs' in out)) - assert all(('1.9 Mbp 100.0% 100.0%' in out, - 'NC_000853.1 Thermotoga maritima MSB8 ' in out)) - - #check additional query sig - assert all(('4.9 Mbp 100.0% 100.0%' in out, - 'NC_003198.1 Salmonella enterica subsp' in out)) + assert "conducted gather searches on 13 signatures" in err + assert "the recovered matches hit 100.0% of the query" in out + # check for matches to some of the sbt signatures + assert all( + ( + "4.7 Mbp 100.0% 100.0%" in out, + "NC_011080.1 Salmonella enterica subsp" in out, + ) + ) + assert all( + ( + "4.5 Mbp 100.0% 100.0%" in out, + "NC_004631.1 Salmonella enterica subsp" in out, + ) + ) + assert all( + ( + "1.6 Mbp 100.0% 100.0%" in out, + "NC_002163.1 Campylobacter jejuni subs" in out, + ) + ) + assert all( + ( + "1.9 Mbp 100.0% 100.0%" in out, + "NC_000853.1 Thermotoga maritima MSB8 " in out, + ) + ) + + # check additional query sig + assert all( + ( + "4.9 Mbp 100.0% 100.0%" in out, + "NC_003198.1 Salmonella enterica subsp" in out, + ) + ) @utils.in_tempdir def test_multigather_metagenome_sbt_query_from_file_incorrect(c): - - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all.sbt.zip'] + cmd = ["index", "gcf_all.sbt.zip"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) c.run_sourmash(*cmd) - assert os.path.exists(c.output('gcf_all.sbt.zip')) + assert os.path.exists(c.output("gcf_all.sbt.zip")) # incorrectly query with sbt using `--query-from-file` - cmd = 'multigather --query-from-file gcf_all.sbt.zip --db gcf_all.sbt.zip -k 21 --threshold-bp=0' - cmd = cmd.split(' ') + cmd = "multigather --query-from-file gcf_all.sbt.zip --db gcf_all.sbt.zip -k 21 --threshold-bp=0" + cmd = cmd.split(" ") - with pytest.raises(SourmashCommandFailed) as e: + with pytest.raises(SourmashCommandFailed): c.run_sourmash(*cmd) print(c.last_result.out) @@ -4285,25 +5088,27 @@ def test_multigather_metagenome_sbt_query_from_file_incorrect(c): @utils.in_tempdir def test_multigather_metagenome_lca_query_from_file(c): - testdata_glob = utils.get_test_data('47*.fa.sig') + testdata_glob = utils.get_test_data("47*.fa.sig") testdata_sigs = glob.glob(testdata_glob) - lca_db = utils.get_test_data('lca/47+63.lca.json') + lca_db = utils.get_test_data("lca/47+63.lca.json") - cmd = ['index', '47+63.sbt.zip'] + cmd = ["index", "47+63.sbt.zip"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '31']) + cmd.extend(["-k", "31"]) c.run_sourmash(*cmd) - assert os.path.exists(c.output('47+63.sbt.zip')) + assert os.path.exists(c.output("47+63.sbt.zip")) # make list w/query sig - query_list = c.output('query.list') - with open(query_list, 'wt') as fp: + query_list = c.output("query.list") + with open(query_list, "w") as fp: print(lca_db, file=fp) - cmd = 'multigather --query-from-file {} --db 47+63.sbt.zip -k 31 --threshold-bp=0'.format(query_list) - cmd = cmd.split(' ') + cmd = "multigather --query-from-file {} --db 47+63.sbt.zip -k 31 --threshold-bp=0".format( + query_list + ) + cmd = cmd.split(" ") c.run_sourmash(*cmd) out = c.last_result.out @@ -4311,36 +5116,40 @@ def test_multigather_metagenome_lca_query_from_file(c): err = c.last_result.err print(err) - assert 'conducted gather searches on 2 signatures' in err - assert 'the recovered matches hit 100.0% of the query' in out -# assert '5.1 Mbp 100.0% 64.9% 491c0a81' in out - assert '5.5 Mbp 100.0% 69.4% 491c0a81' in out + assert "conducted gather searches on 2 signatures" in err + assert "the recovered matches hit 100.0% of the query" in out + # assert '5.1 Mbp 100.0% 64.9% 491c0a81' in out + assert "5.5 Mbp 100.0% 69.4% 491c0a81" in out @utils.in_tempdir def test_multigather_metagenome_query_from_file_with_addl_query(c): # test multigather --query-from-file and --query too - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) c.run_sourmash(*cmd) - assert os.path.exists(c.output('gcf_all.sbt.zip')) + assert os.path.exists(c.output("gcf_all.sbt.zip")) # make list w/query sig - query_list = c.output('query.list') - with open(query_list, 'wt') as fp: + query_list = c.output("query.list") + with open(query_list, "w") as fp: print(query_sig, file=fp) - another_query = utils.get_test_data('gather/GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig') + another_query = utils.get_test_data( + "gather/GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig" + ) - cmd = 'multigather --query-from-file {} --query {} --db gcf_all -k 21 --threshold-bp=0'.format(query_list, another_query) - cmd = cmd.split(' ') + cmd = "multigather --query-from-file {} --query {} --db gcf_all -k 21 --threshold-bp=0".format( + query_list, another_query + ) + cmd = cmd.split(" ") c.run_sourmash(*cmd) out = c.last_result.out @@ -4349,42 +5158,67 @@ def test_multigather_metagenome_query_from_file_with_addl_query(c): print(err) # first gather query - assert 'found 12 matches total' in out - assert 'the recovered matches hit 100.0% of the query' in out - assert all(('4.9 Mbp 33.2% 100.0%' in out, - 'NC_003198.1 Salmonella enterica subsp' in out)) - assert all(('4.7 Mbp 0.5% 1.5%' in out, - 'NC_011294.1 Salmonella enterica subsp' in out)) + assert "found 12 matches total" in out + assert "the recovered matches hit 100.0% of the query" in out + assert all( + ( + "4.9 Mbp 33.2% 100.0%" in out, + "NC_003198.1 Salmonella enterica subsp" in out, + ) + ) + assert all( + ( + "4.7 Mbp 0.5% 1.5%" in out, + "NC_011294.1 Salmonella enterica subsp" in out, + ) + ) # second gather query - assert '4.9 Mbp 100.0% 100.0% NC_003198.1 Salmonella enterica subsp' in out - assert 'found 1 matches total;' in out - assert 'the recovered matches hit 100.0% of the query' in out + assert "4.9 Mbp 100.0% 100.0% NC_003198.1 Salmonella enterica subsp" in out + assert "found 1 matches total;" in out + assert "the recovered matches hit 100.0% of the query" in out def test_gather_metagenome_traverse(runtmp, linear_gather, prefetch_gather): # set up a directory $location/gather that contains # everything in the 'tests/test-data/gather' directory # *except* the query sequence, which is 'combined.sig'. - testdata_dir = utils.get_test_data('gather') - copy_testdata = runtmp.output('somesigs') + testdata_dir = utils.get_test_data("gather") + copy_testdata = runtmp.output("somesigs") shutil.copytree(testdata_dir, copy_testdata) - os.unlink(os.path.join(copy_testdata, 'combined.sig')) + os.unlink(os.path.join(copy_testdata, "combined.sig")) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") # now, feed in the new directory -- - runtmp.sourmash('gather', query_sig, copy_testdata, '-k', '21', '--threshold-bp=0', linear_gather, prefetch_gather) + runtmp.sourmash( + "gather", + query_sig, + copy_testdata, + "-k", + "21", + "--threshold-bp=0", + linear_gather, + prefetch_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'found 12 matches total' in runtmp.last_result.out - assert 'the recovered matches hit 100.0% of the query' in runtmp.last_result.out - assert all(('4.9 Mbp 33.2% 100.0%' in runtmp.last_result.out, - 'NC_003198.1 Salmonella enterica subsp' in runtmp.last_result.out)) - assert all(('4.7 Mbp 0.5% 1.5%' in runtmp.last_result.out, - 'NC_011294.1 Salmonella enterica subsp' in runtmp.last_result.out)) + assert "found 12 matches total" in runtmp.last_result.out + assert "the recovered matches hit 100.0% of the query" in runtmp.last_result.out + assert all( + ( + "4.9 Mbp 33.2% 100.0%" in runtmp.last_result.out, + "NC_003198.1 Salmonella enterica subsp" in runtmp.last_result.out, + ) + ) + assert all( + ( + "4.7 Mbp 0.5% 1.5%" in runtmp.last_result.out, + "NC_011294.1 Salmonella enterica subsp" in runtmp.last_result.out, + ) + ) def test_gather_metagenome_traverse_check_csv(runtmp, linear_gather, prefetch_gather): @@ -4394,300 +5228,483 @@ def test_gather_metagenome_traverse_check_csv(runtmp, linear_gather, prefetch_ga # set up a directory $location/gather that contains # everything in the 'tests/test-data/gather' directory # *except* the query sequence, which is 'combined.sig'. - testdata_dir = utils.get_test_data('gather') - copy_testdata = runtmp.output('somesigs') + testdata_dir = utils.get_test_data("gather") + copy_testdata = runtmp.output("somesigs") shutil.copytree(testdata_dir, copy_testdata) - os.unlink(os.path.join(copy_testdata, 'combined.sig')) + os.unlink(os.path.join(copy_testdata, "combined.sig")) - query_sig = utils.get_test_data('gather/combined.sig') - out_csv = runtmp.output('out.csv') + query_sig = utils.get_test_data("gather/combined.sig") + out_csv = runtmp.output("out.csv") # now, feed in the new directory -- - runtmp.sourmash('gather', query_sig, copy_testdata, '-k', '21', '--threshold-bp=0', '-o', out_csv, linear_gather, prefetch_gather) + runtmp.sourmash( + "gather", + query_sig, + copy_testdata, + "-k", + "21", + "--threshold-bp=0", + "-o", + out_csv, + linear_gather, + prefetch_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - with open(out_csv, 'rt') as fp: + with open(out_csv) as fp: prefix_len = len(copy_testdata) r = csv.DictReader(fp) for row in r: - filename = row['filename'] + filename = row["filename"] assert filename.startswith(copy_testdata), filename # should have full path to file sig was loaded from assert len(filename) > prefix_len - assert 'found 12 matches total' in runtmp.last_result.out - assert 'the recovered matches hit 100.0% of the query' in runtmp.last_result.out - assert all(('4.9 Mbp 33.2% 100.0%' in runtmp.last_result.out, - 'NC_003198.1 Salmonella enterica subsp' in runtmp.last_result.out)) - assert all(('4.7 Mbp 0.5% 1.5%' in runtmp.last_result.out, - 'NC_011294.1 Salmonella enterica subsp' in runtmp.last_result.out)) + assert "found 12 matches total" in runtmp.last_result.out + assert "the recovered matches hit 100.0% of the query" in runtmp.last_result.out + assert all( + ( + "4.9 Mbp 33.2% 100.0%" in runtmp.last_result.out, + "NC_003198.1 Salmonella enterica subsp" in runtmp.last_result.out, + ) + ) + assert all( + ( + "4.7 Mbp 0.5% 1.5%" in runtmp.last_result.out, + "NC_011294.1 Salmonella enterica subsp" in runtmp.last_result.out, + ) + ) @utils.in_tempdir def test_gather_traverse_incompatible(c): - searchdir = c.output('searchme') + searchdir = c.output("searchme") os.mkdir(searchdir) - num_sig = utils.get_test_data('num/47.fa.sig') - scaled_sig = utils.get_test_data('47.fa.sig') - shutil.copyfile(num_sig, c.output('searchme/num.sig')) - shutil.copyfile(scaled_sig, c.output('searchme/scaled.sig')) + num_sig = utils.get_test_data("num/47.fa.sig") + scaled_sig = utils.get_test_data("47.fa.sig") + shutil.copyfile(num_sig, c.output("searchme/num.sig")) + shutil.copyfile(scaled_sig, c.output("searchme/scaled.sig")) - c.run_sourmash("gather", scaled_sig, c.output('searchme')) + c.run_sourmash("gather", scaled_sig, c.output("searchme")) print(c.last_result.out) print(c.last_result.err) - assert "5.2 Mbp 100.0% 100.0% NC_009665.1 Shewanella baltica OS185," in c.last_result.out + assert ( + "5.2 Mbp 100.0% 100.0% NC_009665.1 Shewanella baltica OS185," + in c.last_result.out + ) def test_gather_metagenome_output_unassigned(runtmp): - testdata_glob = utils.get_test_data('gather/GCF_000195995*g') + testdata_glob = utils.get_test_data("gather/GCF_000195995*g") testdata_sigs = glob.glob(testdata_glob)[0] - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - runtmp.sourmash('gather', query_sig, testdata_sigs, '-k', '21', '--output-unassigned=unassigned.sig') + runtmp.sourmash( + "gather", + query_sig, + testdata_sigs, + "-k", + "21", + "--output-unassigned=unassigned.sig", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'found 1 matches total' in runtmp.last_result.out - assert 'the recovered matches hit 33.2% of the query' in runtmp.last_result.out - assert all(('4.9 Mbp 33.2% 100.0%' in runtmp.last_result.out, - 'NC_003198.1 Salmonella enterica subsp' in runtmp.last_result.out)) + assert "found 1 matches total" in runtmp.last_result.out + assert "the recovered matches hit 33.2% of the query" in runtmp.last_result.out + assert all( + ( + "4.9 Mbp 33.2% 100.0%" in runtmp.last_result.out, + "NC_003198.1 Salmonella enterica subsp" in runtmp.last_result.out, + ) + ) # now examine unassigned - testdata2_glob = utils.get_test_data('gather/GCF_000009505.1*.sig') + testdata2_glob = utils.get_test_data("gather/GCF_000009505.1*.sig") testdata2_sigs = glob.glob(testdata2_glob)[0] - runtmp.sourmash('gather', 'unassigned.sig', testdata_sigs, testdata2_sigs, '-k', '21') + runtmp.sourmash( + "gather", "unassigned.sig", testdata_sigs, testdata2_sigs, "-k", "21" + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert all(('1.3 Mbp 13.6% 28.2%' in runtmp.last_result.out, - 'NC_011294.1' in runtmp.last_result.out)) + assert all( + ( + "1.3 Mbp 13.6% 28.2%" in runtmp.last_result.out, + "NC_011294.1" in runtmp.last_result.out, + ) + ) def test_gather_metagenome_output_unassigned_as_zip(runtmp): - testdata_glob = utils.get_test_data('gather/GCF_000195995*g') + testdata_glob = utils.get_test_data("gather/GCF_000195995*g") testdata_sigs = glob.glob(testdata_glob)[0] - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - runtmp.sourmash('gather', query_sig, testdata_sigs, '-k', '21', '--output-unassigned=unassigned.sig.zip') + runtmp.sourmash( + "gather", + query_sig, + testdata_sigs, + "-k", + "21", + "--output-unassigned=unassigned.sig.zip", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'found 1 matches total' in runtmp.last_result.out - assert 'the recovered matches hit 33.2% of the query' in runtmp.last_result.out - assert all(('4.9 Mbp 33.2% 100.0%' in runtmp.last_result.out, - 'NC_003198.1 Salmonella enterica subsp' in runtmp.last_result.out)) + assert "found 1 matches total" in runtmp.last_result.out + assert "the recovered matches hit 33.2% of the query" in runtmp.last_result.out + assert all( + ( + "4.9 Mbp 33.2% 100.0%" in runtmp.last_result.out, + "NC_003198.1 Salmonella enterica subsp" in runtmp.last_result.out, + ) + ) - assert zipfile.is_zipfile(runtmp.output('unassigned.sig.zip')) + assert zipfile.is_zipfile(runtmp.output("unassigned.sig.zip")) # now examine unassigned - testdata2_glob = utils.get_test_data('gather/GCF_000009505.1*.sig') + testdata2_glob = utils.get_test_data("gather/GCF_000009505.1*.sig") testdata2_sigs = glob.glob(testdata2_glob)[0] - runtmp.sourmash('gather', 'unassigned.sig.zip', testdata_sigs, testdata2_sigs, '-k', '21') + runtmp.sourmash( + "gather", "unassigned.sig.zip", testdata_sigs, testdata2_sigs, "-k", "21" + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert all(('1.3 Mbp 13.6% 28.2%' in runtmp.last_result.out, - 'NC_011294.1' in runtmp.last_result.out)) + assert all( + ( + "1.3 Mbp 13.6% 28.2%" in runtmp.last_result.out, + "NC_011294.1" in runtmp.last_result.out, + ) + ) def test_gather_metagenome_output_unassigned_none(runtmp): # test what happens when there's nothing unassigned to output - testdata_glob = utils.get_test_data('gather/GCF_*.sig') + testdata_glob = utils.get_test_data("gather/GCF_*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - runtmp.sourmash('gather', query_sig, *testdata_sigs, '-k', '21', '--output-unassigned=unassigned.sig', '--threshold=0') + runtmp.sourmash( + "gather", + query_sig, + *testdata_sigs, + "-k", + "21", + "--output-unassigned=unassigned.sig", + "--threshold=0", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'found 12 matches total' in runtmp.last_result.out - assert 'the recovered matches hit 100.0% of the query' in runtmp.last_result.out - assert all(('4.9 Mbp 33.2% 100.0%' in runtmp.last_result.out, - 'NC_003198.1 Salmonella enterica subsp' in runtmp.last_result.out)) - assert all(('4.5 Mbp 0.1% 0.4%' in runtmp.last_result.out, - 'NC_004631.1 Salmonella enterica subsp' in runtmp.last_result.out)) + assert "found 12 matches total" in runtmp.last_result.out + assert "the recovered matches hit 100.0% of the query" in runtmp.last_result.out + assert all( + ( + "4.9 Mbp 33.2% 100.0%" in runtmp.last_result.out, + "NC_003198.1 Salmonella enterica subsp" in runtmp.last_result.out, + ) + ) + assert all( + ( + "4.5 Mbp 0.1% 0.4%" in runtmp.last_result.out, + "NC_004631.1 Salmonella enterica subsp" in runtmp.last_result.out, + ) + ) # now examine unassigned - assert not os.path.exists(runtmp.output('unassigned.sig')) - assert 'no unassigned hashes to save with --output-unassigned!' in runtmp.last_result.err + assert not os.path.exists(runtmp.output("unassigned.sig")) + assert ( + "no unassigned hashes to save with --output-unassigned!" + in runtmp.last_result.err + ) -def test_gather_metagenome_output_unassigned_nomatches(runtmp, prefetch_gather, linear_gather): +def test_gather_metagenome_output_unassigned_nomatches( + runtmp, prefetch_gather, linear_gather +): c = runtmp # test --output-unassigned when there are no matches - query_sig = utils.get_test_data('2.fa.sig') - against_sig = utils.get_test_data('47.fa.sig') - - c.run_sourmash('gather', query_sig, against_sig, - '--output-unassigned', 'foo.sig', linear_gather, - prefetch_gather) + query_sig = utils.get_test_data("2.fa.sig") + against_sig = utils.get_test_data("47.fa.sig") + + c.run_sourmash( + "gather", + query_sig, + against_sig, + "--output-unassigned", + "foo.sig", + linear_gather, + prefetch_gather, + ) print(c.last_result.out) assert "No matches found for --threshold-bp at 50.0 kbp." in c.last_result.err x = sourmash.load_one_signature(query_sig, ksize=31) - y = sourmash.load_one_signature(c.output('foo.sig')) + y = sourmash.load_one_signature(c.output("foo.sig")) assert x.minhash == y.minhash -def test_gather_metagenome_output_unassigned_nomatches_protein(runtmp, linear_gather, prefetch_gather): +def test_gather_metagenome_output_unassigned_nomatches_protein( + runtmp, linear_gather, prefetch_gather +): c = runtmp # test --output-unassigned with protein signatures - query_sig = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - against_sig = utils.get_test_data('prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') - - c.run_sourmash('gather', query_sig, against_sig, - '--output-unassigned', 'foo.sig', linear_gather, - prefetch_gather) + query_sig = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + against_sig = utils.get_test_data( + "prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) + + c.run_sourmash( + "gather", + query_sig, + against_sig, + "--output-unassigned", + "foo.sig", + linear_gather, + prefetch_gather, + ) print(c.last_result.out) assert "No matches found for --threshold-bp at 50.0 kbp." in c.last_result.err - c.run_sourmash('sig', 'describe', c.output('foo.sig')) + c.run_sourmash("sig", "describe", c.output("foo.sig")) print(c.last_result.out) x = sourmash.load_one_signature(query_sig, ksize=57) - y = sourmash.load_one_signature(c.output('foo.sig')) + y = sourmash.load_one_signature(c.output("foo.sig")) assert x.minhash == y.minhash assert y.minhash.moltype == "protein" def test_gather_check_scaled_bounds_negative(runtmp, prefetch_gather, linear_gather): - testdata_glob = utils.get_test_data('gather/GCF*.sig') - testdata_sigs = glob.glob(testdata_glob) + testdata_glob = utils.get_test_data("gather/GCF*.sig") + glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('gather', query_sig, prefetch_gather, linear_gather, 'gcf_all', '-k', '21', '--scaled', '-5', '--threshold-bp', '50000') + runtmp.sourmash( + "gather", + query_sig, + prefetch_gather, + linear_gather, + "gcf_all", + "-k", + "21", + "--scaled", + "-5", + "--threshold-bp", + "50000", + ) assert "ERROR: scaled value must be positive" in runtmp.last_result.err -def test_gather_check_scaled_bounds_less_than_minimum(runtmp, prefetch_gather, linear_gather): - testdata_glob = utils.get_test_data('gather/GCF*.sig') - testdata_sigs = glob.glob(testdata_glob) +def test_gather_check_scaled_bounds_less_than_minimum( + runtmp, prefetch_gather, linear_gather +): + testdata_glob = utils.get_test_data("gather/GCF*.sig") + glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('gather', query_sig, prefetch_gather, linear_gather, 'gcf_all', '-k', '21', '--scaled', '50', '--threshold-bp', '50000') - - assert "WARNING: scaled value should be >= 100. Continuing anyway." in runtmp.last_result.err - - -def test_gather_check_scaled_bounds_more_than_maximum(runtmp, prefetch_gather, linear_gather): - testdata_glob = utils.get_test_data('gather/GCF*.sig') - testdata_sigs = glob.glob(testdata_glob) - - query_sig = utils.get_test_data('gather/combined.sig') + runtmp.sourmash( + "gather", + query_sig, + prefetch_gather, + linear_gather, + "gcf_all", + "-k", + "21", + "--scaled", + "50", + "--threshold-bp", + "50000", + ) + + assert ( + "WARNING: scaled value should be >= 100. Continuing anyway." + in runtmp.last_result.err + ) + + +def test_gather_check_scaled_bounds_more_than_maximum( + runtmp, prefetch_gather, linear_gather +): + testdata_glob = utils.get_test_data("gather/GCF*.sig") + glob.glob(testdata_glob) + + query_sig = utils.get_test_data("gather/combined.sig") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('gather', query_sig, prefetch_gather, linear_gather, '-k', '21', '--scaled', '1e9', '--threshold-bp', '50000') - - assert "WARNING: scaled value should be <= 1e6. Continuing anyway." in runtmp.last_result.err + runtmp.sourmash( + "gather", + query_sig, + prefetch_gather, + linear_gather, + "-k", + "21", + "--scaled", + "1e9", + "--threshold-bp", + "50000", + ) + + assert ( + "WARNING: scaled value should be <= 1e6. Continuing anyway." + in runtmp.last_result.err + ) def test_gather_metagenome_downsample(runtmp, prefetch_gather, linear_gather): # downsample w/scaled of 100,000 - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) runtmp.sourmash(*cmd) - assert os.path.exists(runtmp.output('gcf_all.sbt.zip')) - - runtmp.sourmash('gather', query_sig, 'gcf_all', '-k', '21', '--scaled', '100000', prefetch_gather, linear_gather, '--threshold-bp', '50000') + assert os.path.exists(runtmp.output("gcf_all.sbt.zip")) + + runtmp.sourmash( + "gather", + query_sig, + "gcf_all", + "-k", + "21", + "--scaled", + "100000", + prefetch_gather, + linear_gather, + "--threshold-bp", + "50000", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'found 11 matches total' in runtmp.last_result.out - assert 'the recovered matches hit 100.0% of the query' in runtmp.last_result.out - assert all(('5.2 Mbp 32.9% 100.0%' in runtmp.last_result.out, - 'NC_003198.1' in runtmp.last_result.out)) - assert all(('4.1 Mbp 0.6% 2.4%' in runtmp.last_result.out, - '4.1 Mbp 4.4% 17.1%' in runtmp.last_result.out)) + assert "found 11 matches total" in runtmp.last_result.out + assert "the recovered matches hit 100.0% of the query" in runtmp.last_result.out + assert all( + ( + "5.2 Mbp 32.9% 100.0%" in runtmp.last_result.out, + "NC_003198.1" in runtmp.last_result.out, + ) + ) + assert all( + ( + "4.1 Mbp 0.6% 2.4%" in runtmp.last_result.out, + "4.1 Mbp 4.4% 17.1%" in runtmp.last_result.out, + ) + ) def test_gather_query_downsample(runtmp, linear_gather, prefetch_gather): - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) print(testdata_sigs) - query_sig = utils.get_test_data('GCF_000006945.2-s500.sig') + query_sig = utils.get_test_data("GCF_000006945.2-s500.sig") - runtmp.sourmash('gather', '-k', '31', linear_gather, prefetch_gather, query_sig, *testdata_sigs) + runtmp.sourmash( + "gather", "-k", "31", linear_gather, prefetch_gather, query_sig, *testdata_sigs + ) print(runtmp.last_result.out) print(runtmp.last_result.err) err = runtmp.last_result.err - assert 'loaded 36 total signatures from 12 locations.' in err - assert 'after selecting signatures compatible with search, 12 remain.' in err + assert "loaded 36 total signatures from 12 locations." in err + assert "after selecting signatures compatible with search, 12 remain." in err - assert all(('4.9 Mbp 100.0% 100.0%' in runtmp.last_result.out, - 'NC_003197.2' in runtmp.last_result.out)) + assert all( + ( + "4.9 Mbp 100.0% 100.0%" in runtmp.last_result.out, + "NC_003197.2" in runtmp.last_result.out, + ) + ) - assert 'WARNING: final scaled was 10000, vs query scaled of 500' in runtmp.last_result.out + assert ( + "WARNING: final scaled was 10000, vs query scaled of 500" + in runtmp.last_result.out + ) def test_gather_query_downsample_explicit(runtmp, linear_gather, prefetch_gather): # do an explicit downsampling to fix `test_gather_query_downsample` - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('GCF_000006945.2-s500.sig') + query_sig = utils.get_test_data("GCF_000006945.2-s500.sig") - runtmp.sourmash('gather', '-k', '31', '--scaled', '10000', linear_gather, prefetch_gather, query_sig, *testdata_sigs) + runtmp.sourmash( + "gather", + "-k", + "31", + "--scaled", + "10000", + linear_gather, + prefetch_gather, + query_sig, + *testdata_sigs, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) err = runtmp.last_result.err - assert 'loaded 36 total signatures from 12 locations.' in err - assert 'after selecting signatures compatible with search, 12 remain.' in err + assert "loaded 36 total signatures from 12 locations." in err + assert "after selecting signatures compatible with search, 12 remain." in err - assert all(('4.9 Mbp 100.0% 100.0%' in runtmp.last_result.out, - 'NC_003197.2' in runtmp.last_result.out)) + assert all( + ( + "4.9 Mbp 100.0% 100.0%" in runtmp.last_result.out, + "NC_003197.2" in runtmp.last_result.out, + ) + ) def test_gather_downsample_multiple(runtmp, linear_gather, prefetch_gather): # test multiple different downsamplings in gather code - query_sig = utils.get_test_data('GCF_000006945.2-s500.sig') + query_sig = utils.get_test_data("GCF_000006945.2-s500.sig") # load in the hashes and do split them into four bins, randomly. ss = sourmash.load_one_signature(query_sig) hashes = list(ss.minhash.hashes) - random.seed(a=1) # fix seed so test is reproducible + random.seed(a=1) # fix seed so test is reproducible random.shuffle(hashes) # split into 4 bins: - mh_bins = [ ss.minhash.copy_and_clear() for i in range(4) ] + mh_bins = [ss.minhash.copy_and_clear() for i in range(4)] for i, hashval in enumerate(hashes): mh_bins[i % 4].add_hash(hashval) @@ -4706,23 +5723,37 @@ def test_gather_downsample_multiple(runtmp, linear_gather, prefetch_gather): gathersigs.append(f"bin{i}.sig") - runtmp.sourmash('gather', '-k', '31', linear_gather, prefetch_gather, query_sig, *gathersigs) + runtmp.sourmash( + "gather", "-k", "31", linear_gather, prefetch_gather, query_sig, *gathersigs + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert "WARNING: final scaled was 1000, vs query scaled of 500" in runtmp.last_result.out + assert ( + "WARNING: final scaled was 1000, vs query scaled of 500" + in runtmp.last_result.out + ) def test_gather_with_picklist(runtmp, linear_gather, prefetch_gather): # test 'sourmash gather' with picklists - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') - - runtmp.sourmash('gather', metag_sig, *gcf_sigs, '--threshold-bp=0', - '-k', '21', '--picklist', f"{picklist}:md5:md5", - linear_gather, prefetch_gather) + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") + + runtmp.sourmash( + "gather", + metag_sig, + *gcf_sigs, + "--threshold-bp=0", + "-k", + "21", + "--picklist", + f"{picklist}:md5:md5", + linear_gather, + prefetch_gather, + ) err = runtmp.last_result.err print(err) @@ -4740,13 +5771,22 @@ def test_gather_with_picklist(runtmp, linear_gather, prefetch_gather): def test_gather_with_picklist_exclude(runtmp, linear_gather, prefetch_gather): # test 'sourmash gather' with picklists - exclude - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') - - runtmp.sourmash('gather', metag_sig, *gcf_sigs, '--threshold-bp=0', - '-k', '21', '--picklist', f"{picklist}:md5:md5:exclude", - linear_gather, prefetch_gather) + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") + + runtmp.sourmash( + "gather", + metag_sig, + *gcf_sigs, + "--threshold-bp=0", + "-k", + "21", + "--picklist", + f"{picklist}:md5:md5:exclude", + linear_gather, + prefetch_gather, + ) err = runtmp.last_result.err print(err) @@ -4769,12 +5809,21 @@ def test_gather_with_picklist_exclude(runtmp, linear_gather, prefetch_gather): def test_gather_with_pattern_include(runtmp, linear_gather, prefetch_gather): # test 'sourmash gather' with --include-db-pattern - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - - runtmp.sourmash('gather', metag_sig, *gcf_sigs, '--threshold-bp=0', - '-k', '21', '--include', "thermotoga", - linear_gather, prefetch_gather) + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + + runtmp.sourmash( + "gather", + metag_sig, + *gcf_sigs, + "--threshold-bp=0", + "-k", + "21", + "--include", + "thermotoga", + linear_gather, + prefetch_gather, + ) err = runtmp.last_result.err print(err) @@ -4789,12 +5838,21 @@ def test_gather_with_pattern_include(runtmp, linear_gather, prefetch_gather): def test_gather_with_pattern_exclude(runtmp, linear_gather, prefetch_gather): # test 'sourmash gather' with --exclude - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - - runtmp.sourmash('gather', metag_sig, *gcf_sigs, '--threshold-bp=0', - '-k', '21', '--exclude', "thermotoga", - linear_gather, prefetch_gather) + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + + runtmp.sourmash( + "gather", + metag_sig, + *gcf_sigs, + "--threshold-bp=0", + "-k", + "21", + "--exclude", + "thermotoga", + linear_gather, + prefetch_gather, + ) err = runtmp.last_result.err print(err) @@ -4814,53 +5872,78 @@ def test_gather_with_pattern_exclude(runtmp, linear_gather, prefetch_gather): def test_gather_save_matches(runtmp, linear_gather, prefetch_gather): - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) runtmp.sourmash(*cmd) - assert os.path.exists(runtmp.output('gcf_all.sbt.zip')) - - runtmp.sourmash('gather', query_sig, 'gcf_all', '-k', '21', '--save-matches', 'save.sigs', linear_gather, prefetch_gather, '--threshold-bp', '0') + assert os.path.exists(runtmp.output("gcf_all.sbt.zip")) + + runtmp.sourmash( + "gather", + query_sig, + "gcf_all", + "-k", + "21", + "--save-matches", + "save.sigs", + linear_gather, + prefetch_gather, + "--threshold-bp", + "0", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'found 12 matches total' in runtmp.last_result.out - assert 'the recovered matches hit 100.0% of the query' in runtmp.last_result.out - assert os.path.exists(runtmp.output('save.sigs')) + assert "found 12 matches total" in runtmp.last_result.out + assert "the recovered matches hit 100.0% of the query" in runtmp.last_result.out + assert os.path.exists(runtmp.output("save.sigs")) def test_gather_save_matches_and_save_prefetch(runtmp, linear_gather): - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) runtmp.sourmash(*cmd) - assert os.path.exists(runtmp.output('gcf_all.sbt.zip')) - - runtmp.sourmash('gather', query_sig, 'gcf_all', '-k', '21', '--save-matches', 'save.sigs', '--save-prefetch', 'save2.sigs', linear_gather, '--threshold-bp', '0') + assert os.path.exists(runtmp.output("gcf_all.sbt.zip")) + + runtmp.sourmash( + "gather", + query_sig, + "gcf_all", + "-k", + "21", + "--save-matches", + "save.sigs", + "--save-prefetch", + "save2.sigs", + linear_gather, + "--threshold-bp", + "0", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'found 12 matches total' in runtmp.last_result.out - assert 'the recovered matches hit 100.0% of the query' in runtmp.last_result.out + assert "found 12 matches total" in runtmp.last_result.out + assert "the recovered matches hit 100.0% of the query" in runtmp.last_result.out - matches_save = runtmp.output('save.sigs') - prefetch_save = runtmp.output('save2.sigs') + matches_save = runtmp.output("save.sigs") + prefetch_save = runtmp.output("save2.sigs") assert os.path.exists(matches_save) assert os.path.exists(prefetch_save) @@ -4873,12 +5956,14 @@ def test_gather_save_matches_and_save_prefetch(runtmp, linear_gather): @utils.in_tempdir def test_gather_error_no_sigs_traverse(c): # test gather applied to a directory - query = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') + query = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) - emptydir = c.output('') + emptydir = c.output("") - with pytest.raises(SourmashCommandFailed) as e: - c.run_sourmash('gather', query, emptydir) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("gather", query, emptydir) err = c.last_result.err print(err) @@ -4886,65 +5971,85 @@ def test_gather_error_no_sigs_traverse(c): def test_gather_error_no_cardinality_query(runtmp, linear_gather, prefetch_gather): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'dna', '-p', 'k=31,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - testdata3 = utils.get_test_data('short3.fa') + testdata3 = utils.get_test_data("short3.fa") - runtmp.sourmash('sketch', 'translate', '-p', 'k=31,num=500', testdata3) + runtmp.sourmash("sketch", "translate", "-p", "k=31,num=500", testdata3) - runtmp.sourmash('index', 'zzz', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("index", "zzz", "short.fa.sig", "short2.fa.sig") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('gather', 'short3.fa.sig', 'zzz', linear_gather, prefetch_gather) + runtmp.sourmash( + "gather", "short3.fa.sig", "zzz", linear_gather, prefetch_gather + ) assert runtmp.last_result.status == -1 assert "query signature needs to be created with --scaled" in runtmp.last_result.err def test_gather_deduce_ksize(runtmp, prefetch_gather, linear_gather): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'dna', '-p', 'k=23,scaled=10', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=23,scaled=10", testdata1, testdata2) - runtmp.sourmash('sketch','dna','-p','k=23,scaled=10', '-o', 'query.fa.sig', testdata2) + runtmp.sourmash( + "sketch", "dna", "-p", "k=23,scaled=10", "-o", "query.fa.sig", testdata2 + ) - runtmp.sourmash('index', 'zzz', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("index", "zzz", "short.fa.sig", "short2.fa.sig") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('gather', 'query.fa.sig', 'zzz', prefetch_gather, linear_gather, '--threshold-bp=1') + runtmp.sourmash( + "gather", + "query.fa.sig", + "zzz", + prefetch_gather, + linear_gather, + "--threshold-bp=1", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '0.9 kbp 100.0% 100.0%' in runtmp.last_result.out + assert "0.9 kbp 100.0% 100.0%" in runtmp.last_result.out def test_gather_deduce_moltype(runtmp, linear_gather, prefetch_gather): # gather should automatically figure out ksize - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'translate', '-p', 'k=10,scaled=10', testdata1,testdata2) + runtmp.sourmash("sketch", "translate", "-p", "k=10,scaled=10", testdata1, testdata2) - runtmp.sourmash('sketch', 'translate', '-p', 'k=10,scaled=10', '-o', 'query.fa.sig',testdata2) + runtmp.sourmash( + "sketch", "translate", "-p", "k=10,scaled=10", "-o", "query.fa.sig", testdata2 + ) - runtmp.sourmash('index', 'zzz', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("index", "zzz", "short.fa.sig", "short2.fa.sig") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('gather', 'query.fa.sig', 'zzz', linear_gather, prefetch_gather, '--threshold-bp=1') + runtmp.sourmash( + "gather", + "query.fa.sig", + "zzz", + linear_gather, + prefetch_gather, + "--threshold-bp=1", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '1.9 kbp 100.0% 100.0%' in runtmp.last_result.out + assert "1.9 kbp 100.0% 100.0%" in runtmp.last_result.out def test_gather_abund_1_1(runtmp, linear_gather, prefetch_gather): @@ -4966,14 +6071,14 @@ def test_gather_abund_1_1(runtmp, linear_gather, prefetch_gather): # ./sourmash compute -k 21 --scaled 1000 --merge=1-1 -o reads-s10-s11.sig r[13].fa --track-abundance # ./sourmash compute -k 21 --scaled 1000 --merge=10-1 -o reads-s10x10-s11.sig r[23].fa --track-abundance - query = utils.get_test_data('gather-abund/reads-s10-s11.sig') - against_list = ['genome-s10', 'genome-s11', 'genome-s12'] - against_list = ['gather-abund/' + i + '.fa.gz.sig' - for i in against_list] + query = utils.get_test_data("gather-abund/reads-s10-s11.sig") + against_list = ["genome-s10", "genome-s11", "genome-s12"] + against_list = ["gather-abund/" + i + ".fa.gz.sig" for i in against_list] against_list = [utils.get_test_data(i) for i in against_list] - status, out, err = c.run_sourmash('gather', query, *against_list, - linear_gather, prefetch_gather) + status, out, err = c.run_sourmash( + "gather", query, *against_list, linear_gather, prefetch_gather + ) print(out) print(err) @@ -4985,9 +6090,9 @@ def test_gather_abund_1_1(runtmp, linear_gather, prefetch_gather): # (this is due to the low coverage of 2 used to build queries) # * approximately 2.0 abundance (third column, avg_abund) - assert '49.6% 78.5% 1.8 tests/test-data/genome-s10.fa.gz' in out - assert '50.4% 80.0% 1.9 tests/test-data/genome-s11.fa.gz' in out - assert 'genome-s12.fa.gz' not in out + assert "49.6% 78.5% 1.8 tests/test-data/genome-s10.fa.gz" in out + assert "50.4% 80.0% 1.9 tests/test-data/genome-s11.fa.gz" in out + assert "genome-s12.fa.gz" not in out assert "the recovered matches hit 100.0% of the abundance-weighted query" in out assert "the recovered matches hit 100.0% of the query k-mers (unweighted)" in out @@ -5003,15 +6108,14 @@ def test_gather_abund_10_1(runtmp, prefetch_gather, linear_gather): # ./sourmash compute -k 21 --scaled 1000 --merge=1-1 -o reads-s10-s11.sig r[13].fa --track-abundance # ./sourmash compute -k 21 --scaled 1000 --merge=10-1 -o reads-s10x10-s11.sig r[23].fa --track-abundance - query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig') - against_list = ['genome-s10', 'genome-s11', 'genome-s12'] - against_list = ['gather-abund/' + i + '.fa.gz.sig' - for i in against_list] + query = utils.get_test_data("gather-abund/reads-s10x10-s11.sig") + against_list = ["genome-s10", "genome-s11", "genome-s12"] + against_list = ["gather-abund/" + i + ".fa.gz.sig" for i in against_list] against_list = [utils.get_test_data(i) for i in against_list] - status, out, err = c.run_sourmash('gather', query, '-o', 'xxx.csv', - *against_list, linear_gather, - prefetch_gather) + status, out, err = c.run_sourmash( + "gather", query, "-o", "xxx.csv", *against_list, linear_gather, prefetch_gather + ) print(out) print(err) @@ -5025,14 +6129,14 @@ def test_gather_abund_10_1(runtmp, prefetch_gather, linear_gather): # * approximately 2.0 abundance (third column, avg_abund) for s11, # and (very) approximately 20x abundance for genome s10. - assert '91.0% 100.0% 14.5 tests/test-data/genome-s10.fa.gz' in out - assert '9.0% 80.0% 1.9 tests/test-data/genome-s11.fa.gz' in out - assert 'genome-s12.fa.gz' not in out + assert "91.0% 100.0% 14.5 tests/test-data/genome-s10.fa.gz" in out + assert "9.0% 80.0% 1.9 tests/test-data/genome-s11.fa.gz" in out + assert "genome-s12.fa.gz" not in out assert "the recovered matches hit 100.0% of the abundance-weighted query" in out # check the calculations behind the above output by looking into # the CSV. - with open(c.output('xxx.csv'), 'rt') as fp: + with open(c.output("xxx.csv")) as fp: r = csv.DictReader(fp) overlaps = [] @@ -5046,14 +6150,14 @@ def test_gather_abund_10_1(runtmp, prefetch_gather, linear_gather): total_weighted_list = [] for n, row in enumerate(r): - assert int(row['gather_result_rank']) == n + assert int(row["gather_result_rank"]) == n # other than f_weighted, these are all 'flat' numbers - no abunds. - overlap = float(row['intersect_bp']) - remaining_bp = float(row['remaining_bp']) - unique_overlap = float(row['unique_intersect_bp']) - f_weighted = float(row['f_unique_weighted']) - average_abund = float(row['average_abund']) + overlap = float(row["intersect_bp"]) + remaining_bp = float(row["remaining_bp"]) + unique_overlap = float(row["unique_intersect_bp"]) + f_weighted = float(row["f_unique_weighted"]) + average_abund = float(row["average_abund"]) overlaps.append(overlap) unique_overlaps.append(unique_overlap) @@ -5062,14 +6166,14 @@ def test_gather_abund_10_1(runtmp, prefetch_gather, linear_gather): remaining_bps.append(remaining_bp) # also track weighted calculations - n_weighted_list.append(float(row['n_unique_weighted_found'])) - sum_weighted_list.append(float(row['sum_weighted_found'])) - total_weighted_list.append(float(row['total_weighted_hashes'])) + n_weighted_list.append(float(row["n_unique_weighted_found"])) + sum_weighted_list.append(float(row["sum_weighted_found"])) + total_weighted_list.append(float(row["total_weighted_hashes"])) weighted_calc = [] - for (overlap, average_abund) in zip(overlaps, average_abunds): - prod = overlap*average_abund - weighted_calc.append(prod) # @CTB redundant terms with below? + for overlap, average_abund in zip(overlaps, average_abunds): + prod = overlap * average_abund + weighted_calc.append(prod) # @CTB redundant terms with below? total_weighted = sum(weighted_calc) for prod, f_weighted in zip(weighted_calc, f_weighted_list): @@ -5103,6 +6207,7 @@ def test_gather_abund_10_1(runtmp, prefetch_gather, linear_gather): f_weighted = f_weighted_list[i] assert f_weighted == n_weighted / 7986 + def test_gather_abund_10_1_ignore_abundance(runtmp, linear_gather, prefetch_gather): # check gather with an abundance-weighted query, then flattened with # --ignore-abund @@ -5115,18 +6220,21 @@ def test_gather_abund_10_1_ignore_abundance(runtmp, linear_gather, prefetch_gath # ./sourmash compute -k 21 --scaled 1000 --merge=1-1 -o reads-s10-s11.sig r[13].fa --track-abundance # ./sourmash compute -k 21 --scaled 1000 --merge=10-1 -o reads-s10x10-s11.sig r[23].fa --track-abundance - query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig') - against_list = ['genome-s10', 'genome-s11', 'genome-s12'] - against_list = ['gather-abund/' + i + '.fa.gz.sig' - for i in against_list] + query = utils.get_test_data("gather-abund/reads-s10x10-s11.sig") + against_list = ["genome-s10", "genome-s11", "genome-s12"] + against_list = ["gather-abund/" + i + ".fa.gz.sig" for i in against_list] against_list = [utils.get_test_data(i) for i in against_list] - status, out, err = c.run_sourmash('gather', query, - '--ignore-abundance', - *against_list, - linear_gather, prefetch_gather, - '-o', c.output('results.csv')) - + status, out, err = c.run_sourmash( + "gather", + query, + "--ignore-abundance", + *against_list, + linear_gather, + prefetch_gather, + "-o", + c.output("results.csv"), + ) print(out) print(err) @@ -5139,38 +6247,47 @@ def test_gather_abund_10_1_ignore_abundance(runtmp, linear_gather, prefetch_gath # * approximately 100% of the high coverage genome being matched, # with only 80% of the low coverage genome - assert all(('57.2% 100.0%', 'tests/test-data/genome-s10.fa.gz' in out)) - assert all(('42.8% 80.0%', 'tests/test-data/genome-s11.fa.gz' in out)) - assert 'genome-s12.fa.gz' not in out + assert all(("57.2% 100.0%", "tests/test-data/genome-s10.fa.gz" in out)) + assert all(("42.8% 80.0%", "tests/test-data/genome-s11.fa.gz" in out)) + assert "genome-s12.fa.gz" not in out - with open(c.output('results.csv'), 'rt') as fp: + with open(c.output("results.csv")) as fp: r = csv.DictReader(fp) some_results = False for row in r: some_results = True - assert row['average_abund'] == '' - assert row['median_abund'] == '' - assert row['std_abund'] == '' + assert row["average_abund"] == "" + assert row["median_abund"] == "" + assert row["std_abund"] == "" - assert row['query_abundance'] == 'False', row['query_abundance'] - assert row['n_unique_weighted_found'] == '' + assert row["query_abundance"] == "False", row["query_abundance"] + assert row["n_unique_weighted_found"] == "" assert some_results -def test_gather_output_unassigned_with_abundance(runtmp, prefetch_gather, linear_gather): +def test_gather_output_unassigned_with_abundance( + runtmp, prefetch_gather, linear_gather +): # check --output-unassigned with an abund query # @CTB: could add check on sum weighted etc. c = runtmp - query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig') - against = utils.get_test_data('gather-abund/genome-s10.fa.gz.sig') - - c.run_sourmash('gather', query, against, '--output-unassigned', - c.output('unassigned.sig'), linear_gather, prefetch_gather) - - assert os.path.exists(c.output('unassigned.sig')) - - nomatch = sourmash.load_one_signature(c.output('unassigned.sig')) + query = utils.get_test_data("gather-abund/reads-s10x10-s11.sig") + against = utils.get_test_data("gather-abund/genome-s10.fa.gz.sig") + + c.run_sourmash( + "gather", + query, + against, + "--output-unassigned", + c.output("unassigned.sig"), + linear_gather, + prefetch_gather, + ) + + assert os.path.exists(c.output("unassigned.sig")) + + nomatch = sourmash.load_one_signature(c.output("unassigned.sig")) assert nomatch.minhash.track_abundance query_ss = sourmash.load_one_signature(query) @@ -5189,14 +6306,21 @@ def test_gather_output_unassigned_with_abundance(runtmp, prefetch_gather, linear def test_gather_empty_db_fail(runtmp, linear_gather, prefetch_gather): # gather should fail on empty db with --fail-on-empty-database - query = utils.get_test_data('2.fa.sig') - against = utils.get_test_data('47.fa.sig') - against2 = utils.get_test_data('lca/47+63.lca.json') + query = utils.get_test_data("2.fa.sig") + against = utils.get_test_data("47.fa.sig") + against2 = utils.get_test_data("lca/47+63.lca.json") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('gather', query, against, against2, '-k', '51', - linear_gather, prefetch_gather) - + runtmp.sourmash( + "gather", + query, + against, + against2, + "-k", + "51", + linear_gather, + prefetch_gather, + ) err = runtmp.last_result.err assert "no compatible signatures found in " in err @@ -5204,13 +6328,21 @@ def test_gather_empty_db_fail(runtmp, linear_gather, prefetch_gather): def test_gather_empty_db_nofail(runtmp, prefetch_gather, linear_gather): # gather should not fail on empty db with --no-fail-on-empty-database - query = utils.get_test_data('2.fa.sig') - against = utils.get_test_data('47.fa.sig') - against2 = utils.get_test_data('lca/47+63.lca.json') - - runtmp.sourmash('gather', query, against, against2, '-k', '51', - '--no-fail-on-empty-data', - linear_gather, prefetch_gather) + query = utils.get_test_data("2.fa.sig") + against = utils.get_test_data("47.fa.sig") + against2 = utils.get_test_data("lca/47+63.lca.json") + + runtmp.sourmash( + "gather", + query, + against, + against2, + "-k", + "51", + "--no-fail-on-empty-data", + linear_gather, + prefetch_gather, + ) out = runtmp.last_result.out err = runtmp.last_result.err @@ -5218,16 +6350,20 @@ def test_gather_empty_db_nofail(runtmp, prefetch_gather, linear_gather): print(err) assert "no compatible signatures found in " in err - assert "ksize on this database is 31; this is different from requested ksize of 51" in err + assert ( + "ksize on this database is 31; this is different from requested ksize of 51" + in err + ) assert "loaded 50 total signatures from 2 locations" in err assert "after selecting signatures compatible with search, 0 remain." in err + def test_multigather_output_unassigned_with_abundance(runtmp): c = runtmp - query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig') - against = utils.get_test_data('gather-abund/genome-s10.fa.gz.sig') + query = utils.get_test_data("gather-abund/reads-s10x10-s11.sig") + against = utils.get_test_data("gather-abund/genome-s10.fa.gz.sig") - cmd = 'multigather --query {} --db {}'.format(query, against).split() + cmd = f"multigather --query {query} --db {against}".split() c.run_sourmash(*cmd) print(c.last_result.out) @@ -5237,9 +6373,9 @@ def test_multigather_output_unassigned_with_abundance(runtmp): assert "the recovered matches hit 91.0% of the abundance-weighted query." in out assert "the recovered matches hit 57.2% of the query k-mers (unweighted)." in out - assert os.path.exists(c.output('r3.fa.unassigned.sig')) + assert os.path.exists(c.output("r3.fa.unassigned.sig")) - nomatch = sourmash.load_one_signature(c.output('r3.fa.unassigned.sig')) + nomatch = sourmash.load_one_signature(c.output("r3.fa.unassigned.sig")) assert nomatch.minhash.track_abundance query_ss = sourmash.load_one_signature(query) @@ -5258,13 +6394,14 @@ def test_multigather_output_unassigned_with_abundance(runtmp): def test_multigather_empty_db_fail(runtmp): # multigather should fail on empty db with --fail-on-empty-database - query = utils.get_test_data('2.fa.sig') - against = utils.get_test_data('47.fa.sig') - against2 = utils.get_test_data('lca/47+63.lca.json') + query = utils.get_test_data("2.fa.sig") + against = utils.get_test_data("47.fa.sig") + against2 = utils.get_test_data("lca/47+63.lca.json") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('multigather', '--query', query, - '--db', against, against2, '-k', '51') + runtmp.sourmash( + "multigather", "--query", query, "--db", against, against2, "-k", "51" + ) err = runtmp.last_result.err assert "no compatible signatures found in " in err @@ -5272,13 +6409,21 @@ def test_multigather_empty_db_fail(runtmp): def test_multigather_empty_db_nofail(runtmp): # multigather should not fail on empty db with --no-fail-on-empty-database - query = utils.get_test_data('2.fa.sig') - against = utils.get_test_data('47.fa.sig') - against2 = utils.get_test_data('lca/47+63.lca.json') - - runtmp.sourmash('multigather', '--query', query, - '--db', against, against2, '-k', '51', - '--no-fail-on-empty-data') + query = utils.get_test_data("2.fa.sig") + against = utils.get_test_data("47.fa.sig") + against2 = utils.get_test_data("lca/47+63.lca.json") + + runtmp.sourmash( + "multigather", + "--query", + query, + "--db", + against, + against2, + "-k", + "51", + "--no-fail-on-empty-data", + ) out = runtmp.last_result.out err = runtmp.last_result.err @@ -5286,7 +6431,10 @@ def test_multigather_empty_db_nofail(runtmp): print(err) assert "no compatible signatures found in " in err - assert "ksize on this database is 31; this is different from requested ksize of 51" in err + assert ( + "ksize on this database is 31; this is different from requested ksize of 51" + in err + ) assert "conducted gather searches on 0 signatures" in err assert "loaded 50 total signatures from 2 locations" in err assert "after selecting signatures compatible with search, 0 remain." in err @@ -5294,53 +6442,54 @@ def test_multigather_empty_db_nofail(runtmp): def test_multigather_nomatch(runtmp): testdata_query = utils.get_test_data( - 'gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig') - testdata_match = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') + "gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig" + ) + testdata_match = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") - runtmp.sourmash('multigather', '--query', testdata_query, - '--db', testdata_match, '-k', '31') + runtmp.sourmash( + "multigather", "--query", testdata_query, "--db", testdata_match, "-k", "31" + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'found 0 matches total' in runtmp.last_result.out - assert 'the recovered matches hit 0.0% of the query' in runtmp.last_result.out + assert "found 0 matches total" in runtmp.last_result.out + assert "the recovered matches hit 0.0% of the query" in runtmp.last_result.out def test_multigather_abund_nomatch(runtmp): - testdata_query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig') - testdata_match = utils.get_test_data('gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig') + testdata_query = utils.get_test_data("gather-abund/reads-s10x10-s11.sig") + testdata_match = utils.get_test_data( + "gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig" + ) - runtmp.sourmash('multigather', '--query', testdata_query, - '--db', testdata_match) + runtmp.sourmash("multigather", "--query", testdata_query, "--db", testdata_match) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'found 0 matches total' in runtmp.last_result.out - assert 'the recovered matches hit 0.0% of the query' in runtmp.last_result.out + assert "found 0 matches total" in runtmp.last_result.out + assert "the recovered matches hit 0.0% of the query" in runtmp.last_result.out def test_sbt_categorize(runtmp): - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') - testdata4 = utils.get_test_data('genome-s10+s11.sig') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") # all four in the current directory for categorize . - shutil.copyfile(testdata1, runtmp.output('1.sig')) - shutil.copyfile(testdata2, runtmp.output('2.sig')) - shutil.copyfile(testdata3, runtmp.output('3.sig')) - shutil.copyfile(testdata4, runtmp.output('4.sig')) + shutil.copyfile(testdata1, runtmp.output("1.sig")) + shutil.copyfile(testdata2, runtmp.output("2.sig")) + shutil.copyfile(testdata3, runtmp.output("3.sig")) + shutil.copyfile(testdata4, runtmp.output("4.sig")) # omit 3 - args = ['index', '--dna', '-k', '21', 'zzz', '1.sig', '2.sig'] + args = ["index", "--dna", "-k", "21", "zzz", "1.sig", "2.sig"] runtmp.sourmash(*args) - # categorize all of the ones that were copied to 'location' - args = ['categorize', 'zzz', '.', - '--ksize', '21', '--dna', '--csv', 'out.csv'] + args = ["categorize", "zzz", ".", "--ksize", "21", "--dna", "--csv", "out.csv"] runtmp.sourmash(*args) print(runtmp.last_result.out) @@ -5348,27 +6497,34 @@ def test_sbt_categorize(runtmp): # mash dist genome-s10.fa.gz genome-s10+s11.fa.gz # yields 521/1000 ==> ~0.5 - assert 'for genome-s10+s11, found: 0.50 genome-s10' in runtmp.last_result.err + assert "for genome-s10+s11, found: 0.50 genome-s10" in runtmp.last_result.err - out_csv = Path(runtmp.output('out.csv')).read_text() + out_csv = Path(runtmp.output("out.csv")).read_text() print(out_csv) - assert '4.sig,genome-s10+s11,genome-s10,0.504' in out_csv + assert "4.sig,genome-s10+s11,genome-s10,0.504" in out_csv def test_sbt_categorize_ignore_abundance_1(runtmp): # --- Categorize without ignoring abundance --- - query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig') - against_list = ['reads-s10-s11'] - against_list = ['gather-abund/' + i + '.sig' - for i in against_list] + query = utils.get_test_data("gather-abund/reads-s10x10-s11.sig") + against_list = ["reads-s10-s11"] + against_list = ["gather-abund/" + i + ".sig" for i in against_list] against_list = [utils.get_test_data(i) for i in against_list] # omit 3 - args = ['index', '--dna', '-k', '21', 'thebestdatabase'] + against_list + args = ["index", "--dna", "-k", "21", "thebestdatabase"] + against_list runtmp.sourmash(*args) - args = ['categorize', 'thebestdatabase', - '--ksize', '21', '--dna', '--csv', 'out3.csv', query] + args = [ + "categorize", + "thebestdatabase", + "--ksize", + "21", + "--dna", + "--csv", + "out3.csv", + query, + ] with pytest.raises(SourmashCommandFailed): runtmp.sourmash(*args) @@ -5377,120 +6533,138 @@ def test_sbt_categorize_ignore_abundance_1(runtmp): print(runtmp.last_result.out) print(runtmp.last_result.err) - assert "ERROR: this search cannot be done on signatures calculated with abundance." in runtmp.last_result.err + assert ( + "ERROR: this search cannot be done on signatures calculated with abundance." + in runtmp.last_result.err + ) assert "ERROR: please specify --ignore-abundance." in runtmp.last_result.err def test_sbt_categorize_ignore_abundance_3(runtmp): # --- Now categorize with ignored abundance --- - query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig') - against_list = ['reads-s10-s11'] - against_list = ['gather-abund/' + i + '.sig' - for i in against_list] + query = utils.get_test_data("gather-abund/reads-s10x10-s11.sig") + against_list = ["reads-s10-s11"] + against_list = ["gather-abund/" + i + ".sig" for i in against_list] against_list = [utils.get_test_data(i) for i in against_list] # omit 3 - args = ['index', '--dna', '-k', '21', 'thebestdatabase'] + against_list + args = ["index", "--dna", "-k", "21", "thebestdatabase"] + against_list runtmp.sourmash(*args) - args = ['categorize', '--ignore-abundance', - '--ksize', '21', '--dna', '--csv', 'out4.csv', - 'thebestdatabase', query] + args = [ + "categorize", + "--ignore-abundance", + "--ksize", + "21", + "--dna", + "--csv", + "out4.csv", + "thebestdatabase", + query, + ] runtmp.sourmash(*args) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'for 1-1, found: 0.88 1-1' in runtmp.last_result.err + assert "for 1-1, found: 0.88 1-1" in runtmp.last_result.err - out_csv4 = Path(runtmp.output('out4.csv')).read_text() - assert 'reads-s10x10-s11.sig,1-1,1-1,0.87699' in out_csv4 + out_csv4 = Path(runtmp.output("out4.csv")).read_text() + assert "reads-s10x10-s11.sig,1-1,1-1,0.87699" in out_csv4 def test_sbt_categorize_already_done(runtmp): - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') - testdata4 = utils.get_test_data('genome-s10+s11.sig') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") - shutil.copyfile(testdata1, runtmp.output('1.sig')) - shutil.copyfile(testdata2, runtmp.output('2.sig')) - shutil.copyfile(testdata3, runtmp.output('3.sig')) - shutil.copyfile(testdata4, runtmp.output('4.sig')) + shutil.copyfile(testdata1, runtmp.output("1.sig")) + shutil.copyfile(testdata2, runtmp.output("2.sig")) + shutil.copyfile(testdata3, runtmp.output("3.sig")) + shutil.copyfile(testdata4, runtmp.output("4.sig")) # omit 3 - args = ['index', '--dna', '-k', '21', 'zzz', '1.sig', '2.sig'] + args = ["index", "--dna", "-k", "21", "zzz", "1.sig", "2.sig"] runtmp.sourmash(*args) - with open(runtmp.output('in.csv'), 'wt') as fp: - fp.write('./4.sig,genome-s10.fa.gz,0.50') - - args = ['categorize', 'zzz', './2.sig', './4.sig', - '--ksize', '21', '--dna', '--load-csv', 'in.csv'] + with open(runtmp.output("in.csv"), "w") as fp: + fp.write("./4.sig,genome-s10.fa.gz,0.50") + + args = [ + "categorize", + "zzz", + "./2.sig", + "./4.sig", + "--ksize", + "21", + "--dna", + "--load-csv", + "in.csv", + ] runtmp.sourmash(*args) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'for genome-s11.fa.gz, no match found' - assert not 'for s10+s11, found: 0.50 genome-s10.fa.gz' in runtmp.last_result.err + assert "for genome-s11.fa.gz, no match found" + assert "for s10+s11, found: 0.50 genome-s10.fa.gz" not in runtmp.last_result.err def test_sbt_categorize_already_done_traverse(runtmp): - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') - testdata4 = utils.get_test_data('genome-s10+s11.sig') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") - shutil.copyfile(testdata1, runtmp.output('1.sig')) - shutil.copyfile(testdata2, runtmp.output('2.sig')) - shutil.copyfile(testdata3, runtmp.output('3.sig')) - shutil.copyfile(testdata4, runtmp.output('4.sig')) + shutil.copyfile(testdata1, runtmp.output("1.sig")) + shutil.copyfile(testdata2, runtmp.output("2.sig")) + shutil.copyfile(testdata3, runtmp.output("3.sig")) + shutil.copyfile(testdata4, runtmp.output("4.sig")) # omit 3 - args = ['index', '--dna', '-k', '21', 'zzz', '1.sig', '2.sig'] + args = ["index", "--dna", "-k", "21", "zzz", "1.sig", "2.sig"] runtmp.sourmash(*args) - with open(runtmp.output('in.csv'), 'wt') as fp: - fp.write('./4.sig,genome-s10.fa.gz,0.50') + with open(runtmp.output("in.csv"), "w") as fp: + fp.write("./4.sig,genome-s10.fa.gz,0.50") - args = ['categorize', 'zzz', '.', - '--ksize', '21', '--dna', '--load-csv', 'in.csv'] + args = ["categorize", "zzz", ".", "--ksize", "21", "--dna", "--load-csv", "in.csv"] runtmp.sourmash(*args) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'for genome-s11.fa.gz, no match found' - assert not 'for s10+s11, found: 0.50 genome-s10.fa.gz' in runtmp.last_result.err + assert "for genome-s11.fa.gz, no match found" + assert "for s10+s11, found: 0.50 genome-s10.fa.gz" not in runtmp.last_result.err def test_sbt_categorize_multiple_ksizes_moltypes(runtmp): # 'categorize' works fine with multiple moltypes/ksizes - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") - shutil.copyfile(testdata1, runtmp.output('1.sig')) - shutil.copyfile(testdata2, runtmp.output('2.sig')) - shutil.copyfile(testdata3, runtmp.output('3.sig')) + shutil.copyfile(testdata1, runtmp.output("1.sig")) + shutil.copyfile(testdata2, runtmp.output("2.sig")) + shutil.copyfile(testdata3, runtmp.output("3.sig")) - args = ['index', '--dna', '-k', '21', 'zzz', '1.sig', '2.sig'] + args = ["index", "--dna", "-k", "21", "zzz", "1.sig", "2.sig"] runtmp.sourmash(*args) - args = ['categorize', 'zzz', '.'] + args = ["categorize", "zzz", "."] runtmp.sourmash(*args) def test_watch_check_num_bounds_negative(runtmp): # check that watch properly outputs error on negative num c = runtmp - testdata0 = utils.get_test_data('genome-s10.fa.gz') - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - shutil.copyfile(testdata1, c.output('1.sig')) + testdata0 = utils.get_test_data("genome-s10.fa.gz") + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + shutil.copyfile(testdata1, c.output("1.sig")) - c.run_sourmash('index', '--dna', '-k', '21', 'zzz', '1.sig') + c.run_sourmash("index", "--dna", "-k", "21", "zzz", "1.sig") - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('watch', '--ksize', '21', '-n', '-5', '--dna', 'zzz', testdata0) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("watch", "--ksize", "21", "-n", "-5", "--dna", "zzz", testdata0) assert "ERROR: num value must be positive" in c.last_result.err @@ -5498,13 +6672,13 @@ def test_watch_check_num_bounds_negative(runtmp): def test_watch_check_num_bounds_less_than_minimum(runtmp): # check that watch properly outputs warnings on small num c = runtmp - testdata0 = utils.get_test_data('genome-s10.fa.gz') - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - shutil.copyfile(testdata1, c.output('1.sig')) + testdata0 = utils.get_test_data("genome-s10.fa.gz") + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + shutil.copyfile(testdata1, c.output("1.sig")) - c.run_sourmash('index', '--dna', '-k', '21', 'zzz', '1.sig') + c.run_sourmash("index", "--dna", "-k", "21", "zzz", "1.sig") - c.run_sourmash('watch', '--ksize', '21', '-n', '25', '--dna', 'zzz', testdata0) + c.run_sourmash("watch", "--ksize", "21", "-n", "25", "--dna", "zzz", testdata0) assert "WARNING: num value should be >= 50. Continuing anyway." in c.last_result.err @@ -5512,113 +6686,124 @@ def test_watch_check_num_bounds_less_than_minimum(runtmp): def test_watch_check_num_bounds_more_than_maximum(runtmp): # check that watch properly outputs warnings on large num c = runtmp - testdata0 = utils.get_test_data('genome-s10.fa.gz') - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - shutil.copyfile(testdata1, c.output('1.sig')) + testdata0 = utils.get_test_data("genome-s10.fa.gz") + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + shutil.copyfile(testdata1, c.output("1.sig")) - c.run_sourmash('index', '--dna', '-k', '21', 'zzz', '1.sig') + c.run_sourmash("index", "--dna", "-k", "21", "zzz", "1.sig") - c.run_sourmash('watch', '--ksize', '21', '-n', '100000', '--dna', 'zzz', testdata0) + c.run_sourmash("watch", "--ksize", "21", "-n", "100000", "--dna", "zzz", testdata0) - assert "WARNING: num value should be <= 50000. Continuing anyway." in c.last_result.err + assert ( + "WARNING: num value should be <= 50000. Continuing anyway." in c.last_result.err + ) def test_watch(runtmp): # check basic watch functionality c = runtmp - testdata0 = utils.get_test_data('genome-s10.fa.gz') - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - shutil.copyfile(testdata1, c.output('1.sig')) + testdata0 = utils.get_test_data("genome-s10.fa.gz") + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + shutil.copyfile(testdata1, c.output("1.sig")) - c.run_sourmash('index', '--dna', '-k', '21', 'zzz', '1.sig') + c.run_sourmash("index", "--dna", "-k", "21", "zzz", "1.sig") - c.run_sourmash('watch', '--ksize', '21', '--dna', 'zzz', testdata0) + c.run_sourmash("watch", "--ksize", "21", "--dna", "zzz", testdata0) print(c.last_result.out) print(c.last_result.err) - assert 'FOUND: genome-s10, at 1.000' in c.last_result.out + assert "FOUND: genome-s10, at 1.000" in c.last_result.out def test_watch_deduce_ksize(runtmp): # check that watch guesses ksize automatically from database c = runtmp - testdata0 = utils.get_test_data('genome-s10.fa.gz') - c.run_sourmash('sketch','dna','-p','k=29,num=500', '-o', '1.sig', testdata0) + testdata0 = utils.get_test_data("genome-s10.fa.gz") + c.run_sourmash("sketch", "dna", "-p", "k=29,num=500", "-o", "1.sig", testdata0) - c.run_sourmash('index', '--dna', '-k', '29', 'zzz', '1.sig') + c.run_sourmash("index", "--dna", "-k", "29", "zzz", "1.sig") - c.run_sourmash('watch', '--dna', 'zzz', testdata0) + c.run_sourmash("watch", "--dna", "zzz", testdata0) print(c.last_result.out) print(c.last_result.err) - assert 'Computing signature for k=29' in c.last_result.err - assert 'genome-s10.fa.gz, at 1.000' in c.last_result.out + assert "Computing signature for k=29" in c.last_result.err + assert "genome-s10.fa.gz, at 1.000" in c.last_result.out def test_watch_coverage(runtmp): # check output details/coverage of found - testdata0 = utils.get_test_data('genome-s10.fa.gz') - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - shutil.copyfile(testdata1, runtmp.output('1.sig')) + testdata0 = utils.get_test_data("genome-s10.fa.gz") + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + shutil.copyfile(testdata1, runtmp.output("1.sig")) - args = ['index', '--dna', '-k', '21', 'zzz', '1.sig'] + args = ["index", "--dna", "-k", "21", "zzz", "1.sig"] runtmp.sourmash(*args) - with open(runtmp.output('query.fa'), 'wt') as fp: + with open(runtmp.output("query.fa"), "w") as fp: record = list(screed.open(testdata0))[0] for start in range(0, len(record), 100): - fp.write('>{}\n{}\n'.format(start, - record.sequence[start:start+500])) + fp.write(f">{start}\n{record.sequence[start : start + 500]}\n") - args = ['watch', '--ksize', '21', '--dna', 'zzz', 'query.fa'] + args = ["watch", "--ksize", "21", "--dna", "zzz", "query.fa"] runtmp.sourmash(*args) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'FOUND: genome-s10, at 1.000' in runtmp.last_result.out + assert "FOUND: genome-s10, at 1.000" in runtmp.last_result.out def test_watch_output_sig(runtmp): # test watch --output - testdata0 = utils.get_test_data('genome-s10.fa.gz') - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - shutil.copyfile(testdata1, runtmp.output('1.sig')) + testdata0 = utils.get_test_data("genome-s10.fa.gz") + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + shutil.copyfile(testdata1, runtmp.output("1.sig")) - args = ['index', '--dna', '-k', '21', 'zzz', '1.sig'] + args = ["index", "--dna", "-k", "21", "zzz", "1.sig"] runtmp.sourmash(*args) - with open(runtmp.output('query.fa'), 'wt') as fp: + with open(runtmp.output("query.fa"), "w") as fp: record = list(screed.open(testdata0))[0] for start in range(0, len(record), 100): - fp.write('>{}\n{}\n'.format(start, - record.sequence[start:start+500])) - - args = ['watch', '--ksize', '21', '--dna', 'zzz', 'query.fa', - '-o', 'out.sig', '--name', 'xyzfoo'] + fp.write(f">{start}\n{record.sequence[start : start + 500]}\n") + + args = [ + "watch", + "--ksize", + "21", + "--dna", + "zzz", + "query.fa", + "-o", + "out.sig", + "--name", + "xyzfoo", + ] runtmp.sourmash(*args) print(runtmp.last_result.out) print(runtmp.last_result.err) - out_sig = runtmp.output('out.sig') + out_sig = runtmp.output("out.sig") assert os.path.exists(out_sig) siglist = list(sourmash.load_file_as_signatures(out_sig)) assert len(siglist) == 1 - assert siglist[0].filename == 'stdin' - assert siglist[0].name == 'xyzfoo' + assert siglist[0].filename == "stdin" + assert siglist[0].name == "xyzfoo" def test_storage_convert(runtmp): - testdata = utils.get_test_data('v2.sbt.json') - shutil.copyfile(testdata, runtmp.output('v2.sbt.json')) - shutil.copytree(os.path.join(os.path.dirname(testdata), '.sbt.v2'), - runtmp.output('.sbt.v2')) - testsbt = runtmp.output('v2.sbt.json') + testdata = utils.get_test_data("v2.sbt.json") + shutil.copyfile(testdata, runtmp.output("v2.sbt.json")) + shutil.copytree( + os.path.join(os.path.dirname(testdata), ".sbt.v2"), runtmp.output(".sbt.v2") + ) + testsbt = runtmp.output("v2.sbt.json") original = SBT.load(testsbt, leaf_loader=SigLeaf.load) - args = ['storage', 'convert', '-b', 'ipfs', testsbt] + args = ["storage", "convert", "-b", "ipfs", testsbt] try: runtmp.sourmash(*args) except SourmashCommandFailed: @@ -5626,151 +6811,165 @@ def test_storage_convert(runtmp): if runtmp.last_result.status: if "ipfshttpclient.ConnectionError" in runtmp.last_result.err: - raise pytest.xfail('ipfs probably not running') + raise pytest.xfail("ipfs probably not running") if "No module named 'ipfshttpclient'" in runtmp.last_result.err: - raise pytest.xfail('ipfshttpclient module not installed') + raise pytest.xfail("ipfshttpclient module not installed") print("NO FAIL; KEEP ON GOING!") - ipfs = SBT.load(testsbt, leaf_loader=SigLeaf.load) assert len(original) == len(ipfs) - assert all(n1[1].name == n2[1].name - for (n1, n2) in zip(sorted(original), sorted(ipfs))) - - args = ['storage', 'convert', - '-b', """'ZipStorage("{}")'""".format( - runtmp.output('v2.sbt.zip')), - testsbt] + assert all( + n1[1].name == n2[1].name for (n1, n2) in zip(sorted(original), sorted(ipfs)) + ) + + args = [ + "storage", + "convert", + "-b", + """'ZipStorage("{}")'""".format(runtmp.output("v2.sbt.zip")), + testsbt, + ] runtmp.sourmash(*args) tar = SBT.load(testsbt, leaf_loader=SigLeaf.load) assert len(original) == len(tar) - assert all(n1[1].name == n2[1].name - for (n1, n2) in zip(sorted(original), sorted(tar))) + assert all( + n1[1].name == n2[1].name for (n1, n2) in zip(sorted(original), sorted(tar)) + ) print("it all worked!!") def test_storage_convert_identity(runtmp): - testdata = utils.get_test_data('v2.sbt.json') - shutil.copyfile(testdata, runtmp.output('v2.sbt.json')) - shutil.copytree(os.path.join(os.path.dirname(testdata), '.sbt.v2'), - runtmp.output('.sbt.v2')) - testsbt = runtmp.output('v2.sbt.json') + testdata = utils.get_test_data("v2.sbt.json") + shutil.copyfile(testdata, runtmp.output("v2.sbt.json")) + shutil.copytree( + os.path.join(os.path.dirname(testdata), ".sbt.v2"), runtmp.output(".sbt.v2") + ) + testsbt = runtmp.output("v2.sbt.json") original = SBT.load(testsbt, leaf_loader=SigLeaf.load) - args = ['storage', 'convert', '-b', 'fsstorage', testsbt] + args = ["storage", "convert", "-b", "fsstorage", testsbt] runtmp.sourmash(*args) identity = SBT.load(testsbt, leaf_loader=SigLeaf.load) assert len(original) == len(identity) - assert all(n1[1].name == n2[1].name - for (n1, n2) in zip(sorted(original), sorted(identity))) + assert all( + n1[1].name == n2[1].name for (n1, n2) in zip(sorted(original), sorted(identity)) + ) def test_storage_convert_fsstorage_newpath(runtmp): - testdata = utils.get_test_data('v2.sbt.json') - shutil.copyfile(testdata, runtmp.output('v2.sbt.json')) - shutil.copytree(os.path.join(os.path.dirname(testdata), '.sbt.v2'), - runtmp.output('.sbt.v2')) - testsbt = runtmp.output('v2.sbt.json') + testdata = utils.get_test_data("v2.sbt.json") + shutil.copyfile(testdata, runtmp.output("v2.sbt.json")) + shutil.copytree( + os.path.join(os.path.dirname(testdata), ".sbt.v2"), runtmp.output(".sbt.v2") + ) + testsbt = runtmp.output("v2.sbt.json") original = SBT.load(testsbt, leaf_loader=SigLeaf.load) - args = ['storage', 'convert', - '-b', 'fsstorage({})'.format(runtmp.output('v3')), - testsbt] + args = [ + "storage", + "convert", + "-b", + "fsstorage({})".format(runtmp.output("v3")), + testsbt, + ] runtmp.sourmash(*args) identity = SBT.load(testsbt, leaf_loader=SigLeaf.load) assert len(original) == len(identity) - assert all(n1[1].name == n2[1].name - for (n1, n2) in zip(sorted(original), sorted(identity))) + assert all( + n1[1].name == n2[1].name for (n1, n2) in zip(sorted(original), sorted(identity)) + ) def test_migrate(runtmp): - testdata = utils.get_test_data('v3.sbt.json') - shutil.copyfile(testdata, runtmp.output('v3.sbt.json')) - shutil.copytree(os.path.join(os.path.dirname(testdata), '.sbt.v3'), - runtmp.output('.sbt.v3')) - testsbt = runtmp.output('v3.sbt.json') + testdata = utils.get_test_data("v3.sbt.json") + shutil.copyfile(testdata, runtmp.output("v3.sbt.json")) + shutil.copytree( + os.path.join(os.path.dirname(testdata), ".sbt.v3"), runtmp.output(".sbt.v3") + ) + testsbt = runtmp.output("v3.sbt.json") original = SBT.load(testsbt, leaf_loader=SigLeaf.load) - runtmp.sourmash('migrate', testsbt) + runtmp.sourmash("migrate", testsbt) identity = SBT.load(testsbt, leaf_loader=SigLeaf.load) assert len(original) == len(identity) - assert all(n1[1].name == n2[1].name - for (n1, n2) in zip(sorted(original), - sorted(identity))) + assert all( + n1[1].name == n2[1].name for (n1, n2) in zip(sorted(original), sorted(identity)) + ) assert "this is an old index version" not in runtmp.last_result.err - assert all('min_n_below' in node.metadata - for node in identity - if isinstance(node, Node)) + assert all( + "min_n_below" in node.metadata for node in identity if isinstance(node, Node) + ) def test_license_cc0(runtmp): - testdata1 = utils.get_test_data('short.fa') - runtmp.sourmash('sketch','translate', '-p', 'k=31', testdata1) + testdata1 = utils.get_test_data("short.fa") + runtmp.sourmash("sketch", "translate", "-p", "k=31", testdata1) - sigfile = runtmp.output('short.fa.sig') + sigfile = runtmp.output("short.fa.sig") assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) - assert str(sig).endswith('short.fa') + assert str(sig).endswith("short.fa") - assert sig.license == 'CC0' + assert sig.license == "CC0" def test_license_non_cc0(runtmp): - testdata1 = utils.get_test_data('short.fa') + testdata1 = utils.get_test_data("short.fa") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'translate', '-p','k=31', '--license', 'GPL', testdata1) + runtmp.sourmash( + "sketch", "translate", "-p", "k=31", "--license", "GPL", testdata1 + ) assert runtmp.last_result.status != 0 print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'sourmash only supports CC0' in runtmp.last_result.err + assert "sourmash only supports CC0" in runtmp.last_result.err def test_license_load_non_cc0(): - sigfile = utils.get_test_data('bad-license.sig') + sigfile = utils.get_test_data("bad-license.sig") try: - sig = next(signature.load_signatures(sigfile, do_raise=True)) + next(signature.load_signatures(sigfile, do_raise=True)) except Exception as e: assert "sourmash only supports CC0-licensed signatures" in str(e) @utils.in_tempdir def test_do_sourmash_index_zipfile(c): - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - c.run_sourmash('index', '-k', '31', 'zzz.sbt.zip', - *testdata_sigs) + c.run_sourmash("index", "-k", "31", "zzz.sbt.zip", *testdata_sigs) - outfile = c.output('zzz.sbt.zip') + outfile = c.output("zzz.sbt.zip") assert os.path.exists(outfile) print(c) assert c.last_result.status == 0 - assert 'Finished saving SBT index, available at' in c.last_result.err + assert "Finished saving SBT index, available at" in c.last_result.err # look internally at the zip file with zipfile.ZipFile(outfile) as zf: content = zf.namelist() assert len(content) == 26 - assert len([c for c in content if 'internal' in c]) == 11 + assert len([c for c in content if "internal" in c]) == 11 assert ".sbt.zzz/" in content sbts = [c for c in content if c.endswith(".sbt.json")] assert len(sbts) == 1 @@ -5779,7 +6978,7 @@ def test_do_sourmash_index_zipfile(c): @utils.in_tempdir def test_do_sourmash_index_zipfile_append(c): - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) half_point = int(len(testdata_sigs) / 2) first_half = testdata_sigs[:half_point] @@ -5792,35 +6991,33 @@ def test_do_sourmash_index_zipfile_append(c): assert not set(first_half).intersection(set(second_half)) with warnings.catch_warnings(record=True) as record: - c.run_sourmash('index', '-k', '31', 'zzz.sbt.zip', - *first_half) + c.run_sourmash("index", "-k", "31", "zzz.sbt.zip", *first_half) # UserWarning is raised when there are duplicated entries in the zipfile assert not record, record - outfile = c.output('zzz.sbt.zip') + outfile = c.output("zzz.sbt.zip") assert os.path.exists(outfile) print(c) assert c.last_result.status == 0 - assert 'Finished saving SBT index, available at' in c.last_result.err + assert "Finished saving SBT index, available at" in c.last_result.err with warnings.catch_warnings(record=True) as record: - c.run_sourmash('index', "--append", '-k', '31', 'zzz.sbt.zip', - *second_half) + c.run_sourmash("index", "--append", "-k", "31", "zzz.sbt.zip", *second_half) # UserWarning is raised when there are duplicated entries in the zipfile print(record) - #assert not record, record + # assert not record, record print(c) assert c.last_result.status == 0 - assert 'Finished saving SBT index, available at' in c.last_result.err + assert "Finished saving SBT index, available at" in c.last_result.err # look internally at the zip file with zipfile.ZipFile(outfile) as zf: content = zf.namelist() print(content) assert len(content) == 26 - assert len([c for c in content if 'internal' in c]) == 11 + assert len([c for c in content if "internal" in c]) == 11 assert ".sbt.zzz/" in content sbts = [c for c in content if c.endswith(".sbt.json")] assert len(sbts) == 1 @@ -5829,13 +7026,14 @@ def test_do_sourmash_index_zipfile_append(c): def test_index_with_picklist(runtmp): # test 'sourmash index' with picklists - gcf_sig_dir = utils.get_test_data('gather/') - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + gcf_sig_dir = utils.get_test_data("gather/") + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") - output_db = runtmp.output('thermo.sbt.zip') + output_db = runtmp.output("thermo.sbt.zip") - runtmp.sourmash('index', output_db, gcf_sig_dir, - '-k', '31', '--picklist', f"{picklist}:md5:md5") + runtmp.sourmash( + "index", output_db, gcf_sig_dir, "-k", "31", "--picklist", f"{picklist}:md5:md5" + ) err = runtmp.last_result.err print(err) @@ -5848,18 +7046,25 @@ def test_index_with_picklist(runtmp): siglist = list(sourmash.load_file_as_signatures(output_db)) assert len(siglist) == 3 for ss in siglist: - assert 'Thermotoga' in ss.name + assert "Thermotoga" in ss.name def test_index_with_picklist_exclude(runtmp): # test 'sourmash index' with picklists - exclude - gcf_sig_dir = utils.get_test_data('gather/') - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + gcf_sig_dir = utils.get_test_data("gather/") + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") - output_db = runtmp.output('thermo-exclude.sbt.zip') + output_db = runtmp.output("thermo-exclude.sbt.zip") - runtmp.sourmash('index', output_db, gcf_sig_dir, - '-k', '31', '--picklist', f"{picklist}:md5:md5:exclude") + runtmp.sourmash( + "index", + output_db, + gcf_sig_dir, + "-k", + "31", + "--picklist", + f"{picklist}:md5:md5:exclude", + ) err = runtmp.last_result.err print(err) @@ -5869,35 +7074,43 @@ def test_index_with_picklist_exclude(runtmp): siglist = list(sourmash.load_file_as_signatures(output_db)) assert len(siglist) == 9 for ss in siglist: - assert 'Thermotoga' not in ss.name + assert "Thermotoga" not in ss.name def test_index_matches_search_with_picklist(runtmp): # test 'sourmash index' with picklists - gcf_sig_dir = utils.get_test_data('gather/') - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') - metag_sig = utils.get_test_data('gather/combined.sig') + gcf_sig_dir = utils.get_test_data("gather/") + glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") + metag_sig = utils.get_test_data("gather/combined.sig") - output_db = runtmp.output('thermo.sbt.zip') + output_db = runtmp.output("thermo.sbt.zip") - runtmp.sourmash('index', output_db, gcf_sig_dir, '-k', '21') + runtmp.sourmash("index", output_db, gcf_sig_dir, "-k", "21") print(runtmp.last_result.out) print(runtmp.last_result.err) # verify: siglist = list(sourmash.load_file_as_signatures(output_db)) - assert len(siglist) > 3 # all signatures included... + assert len(siglist) > 3 # all signatures included... n_thermo = 0 for ss in siglist: - if 'Thermotoga' in ss.name: + if "Thermotoga" in ss.name: n_thermo += 1 assert n_thermo == 3 - runtmp.sourmash('search', metag_sig, output_db, '--containment', - '-k', '21', '--picklist', f"{picklist}:md5:md5") + runtmp.sourmash( + "search", + metag_sig, + output_db, + "--containment", + "-k", + "21", + "--picklist", + f"{picklist}:md5:md5", + ) err = runtmp.last_result.err print(err) @@ -5915,30 +7128,38 @@ def test_index_matches_search_with_picklist(runtmp): def test_index_matches_search_with_picklist_exclude(runtmp): # test 'sourmash index' with picklists - exclude - gcf_sig_dir = utils.get_test_data('gather/') - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') - metag_sig = utils.get_test_data('gather/combined.sig') + gcf_sig_dir = utils.get_test_data("gather/") + glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") + metag_sig = utils.get_test_data("gather/combined.sig") - output_db = runtmp.output('thermo-exclude.sbt.zip') + output_db = runtmp.output("thermo-exclude.sbt.zip") - runtmp.sourmash('index', output_db, gcf_sig_dir, '-k', '21') + runtmp.sourmash("index", output_db, gcf_sig_dir, "-k", "21") print(runtmp.last_result.out) print(runtmp.last_result.err) # verify: siglist = list(sourmash.load_file_as_signatures(output_db)) - assert len(siglist) > 3 # all signatures included... + assert len(siglist) > 3 # all signatures included... n_thermo = 0 for ss in siglist: - if 'Thermotoga' in ss.name: + if "Thermotoga" in ss.name: n_thermo += 1 assert n_thermo == 3 - runtmp.sourmash('search', metag_sig, output_db, '--containment', - '-k', '21', '--picklist', f"{picklist}:md5:md5:exclude") + runtmp.sourmash( + "search", + metag_sig, + output_db, + "--containment", + "-k", + "21", + "--picklist", + f"{picklist}:md5:md5:exclude", + ) err = runtmp.last_result.err print(err) @@ -5956,12 +7177,11 @@ def test_index_matches_search_with_picklist_exclude(runtmp): def test_gather_with_prefetch_picklist(runtmp, linear_gather): # test 'gather' using a picklist taken from 'sourmash prefetch' output - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - prefetch_csv = runtmp.output('prefetch-out.csv') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + prefetch_csv = runtmp.output("prefetch-out.csv") - runtmp.sourmash('prefetch', metag_sig, *gcf_sigs, - '-k', '21', '-o', prefetch_csv) + runtmp.sourmash("prefetch", metag_sig, *gcf_sigs, "-k", "21", "-o", prefetch_csv) err = runtmp.last_result.err print(err) @@ -5970,12 +7190,22 @@ def test_gather_with_prefetch_picklist(runtmp, linear_gather): print(out) assert "total of 12 matching signatures." in err - assert "of 1466 distinct query hashes, 1466 were found in matches above threshold." in err + assert ( + "of 1466 distinct query hashes, 1466 were found in matches above threshold." + in err + ) # now, do a gather with the results - runtmp.sourmash('gather', metag_sig, *gcf_sigs, linear_gather, - '-k', '21', '--picklist', - f'{prefetch_csv}:match_md5:md5short') + runtmp.sourmash( + "gather", + metag_sig, + *gcf_sigs, + linear_gather, + "-k", + "21", + "--picklist", + f"{prefetch_csv}:match_md5:md5short", + ) err = runtmp.last_result.err print(err) @@ -5993,12 +7223,11 @@ def test_gather_with_prefetch_picklist(runtmp, linear_gather): def test_gather_with_prefetch_picklist_2_prefetch(runtmp, linear_gather): # test 'gather' using a picklist taken from 'sourmash prefetch' output # using ::prefetch - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - prefetch_csv = runtmp.output('prefetch-out.csv') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + prefetch_csv = runtmp.output("prefetch-out.csv") - runtmp.sourmash('prefetch', metag_sig, *gcf_sigs, - '-k', '21', '-o', prefetch_csv) + runtmp.sourmash("prefetch", metag_sig, *gcf_sigs, "-k", "21", "-o", prefetch_csv) err = runtmp.last_result.err print(err) @@ -6007,12 +7236,22 @@ def test_gather_with_prefetch_picklist_2_prefetch(runtmp, linear_gather): print(out) assert "total of 12 matching signatures." in err - assert "of 1466 distinct query hashes, 1466 were found in matches above threshold." in err + assert ( + "of 1466 distinct query hashes, 1466 were found in matches above threshold." + in err + ) # now, do a gather with the results - runtmp.sourmash('gather', metag_sig, *gcf_sigs, linear_gather, - '-k', '21', '--picklist', - f'{prefetch_csv}::prefetch') + runtmp.sourmash( + "gather", + metag_sig, + *gcf_sigs, + linear_gather, + "-k", + "21", + "--picklist", + f"{prefetch_csv}::prefetch", + ) err = runtmp.last_result.err print(err) @@ -6031,12 +7270,11 @@ def test_gather_with_prefetch_picklist_3_gather(runtmp, linear_gather): # test 'gather' using a picklist taken from 'sourmash gather' output, # using ::gather. # (this doesn't really do anything useful, but it's an ok test :) - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - gather_csv = runtmp.output('gather-out.csv') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + gather_csv = runtmp.output("gather-out.csv") - runtmp.sourmash('gather', metag_sig, *gcf_sigs, - '-k', '21', '-o', gather_csv) + runtmp.sourmash("gather", metag_sig, *gcf_sigs, "-k", "21", "-o", gather_csv) err = runtmp.last_result.err print(err) @@ -6051,9 +7289,16 @@ def test_gather_with_prefetch_picklist_3_gather(runtmp, linear_gather): assert "1.9 Mbp 13.1% 100.0% NC_000853.1 " in out # now, do another gather with the results - runtmp.sourmash('gather', metag_sig, *gcf_sigs, linear_gather, - '-k', '21', '--picklist', - f'{gather_csv}::gather') + runtmp.sourmash( + "gather", + metag_sig, + *gcf_sigs, + linear_gather, + "-k", + "21", + "--picklist", + f"{gather_csv}::gather", + ) err = runtmp.last_result.err print(err) @@ -6072,12 +7317,11 @@ def test_gather_with_prefetch_picklist_3_gather_badcol(runtmp): # test 'gather' using a picklist taken from 'sourmash gather' output, # using ::gather. # (this doesn't really do anything useful, but it's an ok test :) - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - gather_csv = runtmp.output('gather-out.csv') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + gather_csv = runtmp.output("gather-out.csv") - runtmp.sourmash('gather', metag_sig, *gcf_sigs, - '-k', '21', '-o', gather_csv) + runtmp.sourmash("gather", metag_sig, *gcf_sigs, "-k", "21", "-o", gather_csv) err = runtmp.last_result.err print(err) @@ -6094,9 +7338,15 @@ def test_gather_with_prefetch_picklist_3_gather_badcol(runtmp): # now, do another gather with the results, but with a bad picklist # parameter with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('gather', metag_sig, *gcf_sigs, - '-k', '21', '--picklist', - f'{gather_csv}:FOO:gather') + runtmp.sourmash( + "gather", + metag_sig, + *gcf_sigs, + "-k", + "21", + "--picklist", + f"{gather_csv}:FOO:gather", + ) err = runtmp.last_result.err print(err) @@ -6112,11 +7362,11 @@ def test_gather_with_prefetch_picklist_4_manifest(runtmp, linear_gather): # test 'gather' using a picklist taken from 'sourmash sig manifest' # output, using ::manifest. # (this doesn't really do anything useful, but it's an ok test :) - gather_dir = utils.get_test_data('gather/') - metag_sig = utils.get_test_data('gather/combined.sig') - manifest_csv = runtmp.output('manifest.csv') + gather_dir = utils.get_test_data("gather/") + metag_sig = utils.get_test_data("gather/combined.sig") + manifest_csv = runtmp.output("manifest.csv") - runtmp.sourmash('sig', 'manifest', gather_dir, '-o', manifest_csv) + runtmp.sourmash("sig", "manifest", gather_dir, "-o", manifest_csv) err = runtmp.last_result.err print(err) @@ -6125,9 +7375,16 @@ def test_gather_with_prefetch_picklist_4_manifest(runtmp, linear_gather): print(out) # now, do a gather on the manifest - runtmp.sourmash('gather', metag_sig, gather_dir, linear_gather, - '-k', '21', '--picklist', - f'{manifest_csv}::manifest') + runtmp.sourmash( + "gather", + metag_sig, + gather_dir, + linear_gather, + "-k", + "21", + "--picklist", + f"{manifest_csv}::manifest", + ) err = runtmp.last_result.err print(err) @@ -6146,11 +7403,11 @@ def test_gather_with_prefetch_picklist_4_manifest_excl(runtmp, linear_gather): # test 'gather' using a picklist taken from 'sourmash sig manifest' # output, using ::manifest. # (this doesn't really do anything useful, but it's an ok test :) - gather_dir = utils.get_test_data('gather/') - metag_sig = utils.get_test_data('gather/combined.sig') - manifest_csv = runtmp.output('manifest.csv') + gather_dir = utils.get_test_data("gather/") + metag_sig = utils.get_test_data("gather/combined.sig") + manifest_csv = runtmp.output("manifest.csv") - runtmp.sourmash('sig', 'manifest', gather_dir, '-o', manifest_csv) + runtmp.sourmash("sig", "manifest", gather_dir, "-o", manifest_csv) err = runtmp.last_result.err print(err) @@ -6159,9 +7416,16 @@ def test_gather_with_prefetch_picklist_4_manifest_excl(runtmp, linear_gather): print(out) # now, do a gather on the manifest - runtmp.sourmash('gather', metag_sig, gather_dir, linear_gather, - '-k', '21', '--picklist', - f'{manifest_csv}::manifest:exclude') + runtmp.sourmash( + "gather", + metag_sig, + gather_dir, + linear_gather, + "-k", + "21", + "--picklist", + f"{manifest_csv}::manifest:exclude", + ) err = runtmp.last_result.err print(err) @@ -6176,12 +7440,13 @@ def test_gather_with_prefetch_picklist_4_manifest_excl(runtmp, linear_gather): def test_gather_with_prefetch_picklist_5_search(runtmp): # test 'gather' using a picklist taken from 'sourmash prefetch' output # using ::prefetch - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - search_csv = runtmp.output('search-out.csv') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + search_csv = runtmp.output("search-out.csv") - runtmp.sourmash('search', '--containment', metag_sig, *gcf_sigs, - '-k', '21', '-o', search_csv) + runtmp.sourmash( + "search", "--containment", metag_sig, *gcf_sigs, "-k", "21", "-o", search_csv + ) err = runtmp.last_result.err print(err) @@ -6193,9 +7458,15 @@ def test_gather_with_prefetch_picklist_5_search(runtmp): assert " 33.2% NC_003198.1 Salmonella enterica subsp." in out # now, do a gather with the results - runtmp.sourmash('gather', metag_sig, *gcf_sigs, - '-k', '21', '--picklist', - f'{search_csv}::search') + runtmp.sourmash( + "gather", + metag_sig, + *gcf_sigs, + "-k", + "21", + "--picklist", + f"{search_csv}::search", + ) err = runtmp.last_result.err print(err) @@ -6212,17 +7483,17 @@ def test_gather_with_prefetch_picklist_5_search(runtmp): def test_gather_scaled_1(runtmp, linear_gather, prefetch_gather): # test gather on a sig indexed with scaled=1 - inp = utils.get_test_data('short.fa') - outp = runtmp.output('out.sig') + inp = utils.get_test_data("short.fa") + outp = runtmp.output("out.sig") # prepare a signature with a scaled of 1 - runtmp.sourmash('sketch', 'dna', '-p', 'scaled=1,k=31', inp, '-o', outp) + runtmp.sourmash("sketch", "dna", "-p", "scaled=1,k=31", inp, "-o", outp) # run with a low threshold - runtmp.sourmash('gather', outp, outp, '--threshold-bp', '0') + runtmp.sourmash("gather", outp, outp, "--threshold-bp", "0") print(runtmp.last_result.out) - print('---') + print("---") print(runtmp.last_result.err) assert "1.0 kbp 100.0% 100.0%" in runtmp.last_result.out @@ -6231,25 +7502,25 @@ def test_gather_scaled_1(runtmp, linear_gather, prefetch_gather): def test_standalone_manifest_search(runtmp): # test loading/searching a manifest file from the command line. - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") - dirname = runtmp.output('somedir') + dirname = runtmp.output("somedir") os.mkdir(dirname) - subdir = runtmp.output('somedir/subdir') + subdir = runtmp.output("somedir/subdir") os.mkdir(subdir) - shutil.copyfile(sig47, os.path.join(dirname, '47.fa.sig')) - shutil.copyfile(sig63, os.path.join(subdir, '63.fa.sig')) + shutil.copyfile(sig47, os.path.join(dirname, "47.fa.sig")) + shutil.copyfile(sig63, os.path.join(subdir, "63.fa.sig")) # for now, the output manifest must be within top level dir for # CLI stuff to work properly. - mf = os.path.join(dirname, 'mf.csv') + mf = os.path.join(dirname, "mf.csv") # build manifest... - runtmp.sourmash('sig', 'manifest', dirname, '-o', mf) + runtmp.sourmash("sig", "manifest", dirname, "-o", mf) # ...and now use for a search! - runtmp.sourmash('search', sig47, mf) + runtmp.sourmash("search", sig47, mf) out = runtmp.last_result.out print(out) @@ -6261,95 +7532,100 @@ def test_standalone_manifest_search(runtmp): def test_standalone_manifest_search_fail(runtmp): # test loading/searching a manifest file from the command line; should # fail if manifest is not located within tld. - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") - dirname = runtmp.output('somedir') + dirname = runtmp.output("somedir") os.mkdir(dirname) - subdir = runtmp.output('somedir/subdir') + subdir = runtmp.output("somedir/subdir") os.mkdir(subdir) - shutil.copyfile(sig47, os.path.join(dirname, '47.fa.sig')) - shutil.copyfile(sig63, os.path.join(subdir, '63.fa.sig')) + shutil.copyfile(sig47, os.path.join(dirname, "47.fa.sig")) + shutil.copyfile(sig63, os.path.join(subdir, "63.fa.sig")) # for now, the output manifest must be within top level dir for # CLI stuff to work properly. here we intentionally break this, # for testing purposes. - mf = runtmp.output('mf.csv') + mf = runtmp.output("mf.csv") # build manifest... - runtmp.sourmash('sig', 'manifest', dirname, '-o', mf) + runtmp.sourmash("sig", "manifest", dirname, "-o", mf) # ...and now use for a search! with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('search', sig47, mf) + runtmp.sourmash("search", sig47, mf) def test_search_ani_jaccard(runtmp): c = runtmp - sig47 = utils.get_test_data('47.fa.sig') - sig4763 = utils.get_test_data('47+63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig4763 = utils.get_test_data("47+63.fa.sig") - c.run_sourmash('search', sig47, sig4763, '-o', 'xxx.csv') + c.run_sourmash("search", sig47, sig4763, "-o", "xxx.csv") print(c.last_result.status, c.last_result.out, c.last_result.err) search_result_names = SearchResult.search_write_cols - csv_file = c.output('xxx.csv') + csv_file = c.output("xxx.csv") with open(csv_file) as fp: reader = csv.DictReader(fp) row = next(reader) print(row) assert search_result_names == list(row.keys()) - assert float(row['similarity']) == 0.6564798376870403 - assert row['filename'].endswith('47+63.fa.sig') - assert row['md5'] == '491c0a81b2cfb0188c0d3b46837c2f42' - assert row['query_filename'].endswith('47.fa') - assert row['query_name'] == 'NC_009665.1 Shewanella baltica OS185, complete genome' - assert row['query_md5'] == '09a08691' - assert row['ani'] == "0.992530907924384" + assert float(row["similarity"]) == 0.6564798376870403 + assert row["filename"].endswith("47+63.fa.sig") + assert row["md5"] == "491c0a81b2cfb0188c0d3b46837c2f42" + assert row["query_filename"].endswith("47.fa") + assert ( + row["query_name"] == "NC_009665.1 Shewanella baltica OS185, complete genome" + ) + assert row["query_md5"] == "09a08691" + assert row["ani"] == "0.992530907924384" def test_search_ani_jaccard_error_too_high(runtmp): c = runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=1', testdata1, testdata2) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,scaled=1", testdata1, testdata2) - c.run_sourmash('search', 'short.fa.sig', 'short2.fa.sig', '-o', 'xxx.csv') + c.run_sourmash("search", "short.fa.sig", "short2.fa.sig", "-o", "xxx.csv") print(c.last_result.status, c.last_result.out, c.last_result.err) search_result_names = SearchResult.search_write_cols - csv_file = c.output('xxx.csv') + csv_file = c.output("xxx.csv") with open(csv_file) as fp: reader = csv.DictReader(fp) row = next(reader) print(row) assert search_result_names == list(row.keys()) - assert float(row['similarity']) == 0.9288577154308617 - assert row['filename'].endswith('short2.fa.sig') - assert row['md5'] == 'bf752903d635b1eb83c53fe4aae951db' - assert row['query_filename'].endswith('short.fa') - assert row['query_name'] == '' - assert row['query_md5'] == '9191284a' - #assert row['ani'] == "0.9987884602947684" - assert row['ani'] == '' + assert float(row["similarity"]) == 0.9288577154308617 + assert row["filename"].endswith("short2.fa.sig") + assert row["md5"] == "bf752903d635b1eb83c53fe4aae951db" + assert row["query_filename"].endswith("short.fa") + assert row["query_name"] == "" + assert row["query_md5"] == "9191284a" + # assert row['ani'] == "0.9987884602947684" + assert row["ani"] == "" - assert "WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons." in c.last_result.err + assert ( + "WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons." + in c.last_result.err + ) def test_searchabund_no_ani(runtmp): c = runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=10,abund', testdata1, testdata2) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,scaled=10,abund", testdata1, testdata2) - c.run_sourmash('search', 'short.fa.sig', 'short2.fa.sig', '-o', 'xxx.csv') + c.run_sourmash("search", "short.fa.sig", "short2.fa.sig", "-o", "xxx.csv") print(c.last_result.status, c.last_result.out, c.last_result.err) - csv_file = c.output('xxx.csv') + csv_file = c.output("xxx.csv") search_result_names = SearchResult.search_write_cols with open(csv_file) as fp: @@ -6357,158 +7633,178 @@ def test_searchabund_no_ani(runtmp): row = next(reader) print(row) assert search_result_names == list(row.keys()) - assert float(row['similarity']) == 0.8224046424612483 - assert row['md5'] == 'c9d5a795eeaaf58e286fb299133e1938' - assert row['filename'].endswith('short2.fa.sig') - assert row['query_filename'].endswith('short.fa') - assert row['query_name'] == '' - assert row['query_md5'] == 'b5cc464c' - assert row['ani'] == "" # do we want empty column to appear?? + assert float(row["similarity"]) == 0.8224046424612483 + assert row["md5"] == "c9d5a795eeaaf58e286fb299133e1938" + assert row["filename"].endswith("short2.fa.sig") + assert row["query_filename"].endswith("short.fa") + assert row["query_name"] == "" + assert row["query_md5"] == "b5cc464c" + assert row["ani"] == "" # do we want empty column to appear?? def test_search_ani_containment(runtmp): c = runtmp - testdata1 = utils.get_test_data('2+63.fa.sig') - testdata2 = utils.get_test_data('47+63.fa.sig') + testdata1 = utils.get_test_data("2+63.fa.sig") + testdata2 = utils.get_test_data("47+63.fa.sig") - c.run_sourmash('search', '--containment', testdata1, testdata2, '-o', 'xxx.csv') + c.run_sourmash("search", "--containment", testdata1, testdata2, "-o", "xxx.csv") print(c.last_result.status, c.last_result.out, c.last_result.err) search_result_names = SearchResult.search_write_cols - csv_file = c.output('xxx.csv') + csv_file = c.output("xxx.csv") with open(csv_file) as fp: reader = csv.DictReader(fp) row = next(reader) print(row) assert search_result_names == list(row.keys()) - assert float(row['similarity']) == 0.6597808288197506 - assert row['filename'].endswith('47+63.fa.sig') - assert row['md5'] == '491c0a81b2cfb0188c0d3b46837c2f42' - assert row['query_name'] == '' - assert row['query_md5'] == '832a45e8' - assert row['ani'] == "0.9866751346467802" + assert float(row["similarity"]) == 0.6597808288197506 + assert row["filename"].endswith("47+63.fa.sig") + assert row["md5"] == "491c0a81b2cfb0188c0d3b46837c2f42" + assert row["query_name"] == "" + assert row["query_md5"] == "832a45e8" + assert row["ani"] == "0.9866751346467802" # search other direction - c.run_sourmash('search', '--containment', testdata2, testdata1, '-o', 'xxxx.csv') + c.run_sourmash("search", "--containment", testdata2, testdata1, "-o", "xxxx.csv") print(c.last_result.status, c.last_result.out, c.last_result.err) - csv_file = c.output('xxxx.csv') + csv_file = c.output("xxxx.csv") with open(csv_file) as fp: reader = csv.DictReader(fp) row = next(reader) print(row) assert search_result_names == list(row.keys()) - assert float(row['similarity']) == 0.6642150646715699 - assert row['filename'].endswith('2+63.fa.sig') - assert row['md5'] == '832a45e85bdca6eaef5d73047e3e6321' - assert row['query_name'] == '' - assert row['query_md5'] == '491c0a81' - assert row['ani'] == "0.9868883523107224" + assert float(row["similarity"]) == 0.6642150646715699 + assert row["filename"].endswith("2+63.fa.sig") + assert row["md5"] == "832a45e85bdca6eaef5d73047e3e6321" + assert row["query_name"] == "" + assert row["query_md5"] == "491c0a81" + assert row["ani"] == "0.9868883523107224" def test_search_ani_containment_asymmetry(runtmp): # test contained_by asymmetries, viz #2215 - query_sig = utils.get_test_data('47.fa.sig') - merged_sig = utils.get_test_data('47-63-merge.sig') + query_sig = utils.get_test_data("47.fa.sig") + merged_sig = utils.get_test_data("47-63-merge.sig") - runtmp.sourmash('search', query_sig, merged_sig, '-o', - 'query-in-merged.csv', '--containment') - runtmp.sourmash('search', merged_sig, query_sig, '-o', - 'merged-in-query.csv', '--containment') + runtmp.sourmash( + "search", query_sig, merged_sig, "-o", "query-in-merged.csv", "--containment" + ) + runtmp.sourmash( + "search", merged_sig, query_sig, "-o", "merged-in-query.csv", "--containment" + ) - with sourmash_args.FileInputCSV(runtmp.output('query-in-merged.csv')) as r: + with sourmash_args.FileInputCSV(runtmp.output("query-in-merged.csv")) as r: query_in_merged = list(r)[0] - with sourmash_args.FileInputCSV(runtmp.output('merged-in-query.csv')) as r: + with sourmash_args.FileInputCSV(runtmp.output("merged-in-query.csv")) as r: merged_in_query = list(r)[0] - assert query_in_merged['ani'] == '1.0' - assert merged_in_query['ani'] == '0.9865155060423993' + assert query_in_merged["ani"] == "1.0" + assert merged_in_query["ani"] == "0.9865155060423993" def test_search_ani_containment_fail(runtmp): c = runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=10', testdata1, testdata2) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,scaled=10", testdata1, testdata2) - c.run_sourmash('search', '--containment', 'short.fa.sig', 'short2.fa.sig', '-o', 'xxx.csv') + c.run_sourmash( + "search", "--containment", "short.fa.sig", "short2.fa.sig", "-o", "xxx.csv" + ) print(c.last_result.status, c.last_result.out, c.last_result.err) search_result_names = SearchResult.search_write_cols - csv_file = c.output('xxx.csv') + csv_file = c.output("xxx.csv") with open(csv_file) as fp: reader = csv.DictReader(fp) row = next(reader) print(row) assert search_result_names == list(row.keys()) - assert round(float(row['similarity']), 3) == 0.967 - assert row['ani'] == "0.998906999319701" + assert round(float(row["similarity"]), 3) == 0.967 + assert row["ani"] == "0.998906999319701" # With PR #2268, this error message should not appear - #assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons." in c.last_result.err - + # assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons." in c.last_result.err + def test_search_ani_containment_estimate_ci(runtmp): # test ANI confidence intervals, based on (asymmetric) containment c = runtmp - testdata1 = utils.get_test_data('2+63.fa.sig') - testdata2 = utils.get_test_data('47+63.fa.sig') - - c.run_sourmash('search', '--containment', testdata1, testdata2, '-o', 'xxx.csv', '--estimate-ani-ci') + testdata1 = utils.get_test_data("2+63.fa.sig") + testdata2 = utils.get_test_data("47+63.fa.sig") + + c.run_sourmash( + "search", + "--containment", + testdata1, + testdata2, + "-o", + "xxx.csv", + "--estimate-ani-ci", + ) print(c.last_result.status, c.last_result.out, c.last_result.err) search_result_names_ci = SearchResult.search_write_cols_ci - csv_file = c.output('xxx.csv') + csv_file = c.output("xxx.csv") with open(csv_file) as fp: reader = csv.DictReader(fp) row = next(reader) print(row) assert search_result_names_ci == list(row.keys()) - assert float(row['similarity']) == 0.6597808288197506 - assert row['filename'].endswith('47+63.fa.sig') - assert row['md5'] == '491c0a81b2cfb0188c0d3b46837c2f42' - assert row['query_name'] == '' - assert row['query_md5'] == '832a45e8' - assert row['ani'] == "0.9866751346467802" - assert row['ani_low'] == "0.9861576758035308" #"0.9861559138341189" - assert row['ani_high'] == "0.9871770716451368" #"0.9871787293232042" + assert float(row["similarity"]) == 0.6597808288197506 + assert row["filename"].endswith("47+63.fa.sig") + assert row["md5"] == "491c0a81b2cfb0188c0d3b46837c2f42" + assert row["query_name"] == "" + assert row["query_md5"] == "832a45e8" + assert row["ani"] == "0.9866751346467802" + assert row["ani_low"] == "0.9861576758035308" # "0.9861559138341189" + assert row["ani_high"] == "0.9871770716451368" # "0.9871787293232042" # search other direction - c.run_sourmash('search', '--containment', testdata2, testdata1, '-o', 'xxxx.csv', '--estimate-ani-ci') + c.run_sourmash( + "search", + "--containment", + testdata2, + testdata1, + "-o", + "xxxx.csv", + "--estimate-ani-ci", + ) print(c.last_result.status, c.last_result.out, c.last_result.err) - csv_file = c.output('xxxx.csv') + csv_file = c.output("xxxx.csv") with open(csv_file) as fp: reader = csv.DictReader(fp) row = next(reader) print(row) assert search_result_names_ci == list(row.keys()) - assert float(row['similarity']) == 0.6642150646715699 - assert row['filename'].endswith('2+63.fa.sig') - assert row['md5'] == '832a45e85bdca6eaef5d73047e3e6321' - assert row['query_name'] == '' - assert row['query_md5'] == '491c0a81' - assert row['ani'] == "0.9868883523107224" - assert row['ani_low'] == "0.986374049720872" #"0.9863757952722036" - assert row['ani_high'] == "0.9873870188726516" #"0.9873853776786775" + assert float(row["similarity"]) == 0.6642150646715699 + assert row["filename"].endswith("2+63.fa.sig") + assert row["md5"] == "832a45e85bdca6eaef5d73047e3e6321" + assert row["query_name"] == "" + assert row["query_md5"] == "491c0a81" + assert row["ani"] == "0.9868883523107224" + assert row["ani_low"] == "0.986374049720872" # "0.9863757952722036" + assert row["ani_high"] == "0.9873870188726516" # "0.9873853776786775" def test_search_ani_max_containment(runtmp): c = runtmp - testdata1 = utils.get_test_data('2+63.fa.sig') - testdata2 = utils.get_test_data('47+63.fa.sig') + testdata1 = utils.get_test_data("2+63.fa.sig") + testdata2 = utils.get_test_data("47+63.fa.sig") - c.run_sourmash('search', '--max-containment', testdata1, testdata2, '-o', 'xxx.csv') + c.run_sourmash("search", "--max-containment", testdata1, testdata2, "-o", "xxx.csv") print(c.last_result.status, c.last_result.out, c.last_result.err) - csv_file = c.output('xxx.csv') + csv_file = c.output("xxx.csv") search_result_names = SearchResult.search_write_cols with open(csv_file) as fp: @@ -6516,25 +7812,33 @@ def test_search_ani_max_containment(runtmp): row = next(reader) print(row) assert search_result_names == list(row.keys()) - assert float(row['similarity']) == 0.6642150646715699 - assert row['filename'].endswith('47+63.fa.sig') - assert row['md5'] == '491c0a81b2cfb0188c0d3b46837c2f42' - assert row['query_name'] == '' - assert row['query_md5'] == '832a45e8' - assert row['ani'] == "0.9868883523107224" + assert float(row["similarity"]) == 0.6642150646715699 + assert row["filename"].endswith("47+63.fa.sig") + assert row["md5"] == "491c0a81b2cfb0188c0d3b46837c2f42" + assert row["query_name"] == "" + assert row["query_md5"] == "832a45e8" + assert row["ani"] == "0.9868883523107224" def test_search_ani_max_containment_estimate_ci(runtmp): # test ANI confidence intervals, based on (symmetric) max-containment c = runtmp - testdata1 = utils.get_test_data('2+63.fa.sig') - testdata2 = utils.get_test_data('47+63.fa.sig') - - c.run_sourmash('search', '--max-containment', testdata1, testdata2, '-o', 'xxx.csv', '--estimate-ani-ci') + testdata1 = utils.get_test_data("2+63.fa.sig") + testdata2 = utils.get_test_data("47+63.fa.sig") + + c.run_sourmash( + "search", + "--max-containment", + testdata1, + testdata2, + "-o", + "xxx.csv", + "--estimate-ani-ci", + ) print(c.last_result.status, c.last_result.out, c.last_result.err) - csv_file = c.output('xxx.csv') + csv_file = c.output("xxx.csv") search_result_names_ci = SearchResult.search_write_cols_ci with open(csv_file) as fp: @@ -6542,32 +7846,32 @@ def test_search_ani_max_containment_estimate_ci(runtmp): row = next(reader) print(row) assert search_result_names_ci == list(row.keys()) - assert float(row['similarity']) == 0.6642150646715699 - assert row['filename'].endswith('47+63.fa.sig') - assert row['md5'] == '491c0a81b2cfb0188c0d3b46837c2f42' - assert row['query_name'] == '' - assert row['query_md5'] == '832a45e8' - assert row['ani'] == "0.9868883523107224" - assert row['ani_low'] == "0.986374049720872" - assert row['ani_high'] == "0.9873870188726516" + assert float(row["similarity"]) == 0.6642150646715699 + assert row["filename"].endswith("47+63.fa.sig") + assert row["md5"] == "491c0a81b2cfb0188c0d3b46837c2f42" + assert row["query_name"] == "" + assert row["query_md5"] == "832a45e8" + assert row["ani"] == "0.9868883523107224" + assert row["ani_low"] == "0.986374049720872" + assert row["ani_high"] == "0.9873870188726516" def test_search_jaccard_ani_downsample(runtmp): c = runtmp - sig47 = utils.get_test_data('47.fa.sig') - sig4763 = utils.get_test_data('47+63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig4763 = utils.get_test_data("47+63.fa.sig") ss47 = sourmash.load_one_signature(sig47) ss4763 = sourmash.load_one_signature(sig4763) print(f"SCALED: sig1: {ss47.minhash.scaled}, sig2: {ss4763.minhash.scaled}") - c.run_sourmash('search', sig47, sig4763, '-o', 'xxx.csv') + c.run_sourmash("search", sig47, sig4763, "-o", "xxx.csv") print(c.last_result.status, c.last_result.out, c.last_result.err) search_result_names = SearchResult.search_write_cols search_result_names_ci = SearchResult.search_write_cols_ci - csv_file = c.output('xxx.csv') + csv_file = c.output("xxx.csv") with open(csv_file) as fp: reader = csv.DictReader(fp) @@ -6575,50 +7879,61 @@ def test_search_jaccard_ani_downsample(runtmp): print(row) assert search_result_names == list(row.keys()) assert search_result_names_ci != list(row.keys()) - assert float(row['similarity']) == 0.6564798376870403 - assert row['filename'].endswith('47+63.fa.sig') - assert row['md5'] == '491c0a81b2cfb0188c0d3b46837c2f42' - assert row['query_filename'].endswith('47.fa') - assert row['query_name'] == 'NC_009665.1 Shewanella baltica OS185, complete genome' - assert row['query_md5'] == '09a08691' - assert row['ani'] == "0.992530907924384" + assert float(row["similarity"]) == 0.6564798376870403 + assert row["filename"].endswith("47+63.fa.sig") + assert row["md5"] == "491c0a81b2cfb0188c0d3b46837c2f42" + assert row["query_filename"].endswith("47.fa") + assert ( + row["query_name"] == "NC_009665.1 Shewanella baltica OS185, complete genome" + ) + assert row["query_md5"] == "09a08691" + assert row["ani"] == "0.992530907924384" # downsample one and check similarity and ANI ds_sig47 = c.output("ds_sig47.sig") - c.run_sourmash('sig', "downsample", sig47, "--scaled", "2000", '-o', ds_sig47) - c.run_sourmash('search', ds_sig47, sig4763, '-o', 'xxx.csv') -# - csv_file = c.output('xxx.csv') + c.run_sourmash("sig", "downsample", sig47, "--scaled", "2000", "-o", ds_sig47) + c.run_sourmash("search", ds_sig47, sig4763, "-o", "xxx.csv") + # + csv_file = c.output("xxx.csv") with open(csv_file) as fp: reader = csv.DictReader(fp) row = next(reader) print(row) - assert round(float(row['similarity']), 3) == round(0.6634517766497462, 3) - assert round(float(row['ani']), 3) == 0.993 + assert round(float(row["similarity"]), 3) == round(0.6634517766497462, 3) + assert round(float(row["ani"]), 3) == 0.993 - #downsample manually and assert same ANI + # downsample manually and assert same ANI ss47_ds = signature.load_one_signature(ds_sig47) print("SCALED:", ss47_ds.minhash.scaled, ss4763.minhash.scaled) ani_info = ss47_ds.jaccard_ani(ss4763, downsample=True) print(ani_info) - assert round(ani_info.ani,3) == 0.993 + assert round(ani_info.ani, 3) == 0.993 assert (1 - round(ani_info.dist, 3)) == 0.993 def test_gather_ani_csv(runtmp, linear_gather, prefetch_gather): - testdata1 = utils.get_test_data('63.fa.sig') - testdata2 = utils.get_test_data('47+63.fa.sig') + testdata1 = utils.get_test_data("63.fa.sig") + testdata2 = utils.get_test_data("47+63.fa.sig") - runtmp.sourmash('index', '-k', '31', 'zzz', testdata2) + runtmp.sourmash("index", "-k", "31", "zzz", testdata2) - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('gather', testdata1, 'zzz', '-o', 'foo.csv', '--threshold-bp=1', linear_gather, prefetch_gather) + runtmp.sourmash( + "gather", + testdata1, + "zzz", + "-o", + "foo.csv", + "--threshold-bp=1", + linear_gather, + prefetch_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - csv_file = runtmp.output('foo.csv') + csv_file = runtmp.output("foo.csv") gather_result_names = GatherResult.gather_write_cols gather_result_names_ci = GatherResult.gather_write_cols_ci @@ -6628,42 +7943,63 @@ def test_gather_ani_csv(runtmp, linear_gather, prefetch_gather): print(row) assert gather_result_names == list(row.keys()) assert gather_result_names_ci != list(row.keys()) - assert float(row['intersect_bp']) == 5238000.0 - assert float(row['unique_intersect_bp']) == 5238000.0 - assert float(row['remaining_bp']) == 0.0 - assert float(row['f_orig_query']) == 1.0 - assert float(row['f_unique_to_query']) == 1.0 - assert float(row['f_match']) == 0.6642150646715699 - assert row['filename'] == 'zzz' - assert row['md5'] == '491c0a81b2cfb0188c0d3b46837c2f42' - assert row['gather_result_rank'] == '0' - assert row['query_md5'] == '38729c63' - assert row['query_bp'] == '5238000' - assert row['query_containment_ani']== '1.0' - assert round(float(row['match_containment_ani']), 3) == 0.987 - assert round(float(row['average_containment_ani']), 3) == 0.993 - assert round(float(row['max_containment_ani']),3) == 1.0 - assert row['potential_false_negative'] == 'False' + assert float(row["intersect_bp"]) == 5238000.0 + assert float(row["unique_intersect_bp"]) == 5238000.0 + assert float(row["remaining_bp"]) == 0.0 + assert float(row["f_orig_query"]) == 1.0 + assert float(row["f_unique_to_query"]) == 1.0 + assert float(row["f_match"]) == 0.6642150646715699 + assert row["filename"] == "zzz" + assert row["md5"] == "491c0a81b2cfb0188c0d3b46837c2f42" + assert row["gather_result_rank"] == "0" + assert row["query_md5"] == "38729c63" + assert row["query_bp"] == "5238000" + assert row["query_containment_ani"] == "1.0" + assert round(float(row["match_containment_ani"]), 3) == 0.987 + assert round(float(row["average_containment_ani"]), 3) == 0.993 + assert round(float(row["max_containment_ani"]), 3) == 1.0 + assert row["potential_false_negative"] == "False" def test_gather_ani_csv_estimate_ci(runtmp, linear_gather, prefetch_gather): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - - runtmp.sourmash('sketch','dna','-p','scaled=10', '--name-from-first', testdata1, testdata2) - - runtmp.sourmash('sketch','dna','-p','scaled=10', '-o', 'query.fa.sig', '--name-from-first', testdata2) - - runtmp.sourmash('index', '-k', '31', 'zzz', 'short.fa.sig', 'short2.fa.sig') - - assert os.path.exists(runtmp.output('zzz.sbt.zip')) - - runtmp.sourmash('gather', 'query.fa.sig', 'zzz', '-o', 'foo.csv', '--threshold-bp=1', '--estimate-ani-ci', linear_gather, prefetch_gather) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + + runtmp.sourmash( + "sketch", "dna", "-p", "scaled=10", "--name-from-first", testdata1, testdata2 + ) + + runtmp.sourmash( + "sketch", + "dna", + "-p", + "scaled=10", + "-o", + "query.fa.sig", + "--name-from-first", + testdata2, + ) + + runtmp.sourmash("index", "-k", "31", "zzz", "short.fa.sig", "short2.fa.sig") + + assert os.path.exists(runtmp.output("zzz.sbt.zip")) + + runtmp.sourmash( + "gather", + "query.fa.sig", + "zzz", + "-o", + "foo.csv", + "--threshold-bp=1", + "--estimate-ani-ci", + linear_gather, + prefetch_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - csv_file = runtmp.output('foo.csv') + csv_file = runtmp.output("foo.csv") gather_result_names = GatherResult.gather_write_cols_ci @@ -6672,29 +8008,29 @@ def test_gather_ani_csv_estimate_ci(runtmp, linear_gather, prefetch_gather): row = next(reader) print(row) assert gather_result_names == list(row.keys()) - assert float(row['intersect_bp']) == 910 - assert float(row['unique_intersect_bp']) == 910 - assert float(row['remaining_bp']) == 0 - assert float(row['f_orig_query']) == 1.0 - assert float(row['f_unique_to_query']) == 1.0 - assert float(row['f_match']) == 1.0 - assert row['filename'] == 'zzz' - assert row['name'] == 'tr1 4' - assert row['md5'] == 'c9d5a795eeaaf58e286fb299133e1938' - assert row['gather_result_rank'] == '0' - assert row['query_filename'].endswith('short2.fa') - assert row['query_name'] == 'tr1 4' - assert row['query_md5'] == 'c9d5a795' - assert row['query_bp'] == '910' - assert row['query_containment_ani'] == '1.0' - assert row['query_containment_ani_low'] == '1.0' - assert row['query_containment_ani_high'] == '1.0' - assert row['match_containment_ani'] == '1.0' - assert row['match_containment_ani_low'] == '1.0' - assert row['match_containment_ani_high'] == '1.0' - assert row['average_containment_ani'] == '1.0' - assert row['max_containment_ani'] == '1.0' - assert row['potential_false_negative'] == 'False' + assert float(row["intersect_bp"]) == 910 + assert float(row["unique_intersect_bp"]) == 910 + assert float(row["remaining_bp"]) == 0 + assert float(row["f_orig_query"]) == 1.0 + assert float(row["f_unique_to_query"]) == 1.0 + assert float(row["f_match"]) == 1.0 + assert row["filename"] == "zzz" + assert row["name"] == "tr1 4" + assert row["md5"] == "c9d5a795eeaaf58e286fb299133e1938" + assert row["gather_result_rank"] == "0" + assert row["query_filename"].endswith("short2.fa") + assert row["query_name"] == "tr1 4" + assert row["query_md5"] == "c9d5a795" + assert row["query_bp"] == "910" + assert row["query_containment_ani"] == "1.0" + assert row["query_containment_ani_low"] == "1.0" + assert row["query_containment_ani_high"] == "1.0" + assert row["match_containment_ani"] == "1.0" + assert row["match_containment_ani_low"] == "1.0" + assert row["match_containment_ani_high"] == "1.0" + assert row["average_containment_ani"] == "1.0" + assert row["max_containment_ani"] == "1.0" + assert row["potential_false_negative"] == "False" def test_compare_containment_ani(runtmp): @@ -6704,12 +8040,21 @@ def test_compare_containment_ani(runtmp): sigfiles = ["2.fa.sig", "2+63.fa.sig", "47.fa.sig", "63.fa.sig"] testdata_sigs = [utils.get_test_data(c) for c in sigfiles] - c.run_sourmash('compare', '--containment', '-k', '31', - '--ani', '--csv', 'output.csv', *testdata_sigs) + c.run_sourmash( + "compare", + "--containment", + "-k", + "31", + "--ani", + "--csv", + "output.csv", + *testdata_sigs, + ) # load the matrix output - mat, idx_to_sig = _load_compare_matrix_and_sigs(c.output('output.csv'), - testdata_sigs) + mat, idx_to_sig = _load_compare_matrix_and_sigs( + c.output("output.csv"), testdata_sigs + ) # check explicit containment against output of compare for i in range(len(idx_to_sig)): @@ -6728,11 +8073,14 @@ def test_compare_containment_ani(runtmp): containment_ani = 0.0 mat_val = round(mat[i][j], 3) - assert containment_ani == mat_val #, (i, j) + assert containment_ani == mat_val # , (i, j) print(c.last_result.err) print(c.last_result.out) - assert "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." in c.last_result.err + assert ( + "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." + in c.last_result.err + ) def test_compare_containment_ani_asymmetry(runtmp): @@ -6744,11 +8092,19 @@ def test_compare_containment_ani_asymmetry(runtmp): sigfiles = ["47.fa.sig", "47-63-merge.sig"] testdata_sigs = [utils.get_test_data(c) for c in sigfiles] - c.run_sourmash('compare', '--containment', '-k', '31', - '--ani', '--csv', 'output.csv', *testdata_sigs) + c.run_sourmash( + "compare", + "--containment", + "-k", + "31", + "--ani", + "--csv", + "output.csv", + *testdata_sigs, + ) # load the matrix output of compare --containment --estimate-ani - with open(c.output('output.csv'), 'rt') as fp: + with open(c.output("output.csv")) as fp: r = iter(csv.reader(fp)) headers = next(r) @@ -6760,7 +8116,7 @@ def test_compare_containment_ani_asymmetry(runtmp): print(mat) # load in all the input signatures - idx_to_sig = dict() + idx_to_sig = {} for idx, filename in enumerate(testdata_sigs): ss = sourmash.load_one_signature(filename, ksize=31) idx_to_sig[idx] = ss @@ -6782,7 +8138,7 @@ def test_compare_containment_ani_asymmetry(runtmp): containment_ani = 0.0 mat_val = round(mat[i][j], 6) - assert containment_ani == mat_val #, (i, j) + assert containment_ani == mat_val # , (i, j) print(c.last_result.err) print(c.last_result.out) @@ -6794,12 +8150,21 @@ def test_compare_jaccard_ani(runtmp): sigfiles = ["47.fa.sig", "47-63-merge.sig"] testdata_sigs = [utils.get_test_data(c) for c in sigfiles] - c.run_sourmash('compare', '--containment', '-k', '31', - '--ani', '--csv', 'output.csv', *testdata_sigs) + c.run_sourmash( + "compare", + "--containment", + "-k", + "31", + "--ani", + "--csv", + "output.csv", + *testdata_sigs, + ) # load the matrix output - mat, idx_to_sig = _load_compare_matrix_and_sigs(c.output('output.csv'), - testdata_sigs) + mat, idx_to_sig = _load_compare_matrix_and_sigs( + c.output("output.csv"), testdata_sigs + ) # check explicit containment against output of compare for i in range(len(idx_to_sig)): @@ -6818,7 +8183,7 @@ def test_compare_jaccard_ani(runtmp): containment_ani = 0.0 mat_val = round(mat[i][j], 6) - assert containment_ani == mat_val #, (i, j) + assert containment_ani == mat_val # , (i, j) print(c.last_result.err) print(c.last_result.out) @@ -6831,8 +8196,7 @@ def test_compare_jaccard_protein_parallel_ani_bug(runtmp): sigfile = utils.get_test_data("prot/protein.zip") - c.run_sourmash('compare', '--ani', '-p', '2', '--csv', 'output.csv', - sigfile) + c.run_sourmash("compare", "--ani", "-p", "2", "--csv", "output.csv", sigfile) print(c.last_result.err) print(c.last_result.out) @@ -6846,12 +8210,22 @@ def test_compare_containment_ani_asymmetry_distance(runtmp): sigfiles = ["47.fa.sig", "47-63-merge.sig"] testdata_sigs = [utils.get_test_data(c) for c in sigfiles] - c.run_sourmash('compare', '--containment', '-k', '31', '--distance-matrix', - '--ani', '--csv', 'output.csv', *testdata_sigs) + c.run_sourmash( + "compare", + "--containment", + "-k", + "31", + "--distance-matrix", + "--ani", + "--csv", + "output.csv", + *testdata_sigs, + ) # load the matrix output - mat, idx_to_sig = _load_compare_matrix_and_sigs(c.output('output.csv'), - testdata_sigs) + mat, idx_to_sig = _load_compare_matrix_and_sigs( + c.output("output.csv"), testdata_sigs + ) # check explicit containment against output of compare for i in range(len(idx_to_sig)): @@ -6870,7 +8244,7 @@ def test_compare_containment_ani_asymmetry_distance(runtmp): containment_ani = 1 mat_val = round(mat[i][j], 6) - assert containment_ani == mat_val #, (i, j) + assert containment_ani == mat_val # , (i, j) print(c.last_result.err) print(c.last_result.out) @@ -6882,12 +8256,14 @@ def test_compare_jaccard_ani(runtmp): sigfiles = ["2.fa.sig", "2+63.fa.sig", "47.fa.sig", "63.fa.sig"] testdata_sigs = [utils.get_test_data(c) for c in sigfiles] - c.run_sourmash('compare', '-k', '31', '--estimate-ani', - '--csv', 'output.csv', *testdata_sigs) + c.run_sourmash( + "compare", "-k", "31", "--estimate-ani", "--csv", "output.csv", *testdata_sigs + ) # load the matrix output - mat, idx_to_sig = _load_compare_matrix_and_sigs(c.output('output.csv'), - testdata_sigs) + mat, idx_to_sig = _load_compare_matrix_and_sigs( + c.output("output.csv"), testdata_sigs + ) # check explicit calculations against output of compare for i in range(len(idx_to_sig)): @@ -6906,30 +8282,43 @@ def test_compare_jaccard_ani(runtmp): jaccard_ani = 0.0 print(jaccard_ani) - assert jaccard_ani == mat_val #, (i, j) + assert jaccard_ani == mat_val # , (i, j) print(c.last_result.err) print(c.last_result.out) - assert "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." in c.last_result.err + assert ( + "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." + in c.last_result.err + ) def test_compare_jaccard_ani_jaccard_error_too_high(runtmp): c = runtmp - testdata1 = utils.get_test_data('short.fa') - sig1 = c.output('short.fa.sig') - testdata2 = utils.get_test_data('short2.fa') - sig2 = c.output('short2.fa.sig') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=1', '-o', sig1, testdata1) - c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=1', '-o', sig2, testdata2) + testdata1 = utils.get_test_data("short.fa") + sig1 = c.output("short.fa.sig") + testdata2 = utils.get_test_data("short2.fa") + sig2 = c.output("short2.fa.sig") + c.run_sourmash("sketch", "dna", "-p", "k=31,scaled=1", "-o", sig1, testdata1) + c.run_sourmash("sketch", "dna", "-p", "k=31,scaled=1", "-o", sig2, testdata2) testdata_sigs = [sig1, sig2] - c.run_sourmash('compare', '-k', '31', '--estimate-ani', '--csv', 'output.csv', 'short.fa.sig', 'short2.fa.sig') + c.run_sourmash( + "compare", + "-k", + "31", + "--estimate-ani", + "--csv", + "output.csv", + "short.fa.sig", + "short2.fa.sig", + ) print(c.last_result.status, c.last_result.out, c.last_result.err) # load the matrix output - mat, idx_to_sig = _load_compare_matrix_and_sigs(c.output('output.csv'), - testdata_sigs) + mat, idx_to_sig = _load_compare_matrix_and_sigs( + c.output("output.csv"), testdata_sigs + ) # check explicit containment against output of compare for i in range(len(idx_to_sig)): @@ -6948,10 +8337,12 @@ def test_compare_jaccard_ani_jaccard_error_too_high(runtmp): jaccard_ani = 0.0 print(jaccard_ani) - assert jaccard_ani == mat_val #, (i, j) + assert jaccard_ani == mat_val # , (i, j) - - assert "WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons." in c.last_result.err + assert ( + "WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons." + in c.last_result.err + ) def test_compare_max_containment_ani(runtmp): @@ -6960,12 +8351,21 @@ def test_compare_max_containment_ani(runtmp): sigfiles = ["2.fa.sig", "2+63.fa.sig", "47.fa.sig", "63.fa.sig"] testdata_sigs = [utils.get_test_data(c) for c in sigfiles] - c.run_sourmash('compare', '--max-containment', '-k', '31', - '--estimate-ani', '--csv', 'output.csv', *testdata_sigs) + c.run_sourmash( + "compare", + "--max-containment", + "-k", + "31", + "--estimate-ani", + "--csv", + "output.csv", + *testdata_sigs, + ) # load the matrix output - mat, idx_to_sig = _load_compare_matrix_and_sigs(c.output('output.csv'), - testdata_sigs) + mat, idx_to_sig = _load_compare_matrix_and_sigs( + c.output("output.csv"), testdata_sigs + ) # check explicit containment against output of compare for i in range(len(idx_to_sig)): @@ -6987,7 +8387,10 @@ def test_compare_max_containment_ani(runtmp): print(c.last_result.err) print(c.last_result.out) - assert "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." in c.last_result.err + assert ( + "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." + in c.last_result.err + ) def test_compare_avg_containment_ani(runtmp): @@ -6997,12 +8400,21 @@ def test_compare_avg_containment_ani(runtmp): sigfiles = ["2.fa.sig", "2+63.fa.sig", "47.fa.sig", "63.fa.sig"] testdata_sigs = [utils.get_test_data(c) for c in sigfiles] - c.run_sourmash('compare', '--avg-containment', '-k', '31', - '--estimate-ani', '--csv', 'output.csv', *testdata_sigs) + c.run_sourmash( + "compare", + "--avg-containment", + "-k", + "31", + "--estimate-ani", + "--csv", + "output.csv", + *testdata_sigs, + ) # load the matrix output - mat, idx_to_sig = _load_compare_matrix_and_sigs(c.output('output.csv'), - testdata_sigs) + mat, idx_to_sig = _load_compare_matrix_and_sigs( + c.output("output.csv"), testdata_sigs + ) # check explicit avg containment against output of compare for i in range(len(idx_to_sig)): @@ -7024,29 +8436,40 @@ def test_compare_avg_containment_ani(runtmp): print(c.last_result.err) print(c.last_result.out) - assert "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." in c.last_result.err + assert ( + "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." + in c.last_result.err + ) def test_compare_ANI_require_scaled(runtmp): # check that compare with containment requires scaled sketches c = runtmp - s47 = utils.get_test_data('num/47.fa.sig') - s63 = utils.get_test_data('num/63.fa.sig') + s47 = utils.get_test_data("num/47.fa.sig") + s63 = utils.get_test_data("num/63.fa.sig") # containment and estimate ANI will give this error - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('compare', '--containment', '--estimate-ani', '-k', '31', s47, s63, - fail_ok=True) - assert 'must use scaled signatures with --containment, --max-containment, and --avg-containment' in \ - c.last_result.err + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "compare", + "--containment", + "--estimate-ani", + "-k", + "31", + s47, + s63, + fail_ok=True, + ) + assert ( + "must use scaled signatures with --containment, --max-containment, and --avg-containment" + in c.last_result.err + ) assert c.last_result.status != 0 # jaccard + estimate ANI will give this error - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('compare', '--estimate-ani', '-k', '31', s47, s63, - fail_ok=True) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("compare", "--estimate-ani", "-k", "31", s47, s63, fail_ok=True) - assert 'must use scaled signatures with --estimate-ani' in \ - c.last_result.err + assert "must use scaled signatures with --estimate-ani" in c.last_result.err assert c.last_result.status != 0 diff --git a/tests/test_sourmash_args.py b/tests/test_sourmash_args.py index ae83dc324d..7fcbe2511e 100644 --- a/tests/test_sourmash_args.py +++ b/tests/test_sourmash_args.py @@ -22,9 +22,9 @@ def test_save_signatures_api_none(): # save to sigfile - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) with sourmash_args.SaveSignaturesToLocation(None) as save_sig: @@ -37,12 +37,12 @@ def test_save_signatures_api_none(): def test_save_signatures_to_location_1_sig(runtmp): # save to sigfile.sig - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) - outloc = runtmp.output('foo.sig') + outloc = runtmp.output("foo.sig") with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: print(save_sig) save_sig.add(ss2) @@ -56,9 +56,9 @@ def test_save_signatures_to_location_1_sig(runtmp): def test_save_signatures_to_location_1_stdout(): # save to stdout - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) output_capture = io.StringIO() @@ -77,12 +77,12 @@ def test_save_signatures_to_location_1_stdout(): def test_save_signatures_to_location_1_sig_is_default(runtmp): # save to sigfile.txt - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) - outloc = runtmp.output('foo.txt') + outloc = runtmp.output("foo.txt") with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: print(save_sig) save_sig.add(ss2) @@ -96,12 +96,12 @@ def test_save_signatures_to_location_1_sig_is_default(runtmp): def test_save_signatures_to_location_1_sig_gz(runtmp): # save to sigfile.gz - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) - outloc = runtmp.output('foo.sig.gz') + outloc = runtmp.output("foo.sig.gz") with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: print(save_sig) save_sig.add(ss2) @@ -120,12 +120,12 @@ def test_save_signatures_to_location_1_sig_gz(runtmp): def test_save_signatures_to_location_1_zip(runtmp): # save to sigfile.zip - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) - outloc = runtmp.output('foo.zip') + outloc = runtmp.output("foo.zip") with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: print(save_sig) save_sig.add(ss2) @@ -143,33 +143,33 @@ def test_save_signatures_to_location_1_zip(runtmp): def test_save_signatures_to_location_1_zip_bad(runtmp): # try saving to bad sigfile.zip - sig2 = utils.get_test_data('2.fa.sig') - ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') - ss47 = sourmash.load_one_signature(sig47, ksize=31) + sig2 = utils.get_test_data("2.fa.sig") + sourmash.load_one_signature(sig2, ksize=31) + sig47 = utils.get_test_data("47.fa.sig") + sourmash.load_one_signature(sig47, ksize=31) - outloc = runtmp.output('foo.zip') + outloc = runtmp.output("foo.zip") # create bad zip: - with open(outloc, 'wt') as fp: + with open(outloc, "w"): pass # now check for error with pytest.raises(ValueError) as exc: - with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: + with sourmash_args.SaveSignaturesToLocation(outloc): pass - assert 'cannot be opened as a zip file' in str(exc) + assert "cannot be opened as a zip file" in str(exc) def test_save_signatures_to_location_1_zip_dup(runtmp): # save to sigfile.zip - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) - outloc = runtmp.output('foo.zip') + outloc = runtmp.output("foo.zip") with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: print(save_sig) save_sig.add(ss2) @@ -178,11 +178,11 @@ def test_save_signatures_to_location_1_zip_dup(runtmp): # here we have to change the names so the sig content is different; # exact duplicates will not be saved, otherwise. ss2 = ss2.to_mutable() - ss2.name = 'different name for ss2' + ss2.name = "different name for ss2" save_sig.add(ss2) ss47 = ss47.to_mutable() - ss47.name = 'different name for ss47' + ss47.name = "different name for ss47" save_sig.add(ss47) # can we open as a .zip file? @@ -197,13 +197,13 @@ def test_save_signatures_to_location_1_zip_dup(runtmp): def test_save_signatures_to_location_2_zip_add(runtmp): # create sigfile.zip; then, add a new signature. - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) # add only ss2 - outloc = runtmp.output('foo.zip') + outloc = runtmp.output("foo.zip") with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: print(save_sig) save_sig.add(ss2) @@ -230,13 +230,13 @@ def test_save_signatures_to_location_2_zip_add(runtmp): def test_save_signatures_to_location_2_zip_add_dup(runtmp): # create sigfile.zip; then, add a new signature, plus a ~duplicate. - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) # add only ss2 - outloc = runtmp.output('foo.zip') + outloc = runtmp.output("foo.zip") with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: print(save_sig) save_sig.add(ss2) @@ -257,8 +257,9 @@ def test_save_signatures_to_location_2_zip_add_dup(runtmp): # add ss2; here we have to change the names so the sig content is # different exact duplicates will not be saved, otherwise. import copy + ss2copy = ss2.to_mutable() - ss2copy.name = 'different name for ss2' + ss2copy.name = "different name for ss2" save_sig.add(ss2copy) # updated file should contain all three. @@ -271,15 +272,15 @@ def test_save_signatures_to_location_2_zip_add_dup(runtmp): def test_save_signatures_to_location_3_zip_add_fail(runtmp): # create sigfile.zip using zipfile, then try to add to it (& fail) - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') - ss47 = sourmash.load_one_signature(sig47, ksize=31) + sig47 = utils.get_test_data("47.fa.sig") + sourmash.load_one_signature(sig47, ksize=31) # add only ss2, using zipfile API - outloc = runtmp.output('foo.zip') - with zipfile.ZipFile(outloc, 'x') as zf: - with zf.open('xyz.sig', 'w') as fp: + outloc = runtmp.output("foo.zip") + with zipfile.ZipFile(outloc, "x") as zf: + with zf.open("xyz.sig", "w") as fp: sourmash.save_signatures([ss2], fp=fp, compression=1) # verify it can be loaded, yada yada @@ -289,28 +290,29 @@ def test_save_signatures_to_location_3_zip_add_fail(runtmp): # now, try to open existing file with SaveSignaturesToLocation... with pytest.raises(ValueError) as exc: - with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: + with sourmash_args.SaveSignaturesToLocation(outloc): pass - assert 'Cannot add to existing zipfile' in str(exc) + assert "Cannot add to existing zipfile" in str(exc) def test_save_signatures_to_location_3_zip_add_with_manifest(runtmp): # create sigfile.zip using zipfile, then try to add to it (& fail) - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) # add only ss2, using zipfile API; add manifest manually. - outloc = runtmp.output('foo.zip') - with zipfile.ZipFile(outloc, 'x') as zf: - with zf.open('xyz.sig', 'w') as fp: + outloc = runtmp.output("foo.zip") + with zipfile.ZipFile(outloc, "x") as zf: + with zf.open("xyz.sig", "w") as fp: sourmash.save_signatures([ss2], fp=fp, compression=1) # make a manifest row... - row = manifest.CollectionManifest.make_manifest_row(ss2, 'xyz.sig', - include_signature=False) + row = manifest.CollectionManifest.make_manifest_row( + ss2, "xyz.sig", include_signature=False + ) # construct & save manifest mf = manifest.CollectionManifest([row]) @@ -320,7 +322,7 @@ def test_save_signatures_to_location_3_zip_add_with_manifest(runtmp): mf.write_to_csv(manifest_fp, write_header=True) manifest_data = manifest_fp.getvalue().encode("utf-8") - with zf.open(mf_name, 'w') as fp: + with zf.open(mf_name, "w") as fp: fp.write(manifest_data) # fini! made our artisanal hand-crafted zipfile. Now... @@ -345,12 +347,12 @@ def test_save_signatures_to_location_3_zip_add_with_manifest(runtmp): def test_save_signatures_to_location_1_dirout(runtmp): # save to sigout/ (directory) - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) - outloc = runtmp.output('sigout/') + outloc = runtmp.output("sigout/") with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: print(save_sig) save_sig.add(ss2) @@ -366,12 +368,12 @@ def test_save_signatures_to_location_1_dirout(runtmp): def test_save_signatures_to_location_1_dirout_bug_2751(runtmp): # check for 2x compressed sig files - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) - outloc = runtmp.output('sigout/') + outloc = runtmp.output("sigout/") with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: print(save_sig) save_sig.add(ss2) @@ -380,7 +382,7 @@ def test_save_signatures_to_location_1_dirout_bug_2751(runtmp): assert os.path.isdir(outloc) print(os.listdir(outloc)) - outloc2 = runtmp.output('sigout/09a08691ce52952152f0e866a59f6261.sig.gz') + outloc2 = runtmp.output("sigout/09a08691ce52952152f0e866a59f6261.sig.gz") with gzip.open(outloc2, "r") as fp: data = fp.read() print(data) @@ -389,12 +391,12 @@ def test_save_signatures_to_location_1_dirout_bug_2751(runtmp): def test_save_signatures_to_location_1_dirout_duplicate(runtmp): # save to sigout/ (directory) - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) - outloc = runtmp.output('sigout/') + outloc = runtmp.output("sigout/") with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: print(save_sig) save_sig.add(ss2) @@ -411,8 +413,8 @@ def test_save_signatures_to_location_1_dirout_duplicate(runtmp): def test_load_empty_zipfile(runtmp): - outloc = runtmp.output('empty.zip') - with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: + outloc = runtmp.output("empty.zip") + with sourmash_args.SaveSignaturesToLocation(outloc): pass sigiter = sourmash.load_file_as_signatures(outloc) @@ -422,15 +424,14 @@ def test_load_empty_zipfile(runtmp): def test_load_many_sigs_empty_file(runtmp): # make sure load_many_signatures behaves properly on empty file outloc = runtmp.output("empty.sig") - with open(outloc, "wt") as fp: + with open(outloc, "w"): pass progress = sourmash_args.SignatureLoadingProgress() with contextlib.redirect_stderr(io.StringIO()) as errfp: - with pytest.raises(SystemExit) as exc: - for ss, sigloc in sourmash_args.load_many_signatures([outloc], - progress): + with pytest.raises(SystemExit): + for ss, sigloc in sourmash_args.load_many_signatures([outloc], progress): pass err = errfp.getvalue() @@ -442,15 +443,15 @@ def test_load_many_sigs_empty_file(runtmp): def test_load_many_sigs_empty_file_force(runtmp): # make sure load_many_signatures behaves properly on empty file w/force outloc = runtmp.output("empty.sig") - with open(outloc, "wt") as fp: + with open(outloc, "w"): pass progress = sourmash_args.SignatureLoadingProgress() with contextlib.redirect_stderr(io.StringIO()) as errfp: - for ss, sigloc in sourmash_args.load_many_signatures([outloc], - progress, - force=True): + for ss, sigloc in sourmash_args.load_many_signatures( + [outloc], progress, force=True + ): pass err = errfp.getvalue() @@ -461,7 +462,7 @@ def test_load_many_sigs_empty_file_force(runtmp): def test_get_manifest_1(): # basic get_manifest retrieves a manifest - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") idx = sourmash.load_file_as_index(sig47) manifest = sourmash_args.get_manifest(idx) @@ -470,18 +471,18 @@ def test_get_manifest_1(): def test_get_manifest_2_cannot_build(): # test what happens when get_manifest cannot build manifest - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47) idx = LinearIndex([ss47]) - with pytest.raises(SystemExit) as exc: - m = sourmash_args.get_manifest(idx) + with pytest.raises(SystemExit): + sourmash_args.get_manifest(idx) def test_get_manifest_2_cannot_buildno_require(): # test what happens when get_manifest cannot build manifest - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47) idx = LinearIndex([ss47]) @@ -493,11 +494,12 @@ def test_get_manifest_2_cannot_buildno_require(): def test_get_manifest_3_build(): # check that manifest is building - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47) class FakeIndex(LinearIndex): was_called = 0 + def _signatures_with_internal(self): self.was_called = 1 return [(ss47, "fakeiloc")] @@ -510,12 +512,12 @@ def _signatures_with_internal(self): print(m) assert len(m) == 1 - assert m.rows[0]['internal_location'] == "fakeiloc" + assert m.rows[0]["internal_location"] == "fakeiloc" def test_get_manifest_3_build_2(): # check that manifest is building, but only when asked - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47) class FakeIndex(LinearIndex): @@ -546,7 +548,7 @@ def _signatures_with_internal(self): assert m == m3 -class FakeArgs(object): +class FakeArgs: picklist = None include_db_pattern = None exclude_db_pattern = None @@ -567,63 +569,63 @@ def test_pattern_1(): # test just --include-pattern handling args = FakeArgs() args.picklist = None - args.include_db_pattern = 'foo' + args.include_db_pattern = "foo" args.exclude_db_pattern = None pattern_search = sourmash_args.load_include_exclude_db_patterns(args) - assert pattern_search(['foo', 'bar', 'baz']) - assert not pattern_search(['bar', 'bif']) + assert pattern_search(["foo", "bar", "baz"]) + assert not pattern_search(["bar", "bif"]) def test_pattern_2(): # test just --exclude-pattern handling args = FakeArgs() args.picklist = None - args.exclude_db_pattern = 'foo' + args.exclude_db_pattern = "foo" args.include_db_pattern = None pattern_search = sourmash_args.load_include_exclude_db_patterns(args) - assert not pattern_search(['foo', 'bar', 'baz']) - assert pattern_search(['bar', 'baz', 'bif']) + assert not pattern_search(["foo", "bar", "baz"]) + assert pattern_search(["bar", "baz", "bif"]) def test_pattern_3(): # test with --picklist and --exclude: should fail args = FakeArgs() args.picklist = True - args.exclude_db_pattern = 'foo' + args.exclude_db_pattern = "foo" args.include_db_pattern = None with pytest.raises(SystemExit): - pattern_search = sourmash_args.load_include_exclude_db_patterns(args) + sourmash_args.load_include_exclude_db_patterns(args) def test_pattern_4(): # test with --picklist and --include: should fail args = FakeArgs() args.picklist = True - args.include_db_pattern = 'foo' + args.include_db_pattern = "foo" args.exclude_db_pattern = None with pytest.raises(SystemExit): - pattern_search = sourmash_args.load_include_exclude_db_patterns(args) + sourmash_args.load_include_exclude_db_patterns(args) def test_pattern_5(): # test with --include and --exclude: should fail args = FakeArgs() args.picklist = None - args.exclude_db_pattern = 'foo' - args.include_db_pattern = 'bar' + args.exclude_db_pattern = "foo" + args.include_db_pattern = "bar" with pytest.raises(SystemExit): - pattern_search = sourmash_args.load_include_exclude_db_patterns(args) + sourmash_args.load_include_exclude_db_patterns(args) def test_fileinput_csv_1_plain(): # test basic CSV input - testfile = utils.get_test_data('tax/test.taxonomy.csv') + testfile = utils.get_test_data("tax/test.taxonomy.csv") with sourmash_args.FileInputCSV(testfile) as r: rows = list(r) @@ -633,21 +635,21 @@ def test_fileinput_csv_1_plain(): def test_fileinput_csv_1_no_such_file(runtmp): # test fail to load file - noexistfile = runtmp.output('does-not-exist.csv') + noexistfile = runtmp.output("does-not-exist.csv") with pytest.raises(FileNotFoundError): - with sourmash_args.FileInputCSV(noexistfile) as r: + with sourmash_args.FileInputCSV(noexistfile): pass def test_fileinput_csv_2_gz(runtmp): # test basic CSV input from gz file - testfile = utils.get_test_data('tax/test.taxonomy.csv') - gzfile = runtmp.output('test.csv.gz') + testfile = utils.get_test_data("tax/test.taxonomy.csv") + gzfile = runtmp.output("test.csv.gz") - with gzip.open(gzfile, 'wt') as outfp: - with open(testfile, 'rt', newline='') as infp: + with gzip.open(gzfile, "wt") as outfp: + with open(testfile, newline="") as infp: outfp.write(infp.read()) with sourmash_args.FileInputCSV(gzfile) as r: @@ -658,42 +660,42 @@ def test_fileinput_csv_2_gz(runtmp): def test_fileinput_csv_2_gz_not_csv(runtmp): # test basic CSV input from gz file that's not CSV - works - gzfile = runtmp.output('test.csv.gz') + gzfile = runtmp.output("test.csv.gz") - with gzip.open(gzfile, 'wt') as outfp: + with gzip.open(gzfile, "wt") as outfp: outfp.write("hello world!") with sourmash_args.FileInputCSV(gzfile) as r: - assert r.fieldnames == ['hello world!'] + assert r.fieldnames == ["hello world!"] def test_fileinput_csv_2_gz_bad_version_header(runtmp): # test basic CSV input from gz file with bad version header # currently this works; not clear to me how it should fail :grin: - gzfile = runtmp.output('test.csv.gz') + gzfile = runtmp.output("test.csv.gz") - with gzip.open(gzfile, 'wt') as outfp: + with gzip.open(gzfile, "wt") as outfp: outfp.write("# excelsior\nhello world!") with sourmash_args.FileInputCSV(gzfile) as r: - assert r.fieldnames == ['hello world!'] + assert r.fieldnames == ["hello world!"] print(r.version_info) - assert r.version_info == ['excelsior'] + assert r.version_info == ["excelsior"] def test_fileinput_csv_2_zip(runtmp): # test CSV input from zip file, with component filename - testfile = utils.get_test_data('tax/test.taxonomy.csv') - zf_file = runtmp.output('test.zip') + testfile = utils.get_test_data("tax/test.taxonomy.csv") + zf_file = runtmp.output("test.zip") - with zipfile.ZipFile(zf_file, 'w') as outzip: - with open(testfile, 'rb') as infp: - with outzip.open('XYZ.csv', 'w') as outfp: + with zipfile.ZipFile(zf_file, "w") as outzip: + with open(testfile, "rb") as infp: + with outzip.open("XYZ.csv", "w") as outfp: outfp.write(infp.read()) - with sourmash_args.FileInputCSV(zf_file, default_csv_name='XYZ.csv') as r: + with sourmash_args.FileInputCSV(zf_file, default_csv_name="XYZ.csv") as r: rows = list(r) assert len(rows) == 6 print(rows) @@ -702,20 +704,21 @@ def test_fileinput_csv_2_zip(runtmp): def test_fileinput_csv_3_load_manifest(): # test loading a manifest from a zipfile collection, using # FileInputCSV. - testfile = utils.get_test_data('prot/all.zip') - - with sourmash_args.FileInputCSV(testfile, default_csv_name='SOURMASH-MANIFEST.csv') as r: + testfile = utils.get_test_data("prot/all.zip") + with sourmash_args.FileInputCSV( + testfile, default_csv_name="SOURMASH-MANIFEST.csv" + ) as r: rows = list(r) assert len(rows) == 8 - assert r.version_info == ['SOURMASH-MANIFEST-VERSION', '1.0'] + assert r.version_info == ["SOURMASH-MANIFEST-VERSION", "1.0"] def test_fileinput_csv_3_load_manifest_no_default(): # test loading a manifest from a zipfile collection, using # FileInputCSV, but with no default_csv_name - should fail - testfile = utils.get_test_data('prot/all.zip') + testfile = utils.get_test_data("prot/all.zip") with pytest.raises(csv.Error): with sourmash_args.FileInputCSV(testfile) as r: @@ -725,72 +728,71 @@ def test_fileinput_csv_3_load_manifest_no_default(): def test_fileinput_csv_3_load_manifest_zipfile_obj(): # test loading a manifest from an open zipfile obj, using # FileInputCSV. - testfile = utils.get_test_data('prot/all.zip') + testfile = utils.get_test_data("prot/all.zip") with zipfile.ZipFile(testfile, "r") as zf: - with sourmash_args.FileInputCSV(testfile, - default_csv_name='SOURMASH-MANIFEST.csv', - zipfile_obj=zf) as r: + with sourmash_args.FileInputCSV( + testfile, default_csv_name="SOURMASH-MANIFEST.csv", zipfile_obj=zf + ) as r: rows = list(r) assert len(rows) == 8 - assert r.version_info == ['SOURMASH-MANIFEST-VERSION', '1.0'] + assert r.version_info == ["SOURMASH-MANIFEST-VERSION", "1.0"] def test_fileinput_csv_3_load_manifest_zipfile_obj_no_defualt(): # test loading a manifest from an open zipfile obj, using # FileInputCSV, but with no default csv name => should fail. - testfile = utils.get_test_data('prot/all.zip') + testfile = utils.get_test_data("prot/all.zip") with zipfile.ZipFile(testfile, "r") as zf: with pytest.raises(ValueError): - with sourmash_args.FileInputCSV(testfile, - zipfile_obj=zf) as r: + with sourmash_args.FileInputCSV(testfile, zipfile_obj=zf): pass def test_fileoutput_csv_1(runtmp): # test basic behavior - outfile = runtmp.output('xxx.csv') + outfile = runtmp.output("xxx.csv") with sourmash_args.FileOutputCSV(outfile) as fp: w = csv.writer(fp) - w.writerow(['a', 'b', 'c']) - w.writerow(['x', 'y', 'z']) + w.writerow(["a", "b", "c"]) + w.writerow(["x", "y", "z"]) with open(outfile, newline="") as fp: r = csv.DictReader(fp) rows = list(r) assert len(rows) == 1 row = rows[0] - assert row['a'] == 'x' - assert row['b'] == 'y' - assert row['c'] == 'z' + assert row["a"] == "x" + assert row["b"] == "y" + assert row["c"] == "z" def test_fileoutput_csv_1_gz(runtmp): # test basic behavior => gz - outfile = runtmp.output('xxx.csv.gz') + outfile = runtmp.output("xxx.csv.gz") with sourmash_args.FileOutputCSV(outfile) as fp: w = csv.writer(fp) - w.writerow(['a', 'b', 'c']) - w.writerow(['x', 'y', 'z']) + w.writerow(["a", "b", "c"]) + w.writerow(["x", "y", "z"]) - with gzip.open(outfile, 'rt') as fp: + with gzip.open(outfile, "rt") as fp: r = csv.DictReader(fp) rows = list(r) assert len(rows) == 1 row = rows[0] - assert row['a'] == 'x' - assert row['b'] == 'y' - assert row['c'] == 'z' + assert row["a"] == "x" + assert row["b"] == "y" + assert row["c"] == "z" def test_fileoutput_csv_2_stdout(): # test '-' and 'None' go to sys.stdout - with sourmash_args.FileOutputCSV('-') as fp: + with sourmash_args.FileOutputCSV("-") as fp: assert fp == sys.stdout with sourmash_args.FileOutputCSV(None) as fp: @@ -802,14 +804,14 @@ def test_add_ksize_arg_no_default(): p = argparse.ArgumentParser() add_ksize_arg(p) args = p.parse_args() - assert args.ksize == None + assert args.ksize is None def test_add_ksize_arg_no_default_specify(): # test behavior of cli.utils.add_ksize_arg p = argparse.ArgumentParser() add_ksize_arg(p) - args = p.parse_args(['-k', '21']) + args = p.parse_args(["-k", "21"]) assert args.ksize == 21 @@ -825,17 +827,17 @@ def test_add_ksize_arg_default_31_specify(): # test behavior of cli.utils.add_ksize_arg p = argparse.ArgumentParser() add_ksize_arg(p, default=31) - args = p.parse_args(['-k', '21']) + args = p.parse_args(["-k", "21"]) assert args.ksize == 21 def test_bug_2370(runtmp): # bug - manifest loading code does not catch gzip.BadGzipFile - sigfile = utils.get_test_data('63.fa.sig') + sigfile = utils.get_test_data("63.fa.sig") # copy sigfile over to a .gz file without compressing it - - shutil.copyfile(sigfile, runtmp.output('not_really_gzipped.gz')) + shutil.copyfile(sigfile, runtmp.output("not_really_gzipped.gz")) # try running sourmash_args.load_file_as_index - #runtmp.sourmash('sig', 'describe', runtmp.output('not_really_gzipped.gz')) - sourmash_args.load_file_as_index(runtmp.output('not_really_gzipped.gz')) + # runtmp.sourmash('sig', 'describe', runtmp.output('not_really_gzipped.gz')) + sourmash_args.load_file_as_index(runtmp.output("not_really_gzipped.gz")) diff --git a/tests/test_sourmash_compute.py b/tests/test_sourmash_compute.py index cb3c48fc32..f6f6370785 100644 --- a/tests/test_sourmash_compute.py +++ b/tests/test_sourmash_compute.py @@ -27,155 +27,231 @@ def test_do_sourmash_compute(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', testdata1], - in_directory=location) + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", ["compute", "-k", "31", testdata1], in_directory=location + ) - sigfile = os.path.join(location, 'short.fa.sig') + sigfile = os.path.join(location, "short.fa.sig") assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) - assert str(sig).endswith('short.fa') + assert str(sig).endswith("short.fa") def test_do_sourmash_compute_check_num_bounds_negative(runtmp): - c=runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') - sigfile = c.output('short.fa.sig') + c = runtmp + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") + sigfile = c.output("short.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('compute', '-k', '31', '--num-hashes', '-5', '-o', sigfile, '--merge', '"name"', testdata1, testdata2, testdata3) - + c.run_sourmash( + "compute", + "-k", + "31", + "--num-hashes", + "-5", + "-o", + sigfile, + "--merge", + '"name"', + testdata1, + testdata2, + testdata3, + ) + assert "ERROR: num value must be positive" in c.last_result.err def test_do_sourmash_compute_check_num_bounds_less_than_minimum(runtmp): - c=runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') - sigfile = c.output('short.fa.sig') - - c.run_sourmash('compute', '-k', '31', '--num-hashes', '25', '-o', sigfile, '--merge', '"name"', testdata1, testdata2, testdata3) - + c = runtmp + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") + sigfile = c.output("short.fa.sig") + + c.run_sourmash( + "compute", + "-k", + "31", + "--num-hashes", + "25", + "-o", + sigfile, + "--merge", + '"name"', + testdata1, + testdata2, + testdata3, + ) + assert "WARNING: num value should be >= 50. Continuing anyway." in c.last_result.err def test_do_sourmash_compute_check_num_bounds_more_than_maximum(runtmp): - c=runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') - sigfile = c.output('short.fa.sig') - - c.run_sourmash('compute', '-k', '31', '--num-hashes', '100000', '-o', sigfile, '--merge', '"name"', testdata1, testdata2, testdata3) - - assert "WARNING: num value should be <= 50000. Continuing anyway." in c.last_result.err + c = runtmp + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") + sigfile = c.output("short.fa.sig") + + c.run_sourmash( + "compute", + "-k", + "31", + "--num-hashes", + "100000", + "-o", + sigfile, + "--merge", + '"name"', + testdata1, + testdata2, + testdata3, + ) + + assert ( + "WARNING: num value should be <= 50000. Continuing anyway." in c.last_result.err + ) @utils.in_tempdir def test_do_sourmash_compute_outdir(c): - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', testdata1, - '--outdir', c.location]) + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", ["compute", "-k", "31", testdata1, "--outdir", c.location] + ) - - sigfile = os.path.join(c.location, 'short.fa.sig') + sigfile = os.path.join(c.location, "short.fa.sig") assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) - assert str(sig).endswith('short.fa') + assert str(sig).endswith("short.fa") def test_do_sourmash_compute_output_valid_file(): - """ Trigger bug #123 """ + """Trigger bug #123""" with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') - sigfile = os.path.join(location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") + sigfile = os.path.join(location, "short.fa.sig") - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', '-o', sigfile, - testdata1, - testdata2, testdata3], - in_directory=location) + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "31", "-o", sigfile, testdata1, testdata2, testdata3], + in_directory=location, + ) assert os.path.exists(sigfile) - assert not out # stdout should be empty + assert not out # stdout should be empty # is it valid json? - with open(sigfile, 'r') as f: + with open(sigfile) as f: data = json.load(f) - filesigs = [sig['filename'] for sig in data] - assert all(testdata in filesigs - for testdata in (testdata1, testdata2, testdata3)) + filesigs = [sig["filename"] for sig in data] + assert all( + testdata in filesigs for testdata in (testdata1, testdata2, testdata3) + ) def test_do_sourmash_compute_output_stdout_valid(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', '-o', '-', - testdata1, - testdata2, testdata3], - in_directory=location) + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "31", "-o", "-", testdata1, testdata2, testdata3], + in_directory=location, + ) # is it valid json? data = json.loads(out) - filesigs = [sig['filename'] for sig in data] - assert all(testdata in filesigs - for testdata in (testdata1, testdata2, testdata3)) + filesigs = [sig["filename"] for sig in data] + assert all( + testdata in filesigs for testdata in (testdata1, testdata2, testdata3) + ) @utils.in_tempdir def test_do_sourmash_compute_output_and_name_valid_file(c): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') - sigfile = c.output('short.fa.sig') - - c.run_sourmash('compute', '-k', '31', '-o', sigfile, '--merge', '"name"', testdata1, testdata2, testdata3) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") + sigfile = c.output("short.fa.sig") + + c.run_sourmash( + "compute", + "-k", + "31", + "-o", + sigfile, + "--merge", + '"name"', + testdata1, + testdata2, + testdata3, + ) assert os.path.exists(sigfile) - assert 'calculated 1 signature for 4 sequences taken from 3 files' in c.last_result.err + assert ( + "calculated 1 signature for 4 sequences taken from 3 files" in c.last_result.err + ) # is it valid json? - with open(sigfile, 'r') as f: + with open(sigfile) as f: data = json.load(f) assert len(data) == 1 - sigfile_merged = c.output('short.all.fa.sig') - c.run_sourmash('compute', '-k', '31', '-o', sigfile_merged, '--merge', '"name"', testdata1, testdata2, testdata3) - - with open(sigfile_merged, 'r') as f: + sigfile_merged = c.output("short.all.fa.sig") + c.run_sourmash( + "compute", + "-k", + "31", + "-o", + sigfile_merged, + "--merge", + '"name"', + testdata1, + testdata2, + testdata3, + ) + + with open(sigfile_merged) as f: data_merged = json.load(f) - assert data[0]['signatures'][0]['mins'] == data_merged[0]['signatures'][0]['mins'] + assert data[0]["signatures"][0]["mins"] == data_merged[0]["signatures"][0]["mins"] @utils.in_tempdir def test_do_sourmash_compute_output_and_name_valid_file_outdir(c): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') - sigfile = os.path.join(c.location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") + sigfile = os.path.join(c.location, "short.fa.sig") - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('compute', '-k', '31', '-o', sigfile, - '--merge', '"name"', - testdata1, testdata2, testdata3, - '--outdir', c.location) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "compute", + "-k", + "31", + "-o", + sigfile, + "--merge", + '"name"', + testdata1, + testdata2, + testdata3, + "--outdir", + c.location, + ) errmsg = c.last_result.err assert "ERROR: --output-dir doesn't make sense with -o/--output" in errmsg @@ -183,103 +259,109 @@ def test_do_sourmash_compute_output_and_name_valid_file_outdir(c): def test_do_sourmash_compute_singleton(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', '--singleton', - testdata1], - in_directory=location) - - sigfile = os.path.join(location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "31", "--singleton", testdata1], + in_directory=location, + ) + + sigfile = os.path.join(location, "short.fa.sig") assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) - assert sig.name.endswith('shortName') + assert sig.name.endswith("shortName") def test_do_sourmash_compute_name(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', '--merge', 'foo', - testdata1, '-o', 'foo.sig'], - in_directory=location) - - sigfile = os.path.join(location, 'foo.sig') + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "31", "--merge", "foo", testdata1, "-o", "foo.sig"], + in_directory=location, + ) + + sigfile = os.path.join(location, "foo.sig") assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) - assert sig.name == 'foo' + assert sig.name == "foo" - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', '--name', 'foo', - testdata1, '-o', 'foo2.sig'], - in_directory=location) + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "31", "--name", "foo", testdata1, "-o", "foo2.sig"], + in_directory=location, + ) - sigfile2 = os.path.join(location, 'foo2.sig') + sigfile2 = os.path.join(location, "foo2.sig") assert os.path.exists(sigfile2) sig2 = next(signature.load_signatures(sigfile)) - assert sig2.name == 'foo' + assert sig2.name == "foo" assert sig.name == sig2.name def test_do_sourmash_compute_name_fail_no_output(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', '--merge', 'foo', - testdata1], - in_directory=location, - fail_ok=True) + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "31", "--merge", "foo", testdata1], + in_directory=location, + fail_ok=True, + ) assert status == -1 def test_do_sourmash_compute_merge_fail_no_output(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', '--merge', 'foo', - testdata1], - in_directory=location, - fail_ok=True) + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "31", "--merge", "foo", testdata1], + in_directory=location, + fail_ok=True, + ) assert status == -1 - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', '--name', 'foo', - testdata1], - in_directory=location, - fail_ok=True) + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "31", "--name", "foo", testdata1], + in_directory=location, + fail_ok=True, + ) assert status == -1 def test_do_sourmash_compute_name_from_first(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short3.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', '--name-from-first', - testdata1], - in_directory=location) - - sigfile = os.path.join(location, 'short3.fa.sig') + testdata1 = utils.get_test_data("short3.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "31", "--name-from-first", testdata1], + in_directory=location, + ) + + sigfile = os.path.join(location, "short3.fa.sig") assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) - assert sig.name == 'firstname' + assert sig.name == "firstname" def test_do_sourmash_compute_multik(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - testdata1], - in_directory=location) - outfile = os.path.join(location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", ["compute", "-k", "21,31", testdata1], in_directory=location + ) + outfile = os.path.join(location, "short.fa.sig") assert os.path.exists(outfile) siglist = list(signature.load_signatures(outfile)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 21 in ksizes assert 31 in ksizes assert len(ksizes) == 2 @@ -287,20 +369,20 @@ def test_do_sourmash_compute_multik(): def test_do_sourmash_compute_multik_with_protein(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,30', - '--protein', - testdata1], - in_directory=location) - outfile = os.path.join(location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,30", "--protein", testdata1], + in_directory=location, + ) + outfile = os.path.join(location, "short.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 4 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 21 in ksizes assert 30 in ksizes assert 7 in ksizes @@ -310,22 +392,24 @@ def test_do_sourmash_compute_multik_with_protein(): def test_do_sourmash_compute_multik_with_dayhoff(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,30', - '--dayhoff', '--no-dna', - testdata1], - in_directory=location) - assert 'Computing only Dayhoff-encoded protein (and not nucleotide) ' \ - 'signatures.' in err - outfile = os.path.join(location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,30", "--dayhoff", "--no-dna", testdata1], + in_directory=location, + ) + assert ( + "Computing only Dayhoff-encoded protein (and not nucleotide) " + "signatures." in err + ) + outfile = os.path.join(location, "short.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes assert all(x.minhash.dayhoff for x in siglist) @@ -334,47 +418,49 @@ def test_do_sourmash_compute_multik_with_dayhoff(): def test_do_sourmash_compute_multik_with_dayhoff_and_dna(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,30', - '--dayhoff', - testdata1], - in_directory=location) - outfile = os.path.join(location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,30", "--dayhoff", testdata1], + in_directory=location, + ) + outfile = os.path.join(location, "short.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 4 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 21 in ksizes assert 30 in ksizes assert 7 in ksizes assert 10 in ksizes - assert sum(x.minhash.moltype == 'DNA' for x in siglist) == 2 - assert sum(x.minhash.moltype == 'dayhoff' for x in siglist) == 2 + assert sum(x.minhash.moltype == "DNA" for x in siglist) == 2 + assert sum(x.minhash.moltype == "dayhoff" for x in siglist) == 2 assert len(ksizes) == 4 def test_do_sourmash_compute_multik_with_hp(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,30', - '--hp', '--no-dna', - testdata1], - in_directory=location) - assert 'Computing only hp-encoded protein (and not nucleotide) ' \ - 'signatures.' in err - outfile = os.path.join(location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,30", "--hp", "--no-dna", testdata1], + in_directory=location, + ) + assert ( + "Computing only hp-encoded protein (and not nucleotide) " + "signatures." in err + ) + outfile = os.path.join(location, "short.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes assert all(x.minhash.hp for x in siglist) @@ -383,20 +469,20 @@ def test_do_sourmash_compute_multik_with_hp(): def test_do_sourmash_compute_multik_with_hp_and_dna(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,30', - '--hp', - testdata1], - in_directory=location) - outfile = os.path.join(location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,30", "--hp", testdata1], + in_directory=location, + ) + outfile = os.path.join(location, "short.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 4 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes assert 21 in ksizes @@ -406,99 +492,98 @@ def test_do_sourmash_compute_multik_with_hp_and_dna(): def test_do_sourmash_compute_multik_with_dayhoff_dna_protein(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,30', - '--dayhoff', '--protein', - testdata1], - in_directory=location) - outfile = os.path.join(location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,30", "--dayhoff", "--protein", testdata1], + in_directory=location, + ) + outfile = os.path.join(location, "short.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 6 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 21 in ksizes assert 30 in ksizes assert 7 in ksizes assert 10 in ksizes - assert sum(x.minhash.moltype == 'DNA' for x in siglist) == 2 - assert sum(x.minhash.moltype == 'dayhoff' for x in siglist) == 2 - assert sum(x.minhash.moltype == 'protein' for x in siglist) == 2 + assert sum(x.minhash.moltype == "DNA" for x in siglist) == 2 + assert sum(x.minhash.moltype == "dayhoff" for x in siglist) == 2 + assert sum(x.minhash.moltype == "protein" for x in siglist) == 2 assert len(ksizes) == 4 def test_do_sourmash_compute_multik_with_dayhoff_hp_dna_protein(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,30', - '--dayhoff', '--hp', '--protein', - testdata1], - in_directory=location) - outfile = os.path.join(location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,30", "--dayhoff", "--hp", "--protein", testdata1], + in_directory=location, + ) + outfile = os.path.join(location, "short.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 8 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes assert 21 in ksizes assert 30 in ksizes - assert sum(x.minhash.moltype == 'DNA' for x in siglist) == 2 - assert sum(x.minhash.moltype == 'dayhoff' for x in siglist) == 2 - assert sum(x.minhash.moltype == 'hp' for x in siglist) == 2 + assert sum(x.minhash.moltype == "DNA" for x in siglist) == 2 + assert sum(x.minhash.moltype == "dayhoff" for x in siglist) == 2 + assert sum(x.minhash.moltype == "hp" for x in siglist) == 2 # 2 = dayhoff, 2 = hp = 4 protein - assert sum(x.minhash.moltype == 'protein' for x in siglist) == 2 + assert sum(x.minhash.moltype == "protein" for x in siglist) == 2 assert len(ksizes) == 4 def test_do_sourmash_compute_multik_with_nothing(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - '--no-protein', '--no-dna', - testdata1], - in_directory=location, - fail_ok=True) - outfile = os.path.join(location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,31", "--no-protein", "--no-dna", testdata1], + in_directory=location, + fail_ok=True, + ) + outfile = os.path.join(location, "short.fa.sig") assert not os.path.exists(outfile) def test_do_sourmash_compute_multik_protein_bad_ksize(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '20,32', - '--protein', '--no-dna', - testdata1], - in_directory=location, - fail_ok=True) - outfile = os.path.join(location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "20,32", "--protein", "--no-dna", testdata1], + in_directory=location, + fail_ok=True, + ) + outfile = os.path.join(location, "short.fa.sig") assert not os.path.exists(outfile) - assert 'protein ksizes must be divisible by 3' in err + assert "protein ksizes must be divisible by 3" in err @utils.in_tempdir def test_do_sourmash_compute_multik_only_protein(c): # check sourmash compute with only protein, no nucl - testdata1 = utils.get_test_data('short.fa') - c.run_sourmash('compute', '-k', '21,30', - '--protein', '--no-dna', testdata1) - outfile = os.path.join(c.location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + c.run_sourmash("compute", "-k", "21,30", "--protein", "--no-dna", testdata1) + outfile = os.path.join(c.location, "short.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes assert len(ksizes) == 2 @@ -506,34 +591,40 @@ def test_do_sourmash_compute_multik_only_protein(c): def test_do_sourmash_compute_multik_protein_input_bad_ksize(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short-protein.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '20,32', - '--protein', '--no-dna', - '--input-is-protein', - testdata1], - in_directory=location, - fail_ok=True) - outfile = os.path.join(location, 'short-protein.fa.sig') + testdata1 = utils.get_test_data("short-protein.fa") + status, out, err = utils.runscript( + "sourmash", + [ + "compute", + "-k", + "20,32", + "--protein", + "--no-dna", + "--input-is-protein", + testdata1, + ], + in_directory=location, + fail_ok=True, + ) + os.path.join(location, "short-protein.fa.sig") assert status != 0 - assert 'protein ksizes must be divisible by 3' in err + assert "protein ksizes must be divisible by 3" in err @utils.in_tempdir def test_do_sourmash_compute_multik_only_protein_no_rna(c): # test --no-rna as well (otherwise identical to previous test) - testdata1 = utils.get_test_data('short.fa') + testdata1 = utils.get_test_data("short.fa") - c.run_sourmash('compute', '-k', '21,30', - '--protein', '--no-rna', testdata1) - outfile = os.path.join(c.location, 'short.fa.sig') + c.run_sourmash("compute", "-k", "21,30", "--protein", "--no-rna", testdata1) + outfile = os.path.join(c.location, "short.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes assert len(ksizes) == 2 @@ -542,20 +633,20 @@ def test_do_sourmash_compute_multik_only_protein_no_rna(c): def test_do_sourmash_compute_protein_bad_sequences(): """Proper error handling when Ns in dna sequence""" with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.bad.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,30', - '--protein', '--no-dna', - testdata1], - in_directory=location) - outfile = os.path.join(location, 'short.bad.fa.sig') + testdata1 = utils.get_test_data("short.bad.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,30", "--protein", "--no-dna", testdata1], + in_directory=location, + ) + outfile = os.path.join(location, "short.bad.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes assert len(ksizes) == 2 @@ -563,178 +654,176 @@ def test_do_sourmash_compute_protein_bad_sequences(): def test_do_sourmash_compute_multik_input_is_protein(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('ecoli.faa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,30', - '--input-is-protein', - testdata1], - in_directory=location) - outfile = os.path.join(location, 'ecoli.faa.sig') + testdata1 = utils.get_test_data("ecoli.faa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,30", "--input-is-protein", testdata1], + in_directory=location, + ) + outfile = os.path.join(location, "ecoli.faa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes assert len(ksizes) == 2 - moltype = set([ x.minhash.moltype == 'protein' - for x in siglist ]) + moltype = set([x.minhash.moltype == "protein" for x in siglist]) assert len(moltype) == 1 assert True in moltype def test_do_sourmash_compute_multik_outfile(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - outfile = os.path.join(location, 'FOO.xxx') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - testdata1, '-o', outfile], - in_directory=location) + testdata1 = utils.get_test_data("short.fa") + outfile = os.path.join(location, "FOO.xxx") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,31", testdata1, "-o", outfile], + in_directory=location, + ) assert os.path.exists(outfile) siglist = list(signature.load_signatures(outfile)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 21 in ksizes assert 31 in ksizes def test_do_sourmash_compute_with_scaled_1(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - outfile = os.path.join(location, 'FOO.xxx') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - '--scaled', '1', - testdata1, '-o', outfile], - in_directory=location) + testdata1 = utils.get_test_data("short.fa") + outfile = os.path.join(location, "FOO.xxx") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,31", "--scaled", "1", testdata1, "-o", outfile], + in_directory=location, + ) assert os.path.exists(outfile) siglist = list(signature.load_signatures(outfile)) assert len(siglist) == 2 - scaled_vals = [ x.minhash.scaled for x in siglist ] + scaled_vals = [x.minhash.scaled for x in siglist] assert len(scaled_vals) == 2 - assert set(scaled_vals) == { 1 } + assert set(scaled_vals) == {1} def test_do_sourmash_compute_with_scaled_2(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - outfile = os.path.join(location, 'FOO.xxx') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - '--scaled', '2', - testdata1, '-o', outfile], - in_directory=location) + testdata1 = utils.get_test_data("short.fa") + outfile = os.path.join(location, "FOO.xxx") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,31", "--scaled", "2", testdata1, "-o", outfile], + in_directory=location, + ) assert os.path.exists(outfile) siglist = list(signature.load_signatures(outfile)) assert len(siglist) == 2 - max_hashes = [ x.minhash._max_hash for x in siglist ] + max_hashes = [x.minhash._max_hash for x in siglist] assert len(max_hashes) == 2 - assert set(max_hashes) == set([ int(2**64 /2.) ]) + assert set(max_hashes) == set([int(2**64 / 2.0)]) def test_do_sourmash_compute_with_scaled(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - outfile = os.path.join(location, 'FOO.xxx') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - '--scaled', '100', - testdata1, '-o', outfile], - in_directory=location) + testdata1 = utils.get_test_data("short.fa") + outfile = os.path.join(location, "FOO.xxx") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,31", "--scaled", "100", testdata1, "-o", outfile], + in_directory=location, + ) assert os.path.exists(outfile) siglist = list(signature.load_signatures(outfile)) assert len(siglist) == 2 - max_hashes = [ x.minhash._max_hash for x in siglist ] + max_hashes = [x.minhash._max_hash for x in siglist] assert len(max_hashes) == 2 - assert set(max_hashes) == set([ int(2**64 /100.) ]) + assert set(max_hashes) == set([int(2**64 / 100.0)]) def test_do_sourmash_compute_with_bad_scaled(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - outfile = os.path.join(location, 'FOO.xxx') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - '--scaled', '-1', - testdata1, '-o', outfile], - in_directory=location, - fail_ok=True) + testdata1 = utils.get_test_data("short.fa") + outfile = os.path.join(location, "FOO.xxx") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,31", "--scaled", "-1", testdata1, "-o", outfile], + in_directory=location, + fail_ok=True, + ) assert status != 0 - assert '--scaled value must be >= 1' in err + assert "--scaled value must be >= 1" in err - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - '--scaled', '1000.5', - testdata1, '-o', outfile], - in_directory=location, - fail_ok=True) + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,31", "--scaled", "1000.5", testdata1, "-o", outfile], + in_directory=location, + fail_ok=True, + ) assert status != 0 - assert '--scaled value must be integer value' in err + assert "--scaled value must be integer value" in err - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - '--scaled', '1e9', - testdata1, '-o', outfile], - in_directory=location) + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,31", "--scaled", "1e9", testdata1, "-o", outfile], + in_directory=location, + ) assert status == 0 - assert 'WARNING: scaled value is nonsensical!?' in err + assert "WARNING: scaled value is nonsensical!?" in err def test_do_sourmash_compute_with_seed(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - outfile = os.path.join(location, 'FOO.xxx') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - '--seed', '43', - testdata1, '-o', outfile], - in_directory=location) + testdata1 = utils.get_test_data("short.fa") + outfile = os.path.join(location, "FOO.xxx") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,31", "--seed", "43", testdata1, "-o", outfile], + in_directory=location, + ) assert os.path.exists(outfile) siglist = list(signature.load_signatures(outfile)) assert len(siglist) == 2 - seeds = [ x.minhash.seed for x in siglist ] + seeds = [x.minhash.seed for x in siglist] assert len(seeds) == 2 - assert set(seeds) == set([ 43 ]) + assert set(seeds) == set([43]) def test_do_sourmash_check_protein_comparisons(): # this test checks 2 x 2 protein comparisons with E. coli genes. with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('ecoli.faa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21', - '--input-is-protein', - '--singleton', - testdata1], - in_directory=location) - sig1 = os.path.join(location, 'ecoli.faa.sig') + testdata1 = utils.get_test_data("ecoli.faa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21", "--input-is-protein", "--singleton", testdata1], + in_directory=location, + ) + sig1 = os.path.join(location, "ecoli.faa.sig") assert os.path.exists(sig1) - testdata2 = utils.get_test_data('ecoli.genes.fna') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21', - '--protein', '--no-dna', - '--singleton', - testdata2], - in_directory=location) - sig2 = os.path.join(location, 'ecoli.genes.fna.sig') + testdata2 = utils.get_test_data("ecoli.genes.fna") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21", "--protein", "--no-dna", "--singleton", testdata2], + in_directory=location, + ) + sig2 = os.path.join(location, "ecoli.genes.fna.sig") assert os.path.exists(sig2) # I'm not sure why load_signatures is randomizing order, but ok. @@ -745,13 +834,13 @@ def test_do_sourmash_check_protein_comparisons(): sig1_trans, sig2_trans = sorted(x, key=lambda x: x.name) name1 = sig1_aa.name.split()[0] - assert name1 == 'NP_414543.1' + assert name1 == "NP_414543.1" name2 = sig2_aa.name.split()[0] - assert name2 == 'NP_414544.1' + assert name2 == "NP_414544.1" name3 = sig1_trans.name.split()[0] - assert name3 == 'gi|556503834:2801-3733' + assert name3 == "gi|556503834:2801-3733" name4 = sig2_trans.name.split()[0] - assert name4 == 'gi|556503834:337-2799' + assert name4 == "gi|556503834:337-2799" print(name1, name3, round(sig1_aa.similarity(sig1_trans), 3)) print(name2, name3, round(sig2_aa.similarity(sig1_trans), 3)) @@ -768,11 +857,9 @@ def test_do_sourmash_check_protein_comparisons(): def test_do_sourmash_check_knowngood_dna_comparisons(c): # this test checks against a known good signature calculated # by utils/compute-dna-mh-another-way.py - testdata1 = utils.get_test_data('ecoli.genes.fna') - c.run_sourmash('compute', '-k', '21', - '--singleton', '--dna', - testdata1) - sig1 = c.output('ecoli.genes.fna.sig') + testdata1 = utils.get_test_data("ecoli.genes.fna") + c.run_sourmash("compute", "-k", "21", "--singleton", "--dna", testdata1) + sig1 = c.output("ecoli.genes.fna.sig") assert os.path.exists(sig1) x = list(signature.load_signatures(sig1)) @@ -780,7 +867,7 @@ def test_do_sourmash_check_knowngood_dna_comparisons(c): print(sig1.name) print(sig2.name) - knowngood = utils.get_test_data('benchmark.dna.sig') + knowngood = utils.get_test_data("benchmark.dna.sig") good = list(signature.load_signatures(knowngood))[0] assert sig2.similarity(good) == 1.0 @@ -789,16 +876,15 @@ def test_do_sourmash_check_knowngood_dna_comparisons(c): @utils.in_tempdir def test_do_sourmash_check_knowngood_dna_comparisons_use_rna(c): # check the --rna flag; otherwise identical to previous test. - testdata1 = utils.get_test_data('ecoli.genes.fna') - c.run_sourmash('compute', '-k', '21', '--singleton', '--rna', - testdata1) - sig1 = c.output('ecoli.genes.fna.sig') + testdata1 = utils.get_test_data("ecoli.genes.fna") + c.run_sourmash("compute", "-k", "21", "--singleton", "--rna", testdata1) + sig1 = c.output("ecoli.genes.fna.sig") assert os.path.exists(sig1) x = list(signature.load_signatures(sig1)) sig1, sig2 = sorted(x, key=lambda x: x.name) - knowngood = utils.get_test_data('benchmark.dna.sig') + knowngood = utils.get_test_data("benchmark.dna.sig") good = list(signature.load_signatures(knowngood))[0] assert sig2.similarity(good) == 1.0 @@ -808,20 +894,19 @@ def test_do_sourmash_check_knowngood_input_protein_comparisons(): # this test checks against a known good signature calculated # by utils/compute-input-prot-another-way.py with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('ecoli.faa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21', - '--input-is-protein', - '--singleton', - testdata1], - in_directory=location) - sig1 = os.path.join(location, 'ecoli.faa.sig') + testdata1 = utils.get_test_data("ecoli.faa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21", "--input-is-protein", "--singleton", testdata1], + in_directory=location, + ) + sig1 = os.path.join(location, "ecoli.faa.sig") assert os.path.exists(sig1) x = list(signature.load_signatures(sig1)) sig1_aa, sig2_aa = sorted(x, key=lambda x: x.name) - knowngood = utils.get_test_data('benchmark.input_prot.sig') + knowngood = utils.get_test_data("benchmark.input_prot.sig") good_aa = list(signature.load_signatures(knowngood))[0] assert sig1_aa.similarity(good_aa) == 1.0 @@ -831,29 +916,36 @@ def test_do_sourmash_check_knowngood_protein_comparisons(): # this test checks against a known good signature calculated # by utils/compute-prot-mh-another-way.py with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('ecoli.genes.fna') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21', - '--singleton', '--protein', - '--no-dna', - testdata1], - in_directory=location) - sig1 = os.path.join(location, 'ecoli.genes.fna.sig') + testdata1 = utils.get_test_data("ecoli.genes.fna") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21", "--singleton", "--protein", "--no-dna", testdata1], + in_directory=location, + ) + sig1 = os.path.join(location, "ecoli.genes.fna.sig") assert os.path.exists(sig1) x = list(signature.load_signatures(sig1)) sig1_trans, sig2_trans = sorted(x, key=lambda x: x.name) - knowngood = utils.get_test_data('benchmark.prot.sig') + knowngood = utils.get_test_data("benchmark.prot.sig") good_trans = list(signature.load_signatures(knowngood))[0] assert sig2_trans.similarity(good_trans) == 1.0 def test_compute_parameters(): - args_list = ["compute", "-k", "21,31", "--singleton", "--protein", "--no-dna", "input_file"] - - parser = SourmashParser(prog='sourmash') + args_list = [ + "compute", + "-k", + "21,31", + "--singleton", + "--protein", + "--no-dna", + "input_file", + ] + + parser = SourmashParser(prog="sourmash") subp = parser.add_subparsers(title="instruction", dest="cmd", metavar="cmd") subparser(subp) diff --git a/tests/test_sourmash_sketch.py b/tests/test_sourmash_sketch.py index 15925cb344..5c06ace5f2 100644 --- a/tests/test_sourmash_sketch.py +++ b/tests/test_sourmash_sketch.py @@ -9,7 +9,6 @@ import json import csv import pytest -import screed import sourmash_tst_utils as utils import sourmash @@ -31,55 +30,67 @@ def test_do_sourmash_sketch_check_scaled_bounds_negative(runtmp): - testdata1 = utils.get_test_data('short.fa') + testdata1 = utils.get_test_data("short.fa") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'translate', '-p', 'scaled=-5', testdata1) + runtmp.sourmash("sketch", "translate", "-p", "scaled=-5", testdata1) assert "ERROR: scaled value must be positive" in runtmp.last_result.err def test_do_sourmash_sketch_check_scaled_bounds_less_than_minimum(runtmp): - testdata1 = utils.get_test_data('short.fa') - runtmp.sourmash('sketch', 'translate', '-p', 'scaled=50', testdata1) - assert "WARNING: scaled value should be >= 100. Continuing anyway." in runtmp.last_result.err + testdata1 = utils.get_test_data("short.fa") + runtmp.sourmash("sketch", "translate", "-p", "scaled=50", testdata1) + assert ( + "WARNING: scaled value should be >= 100. Continuing anyway." + in runtmp.last_result.err + ) def test_do_sourmash_sketch_check_scaled_bounds_more_than_maximum(runtmp): - testdata1 = utils.get_test_data('short.fa') - runtmp.sourmash('sketch', 'translate', '-p', 'scaled=1000000000', testdata1) - assert "WARNING: scaled value should be <= 1e6. Continuing anyway." in runtmp.last_result.err + testdata1 = utils.get_test_data("short.fa") + runtmp.sourmash("sketch", "translate", "-p", "scaled=1000000000", testdata1) + assert ( + "WARNING: scaled value should be <= 1e6. Continuing anyway." + in runtmp.last_result.err + ) def test_do_sourmash_sketch_check_num_bounds_negative(runtmp): - testdata1 = utils.get_test_data('short.fa') + testdata1 = utils.get_test_data("short.fa") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'translate', '-p', 'num=-5', testdata1) + runtmp.sourmash("sketch", "translate", "-p", "num=-5", testdata1) assert "ERROR: num value must be positive" in runtmp.last_result.err def test_do_sourmash_sketch_check_num_bounds_less_than_minimum(runtmp): - testdata1 = utils.get_test_data('short.fa') - runtmp.sourmash('sketch', 'translate', '-p', 'num=25', testdata1) - assert "WARNING: num value should be >= 50. Continuing anyway." in runtmp.last_result.err + testdata1 = utils.get_test_data("short.fa") + runtmp.sourmash("sketch", "translate", "-p", "num=25", testdata1) + assert ( + "WARNING: num value should be >= 50. Continuing anyway." + in runtmp.last_result.err + ) def test_do_sourmash_sketch_check_num_bounds_more_than_maximum(runtmp): - testdata1 = utils.get_test_data('short.fa') - runtmp.sourmash('sketch', 'translate', '-p', 'num=100000', testdata1) - assert "WARNING: num value should be <= 50000. Continuing anyway." in runtmp.last_result.err + testdata1 = utils.get_test_data("short.fa") + runtmp.sourmash("sketch", "translate", "-p", "num=100000", testdata1) + assert ( + "WARNING: num value should be <= 50000. Continuing anyway." + in runtmp.last_result.err + ) def test_empty_factory(): with pytest.raises(ValueError): - factory = _signatures_for_sketch_factory([], None) + _signatures_for_sketch_factory([], None) def test_no_default_moltype_factory_nonempty(): with pytest.raises(ValueError): - factory = _signatures_for_sketch_factory(["k=31"], None) + _signatures_for_sketch_factory(["k=31"], None) def test_factory_no_default_moltype_dna(): - factory = _signatures_for_sketch_factory(['dna'], None) + factory = _signatures_for_sketch_factory(["dna"], None) params_list = list(factory.get_compute_params()) assert len(params_list) == 1 @@ -88,7 +99,7 @@ def test_factory_no_default_moltype_dna(): def test_factory_no_default_moltype_protein(): - factory = _signatures_for_sketch_factory(['protein'], None) + factory = _signatures_for_sketch_factory(["protein"], None) params_list = list(factory.get_compute_params()) assert len(params_list) == 1 @@ -97,16 +108,16 @@ def test_factory_no_default_moltype_protein(): def test_factory_dna_nosplit(): - factory = _signatures_for_sketch_factory(['k=31,k=51'], 'dna') + factory = _signatures_for_sketch_factory(["k=31,k=51"], "dna") params_list = list(factory.get_compute_params(split_ksizes=False)) assert len(params_list) == 1 params = params_list[0] - assert params.ksizes == [31,51] + assert params.ksizes == [31, 51] def test_factory_dna_split(): - factory = _signatures_for_sketch_factory(['k=31,k=51'], 'dna') + factory = _signatures_for_sketch_factory(["k=31,k=51"], "dna") params_list = list(factory.get_compute_params(split_ksizes=True)) assert len(params_list) == 2 @@ -117,7 +128,7 @@ def test_factory_dna_split(): def test_factory_protein_nosplit(): - factory = _signatures_for_sketch_factory(['k=10,k=9'], 'protein') + factory = _signatures_for_sketch_factory(["k=10,k=9"], "protein") params_list = list(factory.get_compute_params(split_ksizes=False)) assert len(params_list) == 1 @@ -126,7 +137,7 @@ def test_factory_protein_nosplit(): def test_factory_protein_split(): - factory = _signatures_for_sketch_factory(['k=10,k=9'], 'protein') + factory = _signatures_for_sketch_factory(["k=10,k=9"], "protein") params_list = list(factory.get_compute_params(split_ksizes=True)) assert len(params_list) == 2 @@ -137,12 +148,12 @@ def test_factory_protein_split(): def test_factory_dna_equal(): - factory1 = _signatures_for_sketch_factory(['dna'], None) + factory1 = _signatures_for_sketch_factory(["dna"], None) params_list1 = list(factory1.get_compute_params()) assert len(params_list1) == 1 params1 = params_list1[0] - factory2 = _signatures_for_sketch_factory([], 'dna') + factory2 = _signatures_for_sketch_factory([], "dna") params_list2 = list(factory2.get_compute_params()) assert len(params_list2) == 1 params2 = params_list2[0] @@ -152,12 +163,12 @@ def test_factory_dna_equal(): def test_factory_protein_equal(): - factory1 = _signatures_for_sketch_factory(['protein'], None) + factory1 = _signatures_for_sketch_factory(["protein"], None) params_list1 = list(factory1.get_compute_params()) assert len(params_list1) == 1 params1 = params_list1[0] - factory2 = _signatures_for_sketch_factory([], 'protein') + factory2 = _signatures_for_sketch_factory([], "protein") params_list2 = list(factory2.get_compute_params()) assert len(params_list2) == 1 params2 = params_list2[0] @@ -167,12 +178,12 @@ def test_factory_protein_equal(): def test_factory_dna_multi_ksize_eq(): - factory1 = _signatures_for_sketch_factory(['k=21,k=31,dna'], None) + factory1 = _signatures_for_sketch_factory(["k=21,k=31,dna"], None) params_list1 = list(factory1.get_compute_params()) assert len(params_list1) == 1 params1 = params_list1[0] - factory2 = _signatures_for_sketch_factory(['k=21,k=31'], 'dna') + factory2 = _signatures_for_sketch_factory(["k=21,k=31"], "dna") params_list2 = list(factory2.get_compute_params()) assert len(params_list2) == 1 params2 = params_list2[0] @@ -182,12 +193,12 @@ def test_factory_dna_multi_ksize_eq(): def test_factory_protein_multi_ksize_eq(): - factory1 = _signatures_for_sketch_factory(['k=10,k=11,protein'], None) + factory1 = _signatures_for_sketch_factory(["k=10,k=11,protein"], None) params_list1 = list(factory1.get_compute_params()) assert len(params_list1) == 1 params1 = params_list1[0] - factory2 = _signatures_for_sketch_factory(['k=10,k=11'], 'protein') + factory2 = _signatures_for_sketch_factory(["k=10,k=11"], "protein") params_list2 = list(factory2.get_compute_params()) assert len(params_list2) == 1 params2 = params_list2[0] @@ -197,7 +208,7 @@ def test_factory_protein_multi_ksize_eq(): def test_dna_defaults(): - factory = _signatures_for_sketch_factory([], 'dna') + factory = _signatures_for_sketch_factory([], "dna") params_list = list(factory.get_compute_params()) assert len(params_list) == 1 @@ -219,13 +230,13 @@ def test_dna_defaults(): def test_dna_multiple_ksize(): - factory = _signatures_for_sketch_factory(['k=21,k=31,k=51'], 'dna') + factory = _signatures_for_sketch_factory(["k=21,k=31,k=51"], "dna") params_list = list(factory.get_compute_params()) assert len(params_list) == 1 params = params_list[0] - assert params.ksizes == [21,31,51] + assert params.ksizes == [21, 31, 51] assert params.num_hashes == 0 assert params.scaled == 1000 assert not params.track_abundance @@ -246,8 +257,7 @@ def test_dna_multiple_ksize(): def test_dna_override_1(): - factory = _signatures_for_sketch_factory(['k=21,scaled=2000,abund'], - 'dna') + factory = _signatures_for_sketch_factory(["k=21,scaled=2000,abund"], "dna") params_list = list(factory.get_compute_params()) assert len(params_list) == 1 @@ -266,48 +276,47 @@ def test_dna_override_1(): def test_scaled_param_requires_equal(): with pytest.raises(ValueError): - factory = _signatures_for_sketch_factory(['k=21,scaled'], 'dna') + _signatures_for_sketch_factory(["k=21,scaled"], "dna") def test_k_param_requires_equal(): with pytest.raises(ValueError): - factory = _signatures_for_sketch_factory(['k'], 'dna') + _signatures_for_sketch_factory(["k"], "dna") def test_k_param_requires_equal_2(): - with pytest.raises(ValueError) as exc: - factory = _signatures_for_sketch_factory(['k='], 'dna') + with pytest.raises(ValueError): + _signatures_for_sketch_factory(["k="], "dna") def test_seed_param_requires_equal(): - with pytest.raises(ValueError) as exc: - factory = _signatures_for_sketch_factory(['seed='], 'dna') + with pytest.raises(ValueError): + _signatures_for_sketch_factory(["seed="], "dna") def test_num_param_requires_equal(): - with pytest.raises(ValueError) as exc: - factory = _signatures_for_sketch_factory(['num='], 'dna') + with pytest.raises(ValueError): + _signatures_for_sketch_factory(["num="], "dna") def test_dna_override_bad_1(): with pytest.raises(ValueError): - factory = _signatures_for_sketch_factory(['k=21,scaledFOO=2000,abund'], - 'dna') + _signatures_for_sketch_factory(["k=21,scaledFOO=2000,abund"], "dna") def test_dna_override_bad_2(): with pytest.raises(ValueError): - factory = _signatures_for_sketch_factory(['k=21,protein'], 'dna') + _signatures_for_sketch_factory(["k=21,protein"], "dna") def test_protein_defaults(): - factory = _signatures_for_sketch_factory([], 'protein') + factory = _signatures_for_sketch_factory([], "protein") params_list = list(factory.get_compute_params()) assert len(params_list) == 1 params = params_list[0] - assert params.ksizes == [30] # x3 for now + assert params.ksizes == [30] # x3 for now assert params.num_hashes == 0 assert params.scaled == 200 assert not params.track_abundance @@ -320,14 +329,15 @@ def test_protein_defaults(): def test_protein_override_bad_2(): with pytest.raises(ValueError): - factory = _signatures_for_sketch_factory(['k=21,dna'], 'protein') + _signatures_for_sketch_factory(["k=21,dna"], "protein") + def test_protein_override_bad_rust_foo(): # mimic 'sourmash sketch protein -p dna' - factory = _signatures_for_sketch_factory([], 'protein') + factory = _signatures_for_sketch_factory([], "protein") # reach in and avoid error checking to construct a bad params_list. - factory.params_list = [('dna', {})] + factory.params_list = [("dna", {})] # now, get sigs... siglist = factory() @@ -335,7 +345,7 @@ def test_protein_override_bad_rust_foo(): sig = siglist[0] # try adding something - testdata1 = utils.get_test_data('ecoli.faa') + testdata1 = utils.get_test_data("ecoli.faa") with screed.open(testdata1) as f: record = next(iter(f)) @@ -346,13 +356,13 @@ def test_protein_override_bad_rust_foo(): def test_dayhoff_defaults(): - factory = _signatures_for_sketch_factory([], 'dayhoff') + factory = _signatures_for_sketch_factory([], "dayhoff") params_list = list(factory.get_compute_params()) assert len(params_list) == 1 params = params_list[0] - assert params.ksizes == [48] # x3 for now + assert params.ksizes == [48] # x3 for now assert params.num_hashes == 0 assert params.scaled == 200 assert not params.track_abundance @@ -365,17 +375,17 @@ def test_dayhoff_defaults(): def test_dayhoff_override_bad_2(): with pytest.raises(ValueError): - factory = _signatures_for_sketch_factory(['k=21,dna'], 'dayhoff') + _signatures_for_sketch_factory(["k=21,dna"], "dayhoff") def test_hp_defaults(): - factory = _signatures_for_sketch_factory([], 'hp') + factory = _signatures_for_sketch_factory([], "hp") params_list = list(factory.get_compute_params()) assert len(params_list) == 1 params = params_list[0] - assert params.ksizes == [126] # x3 for now + assert params.ksizes == [126] # x3 for now assert params.num_hashes == 0 assert params.scaled == 200 assert not params.track_abundance @@ -388,21 +398,23 @@ def test_hp_defaults(): def test_hp_override_bad_2(): with pytest.raises(ValueError): - factory = _signatures_for_sketch_factory(['k=21,dna'], 'hp') + _signatures_for_sketch_factory(["k=21,dna"], "hp") def test_multiple_moltypes(): - params_foo = ['k=20,num=500,protein', - 'k=19,num=400,dayhoff,abund', - 'k=30,scaled=200,hp', - 'k=30,scaled=200,seed=58'] - factory = _signatures_for_sketch_factory(params_foo, 'protein') + params_foo = [ + "k=20,num=500,protein", + "k=19,num=400,dayhoff,abund", + "k=30,scaled=200,hp", + "k=30,scaled=200,seed=58", + ] + factory = _signatures_for_sketch_factory(params_foo, "protein") params_list = list(factory.get_compute_params()) assert len(params_list) == 4 params = params_list[0] - assert params.ksizes == [60] # x3, for now. + assert params.ksizes == [60] # x3, for now. assert params.num_hashes == 500 assert params.scaled == 0 assert not params.track_abundance @@ -413,7 +425,7 @@ def test_multiple_moltypes(): assert params.protein params = params_list[1] - assert params.ksizes == [57] # x3, for now. + assert params.ksizes == [57] # x3, for now. assert params.num_hashes == 400 assert params.scaled == 0 assert params.track_abundance @@ -424,7 +436,7 @@ def test_multiple_moltypes(): assert not params.protein params = params_list[2] - assert params.ksizes == [90] # x3, for now. + assert params.ksizes == [90] # x3, for now. assert params.num_hashes == 0 assert params.scaled == 200 assert not params.track_abundance @@ -435,7 +447,7 @@ def test_multiple_moltypes(): assert not params.protein params = params_list[3] - assert params.ksizes == [90] # x3, for now. + assert params.ksizes == [90] # x3, for now. assert params.num_hashes == 0 assert params.scaled == 200 assert not params.track_abundance @@ -446,16 +458,19 @@ def test_multiple_moltypes(): assert params.protein -@pytest.mark.parametrize("input_param_str, expected_output", - [('protein', 'protein,k=10,scaled=200'), - ('dna', 'dna,k=31,scaled=1000'), - ('hp', 'hp,k=42,scaled=200'), - ('dayhoff', 'dayhoff,k=16,scaled=200'), - ('dna,seed=52', 'dna,k=31,scaled=1000,seed=52'), - ('dna,num=500', 'dna,k=31,num=500'), - ('scaled=1100,dna', 'dna,k=31,scaled=1100'), - ('dna,abund', 'dna,k=31,scaled=1000,abund') - ]) +@pytest.mark.parametrize( + "input_param_str, expected_output", + [ + ("protein", "protein,k=10,scaled=200"), + ("dna", "dna,k=31,scaled=1000"), + ("hp", "hp,k=42,scaled=200"), + ("dayhoff", "dayhoff,k=16,scaled=200"), + ("dna,seed=52", "dna,k=31,scaled=1000,seed=52"), + ("dna,num=500", "dna,k=31,num=500"), + ("scaled=1100,dna", "dna,k=31,scaled=1100"), + ("dna,abund", "dna,k=31,scaled=1000,abund"), + ], +) def test_compute_parameters_to_param_str(input_param_str, expected_output): factory = _signatures_for_sketch_factory([input_param_str], None) params_list = list(factory.get_compute_params()) @@ -464,22 +479,18 @@ def test_compute_parameters_to_param_str(input_param_str, expected_output): actual_output_str = params.to_param_str() - assert actual_output_str == expected_output, (actual_output_str, - expected_output) + assert actual_output_str == expected_output, (actual_output_str, expected_output) def test_manifest_row_to_compute_parameters_1(): # test ComputeParameters.from_manifest_row with moltype 'DNA' - row = dict(moltype='DNA', - ksize=21, - num=0, scaled=1000, - with_abundance=1) + row = dict(moltype="DNA", ksize=21, num=0, scaled=1000, with_abundance=1) p = ComputeParameters.from_manifest_row(row) assert p.dna assert not p.protein assert not p.dayhoff assert not p.hp - assert p.moltype == 'DNA' + assert p.moltype == "DNA" assert p.num_hashes == 0 assert p.scaled == 1000 assert p.ksizes == [21] @@ -489,14 +500,11 @@ def test_manifest_row_to_compute_parameters_1(): def test_manifest_row_to_compute_parameters_2(): # test ComputeParameters.from_manifest_row with moltype 'protein' - row = dict(moltype='protein', - ksize=10, - num=0, scaled=200, - with_abundance=1) + row = dict(moltype="protein", ksize=10, num=0, scaled=200, with_abundance=1) p = ComputeParameters.from_manifest_row(row) assert not p.dna assert p.protein - assert p.moltype == 'protein' + assert p.moltype == "protein" assert not p.dayhoff assert not p.hp assert p.num_hashes == 0 @@ -508,15 +516,12 @@ def test_manifest_row_to_compute_parameters_2(): def test_manifest_row_to_compute_parameters_3(): # test ComputeParameters.from_manifest_row with moltype 'dayhoff' - row = dict(moltype='dayhoff', - ksize=12, - num=0, scaled=200, - with_abundance=0) + row = dict(moltype="dayhoff", ksize=12, num=0, scaled=200, with_abundance=0) p = ComputeParameters.from_manifest_row(row) assert not p.dna assert not p.protein assert p.dayhoff - assert p.moltype == 'dayhoff' + assert p.moltype == "dayhoff" assert not p.hp assert p.num_hashes == 0 assert p.scaled == 200 @@ -527,16 +532,13 @@ def test_manifest_row_to_compute_parameters_3(): def test_manifest_row_to_compute_parameters_4(): # test ComputeParameters.from_manifest_row with moltype 'hp' - row = dict(moltype='hp', - ksize=32, - num=0, scaled=200, - with_abundance=0) + row = dict(moltype="hp", ksize=32, num=0, scaled=200, with_abundance=0) p = ComputeParameters.from_manifest_row(row) assert not p.dna assert not p.protein assert not p.dayhoff assert p.hp - assert p.moltype == 'hp' + assert p.moltype == "hp" assert p.num_hashes == 0 assert p.scaled == 200 assert p.ksizes == [96] @@ -545,8 +547,17 @@ def test_manifest_row_to_compute_parameters_4(): def test_bad_compute_parameters(): - p = ComputeParameters(ksizes=[31], seed=42, dna=0, protein=0, dayhoff=0, - hp=0, num_hashes=0, track_abundance=True, scaled=1000) + p = ComputeParameters( + ksizes=[31], + seed=42, + dna=0, + protein=0, + dayhoff=0, + hp=0, + num_hashes=0, + track_abundance=True, + scaled=1000, + ) with pytest.raises(AssertionError): p.moltype @@ -557,434 +568,484 @@ def test_bad_compute_parameters(): @utils.in_thisdir def test_do_sourmash_sketchdna_empty(c): with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sketch', 'dna') - assert 'error: no input filenames provided! nothing to do - exiting.' in c.last_result.err + c.run_sourmash("sketch", "dna") + assert ( + "error: no input filenames provided! nothing to do - exiting." + in c.last_result.err + ) @utils.in_thisdir def test_do_sourmash_sketchprotein_empty(c): with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sketch', 'protein') - assert 'error: no input filenames provided! nothing to do - exiting.' in c.last_result.err + c.run_sourmash("sketch", "protein") + assert ( + "error: no input filenames provided! nothing to do - exiting." + in c.last_result.err + ) @utils.in_thisdir def test_do_sourmash_sketchtranslate_empty(c): with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sketch', 'translate') - assert 'error: no input filenames provided! nothing to do - exiting.' in c.last_result.err + c.run_sourmash("sketch", "translate") + assert ( + "error: no input filenames provided! nothing to do - exiting." + in c.last_result.err + ) def test_do_sourmash_sketchdna(runtmp): - testdata1 = utils.get_test_data('short.fa') - runtmp.sourmash('sketch', 'dna', testdata1) + testdata1 = utils.get_test_data("short.fa") + runtmp.sourmash("sketch", "dna", testdata1) - sigfile = runtmp.output('short.fa.sig') + sigfile = runtmp.output("short.fa.sig") assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) - assert str(sig).endswith('short.fa') + assert str(sig).endswith("short.fa") def test_do_sourmash_sketchdna_check_sequence_succeed(runtmp): - testdata1 = utils.get_test_data('short.fa') - runtmp.sourmash('sketch', 'dna', testdata1, '--check-sequence') + testdata1 = utils.get_test_data("short.fa") + runtmp.sourmash("sketch", "dna", testdata1, "--check-sequence") - sigfile = runtmp.output('short.fa.sig') + sigfile = runtmp.output("short.fa.sig") assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) - assert str(sig).endswith('short.fa') + assert str(sig).endswith("short.fa") def test_do_sourmash_sketchdna_check_sequence_fail(runtmp): - testdata1 = utils.get_test_data('shewanella.faa') + testdata1 = utils.get_test_data("shewanella.faa") - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('sketch', 'dna', testdata1, '--check-sequence') + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash("sketch", "dna", testdata1, "--check-sequence") err = runtmp.last_result.err print(err) assert "ERROR when reading from " in err - assert "invalid DNA character in input k-mer: MCGIVGAVAQRDVAEILVEGLRRLEYRGYDS" in err + assert ( + "invalid DNA character in input k-mer: MCGIVGAVAQRDVAEILVEGLRRLEYRGYDS" in err + ) def test_do_sourmash_sketchdna_check_sequence_fail_singleton(runtmp): - testdata1 = utils.get_test_data('shewanella.faa') + testdata1 = utils.get_test_data("shewanella.faa") - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('sketch', 'dna', testdata1, '--check-sequence', - '--singleton') + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash("sketch", "dna", testdata1, "--check-sequence", "--singleton") err = runtmp.last_result.err print(err) assert "ERROR when reading from " in err - assert "invalid DNA character in input k-mer: MCGIVGAVAQRDVAEILVEGLRRLEYRGYDS" in err + assert ( + "invalid DNA character in input k-mer: MCGIVGAVAQRDVAEILVEGLRRLEYRGYDS" in err + ) def test_do_sourmash_sketchdna_from_file(runtmp): - testdata1 = utils.get_test_data('short.fa') + testdata1 = utils.get_test_data("short.fa") file_list = runtmp.output("filelist.txt") - with open(file_list, 'wt') as fp: + with open(file_list, "w") as fp: print(testdata1, file=fp) - runtmp.sourmash('sketch', 'dna', '--from-file', file_list) + runtmp.sourmash("sketch", "dna", "--from-file", file_list) - sigfile = runtmp.output('short.fa.sig') + sigfile = runtmp.output("short.fa.sig") assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) - assert str(sig).endswith('short.fa') + assert str(sig).endswith("short.fa") @utils.in_tempdir def test_do_sourmash_sketchdna_noinput(c): data = "" - cmd = ['sketch', 'dna', '-', '-o', c.output('xxx.sig')] + cmd = ["sketch", "dna", "-", "-o", c.output("xxx.sig")] c.run_sourmash(*cmd, stdin_data=data) print(c.last_result.out) print(c.last_result.err) - sigfile = c.output('xxx.sig') + sigfile = c.output("xxx.sig") assert not os.path.exists(sigfile) - assert 'no sequences found' in c.last_result.err + assert "no sequences found" in c.last_result.err @utils.in_tempdir def test_do_sourmash_sketchdna_noinput_singleton(c): data = "" - cmd = ['sketch', 'dna', '-', '-o', c.output('xxx.sig'), '--singleton'] + cmd = ["sketch", "dna", "-", "-o", c.output("xxx.sig"), "--singleton"] c.run_sourmash(*cmd, stdin_data=data) - sigfile = c.output('xxx.sig') + sigfile = c.output("xxx.sig") assert not os.path.exists(sigfile) - assert 'no sequences found' in c.last_result.err + assert "no sequences found" in c.last_result.err @utils.in_tempdir def test_do_sourmash_sketchdna_noinput_merge(c): data = "" - cmd = ['sketch', 'dna', '-', '-o', c.output('xxx.sig'), '--merge', 'name'] + cmd = ["sketch", "dna", "-", "-o", c.output("xxx.sig"), "--merge", "name"] c.run_sourmash(*cmd, stdin_data=data) - sigfile = c.output('xxx.sig') + sigfile = c.output("xxx.sig") assert not os.path.exists(sigfile) - assert 'no sequences found' in c.last_result.err + assert "no sequences found" in c.last_result.err @utils.in_tempdir def test_do_sourmash_sketchdna_outdir(c): - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['sketch', 'dna', testdata1, - '--outdir', c.location]) + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", ["sketch", "dna", testdata1, "--outdir", c.location] + ) - sigfile = os.path.join(c.location, 'short.fa.sig') + sigfile = os.path.join(c.location, "short.fa.sig") assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) - assert str(sig).endswith('short.fa') + assert str(sig).endswith("short.fa") @utils.in_tempdir def test_do_sourmash_sketchdna_output_dir(c): # test via --output-dir not --outdir - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['sketch', 'dna', testdata1, - '--output-dir', c.location]) + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", ["sketch", "dna", testdata1, "--output-dir", c.location] + ) - sigfile = os.path.join(c.location, 'short.fa.sig') + sigfile = os.path.join(c.location, "short.fa.sig") assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) - assert str(sig).endswith('short.fa') + assert str(sig).endswith("short.fa") def test_do_sourmash_sketchdna_output_valid_file(runtmp): - """ Trigger bug #123 """ - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') - sigfile = runtmp.output('short.fa.sig') + """Trigger bug #123""" + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") + sigfile = runtmp.output("short.fa.sig") - runtmp.sourmash('sketch', 'dna', '-o', sigfile, testdata1, testdata2, testdata3) + runtmp.sourmash("sketch", "dna", "-o", sigfile, testdata1, testdata2, testdata3) assert os.path.exists(sigfile) - assert not runtmp.last_result.out # stdout should be empty + assert not runtmp.last_result.out # stdout should be empty # is it valid json? - with open(sigfile, 'r') as f: + with open(sigfile) as f: data = json.load(f) - filesigs = [sig['filename'] for sig in data] - assert all(testdata in filesigs - for testdata in (testdata1, testdata2, testdata3)) + filesigs = [sig["filename"] for sig in data] + assert all(testdata in filesigs for testdata in (testdata1, testdata2, testdata3)) def test_do_sourmash_sketchdna_output_zipfile(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") - outfile = runtmp.output('shorts.zip') + outfile = runtmp.output("shorts.zip") - runtmp.sourmash('sketch', 'dna', '-o', outfile, testdata1, testdata2, testdata3) + runtmp.sourmash("sketch", "dna", "-o", outfile, testdata1, testdata2, testdata3) assert os.path.exists(outfile) - assert not runtmp.last_result.out # stdout should be empty + assert not runtmp.last_result.out # stdout should be empty sigs = list(sourmash.load_file_as_signatures(outfile)) assert len(sigs) == 3 def test_do_sourmash_sketchdna_output_stdout_valid(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") - runtmp.sourmash('sketch', 'dna', '-o', '-', testdata1, testdata2, testdata3) + runtmp.sourmash("sketch", "dna", "-o", "-", testdata1, testdata2, testdata3) # is it valid json? data = json.loads(runtmp.last_result.out) - filesigs = [sig['filename'] for sig in data] - assert all(testdata in filesigs - for testdata in (testdata1, testdata2, testdata3)) + filesigs = [sig["filename"] for sig in data] + assert all(testdata in filesigs for testdata in (testdata1, testdata2, testdata3)) @utils.in_tempdir def test_do_sourmash_sketchdna_output_and_name_valid_file(c): # test --merge of multiple input files - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') - sigfile = c.output('short.fa.sig') - - c.run_sourmash('sketch', 'dna', '-p', 'num=500', '-o', sigfile, '--merge', - '"name"', testdata1, testdata2, testdata3) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") + sigfile = c.output("short.fa.sig") + + c.run_sourmash( + "sketch", + "dna", + "-p", + "num=500", + "-o", + sigfile, + "--merge", + '"name"', + testdata1, + testdata2, + testdata3, + ) assert os.path.exists(sigfile) - assert 'calculated 1 signature for 4 sequences taken from 3 files' in c.last_result.err + assert ( + "calculated 1 signature for 4 sequences taken from 3 files" in c.last_result.err + ) # is it valid json? - with open(sigfile, 'r') as f: + with open(sigfile) as f: data = json.load(f) assert len(data) == 1 - sigfile_merged = c.output('short.all.fa.sig') - c.run_sourmash('sketch', 'dna', '-p', 'num=500', '-o', sigfile_merged, - '--merge', '"name"', testdata1, testdata2, testdata3) - - with open(sigfile_merged, 'r') as f: + sigfile_merged = c.output("short.all.fa.sig") + c.run_sourmash( + "sketch", + "dna", + "-p", + "num=500", + "-o", + sigfile_merged, + "--merge", + '"name"', + testdata1, + testdata2, + testdata3, + ) + + with open(sigfile_merged) as f: data_merged = json.load(f) - assert data[0]['signatures'][0]['mins'] == data_merged[0]['signatures'][0]['mins'] + assert data[0]["signatures"][0]["mins"] == data_merged[0]["signatures"][0]["mins"] @utils.in_tempdir def test_do_sourmash_sketchdna_output_and_name_valid_file_outdir(c): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') - sigfile = os.path.join(c.location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") + sigfile = os.path.join(c.location, "short.fa.sig") - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('sketch', 'dna', '-o', sigfile, - '--merge', '"name"', - testdata1, testdata2, testdata3, - '--outdir', c.location) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "sketch", + "dna", + "-o", + sigfile, + "--merge", + '"name"', + testdata1, + testdata2, + testdata3, + "--outdir", + c.location, + ) errmsg = c.last_result.err assert "ERROR: --output-dir doesn't make sense with -o/--output" in errmsg def test_do_sourmash_sketchdna_singleton(runtmp): - testdata1 = utils.get_test_data('short.fa') - runtmp.sourmash('sketch', 'dna', '--singleton', testdata1) + testdata1 = utils.get_test_data("short.fa") + runtmp.sourmash("sketch", "dna", "--singleton", testdata1) - sigfile = runtmp.output('short.fa.sig') + sigfile = runtmp.output("short.fa.sig") assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) - assert str(sig).endswith('shortName') + assert str(sig).endswith("shortName") def test_do_sourmash_sketchdna_name(runtmp): - testdata1 = utils.get_test_data('short.fa') - runtmp.sourmash('sketch', 'dna', '--merge', 'foo', testdata1, '-o', 'foo.sig') + testdata1 = utils.get_test_data("short.fa") + runtmp.sourmash("sketch", "dna", "--merge", "foo", testdata1, "-o", "foo.sig") - sigfile = runtmp.output('foo.sig') + sigfile = runtmp.output("foo.sig") assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) - assert sig.name == 'foo' - - runtmp.sourmash('sketch', 'dna', '--name', 'foo', testdata1, '-o', 'foo2.sig') + assert sig.name == "foo" - sigfile2 = runtmp.output('foo2.sig') + runtmp.sourmash("sketch", "dna", "--name", "foo", testdata1, "-o", "foo2.sig") + + sigfile2 = runtmp.output("foo2.sig") assert os.path.exists(sigfile2) sig2 = next(signature.load_signatures(sigfile)) - assert sig2.name == 'foo' + assert sig2.name == "foo" assert sig.name == sig2.name def test_do_sourmash_sketchdna_name_fail_no_output(runtmp): - testdata1 = utils.get_test_data('short.fa') + testdata1 = utils.get_test_data("short.fa") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'dna', '--merge', 'foo', testdata1) + runtmp.sourmash("sketch", "dna", "--merge", "foo", testdata1) assert runtmp.last_result.status == -1 def test_do_sourmash_sketchdna_fail_no_output(runtmp): - testdata1 = utils.get_test_data('short.fa') + testdata1 = utils.get_test_data("short.fa") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'dna', '--merge', 'foo', testdata1) + runtmp.sourmash("sketch", "dna", "--merge", "foo", testdata1) assert runtmp.last_result.status == -1 with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'dna', '--name', 'foo', testdata1) + runtmp.sourmash("sketch", "dna", "--name", "foo", testdata1) assert runtmp.last_result.status == -1 def test_do_sourmash_sketchdna_name_from_first(runtmp): - testdata1 = utils.get_test_data('short3.fa') - runtmp.sourmash('sketch', 'dna', '--name-from-first', testdata1) + testdata1 = utils.get_test_data("short3.fa") + runtmp.sourmash("sketch", "dna", "--name-from-first", testdata1) - sigfile = runtmp.output('short3.fa.sig') + sigfile = runtmp.output("short3.fa.sig") assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) - assert sig.name == 'firstname' + assert sig.name == "firstname" def test_do_sourmash_sketchdna_multik(runtmp): - testdata1 = utils.get_test_data('short.fa') - runtmp.sourmash('sketch', 'dna', '-p', 'k=31,k=21', testdata1) + testdata1 = utils.get_test_data("short.fa") + runtmp.sourmash("sketch", "dna", "-p", "k=31,k=21", testdata1) - outfile = runtmp.output('short.fa.sig') + outfile = runtmp.output("short.fa.sig") assert os.path.exists(outfile) siglist = list(signature.load_signatures(outfile)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 21 in ksizes assert 31 in ksizes def test_do_sourmash_sketchdna_multik_output(runtmp, sig_save_extension): - testdata1 = utils.get_test_data('short.fa') - outfile = runtmp.output(f'out.{sig_save_extension}') - runtmp.sourmash('sketch', 'dna', '-p', 'k=31,k=21', testdata1, - '-o', outfile) + testdata1 = utils.get_test_data("short.fa") + outfile = runtmp.output(f"out.{sig_save_extension}") + runtmp.sourmash("sketch", "dna", "-p", "k=31,k=21", testdata1, "-o", outfile) print("saved to file/path with extension:", outfile) assert os.path.exists(outfile) siglist = list(sourmash.load_file_as_signatures(outfile)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 21 in ksizes assert 31 in ksizes def test_do_sketch_dna_override_protein_fail(runtmp): - testdata1 = utils.get_test_data('short.fa') + testdata1 = utils.get_test_data("short.fa") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'dna', '-p', 'k=7,num=500,protein', testdata1) + runtmp.sourmash("sketch", "dna", "-p", "k=7,num=500,protein", testdata1) assert runtmp.last_result.status != 0 - assert 'Error creating signatures: Incompatible sketch type' in runtmp.last_result.err + assert ( + "Error creating signatures: Incompatible sketch type" in runtmp.last_result.err + ) def test_do_sketch_protein_override_dna_fail(runtmp): - testdata1 = utils.get_test_data('short.fa') + testdata1 = utils.get_test_data("short.fa") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'protein', '-p', 'k=7,num=500,dna', testdata1) + runtmp.sourmash("sketch", "protein", "-p", "k=7,num=500,dna", testdata1) assert runtmp.last_result.status != 0 - assert 'Error creating signatures: Incompatible sketch type' in runtmp.last_result.err + assert ( + "Error creating signatures: Incompatible sketch type" in runtmp.last_result.err + ) def test_do_sketch_translate_multik_with_protein(runtmp): - testdata1 = utils.get_test_data('short.fa') - runtmp.sourmash('sketch', 'translate', '-p', 'k=7,k=10,num=500', testdata1) + testdata1 = utils.get_test_data("short.fa") + runtmp.sourmash("sketch", "translate", "-p", "k=7,k=10,num=500", testdata1) - outfile = runtmp.output('short.fa.sig') + outfile = runtmp.output("short.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes def test_do_sketch_translate_multik_with_protein_from_file(runtmp): - testdata1 = utils.get_test_data('short.fa') + testdata1 = utils.get_test_data("short.fa") file_list = runtmp.output("filelist.txt") - with open(file_list, 'wt') as fp: + with open(file_list, "w") as fp: print(testdata1, file=fp) - runtmp.sourmash('sketch', 'translate', '-p', 'k=7,k=10,num=500', '--from-file', file_list) + runtmp.sourmash( + "sketch", "translate", "-p", "k=7,k=10,num=500", "--from-file", file_list + ) - outfile = runtmp.output('short.fa.sig') + outfile = runtmp.output("short.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes def test_do_sketch_translate_multik_with_dayhoff(runtmp): - testdata1 = utils.get_test_data('short.fa') - runtmp.sourmash('sketch', 'translate', '-p', 'k=7,k=10,num=500', '--dayhoff', testdata1) + testdata1 = utils.get_test_data("short.fa") + runtmp.sourmash( + "sketch", "translate", "-p", "k=7,k=10,num=500", "--dayhoff", testdata1 + ) - outfile = runtmp.output('short.fa.sig') + outfile = runtmp.output("short.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes assert all(x.minhash.dayhoff for x in siglist) def test_do_sketch_translate_multik_with_hp(runtmp): - testdata1 = utils.get_test_data('short.fa') - runtmp.sourmash('sketch', 'translate', '-p', 'k=7,k=10,num=500', '--hp', testdata1) + testdata1 = utils.get_test_data("short.fa") + runtmp.sourmash("sketch", "translate", "-p", "k=7,k=10,num=500", "--hp", testdata1) - outfile = runtmp.output('short.fa.sig') + outfile = runtmp.output("short.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes assert all(x.minhash.hp for x in siglist) @@ -993,202 +1054,220 @@ def test_do_sketch_translate_multik_with_hp(runtmp): @utils.in_tempdir def test_do_sourmash_sketch_translate_multik_only_protein(c): # check sourmash sketch_translate with only protein, no nucl - testdata1 = utils.get_test_data('short.fa') - c.run_sourmash('sketch', 'translate', '-p', 'k=7,k=10,num=500', - testdata1) - outfile = os.path.join(c.location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + c.run_sourmash("sketch", "translate", "-p", "k=7,k=10,num=500", testdata1) + outfile = os.path.join(c.location, "short.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes def test_do_sourmash_sketch_translate_bad_sequences(runtmp): """Proper error handling when Ns in dna sequence""" - testdata1 = utils.get_test_data('short.bad.fa') - runtmp.sourmash('sketch', 'translate', '-p', 'k=7,k=10,num=500', testdata1) + testdata1 = utils.get_test_data("short.bad.fa") + runtmp.sourmash("sketch", "translate", "-p", "k=7,k=10,num=500", testdata1) - outfile = runtmp.output('short.bad.fa.sig') + outfile = runtmp.output("short.bad.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes def test_do_sketch_protein_multik_input(runtmp): - testdata1 = utils.get_test_data('ecoli.faa') - runtmp.sourmash('sketch', 'protein', '-p', 'k=7,k=10,num=500', testdata1) + testdata1 = utils.get_test_data("ecoli.faa") + runtmp.sourmash("sketch", "protein", "-p", "k=7,k=10,num=500", testdata1) - outfile = runtmp.output('ecoli.faa.sig') + outfile = runtmp.output("ecoli.faa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes - moltype = set([ x.minhash.moltype == 'protein' - for x in siglist ]) + moltype = set([x.minhash.moltype == "protein" for x in siglist]) assert len(moltype) == 1 assert True in moltype def test_do_sketch_protein_multik_input_from_file(runtmp): - testdata1 = utils.get_test_data('ecoli.faa') + testdata1 = utils.get_test_data("ecoli.faa") file_list = runtmp.output("filelist.txt") - with open(file_list, 'wt') as fp: + with open(file_list, "w") as fp: print(testdata1, file=fp) - runtmp.sourmash('sketch', 'protein', '-p', 'k=7,k=10,num=500', '--from-file', file_list) + runtmp.sourmash( + "sketch", "protein", "-p", "k=7,k=10,num=500", "--from-file", file_list + ) - outfile = runtmp.output('ecoli.faa.sig') + outfile = runtmp.output("ecoli.faa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes - moltype = set([ x.minhash.moltype == 'protein' - for x in siglist ]) + moltype = set([x.minhash.moltype == "protein" for x in siglist]) assert len(moltype) == 1 assert True in moltype def test_do_sourmash_sketchdna_multik_outfile(runtmp): - testdata1 = utils.get_test_data('short.fa') - outfile = runtmp.output('FOO.xxx') + testdata1 = utils.get_test_data("short.fa") + outfile = runtmp.output("FOO.xxx") - runtmp.sourmash('sketch', 'dna', '-p', 'k=21,k=31', testdata1, '-o', outfile) + runtmp.sourmash("sketch", "dna", "-p", "k=21,k=31", testdata1, "-o", outfile) assert os.path.exists(outfile) siglist = list(signature.load_signatures(outfile)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 21 in ksizes assert 31 in ksizes def test_do_sourmash_sketchdna_with_scaled_1(runtmp): - testdata1 = utils.get_test_data('short.fa') - outfile = runtmp.output('FOO.xxx') + testdata1 = utils.get_test_data("short.fa") + outfile = runtmp.output("FOO.xxx") - runtmp.sourmash('sketch', 'dna', '-p', 'k=21,k=31,scaled=1', testdata1, '-o', outfile) + runtmp.sourmash( + "sketch", "dna", "-p", "k=21,k=31,scaled=1", testdata1, "-o", outfile + ) assert os.path.exists(outfile) siglist = list(signature.load_signatures(outfile)) assert len(siglist) == 2 - scaled_vals = [ x.minhash.scaled for x in siglist ] + scaled_vals = [x.minhash.scaled for x in siglist] assert len(scaled_vals) == 2 - assert set(scaled_vals) == { 1 } + assert set(scaled_vals) == {1} def test_do_sourmash_sketchdna_with_scaled_2(runtmp): - testdata1 = utils.get_test_data('short.fa') - outfile = runtmp.output('FOO.xxx') + testdata1 = utils.get_test_data("short.fa") + outfile = runtmp.output("FOO.xxx") - runtmp.sourmash('sketch', 'dna', '-p', 'k=21,k=31,scaled=2', testdata1, '-o', outfile) + runtmp.sourmash( + "sketch", "dna", "-p", "k=21,k=31,scaled=2", testdata1, "-o", outfile + ) assert os.path.exists(outfile) siglist = list(signature.load_signatures(outfile)) assert len(siglist) == 2 - max_hashes = [ x.minhash._max_hash for x in siglist ] + max_hashes = [x.minhash._max_hash for x in siglist] assert len(max_hashes) == 2 - assert set(max_hashes) == set([ int(2**64 /2.) ]) + assert set(max_hashes) == set([int(2**64 / 2.0)]) def test_do_sourmash_sketchdna_with_scaled(runtmp): - testdata1 = utils.get_test_data('short.fa') - outfile = runtmp.output('FOO.xxx') + testdata1 = utils.get_test_data("short.fa") + outfile = runtmp.output("FOO.xxx") - runtmp.sourmash('sketch', 'dna', '-p', 'k=21,k=31,scaled=100', testdata1, '-o', outfile) + runtmp.sourmash( + "sketch", "dna", "-p", "k=21,k=31,scaled=100", testdata1, "-o", outfile + ) assert os.path.exists(outfile) siglist = list(signature.load_signatures(outfile)) assert len(siglist) == 2 - max_hashes = [ x.minhash._max_hash for x in siglist ] + max_hashes = [x.minhash._max_hash for x in siglist] assert len(max_hashes) == 2 - assert set(max_hashes) == set([ int(2**64 /100.) ]) + assert set(max_hashes) == set([int(2**64 / 100.0)]) def test_do_sourmash_sketchdna_with_bad_scaled(runtmp): - testdata1 = utils.get_test_data('short.fa') - outfile = runtmp.output('FOO.xxx') + testdata1 = utils.get_test_data("short.fa") + outfile = runtmp.output("FOO.xxx") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'dna', '-p', 'k=21,k=31,scaled=-1', testdata1, '-o', outfile) + runtmp.sourmash( + "sketch", "dna", "-p", "k=21,k=31,scaled=-1", testdata1, "-o", outfile + ) assert runtmp.last_result.status != 0 print(runtmp.last_result.err) - assert 'ERROR: scaled value must be positive' in runtmp.last_result.err + assert "ERROR: scaled value must be positive" in runtmp.last_result.err with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'dna', '-p', 'k=21,k=31,scaled=1000.5', testdata1, '-o', outfile) + runtmp.sourmash( + "sketch", "dna", "-p", "k=21,k=31,scaled=1000.5", testdata1, "-o", outfile + ) assert runtmp.last_result.status != 0 assert "cannot parse scaled='1000.5' as an integer" in runtmp.last_result.err - runtmp.sourmash('sketch', 'dna', '-p', 'k=21,k=31,scaled=1000000000', testdata1, '-o', outfile) + runtmp.sourmash( + "sketch", "dna", "-p", "k=21,k=31,scaled=1000000000", testdata1, "-o", outfile + ) assert runtmp.last_result.status == 0 - print('XXX') + print("XXX") print(runtmp.last_result.err) - assert 'WARNING: scaled value should be <= 1e6. Continuing anyway.' in runtmp.last_result.err + assert ( + "WARNING: scaled value should be <= 1e6. Continuing anyway." + in runtmp.last_result.err + ) def test_do_sketch_with_seed(runtmp): - testdata1 = utils.get_test_data('short.fa') - outfile = runtmp.output('FOO.xxx') + testdata1 = utils.get_test_data("short.fa") + outfile = runtmp.output("FOO.xxx") - runtmp.sourmash('sketch', 'dna', '-p', 'k=21,k=31,seed=43', testdata1, '-o', outfile) + runtmp.sourmash( + "sketch", "dna", "-p", "k=21,k=31,seed=43", testdata1, "-o", outfile + ) assert os.path.exists(outfile) siglist = list(signature.load_signatures(outfile)) assert len(siglist) == 2 - seeds = [ x.minhash.seed for x in siglist ] + seeds = [x.minhash.seed for x in siglist] assert len(seeds) == 2 - assert set(seeds) == set([ 43 ]) + assert set(seeds) == set([43]) def test_do_sourmash_check_protein_comparisons(runtmp): # this test checks 2 x 2 protein comparisons with E. coli genes. - testdata1 = utils.get_test_data('ecoli.faa') + testdata1 = utils.get_test_data("ecoli.faa") - runtmp.sourmash('sketch', 'protein', '-p', 'k=7,num=500', '--singleton', testdata1) + runtmp.sourmash("sketch", "protein", "-p", "k=7,num=500", "--singleton", testdata1) - sig1 = runtmp.output('ecoli.faa.sig') + sig1 = runtmp.output("ecoli.faa.sig") assert os.path.exists(sig1) - testdata2 = utils.get_test_data('ecoli.genes.fna') - runtmp.sourmash('sketch', 'translate', '-p', 'k=7,num=500', '--singleton', testdata2) + testdata2 = utils.get_test_data("ecoli.genes.fna") + runtmp.sourmash( + "sketch", "translate", "-p", "k=7,num=500", "--singleton", testdata2 + ) - sig2 = runtmp.output('ecoli.genes.fna.sig') + sig2 = runtmp.output("ecoli.genes.fna.sig") assert os.path.exists(sig2) # I'm not sure why load_signatures is randomizing order, but ok. @@ -1199,13 +1278,13 @@ def test_do_sourmash_check_protein_comparisons(runtmp): sig1_trans, sig2_trans = sorted(x, key=lambda x: x.name) name1 = sig1_aa.name.split()[0] - assert name1 == 'NP_414543.1' + assert name1 == "NP_414543.1" name2 = sig2_aa.name.split()[0] - assert name2 == 'NP_414544.1' + assert name2 == "NP_414544.1" name3 = sig1_trans.name.split()[0] - assert name3 == 'gi|556503834:2801-3733' + assert name3 == "gi|556503834:2801-3733" name4 = sig2_trans.name.split()[0] - assert name4 == 'gi|556503834:337-2799' + assert name4 == "gi|556503834:337-2799" print(name1, name3, round(sig1_aa.similarity(sig1_trans), 3)) print(name2, name3, round(sig2_aa.similarity(sig1_trans), 3)) @@ -1222,10 +1301,9 @@ def test_do_sourmash_check_protein_comparisons(runtmp): def test_do_sourmash_check_knowngood_dna_comparisons(c): # this test checks against a known good signature calculated # by utils/compute-dna-mh-another-way.py - testdata1 = utils.get_test_data('ecoli.genes.fna') - c.run_sourmash('sketch', 'dna', '-p', 'k=21,num=500', - '--singleton', testdata1) - sig1 = c.output('ecoli.genes.fna.sig') + testdata1 = utils.get_test_data("ecoli.genes.fna") + c.run_sourmash("sketch", "dna", "-p", "k=21,num=500", "--singleton", testdata1) + sig1 = c.output("ecoli.genes.fna.sig") assert os.path.exists(sig1) x = list(signature.load_signatures(sig1)) @@ -1234,7 +1312,7 @@ def test_do_sourmash_check_knowngood_dna_comparisons(c): print(sig1.name) print(sig2.name) - knowngood = utils.get_test_data('benchmark.dna.sig') + knowngood = utils.get_test_data("benchmark.dna.sig") good = list(signature.load_signatures(knowngood))[0] assert sig2.similarity(good) == 1.0 @@ -1243,16 +1321,15 @@ def test_do_sourmash_check_knowngood_dna_comparisons(c): @utils.in_tempdir def test_do_sourmash_check_knowngood_dna_comparisons_use_rna(c): # check the rna ; otherwise identical to previous test. - testdata1 = utils.get_test_data('ecoli.genes.fna') - c.run_sourmash('sketch', 'rna', '-p', 'k=21,num=500', '--singleton', - testdata1) - sig1 = c.output('ecoli.genes.fna.sig') + testdata1 = utils.get_test_data("ecoli.genes.fna") + c.run_sourmash("sketch", "rna", "-p", "k=21,num=500", "--singleton", testdata1) + sig1 = c.output("ecoli.genes.fna.sig") assert os.path.exists(sig1) x = list(signature.load_signatures(sig1)) sig1, sig2 = sorted(x, key=lambda x: x.name) - knowngood = utils.get_test_data('benchmark.dna.sig') + knowngood = utils.get_test_data("benchmark.dna.sig") good = list(signature.load_signatures(knowngood))[0] assert sig2.similarity(good) == 1.0 @@ -1261,17 +1338,17 @@ def test_do_sourmash_check_knowngood_dna_comparisons_use_rna(c): def test_do_sourmash_check_knowngood_input_protein_comparisons(runtmp): # this test checks against a known good signature calculated # by utils/compute-input-prot-another-way.py - testdata1 = utils.get_test_data('ecoli.faa') + testdata1 = utils.get_test_data("ecoli.faa") - runtmp.sourmash('sketch', 'protein', '-p', 'k=7,num=500', '--singleton', testdata1) + runtmp.sourmash("sketch", "protein", "-p", "k=7,num=500", "--singleton", testdata1) - sig1 = runtmp.output('ecoli.faa.sig') + sig1 = runtmp.output("ecoli.faa.sig") assert os.path.exists(sig1) x = list(signature.load_signatures(sig1)) sig1_aa, sig2_aa = sorted(x, key=lambda x: x.name) - knowngood = utils.get_test_data('benchmark.input_prot.sig') + knowngood = utils.get_test_data("benchmark.input_prot.sig") good_aa = list(signature.load_signatures(knowngood))[0] assert sig1_aa.similarity(good_aa) == 1.0 @@ -1280,17 +1357,19 @@ def test_do_sourmash_check_knowngood_input_protein_comparisons(runtmp): def test_do_sourmash_check_knowngood_protein_comparisons(runtmp): # this test checks against a known good signature calculated # by utils/compute-prot-mh-another-way.py - testdata1 = utils.get_test_data('ecoli.genes.fna') + testdata1 = utils.get_test_data("ecoli.genes.fna") - runtmp.sourmash('sketch', 'translate', '-p', 'k=7,num=500', '--singleton', testdata1) + runtmp.sourmash( + "sketch", "translate", "-p", "k=7,num=500", "--singleton", testdata1 + ) - sig1 = runtmp.output('ecoli.genes.fna.sig') + sig1 = runtmp.output("ecoli.genes.fna.sig") assert os.path.exists(sig1) x = list(signature.load_signatures(sig1)) sig1_trans, sig2_trans = sorted(x, key=lambda x: x.name) - knowngood = utils.get_test_data('benchmark.prot.sig') + knowngood = utils.get_test_data("benchmark.prot.sig") good_trans = list(signature.load_signatures(knowngood))[0] assert sig2_trans.similarity(good_trans) == 1.0 @@ -1298,19 +1377,26 @@ def test_do_sourmash_check_knowngood_protein_comparisons(runtmp): def test_do_sourmash_singleton_multiple_files_no_out_specified(runtmp): # this test checks that --singleton -o works - testdata1 = utils.get_test_data('ecoli.faa') - testdata2 = utils.get_test_data('shewanella.faa') + testdata1 = utils.get_test_data("ecoli.faa") + testdata2 = utils.get_test_data("shewanella.faa") - runtmp.sourmash('sketch', 'protein', '-p', 'k=7', '--singleton', - testdata1, testdata2) + runtmp.sourmash( + "sketch", "protein", "-p", "k=7", "--singleton", testdata1, testdata2 + ) print(runtmp.last_result.err) - assert "saved 2 signature(s) to 'ecoli.faa.sig'. Note: signature license is CC0." in runtmp.last_result.err - assert "saved 2 signature(s) to 'shewanella.faa.sig'. Note: signature license is CC0." in runtmp.last_result.err - - sig1 = runtmp.output('ecoli.faa.sig') + assert ( + "saved 2 signature(s) to 'ecoli.faa.sig'. Note: signature license is CC0." + in runtmp.last_result.err + ) + assert ( + "saved 2 signature(s) to 'shewanella.faa.sig'. Note: signature license is CC0." + in runtmp.last_result.err + ) + + sig1 = runtmp.output("ecoli.faa.sig") assert os.path.exists(sig1) - sig2 = runtmp.output('shewanella.faa.sig') + sig2 = runtmp.output("shewanella.faa.sig") assert os.path.exists(sig2) x = list(signature.load_signatures(sig1)) @@ -1324,27 +1410,39 @@ def test_do_sourmash_singleton_multiple_files_no_out_specified(runtmp): assert len(x) == 2 assert len(y) == 2 - idents = [ ss.name.split()[0] for ss in x ] + idents = [ss.name.split()[0] for ss in x] print(idents) - assert set(['NP_414543.1', 'NP_414544.1' ]) == set(idents) + assert set(["NP_414543.1", "NP_414544.1"]) == set(idents) - idents = [ ss.name.split()[0] for ss in y ] + idents = [ss.name.split()[0] for ss in y] print(idents) - assert set(['WP_006079348.1', 'WP_006079351.1']) == set(idents) + assert set(["WP_006079348.1", "WP_006079351.1"]) == set(idents) def test_do_sourmash_singleton_multiple_files_output(runtmp): # this test checks that --singleton -o works - testdata1 = utils.get_test_data('ecoli.faa') - testdata2 = utils.get_test_data('shewanella.faa') - - runtmp.sourmash('sketch', 'protein', '-p', 'k=7', '--singleton', - testdata1, testdata2, '-o', 'output.sig') + testdata1 = utils.get_test_data("ecoli.faa") + testdata2 = utils.get_test_data("shewanella.faa") + + runtmp.sourmash( + "sketch", + "protein", + "-p", + "k=7", + "--singleton", + testdata1, + testdata2, + "-o", + "output.sig", + ) print(runtmp.last_result.err) - assert "saved 4 signature(s) to 'output.sig'. Note: signature license is CC0." in runtmp.last_result.err + assert ( + "saved 4 signature(s) to 'output.sig'. Note: signature license is CC0." + in runtmp.last_result.err + ) - sig1 = runtmp.output('output.sig') + sig1 = runtmp.output("output.sig") assert os.path.exists(sig1) x = list(signature.load_signatures(sig1)) @@ -1353,23 +1451,37 @@ def test_do_sourmash_singleton_multiple_files_output(runtmp): assert len(x) == 4 - idents = [ ss.name.split()[0] for ss in x ] + idents = [ss.name.split()[0] for ss in x] print(idents) - assert set(['NP_414543.1', 'NP_414544.1', 'WP_006079348.1', 'WP_006079351.1']) == set(idents) + assert set( + ["NP_414543.1", "NP_414544.1", "WP_006079348.1", "WP_006079351.1"] + ) == set(idents) def test_do_sourmash_singleton_multiple_files_output_zip(runtmp): # this test checks that --singleton -o works - testdata1 = utils.get_test_data('ecoli.faa') - testdata2 = utils.get_test_data('shewanella.faa') - - runtmp.sourmash('sketch', 'protein', '-p', 'k=7', '--singleton', - testdata1, testdata2, '-o', 'output.zip') + testdata1 = utils.get_test_data("ecoli.faa") + testdata2 = utils.get_test_data("shewanella.faa") + + runtmp.sourmash( + "sketch", + "protein", + "-p", + "k=7", + "--singleton", + testdata1, + testdata2, + "-o", + "output.zip", + ) print(runtmp.last_result.err) - assert "saved 4 signature(s) to 'output.zip'. Note: signature license is CC0." in runtmp.last_result.err + assert ( + "saved 4 signature(s) to 'output.zip'. Note: signature license is CC0." + in runtmp.last_result.err + ) - sig1 = runtmp.output('output.zip') + sig1 = runtmp.output("output.zip") assert os.path.exists(sig1) x = list(sourmash.load_file_as_signatures(sig1)) @@ -1378,35 +1490,37 @@ def test_do_sourmash_singleton_multiple_files_output_zip(runtmp): assert len(x) == 4 - idents = [ ss.name.split()[0] for ss in x ] + idents = [ss.name.split()[0] for ss in x] print(idents) - assert set(['NP_414543.1', 'NP_414544.1', 'WP_006079348.1', 'WP_006079351.1']) == set(idents) + assert set( + ["NP_414543.1", "NP_414544.1", "WP_006079348.1", "WP_006079351.1"] + ) == set(idents) def test_protein_with_stop_codons(runtmp): # compare protein seq with/without stop codons, via cli and also python # apis - testdata1 = utils.get_test_data('ecoli.faa') + testdata1 = utils.get_test_data("ecoli.faa") with screed.open(testdata1) as f: - ecoli_seq = [ record.sequence for record in f ] + ecoli_seq = [record.sequence for record in f] # first, via CLI w/o stop codons - runtmp.sourmash('sketch', 'protein', '-p', 'k=7,scaled=1', testdata1) - sig1 = runtmp.output('ecoli.faa.sig') + runtmp.sourmash("sketch", "protein", "-p", "k=7,scaled=1", testdata1) + sig1 = runtmp.output("ecoli.faa.sig") assert os.path.exists(sig1) x = signature.load_one_signature(sig1) cli_mh1 = x.minhash # second, via CLI w/stop codons - ecoli_stop = runtmp.output('ecoli.stop.faa') - with open(ecoli_stop, 'wt') as fp: + ecoli_stop = runtmp.output("ecoli.stop.faa") + with open(ecoli_stop, "w") as fp: for seq in ecoli_seq: - fp.write(f'>seq\n{seq}*\n') + fp.write(f">seq\n{seq}*\n") - runtmp.sourmash('sketch', 'protein', '-p', 'k=7,scaled=1', ecoli_stop) - sig2 = runtmp.output('ecoli.stop.faa.sig') + runtmp.sourmash("sketch", "protein", "-p", "k=7,scaled=1", ecoli_stop) + sig2 = runtmp.output("ecoli.stop.faa.sig") assert os.path.exists(sig2) x = signature.load_one_signature(sig2) @@ -1420,7 +1534,7 @@ def test_protein_with_stop_codons(runtmp): # now calculate sketch with MinHash and stop codons... py_mh2 = MinHash(n=0, ksize=7, is_protein=True, scaled=1) for seq in ecoli_seq: - py_mh2.add_protein(seq + '*') + py_mh2.add_protein(seq + "*") # and, last, calculate hashes separately with seq_to_hashes h_mh1 = MinHash(n=0, ksize=7, is_protein=True, scaled=1) @@ -1430,7 +1544,7 @@ def test_protein_with_stop_codons(runtmp): h = h_mh1.seq_to_hashes(seq, is_protein=1) h_mh1.add_many(h) - h = h_mh2.seq_to_hashes(seq + '*', is_protein=1) + h = h_mh2.seq_to_hashes(seq + "*", is_protein=1) h_mh2.add_many(h) # check! @@ -1453,26 +1567,26 @@ def test_hp_with_stop_codons(runtmp): # compare hp seq with/without stop codons, via cli and also python # apis - testdata1 = utils.get_test_data('ecoli.faa') + testdata1 = utils.get_test_data("ecoli.faa") with screed.open(testdata1) as f: - ecoli_seq = [ record.sequence for record in f ] + ecoli_seq = [record.sequence for record in f] # first, via CLI w/o stop codons - runtmp.sourmash('sketch', 'protein', '-p', 'k=7,scaled=1,hp', testdata1) - sig1 = runtmp.output('ecoli.faa.sig') + runtmp.sourmash("sketch", "protein", "-p", "k=7,scaled=1,hp", testdata1) + sig1 = runtmp.output("ecoli.faa.sig") assert os.path.exists(sig1) x = signature.load_one_signature(sig1) cli_mh1 = x.minhash # second, via CLI w/stop codons - ecoli_stop = runtmp.output('ecoli.stop.faa') - with open(ecoli_stop, 'wt') as fp: + ecoli_stop = runtmp.output("ecoli.stop.faa") + with open(ecoli_stop, "w") as fp: for seq in ecoli_seq: - fp.write(f'>seq\n{seq}*\n') + fp.write(f">seq\n{seq}*\n") - runtmp.sourmash('sketch', 'protein', '-p', 'k=7,scaled=1,hp', ecoli_stop) - sig2 = runtmp.output('ecoli.stop.faa.sig') + runtmp.sourmash("sketch", "protein", "-p", "k=7,scaled=1,hp", ecoli_stop) + sig2 = runtmp.output("ecoli.stop.faa.sig") assert os.path.exists(sig2) x = signature.load_one_signature(sig2) @@ -1486,7 +1600,7 @@ def test_hp_with_stop_codons(runtmp): # now calculate sketch with MinHash and stop codons... py_mh2 = MinHash(n=0, ksize=7, hp=True, scaled=1) for seq in ecoli_seq: - py_mh2.add_protein(seq + '*') + py_mh2.add_protein(seq + "*") # and, last, calculate hashes separately with seq_to_hashes h_mh1 = MinHash(n=0, ksize=7, hp=True, scaled=1) @@ -1496,7 +1610,7 @@ def test_hp_with_stop_codons(runtmp): h = h_mh1.seq_to_hashes(seq, is_protein=1) h_mh1.add_many(h) - h = h_mh2.seq_to_hashes(seq + '*', is_protein=1) + h = h_mh2.seq_to_hashes(seq + "*", is_protein=1) h_mh2.add_many(h) # check! @@ -1519,26 +1633,26 @@ def test_dayhoff_with_stop_codons(runtmp): # compare dayhoff seq with/without stop codons, via cli and also python # apis - testdata1 = utils.get_test_data('ecoli.faa') + testdata1 = utils.get_test_data("ecoli.faa") with screed.open(testdata1) as f: - ecoli_seq = [ record.sequence for record in f] + ecoli_seq = [record.sequence for record in f] # first, via CLI w/o stop codons - runtmp.sourmash('sketch', 'protein', '-p', 'k=7,scaled=1,dayhoff', testdata1) - sig1 = runtmp.output('ecoli.faa.sig') + runtmp.sourmash("sketch", "protein", "-p", "k=7,scaled=1,dayhoff", testdata1) + sig1 = runtmp.output("ecoli.faa.sig") assert os.path.exists(sig1) x = signature.load_one_signature(sig1) cli_mh1 = x.minhash # second, via CLI w/stop codons - ecoli_stop = runtmp.output('ecoli.stop.faa') - with open(ecoli_stop, 'wt') as fp: + ecoli_stop = runtmp.output("ecoli.stop.faa") + with open(ecoli_stop, "w") as fp: for seq in ecoli_seq: - fp.write(f'>seq\n{seq}*\n') + fp.write(f">seq\n{seq}*\n") - runtmp.sourmash('sketch', 'protein', '-p', 'k=7,scaled=1,dayhoff', ecoli_stop) - sig2 = runtmp.output('ecoli.stop.faa.sig') + runtmp.sourmash("sketch", "protein", "-p", "k=7,scaled=1,dayhoff", ecoli_stop) + sig2 = runtmp.output("ecoli.stop.faa.sig") assert os.path.exists(sig2) x = signature.load_one_signature(sig2) @@ -1552,7 +1666,7 @@ def test_dayhoff_with_stop_codons(runtmp): # now calculate sketch with MinHash and stop codons... py_mh2 = MinHash(n=0, ksize=7, dayhoff=True, scaled=1) for seq in ecoli_seq: - py_mh2.add_protein(seq + '*') + py_mh2.add_protein(seq + "*") # and, last, calculate hashes separately with seq_to_hashes h_mh1 = MinHash(n=0, ksize=7, dayhoff=True, scaled=1) @@ -1562,7 +1676,7 @@ def test_dayhoff_with_stop_codons(runtmp): h = h_mh1.seq_to_hashes(seq, is_protein=1) h_mh1.add_many(h) - h = h_mh2.seq_to_hashes(seq + '*', is_protein=1) + h = h_mh2.seq_to_hashes(seq + "*", is_protein=1) h_mh2.add_many(h) # check! @@ -1586,66 +1700,84 @@ def test_dayhoff_with_stop_codons(runtmp): def test_fromfile_dna(runtmp): # does it run? yes, hopefully. - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) - - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '-o', 'out.zip', '-p', 'dna') + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) + + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "-o", + "out.zip", + "-p", + "dna", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert os.path.exists(runtmp.output('out.zip')) - idx = sourmash.load_file_as_index(runtmp.output('out.zip')) + assert os.path.exists(runtmp.output("out.zip")) + idx = sourmash.load_file_as_index(runtmp.output("out.zip")) siglist = list(idx.signatures()) assert len(siglist) == 1 ss = siglist[0] - assert ss.name == 'GCA_903797575 Salmonella enterica' - assert ss.minhash.moltype == 'DNA' + assert ss.name == "GCA_903797575 Salmonella enterica" + assert ss.minhash.moltype == "DNA" assert "** 1 total requested; output 1, skipped 0" in runtmp.last_result.err def test_fromfile_dna_csv_gz(runtmp): # test with a gzipped csv - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) # gzip the CSV file - with open(runtmp.output('sketch_fromfile/salmonella.csv'), 'rb') as infp: - with gzip.open(runtmp.output('salmonella.csv.gz'), 'w') as outfp: + with open(runtmp.output("sketch_fromfile/salmonella.csv"), "rb") as infp: + with gzip.open(runtmp.output("salmonella.csv.gz"), "w") as outfp: outfp.write(infp.read()) - runtmp.sourmash('sketch', 'fromfile', 'salmonella.csv.gz', - '-o', 'out.zip', '-p', 'dna') + runtmp.sourmash( + "sketch", "fromfile", "salmonella.csv.gz", "-o", "out.zip", "-p", "dna" + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert os.path.exists(runtmp.output('out.zip')) - idx = sourmash.load_file_as_index(runtmp.output('out.zip')) + assert os.path.exists(runtmp.output("out.zip")) + idx = sourmash.load_file_as_index(runtmp.output("out.zip")) siglist = list(idx.signatures()) assert len(siglist) == 1 ss = siglist[0] - assert ss.name == 'GCA_903797575 Salmonella enterica' - assert ss.minhash.moltype == 'DNA' + assert ss.name == "GCA_903797575 Salmonella enterica" + assert ss.minhash.moltype == "DNA" assert "** 1 total requested; output 1, skipped 0" in runtmp.last_result.err def test_fromfile_dna_empty(runtmp): # test what happens on empty files. - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) # zero out the file - with gzip.open(runtmp.output('sketch_fromfile/GCA_903797575.1_PARATYPHIC668_genomic.fna.gz'), 'w') as fp: + with gzip.open( + runtmp.output("sketch_fromfile/GCA_903797575.1_PARATYPHIC668_genomic.fna.gz"), + "w", + ): pass # now what happens? with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '-o', 'out.zip', '-p', 'dna') + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "-o", + "out.zip", + "-p", + "dna", + ) print(runtmp.last_result.out) err = runtmp.last_result.err @@ -1656,125 +1788,172 @@ def test_fromfile_dna_empty(runtmp): def test_fromfile_dna_check_sequence_succeed(runtmp): # does it run? yes, hopefully. - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) - - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '-o', 'out.zip', '-p', 'dna', '--check-sequence') + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) + + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "-o", + "out.zip", + "-p", + "dna", + "--check-sequence", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert os.path.exists(runtmp.output('out.zip')) - idx = sourmash.load_file_as_index(runtmp.output('out.zip')) + assert os.path.exists(runtmp.output("out.zip")) + idx = sourmash.load_file_as_index(runtmp.output("out.zip")) siglist = list(idx.signatures()) assert len(siglist) == 1 ss = siglist[0] - assert ss.name == 'GCA_903797575 Salmonella enterica' - assert ss.minhash.moltype == 'DNA' + assert ss.name == "GCA_903797575 Salmonella enterica" + assert ss.minhash.moltype == "DNA" assert "** 1 total requested; output 1, skipped 0" in runtmp.last_result.err def test_fromfile_dna_check_sequence_fail(runtmp): # does it run? yes, hopefully. - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'fromfile', - 'sketch_fromfile/salmonella-badseq.csv', - '-o', 'out.zip', '-p', 'dna', '--check-sequence') + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella-badseq.csv", + "-o", + "out.zip", + "-p", + "dna", + "--check-sequence", + ) print(runtmp.last_result.out) err = runtmp.last_result.err print(err) assert "ERROR when reading from " in err - assert "invalid DNA character in input k-mer: MTNILKLFSRKAGEPLDSLAVKSVRQHLSGD" in err + assert ( + "invalid DNA character in input k-mer: MTNILKLFSRKAGEPLDSLAVKSVRQHLSGD" in err + ) def test_fromfile_dna_and_protein(runtmp): # does it run and produce DNA _and_ protein signatures? - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) - - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '-o', 'out.zip', '-p', 'dna', '-p', 'protein') + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) + + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "-o", + "out.zip", + "-p", + "dna", + "-p", + "protein", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert os.path.exists(runtmp.output('out.zip')) - idx = sourmash.load_file_as_index(runtmp.output('out.zip')) + assert os.path.exists(runtmp.output("out.zip")) + idx = sourmash.load_file_as_index(runtmp.output("out.zip")) siglist = list(idx.signatures()) assert len(siglist) == 2 - prot_sig = [ ss for ss in siglist if ss.minhash.moltype == 'protein' ] + prot_sig = [ss for ss in siglist if ss.minhash.moltype == "protein"] assert len(prot_sig) == 1 prot_sig = prot_sig[0] - assert prot_sig.name == 'GCA_903797575 Salmonella enterica' + assert prot_sig.name == "GCA_903797575 Salmonella enterica" - dna_sig = [ ss for ss in siglist if ss.minhash.moltype == 'DNA' ] + dna_sig = [ss for ss in siglist if ss.minhash.moltype == "DNA"] assert len(dna_sig) == 1 dna_sig = dna_sig[0] - assert dna_sig.name == 'GCA_903797575 Salmonella enterica' + assert dna_sig.name == "GCA_903797575 Salmonella enterica" assert "** 2 total requested; output 2, skipped 0" in runtmp.last_result.err def test_fromfile_dna_and_protein_and_hp_and_dayhoff(runtmp): # does it run and produce DNA _and_ protein signatures? - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) - - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '-o', 'out.zip', '-p', 'dna', '-p', 'dna,k=25', - '-p', 'protein', - '-p', 'hp', '-p', 'dayhoff') + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) + + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "-o", + "out.zip", + "-p", + "dna", + "-p", + "dna,k=25", + "-p", + "protein", + "-p", + "hp", + "-p", + "dayhoff", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert os.path.exists(runtmp.output('out.zip')) - idx = sourmash.load_file_as_index(runtmp.output('out.zip')) + assert os.path.exists(runtmp.output("out.zip")) + idx = sourmash.load_file_as_index(runtmp.output("out.zip")) siglist = list(idx.signatures()) assert len(siglist) == 5 - prot_sig = [ ss for ss in siglist if ss.minhash.moltype == 'protein' ] + prot_sig = [ss for ss in siglist if ss.minhash.moltype == "protein"] assert len(prot_sig) == 1 prot_sig = prot_sig[0] - assert prot_sig.name == 'GCA_903797575 Salmonella enterica' + assert prot_sig.name == "GCA_903797575 Salmonella enterica" - prot_sig = [ ss for ss in siglist if ss.minhash.moltype == 'hp' ] + prot_sig = [ss for ss in siglist if ss.minhash.moltype == "hp"] assert len(prot_sig) == 1 prot_sig = prot_sig[0] - assert prot_sig.name == 'GCA_903797575 Salmonella enterica' + assert prot_sig.name == "GCA_903797575 Salmonella enterica" - prot_sig = [ ss for ss in siglist if ss.minhash.moltype == 'dayhoff' ] + prot_sig = [ss for ss in siglist if ss.minhash.moltype == "dayhoff"] assert len(prot_sig) == 1 prot_sig = prot_sig[0] - assert prot_sig.name == 'GCA_903797575 Salmonella enterica' + assert prot_sig.name == "GCA_903797575 Salmonella enterica" - dna_sig = [ ss for ss in siglist if ss.minhash.moltype == 'DNA' ] + dna_sig = [ss for ss in siglist if ss.minhash.moltype == "DNA"] assert len(dna_sig) == 2 dna_sig = dna_sig[0] - assert dna_sig.name == 'GCA_903797575 Salmonella enterica' + assert dna_sig.name == "GCA_903797575 Salmonella enterica" assert "** 5 total requested; output 5, skipped 0" in runtmp.last_result.err def test_fromfile_dna_and_protein_noname(runtmp): # nothing in the name column - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'fromfile', - 'sketch_fromfile/salmonella-noname.csv', - '-o', 'out.zip', '-p', 'dna', '-p', 'protein') + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella-noname.csv", + "-o", + "out.zip", + "-p", + "dna", + "-p", + "protein", + ) out = runtmp.last_result.out err = runtmp.last_result.err @@ -1786,14 +1965,22 @@ def test_fromfile_dna_and_protein_noname(runtmp): def test_fromfile_dna_and_protein_dup_name(runtmp): # duplicate names - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'fromfile', - 'sketch_fromfile/salmonella.csv', - 'sketch_fromfile/salmonella.csv', - '-o', 'out.zip', '-p', 'dna', '-p', 'protein') + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "sketch_fromfile/salmonella.csv", + "-o", + "out.zip", + "-p", + "dna", + "-p", + "protein", + ) out = runtmp.last_result.out err = runtmp.last_result.err @@ -1806,15 +1993,23 @@ def test_fromfile_dna_and_protein_dup_name(runtmp): def test_fromfile_dna_and_protein_dup_name_report(runtmp): # duplicate names - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'fromfile', - 'sketch_fromfile/salmonella.csv', - 'sketch_fromfile/salmonella.csv', - '--report-duplicated', - '-o', 'out.zip', '-p', 'dna', '-p', 'protein') + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "sketch_fromfile/salmonella.csv", + "--report-duplicated", + "-o", + "out.zip", + "-p", + "dna", + "-p", + "protein", + ) out = runtmp.last_result.out err = runtmp.last_result.err @@ -1827,13 +2022,19 @@ def test_fromfile_dna_and_protein_dup_name_report(runtmp): def test_fromfile_dna_and_protein_missing(runtmp): # test what happens when missing protein. - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'fromfile', - 'sketch_fromfile/salmonella-missing.csv', - '-o', 'out.zip', '-p', 'protein') + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella-missing.csv", + "-o", + "out.zip", + "-p", + "protein", + ) out = runtmp.last_result.out err = runtmp.last_result.err @@ -1841,19 +2042,29 @@ def test_fromfile_dna_and_protein_missing(runtmp): print(out) print(err) - assert "WARNING: fromfile entry 'GCA_903797575 Salmonella enterica' is missing a proteome" in err + assert ( + "WARNING: fromfile entry 'GCA_903797575 Salmonella enterica' is missing a proteome" + in err + ) assert "** ERROR: we cannot build some of the requested signatures." in err assert "** 1 total signatures (for 1 names) cannot be built." in err def test_fromfile_dna_and_protein_missing_ignore(runtmp): # test what happens when missing protein + --ignore-missing - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) - - runtmp.sourmash('sketch', 'fromfile', - 'sketch_fromfile/salmonella-missing.csv', - '-o', 'out.zip', '-p', 'protein', '--ignore-missing') + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) + + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella-missing.csv", + "-o", + "out.zip", + "-p", + "protein", + "--ignore-missing", + ) out = runtmp.last_result.out err = runtmp.last_result.err @@ -1861,7 +2072,10 @@ def test_fromfile_dna_and_protein_missing_ignore(runtmp): print(out) print(err) - assert "WARNING: fromfile entry 'GCA_903797575 Salmonella enterica' is missing a proteome" in err + assert ( + "WARNING: fromfile entry 'GCA_903797575 Salmonella enterica' is missing a proteome" + in err + ) assert "** ERROR: we cannot build some of the requested signatures." in err assert "** 1 total signatures (for 1 names) cannot be built." in err @@ -1872,21 +2086,35 @@ def test_fromfile_dna_and_protein_missing_ignore(runtmp): def test_fromfile_no_overwrite(runtmp): # test --force-output-already-exists - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) - - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '-o', 'out.zip', '-p', 'dna') + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) + + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "-o", + "out.zip", + "-p", + "dna", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert os.path.exists(runtmp.output('out.zip')) + assert os.path.exists(runtmp.output("out.zip")) # now run again; will fail since already exists - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '-o', 'out.zip', '-p', 'protein') + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "-o", + "out.zip", + "-p", + "protein", + ) err = runtmp.last_result.err @@ -1896,55 +2124,81 @@ def test_fromfile_no_overwrite(runtmp): def test_fromfile_force_overwrite(runtmp): # test --force-output-already-exists - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) - - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '-o', 'out.zip', '-p', 'dna') + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) + + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "-o", + "out.zip", + "-p", + "dna", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert os.path.exists(runtmp.output('out.zip')) + assert os.path.exists(runtmp.output("out.zip")) # now run again, with --force - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '-o', 'out.zip', '-p', 'protein', '--force-output') + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "-o", + "out.zip", + "-p", + "protein", + "--force-output", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert os.path.exists(runtmp.output('out.zip')) - idx = sourmash.load_file_as_index(runtmp.output('out.zip')) + assert os.path.exists(runtmp.output("out.zip")) + idx = sourmash.load_file_as_index(runtmp.output("out.zip")) siglist = list(idx.signatures()) assert len(siglist) == 2 - names = list(set([ ss.name for ss in siglist ])) - assert names[0] == 'GCA_903797575 Salmonella enterica' + names = list(set([ss.name for ss in siglist])) + assert names[0] == "GCA_903797575 Salmonella enterica" assert "** 1 total requested; output 1, skipped 0" in runtmp.last_result.err def test_fromfile_need_params(runtmp): # check that we need a -p - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '-o', 'out.zip') + runtmp.sourmash( + "sketch", "fromfile", "sketch_fromfile/salmonella.csv", "-o", "out.zip" + ) print(str(exc)) - assert "Error creating signatures: No default moltype and none specified in param string" in str(exc) + assert ( + "Error creating signatures: No default moltype and none specified in param string" + in str(exc) + ) def test_fromfile_seed_not_allowed(runtmp): # check that we cannot adjust 'seed' - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '-o', 'out.zip', '-p', 'dna,seed=43') + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "-o", + "out.zip", + "-p", + "dna,seed=43", + ) print(str(exc)) assert "ERROR: cannot set 'seed' in 'sketch fromfile'" in str(exc) @@ -1952,32 +2206,49 @@ def test_fromfile_seed_not_allowed(runtmp): def test_fromfile_license_not_allowed(runtmp): # check that license is CC0 - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '-o', 'out.zip', '-p', 'dna', - '--license', 'BSD') + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "-o", + "out.zip", + "-p", + "dna", + "--license", + "BSD", + ) print(str(exc)) - assert 'sourmash only supports CC0-licensed signatures' in str(exc) + assert "sourmash only supports CC0-licensed signatures" in str(exc) def test_fromfile_dna_and_protein_csv_output(runtmp): # does it run and produce DNA _and_ protein signatures? - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) - - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '--output-csv', 'out.csv', '-p', 'dna', '-p', 'protein') + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) + + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "--output-csv", + "out.csv", + "-p", + "dna", + "-p", + "protein", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert os.path.exists(runtmp.output('out.csv')) + assert os.path.exists(runtmp.output("out.csv")) - with open(runtmp.output('out.csv'), newline='') as fp: + with open(runtmp.output("out.csv"), newline="") as fp: r = csv.DictReader(fp) # filename,sketchtype,output_index,name,param_strs @@ -1985,88 +2256,120 @@ def test_fromfile_dna_and_protein_csv_output(runtmp): for row in r: x.append(row) - x.sort(key=lambda x: x['filename']) + x.sort(key=lambda x: x["filename"]) assert len(x) == 2 - assert x[0]['sketchtype'] == 'dna' - assert x[0]['param_strs'] == '-p dna,k=31,scaled=1000' - assert x[0]['filename'] == 'sketch_fromfile/GCA_903797575.1_PARATYPHIC668_genomic.fna.gz' - - assert x[1]['sketchtype'] == 'protein' - assert x[1]['param_strs'] == '-p protein,k=10,scaled=200' - assert x[1]['filename'] == 'sketch_fromfile/GCA_903797575.1_PARATYPHIC668_protein.faa.gz' + assert x[0]["sketchtype"] == "dna" + assert x[0]["param_strs"] == "-p dna,k=31,scaled=1000" + assert ( + x[0]["filename"] + == "sketch_fromfile/GCA_903797575.1_PARATYPHIC668_genomic.fna.gz" + ) + + assert x[1]["sketchtype"] == "protein" + assert x[1]["param_strs"] == "-p protein,k=10,scaled=200" + assert ( + x[1]["filename"] + == "sketch_fromfile/GCA_903797575.1_PARATYPHIC668_protein.faa.gz" + ) # same name... - assert x[0]['name'] == x[1]['name'] == "GCA_903797575 Salmonella enterica" + assert x[0]["name"] == x[1]["name"] == "GCA_903797575 Salmonella enterica" # ...different output index. - assert x[1]['output_index'] != x[0]['output_index'] + assert x[1]["output_index"] != x[0]["output_index"] def test_fromfile_dna_and_protein_already_exists(runtmp): # does it properly ignore existing (--already-done) sigs? - test_inp = utils.get_test_data('sketch_fromfile') - already_done = utils.get_test_data('sketch_fromfile/salmonella-dna-protein.zip') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) - - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '-p', 'dna', '-p', 'protein', - '--already-done', already_done, - '--output-manifest', 'matching.csv') + test_inp = utils.get_test_data("sketch_fromfile") + already_done = utils.get_test_data("sketch_fromfile/salmonella-dna-protein.zip") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) + + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "-p", + "dna", + "-p", + "protein", + "--already-done", + already_done, + "--output-manifest", + "matching.csv", + ) print(runtmp.last_result.out) err = runtmp.last_result.err print(err) - assert 'Loaded 1 pre-existing names from manifest(s)' in err - assert 'Read 1 rows, requesting that 2 signatures be built.' in err - assert '** 0 new signatures to build from 0 files;' in err - assert '** Nothing to build. Exiting!' in err + assert "Loaded 1 pre-existing names from manifest(s)" in err + assert "Read 1 rows, requesting that 2 signatures be built." in err + assert "** 0 new signatures to build from 0 files;" in err + assert "** Nothing to build. Exiting!" in err - assert "output 2 already-done signatures to 'matching.csv' in manifest format." in err - mf = manifest.CollectionManifest.load_from_filename(runtmp.output('matching.csv')) + assert ( + "output 2 already-done signatures to 'matching.csv' in manifest format." in err + ) + mf = manifest.CollectionManifest.load_from_filename(runtmp.output("matching.csv")) assert len(mf) == 2 def test_fromfile_dna_and_protein_partly_already_exists(runtmp): # does it properly ignore existing (--already-done) sigs? - test_inp = utils.get_test_data('sketch_fromfile') - already_done = utils.get_test_data('sketch_fromfile/salmonella-dna-protein.zip') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) - - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella-mult.csv', - '-p', 'dna', '-p', 'protein', - '--already-done', already_done) + test_inp = utils.get_test_data("sketch_fromfile") + already_done = utils.get_test_data("sketch_fromfile/salmonella-dna-protein.zip") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) + + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella-mult.csv", + "-p", + "dna", + "-p", + "protein", + "--already-done", + already_done, + ) print(runtmp.last_result.out) err = runtmp.last_result.err print(err) - assert 'Loaded 1 pre-existing names from manifest(s)' in err - assert 'Read 2 rows, requesting that 4 signatures be built.' in err - assert '** 2 new signatures to build from 2 files;' in err + assert "Loaded 1 pre-existing names from manifest(s)" in err + assert "Read 2 rows, requesting that 4 signatures be built." in err + assert "** 2 new signatures to build from 2 files;" in err assert "** 2 already exist, so skipping those." in err assert "** 4 total requested; output 2, skipped 2" in err def test_fromfile_dna_and_protein_already_exists_noname(runtmp): # check that no name in already_exists is handled - test_inp = utils.get_test_data('sketch_fromfile') - already_done = utils.get_test_data('sketch_fromfile/salmonella-dna-protein.zip') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) + test_inp = utils.get_test_data("sketch_fromfile") + already_done = utils.get_test_data("sketch_fromfile/salmonella-dna-protein.zip") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) # run rename to get rid of names - runtmp.sourmash('sig', 'rename', already_done, '', - '-o', 'already-done.zip') - - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '-p', 'dna', '-p', 'protein', - '--already-done', 'already-done.zip') + runtmp.sourmash("sig", "rename", already_done, "", "-o", "already-done.zip") + + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "-p", + "dna", + "-p", + "protein", + "--already-done", + "already-done.zip", + ) print(runtmp.last_result.out) err = runtmp.last_result.err print(err) - assert 'Loaded 0 pre-existing names from manifest(s)' in err - assert 'Read 1 rows, requesting that 2 signatures be built.' in err - assert '** 2 new signatures to build from 2 files;' in err - assert '** 2 total requested; output 2, skipped 0' in err + assert "Loaded 0 pre-existing names from manifest(s)" in err + assert "Read 1 rows, requesting that 2 signatures be built." in err + assert "** 2 new signatures to build from 2 files;" in err + assert "** 2 total requested; output 2, skipped 0" in err diff --git a/tests/test_sqlite_index.py b/tests/test_sqlite_index.py index 74c4692c06..816719e602 100644 --- a/tests/test_sqlite_index.py +++ b/tests/test_sqlite_index.py @@ -6,9 +6,12 @@ import sourmash from sourmash.exceptions import IndexNotSupported -from sourmash.index.sqlite_index import (SqliteIndex, load_sqlite_index, - SqliteCollectionManifest, - LCA_SqliteDatabase) +from sourmash.index.sqlite_index import ( + SqliteIndex, + load_sqlite_index, + SqliteCollectionManifest, + LCA_SqliteDatabase, +) from sourmash.index import StandaloneManifestIndex from sourmash import load_one_signature, SourmashSignature @@ -23,7 +26,7 @@ def test_sqlite_index_prefetch_empty(): # check that an exception is raised upon for an empty database - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) sqlidx = SqliteIndex.create(":memory:") @@ -41,26 +44,27 @@ def test_sqlite_index_bad_version(runtmp): # create a sqlite database with a bad index version in the # sourmash_internal table, see what happens :) - dbfile = runtmp.output('xyz.sqldb') + dbfile = runtmp.output("xyz.sqldb") conn = sqlite3.connect(dbfile) c = conn.cursor() SqliteIndex._create_tables(c) # 0.9 doesn't exist/is bad version - c.execute('UPDATE sourmash_internal SET value=? WHERE key=?', - ('0.9', 'SqliteIndex')) + c.execute( + "UPDATE sourmash_internal SET value=? WHERE key=?", ("0.9", "SqliteIndex") + ) conn.commit() with pytest.raises(IndexNotSupported): - idx = sourmash.load_file_as_index(dbfile) + sourmash.load_file_as_index(dbfile) def test_sqlite_index_bad_version_unique(runtmp): # try to insert duplicate sqlite index info into sourmash_internal; fail - dbfile = runtmp.output('xyz.sqldb') + dbfile = runtmp.output("xyz.sqldb") conn = sqlite3.connect(dbfile) c = conn.cursor() @@ -68,13 +72,17 @@ def test_sqlite_index_bad_version_unique(runtmp): # can't insert duplicate key with pytest.raises(sqlite3.IntegrityError): - c.execute('INSERT INTO sourmash_internal (value, key) VALUES (?, ?)', - ('1.1', 'SqliteIndex')) + c.execute( + "INSERT INTO sourmash_internal (value, key) VALUES (?, ?)", + ("1.1", "SqliteIndex"), + ) def test_index_search_subj_scaled_is_lower(): # check that subject sketches are appropriately downsampled - sigfile = utils.get_test_data('scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz') + sigfile = utils.get_test_data( + "scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz" + ) ss = sourmash.load_one_signature(sigfile) # double check :) @@ -95,15 +103,15 @@ def test_index_search_subj_scaled_is_lower(): def test_sqlite_index_save_load(runtmp): - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47) ss63 = sourmash.load_one_signature(sig63) - filename = runtmp.output('foo') + filename = runtmp.output("foo") sqlidx = SqliteIndex.create(filename) sqlidx.insert(ss2) sqlidx.insert(ss47) @@ -122,7 +130,7 @@ def test_sqlite_index_save_load(runtmp): def test_sqlite_index_multik_select(): # this loads three ksizes, 21/31/51 - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") siglist = sourmash.load_file_as_signatures(sig2) sqlidx = SqliteIndex.create(":memory:") @@ -130,11 +138,11 @@ def test_sqlite_index_multik_select(): sqlidx.insert(ss) # select most specifically - sqlidx2 = sqlidx.select(ksize=31, moltype='DNA') + sqlidx2 = sqlidx.select(ksize=31, moltype="DNA") assert len(sqlidx2) == 1 # all are DNA: - sqlidx2 = sqlidx.select(moltype='DNA') + sqlidx2 = sqlidx.select(moltype="DNA") assert len(sqlidx2) == 3 @@ -156,7 +164,7 @@ def test_sqlite_index_insert_num_fail(): # cannot insert 'num' signatures sqlidx = SqliteIndex.create(":memory:") - sig47 = utils.get_test_data('num/47.fa.sig') + sig47 = utils.get_test_data("num/47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) assert ss47.minhash.num != 0 @@ -170,7 +178,7 @@ def test_sqlite_index_insert_abund_fail(): # cannot insert 'num' signatures sqlidx = SqliteIndex.create(":memory:") - sig47 = utils.get_test_data('track_abund/47.fa.sig') + sig47 = utils.get_test_data("track_abund/47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) with pytest.raises(ValueError) as exc: @@ -183,7 +191,7 @@ def test_sqlite_index_moltype_multi_fail(): # check that we cannot store sigs with multiple scaled values. # this loads multiple ksizes (19, 31) and moltypes (DNA, protein, hp, etc) - filename = utils.get_test_data('prot/all.zip') + filename = utils.get_test_data("prot/all.zip") siglist = sourmash.load_file_as_signatures(filename) siglist = list(siglist) @@ -203,7 +211,7 @@ def test_sqlite_index_picklist_select(): # test select with a picklist # this loads three ksizes, 21/31/51 - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") siglist = sourmash.load_file_as_signatures(sig2) sqlidx = SqliteIndex.create(":memory:") @@ -211,22 +219,22 @@ def test_sqlite_index_picklist_select(): sqlidx.insert(ss) # construct a picklist... - picklist = SignaturePicklist('md5prefix8') - picklist.init(['f3a90d4e']) + picklist = SignaturePicklist("md5prefix8") + picklist.init(["f3a90d4e"]) # select on picklist sqlidx2 = sqlidx.select(picklist=picklist) assert len(sqlidx2) == 1 ss = list(sqlidx2.signatures())[0] assert ss.minhash.ksize == 31 - assert ss.md5sum().startswith('f3a90d4e55') + assert ss.md5sum().startswith("f3a90d4e55") def test_sqlite_index_picklist_select_exclude(): # test select with a picklist, but exclude # this loads three ksizes, 21/31/51 - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") siglist = sourmash.load_file_as_signatures(sig2) sqlidx = SqliteIndex.create(":memory:") @@ -234,8 +242,8 @@ def test_sqlite_index_picklist_select_exclude(): sqlidx.insert(ss) # construct a picklist... - picklist = SignaturePicklist('md5prefix8', pickstyle=PickStyle.EXCLUDE) - picklist.init(['f3a90d4e']) + picklist = SignaturePicklist("md5prefix8", pickstyle=PickStyle.EXCLUDE) + picklist.init(["f3a90d4e"]) # select on picklist sqlidx2 = sqlidx.select(picklist=picklist) @@ -245,8 +253,10 @@ def test_sqlite_index_picklist_select_exclude(): for ss in list(sqlidx2.signatures()): md5s.add(ss.md5sum()) ksizes.add(ss.minhash.ksize) - assert md5s == set(['f372e47893edd349e5956f8b0d8dcbf7','43f3b48e59443092850964d355a20ac0']) - assert ksizes == set([21,51]) + assert md5s == set( + ["f372e47893edd349e5956f8b0d8dcbf7", "43f3b48e59443092850964d355a20ac0"] + ) + assert ksizes == set([21, 51]) def test_sqlite_jaccard_ordering(): @@ -265,10 +275,10 @@ def test_sqlite_jaccard_ordering(): def _intersect(x, y): return x.intersection_and_union_size(y)[0] - print('a intersect b:', _intersect(a, b)) - print('a intersect c:', _intersect(a, c)) - print('a jaccard b:', a.jaccard(b)) - print('a jaccard c:', a.jaccard(c)) + print("a intersect b:", _intersect(a, b)) + print("a intersect c:", _intersect(a, c)) + print("a jaccard b:", a.jaccard(b)) + print("a jaccard c:", a.jaccard(c)) assert _intersect(a, b) > _intersect(a, c) assert a.jaccard(b) < a.jaccard(c) @@ -277,9 +287,9 @@ def _intersect(x, y): assert a.jaccard(c) > 0.15 # now - make signatures, try out :) - ss_a = sourmash.SourmashSignature(a, name='A') - ss_b = sourmash.SourmashSignature(b, name='B') - ss_c = sourmash.SourmashSignature(c, name='C') + ss_a = sourmash.SourmashSignature(a, name="A") + ss_b = sourmash.SourmashSignature(b, name="B") + ss_c = sourmash.SourmashSignature(c, name="C") sqlidx = SqliteIndex.create(":memory:") sqlidx.insert(ss_a) @@ -303,7 +313,7 @@ def test_sqlite_index_scaled1(): mh1.add_hash(2**64 - 1) mh1.add_hash(2**64 - 2) mh1.add_hash(2**64 - 3) - ss1 = sourmash.SourmashSignature(mh1, name='ss 1') + ss1 = sourmash.SourmashSignature(mh1, name="ss 1") mh2 = sourmash.MinHash(0, 31, scaled=1) mh2.add_hash(2**64 - 1) @@ -312,7 +322,7 @@ def test_sqlite_index_scaled1(): mh2.add_hash(0) mh2.add_hash(1) mh2.add_hash(2) - ss2 = sourmash.SourmashSignature(mh2, name='ss 2') + ss2 = sourmash.SourmashSignature(mh2, name="ss 2") sqlidx.insert(ss1) sqlidx.insert(ss2) @@ -340,7 +350,7 @@ def test_sqlite_index_scaled1(): def test_sqlite_index_load_existing(): # try loading an existing sqlite index - filename = utils.get_test_data('sqlite/index.sqldb') + filename = utils.get_test_data("sqlite/index.sqldb") sqlidx = sourmash.load_file_as_index(filename) assert isinstance(sqlidx, SqliteIndex) @@ -350,11 +360,11 @@ def test_sqlite_index_load_existing(): def test_sqlite_index_create_load_existing(runtmp): # try creating then loading an existing sqlite index; create from CLI - filename = runtmp.output('idx.sqldb') - sig1 = utils.get_test_data('47.fa.sig') - sig2 = utils.get_test_data('63.fa.sig') + filename = runtmp.output("idx.sqldb") + sig1 = utils.get_test_data("47.fa.sig") + sig2 = utils.get_test_data("63.fa.sig") - runtmp.sourmash('sig', 'cat', sig1, sig2, '-o', filename) + runtmp.sourmash("sig", "cat", sig1, sig2, "-o", filename) sqlidx = sourmash.load_file_as_index(filename) assert isinstance(sqlidx, SqliteIndex) @@ -365,12 +375,12 @@ def test_sqlite_index_create_load_existing(runtmp): def test_sqlite_index_create_load_insert_existing(runtmp): # try creating, loading, inserting into an existing sqlite index - filename = runtmp.output('idx.sqldb') - sig1 = utils.get_test_data('47.fa.sig') - sig2 = utils.get_test_data('63.fa.sig') - sig3 = utils.get_test_data('2.fa.sig') + filename = runtmp.output("idx.sqldb") + sig1 = utils.get_test_data("47.fa.sig") + sig2 = utils.get_test_data("63.fa.sig") + sig3 = utils.get_test_data("2.fa.sig") - runtmp.sourmash('sig', 'cat', sig1, sig2, '-o', filename) + runtmp.sourmash("sig", "cat", sig1, sig2, "-o", filename) sqlidx = sourmash.load_file_as_index(filename) assert isinstance(sqlidx, SqliteIndex) @@ -382,7 +392,7 @@ def test_sqlite_index_create_load_insert_existing(runtmp): sqlidx.insert(ss3) sqlidx.commit() - runtmp.sourmash('sig', 'describe', filename) + runtmp.sourmash("sig", "describe", filename) print(runtmp.last_result.out) assert "md5: f3a90d4e5528864a5bcc8434b0d0c3b1" in runtmp.last_result.out @@ -390,12 +400,12 @@ def test_sqlite_index_create_load_insert_existing(runtmp): def test_sqlite_index_create_load_insert_existing_cli(runtmp): # try creating, loading, inserting into an existing sqlite index from cli # (aka "append" to existing database) - filename = runtmp.output('idx.sqldb') - sig1 = utils.get_test_data('47.fa.sig') - sig2 = utils.get_test_data('63.fa.sig') - sig3 = utils.get_test_data('2.fa.sig') + filename = runtmp.output("idx.sqldb") + sig1 = utils.get_test_data("47.fa.sig") + sig2 = utils.get_test_data("63.fa.sig") + sig3 = utils.get_test_data("2.fa.sig") - runtmp.sourmash('sig', 'cat', sig1, sig2, '-o', filename) + runtmp.sourmash("sig", "cat", sig1, sig2, "-o", filename) sqlidx = sourmash.load_file_as_index(filename) assert isinstance(sqlidx, SqliteIndex) @@ -404,7 +414,7 @@ def test_sqlite_index_create_load_insert_existing_cli(runtmp): assert len(siglist) == 2 # add a third - runtmp.sourmash('sig', 'cat', sig3, '-o', filename, '-k', '31') + runtmp.sourmash("sig", "cat", sig3, "-o", filename, "-k", "31") siglist = list(sqlidx.signatures()) assert len(siglist) == 3 @@ -414,26 +424,27 @@ def test_sqlite_manifest_bad_version(runtmp): # create a sqlite database with a bad manifest version in the # sourmash_internal table, see what happens :) - dbfile = runtmp.output('xyz.sqlmf') + dbfile = runtmp.output("xyz.sqlmf") conn = sqlite3.connect(dbfile) c = conn.cursor() SqliteCollectionManifest._create_tables(c) # 0.9 doesn't exist/bad version - c.execute('UPDATE sourmash_internal SET value=? WHERE key=?', - ('0.9', 'SqliteManifest')) + c.execute( + "UPDATE sourmash_internal SET value=? WHERE key=?", ("0.9", "SqliteManifest") + ) conn.commit() with pytest.raises(IndexNotSupported): - mf = CollectionManifest.load_from_filename(dbfile) + CollectionManifest.load_from_filename(dbfile) def test_sqlite_manifest_bad_version_unique(runtmp): # try to insert duplicate sqlite manifest info into sourmash_internal; fail - dbfile = runtmp.output('xyz.sqldb') + dbfile = runtmp.output("xyz.sqldb") conn = sqlite3.connect(dbfile) c = conn.cursor() @@ -441,15 +452,17 @@ def test_sqlite_manifest_bad_version_unique(runtmp): # can't insert duplicate key with pytest.raises(sqlite3.IntegrityError): - c.execute('INSERT INTO sourmash_internal (value, key) VALUES (?, ?)', - ('1.1', 'SqliteManifest')) + c.execute( + "INSERT INTO sourmash_internal (value, key) VALUES (?, ?)", + ("1.1", "SqliteManifest"), + ) def test_sqlite_manifest_basic(): # test some features of the SQLite-based manifest. - sig2 = load_one_signature(utils.get_test_data('2.fa.sig'), ksize=31) - sig47 = load_one_signature(utils.get_test_data('47.fa.sig'), ksize=31) - sig63 = load_one_signature(utils.get_test_data('63.fa.sig'), ksize=31) + sig2 = load_one_signature(utils.get_test_data("2.fa.sig"), ksize=31) + sig47 = load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + sig63 = load_one_signature(utils.get_test_data("63.fa.sig"), ksize=31) sqlidx = SqliteIndex.create(":memory:") @@ -479,15 +492,17 @@ def test_sqlite_manifest_basic(): def test_sqlite_manifest_round_trip(): # check that we can go from regular mf -> sqlite mf -> regular again. - sig2 = load_one_signature(utils.get_test_data('2.fa.sig'), ksize=31) - sig47 = load_one_signature(utils.get_test_data('47.fa.sig'), ksize=31) - sig63 = load_one_signature(utils.get_test_data('63.fa.sig'), ksize=31) + sig2 = load_one_signature(utils.get_test_data("2.fa.sig"), ksize=31) + sig47 = load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + sig63 = load_one_signature(utils.get_test_data("63.fa.sig"), ksize=31) rows = [] - rows.append(CollectionManifest.make_manifest_row(sig47, None, - include_signature=False)) - rows.append(CollectionManifest.make_manifest_row(sig63, None, - include_signature=False)) + rows.append( + CollectionManifest.make_manifest_row(sig47, None, include_signature=False) + ) + rows.append( + CollectionManifest.make_manifest_row(sig63, None, include_signature=False) + ) nosql_mf = CollectionManifest(rows) sqlite_mf = SqliteCollectionManifest.load_from_manifest(nosql_mf) @@ -507,13 +522,12 @@ def test_sqlite_manifest_round_trip(): def test_sqlite_manifest_create(runtmp): # test creation and summarization of a manifest of prot.zip - zipfile = utils.get_test_data('prot/all.zip') + zipfile = utils.get_test_data("prot/all.zip") # create manifest - runtmp.sourmash('sig', 'manifest', '-F', 'sql', zipfile, - '-o', 'mf.sqlmf') + runtmp.sourmash("sig", "manifest", "-F", "sql", zipfile, "-o", "mf.sqlmf") - sqlmf = runtmp.output('mf.sqlmf') + sqlmf = runtmp.output("mf.sqlmf") assert os.path.exists(sqlmf) # verify it's loadable as the right type @@ -521,7 +535,7 @@ def test_sqlite_manifest_create(runtmp): assert isinstance(idx, StandaloneManifestIndex) # summarize - runtmp.sourmash('sig', 'fileinfo', 'mf.sqlmf') + runtmp.sourmash("sig", "fileinfo", "mf.sqlmf") out = runtmp.last_result.out print(out) @@ -540,41 +554,38 @@ def test_sqlite_manifest_create(runtmp): def test_sqlite_manifest_create_noload_sigs(runtmp): # sigs should not be loadable from manifest this way... - zipfile = utils.get_test_data('prot/all.zip') + zipfile = utils.get_test_data("prot/all.zip") # create manifest - runtmp.sourmash('sig', 'manifest', '-F', 'sql', zipfile, - '-o', 'mf.sqlmf') + runtmp.sourmash("sig", "manifest", "-F", "sql", zipfile, "-o", "mf.sqlmf") # 'describe' should not be able to load the sqlmf b/c prefix is wrong with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'describe', 'mf.sqlmf') + runtmp.sourmash("sig", "describe", "mf.sqlmf") def test_sqlite_manifest_create_yesload_sigs(runtmp): # should be able to load after copying files - zipfile = utils.get_test_data('prot/all.zip') - shutil.copytree(utils.get_test_data('prot'), runtmp.output('prot')) + zipfile = utils.get_test_data("prot/all.zip") + shutil.copytree(utils.get_test_data("prot"), runtmp.output("prot")) # create manifest - runtmp.sourmash('sig', 'manifest', '-F', 'sql', zipfile, - '-o', 'prot/mf.sqlmf') + runtmp.sourmash("sig", "manifest", "-F", "sql", zipfile, "-o", "prot/mf.sqlmf") # 'describe' should now be able to load the sqlmf, which is cool - runtmp.sourmash('sig', 'describe', 'prot/mf.sqlmf') + runtmp.sourmash("sig", "describe", "prot/mf.sqlmf") print(runtmp.last_result.out) def test_sqlite_manifest_num(runtmp): # should be able to produce sql manifests with 'num' sketches in them - numsig = utils.get_test_data('num/47.fa.sig') + numsig = utils.get_test_data("num/47.fa.sig") # create mf - runtmp.sourmash('sig', 'manifest', '-F', 'sql', numsig, - '-o', 'mf.sqlmf') + runtmp.sourmash("sig", "manifest", "-F", "sql", numsig, "-o", "mf.sqlmf") # do summarize: - runtmp.sourmash('sig', 'summarize', 'mf.sqlmf') + runtmp.sourmash("sig", "summarize", "mf.sqlmf") out = runtmp.last_result.out print(out) @@ -586,14 +597,13 @@ def test_sqlite_manifest_num(runtmp): def test_sqlite_manifest_num_select(runtmp): # should be able to _select_ sql manifests with 'num' sketches in them - numsig = utils.get_test_data('num/47.fa.sig') + numsig = utils.get_test_data("num/47.fa.sig") # create mf - runtmp.sourmash('sig', 'manifest', '-F', 'sql', numsig, - '-o', 'mf.sqlmf') + runtmp.sourmash("sig", "manifest", "-F", "sql", numsig, "-o", "mf.sqlmf") # load as index - idx = sourmash.load_file_as_index(runtmp.output('mf.sqlmf')) + idx = sourmash.load_file_as_index(runtmp.output("mf.sqlmf")) # select print(list(idx.manifest.rows)) @@ -604,25 +614,24 @@ def test_sqlite_manifest_num_select(runtmp): def test_sqlite_manifest_locations(runtmp): # check what locations returns... may return too many, that's ok. - prot = utils.get_test_data('prot') + prot = utils.get_test_data("prot") - runtmp.sourmash('sig', 'manifest', '-F', 'sql', prot, - '-o', 'mf.sqlmf') + runtmp.sourmash("sig", "manifest", "-F", "sql", prot, "-o", "mf.sqlmf") # load as index - idx = sourmash.load_file_as_index(runtmp.output('mf.sqlmf')) + idx = sourmash.load_file_as_index(runtmp.output("mf.sqlmf")) - picklist = SignaturePicklist('identprefix') - picklist.pickset = set(['GCA_001593925']) + picklist = SignaturePicklist("identprefix") + picklist.pickset = set(["GCA_001593925"]) idx = idx.select(picklist=picklist) sql_locations = set(idx.manifest.locations()) - row_locations = set(row['internal_location'] for row in idx.manifest.rows) + row_locations = set(row["internal_location"] for row in idx.manifest.rows) assert sql_locations.issuperset(row_locations) - assert 'dna-sig.sig.gz' in sql_locations # this is unnecessary... - assert 'dna-sig.sig.gz' not in row_locations # ...this is correct :) + assert "dna-sig.sig.gz" in sql_locations # this is unnecessary... + assert "dna-sig.sig.gz" not in row_locations # ...this is correct :) def test_sqlite_manifest_create_insert(runtmp): @@ -631,126 +640,125 @@ def test_sqlite_manifest_create_insert(runtmp): mfname = runtmp.output("some.sqlmf") mf = SqliteCollectionManifest.create(mfname) - sigfile = utils.get_test_data('47.fa.sig') + sigfile = utils.get_test_data("47.fa.sig") ss = sourmash.load_one_signature(sigfile) - mf._insert_row(mf.conn.cursor(), mf.make_manifest_row(ss, 'some.sig')) + mf._insert_row(mf.conn.cursor(), mf.make_manifest_row(ss, "some.sig")) mf.conn.commit() # copy sig in since we want it to resolve... - shutil.copyfile(sigfile, runtmp.output('some.sig')) + shutil.copyfile(sigfile, runtmp.output("some.sig")) # 'describe' should work here, to resolve actual sigs. - runtmp.sourmash('sig', 'describe', mfname) + runtmp.sourmash("sig", "describe", mfname) print(runtmp.last_result.out) - assert 'md5: 09a08691ce52952152f0e866a59f6261' in runtmp.last_result.out + assert "md5: 09a08691ce52952152f0e866a59f6261" in runtmp.last_result.out def test_sqlite_manifest_create_insert_2(runtmp): # try out creating a sqlite manifest from cli and then _insert_row into it # copy sig in since we want it to resolve... - sigfile = utils.get_test_data('47.fa.sig') - shutil.copyfile(sigfile, runtmp.output('some.sig')) + sigfile = utils.get_test_data("47.fa.sig") + shutil.copyfile(sigfile, runtmp.output("some.sig")) - runtmp.sourmash('sig', 'manifest', 'some.sig', '-F', 'sql', - '-o', 'some.sqlmf') + runtmp.sourmash("sig", "manifest", "some.sig", "-F", "sql", "-o", "some.sqlmf") mfname = runtmp.output("some.sqlmf") mf = CollectionManifest.load_from_filename(mfname) - ss = sourmash.load_one_signature(runtmp.output('some.sig')) - mf._insert_row(mf.conn.cursor(), mf.make_manifest_row(ss, 'some.sig')) + ss = sourmash.load_one_signature(runtmp.output("some.sig")) + mf._insert_row(mf.conn.cursor(), mf.make_manifest_row(ss, "some.sig")) mf.conn.commit() # 'describe' should work here, to resolve actual sigs. - runtmp.sourmash('sig', 'describe', mfname) + runtmp.sourmash("sig", "describe", mfname) print(runtmp.last_result.out) - assert 'md5: 09a08691ce52952152f0e866a59f6261' in runtmp.last_result.out + assert "md5: 09a08691ce52952152f0e866a59f6261" in runtmp.last_result.out def test_sqlite_manifest_existing(runtmp): # try out an existing sqlite manifest - prefix = runtmp.output('protdir') - mf = runtmp.output('protdir/prot.sqlmf') - shutil.copytree(utils.get_test_data('prot'), prefix) - shutil.copyfile(utils.get_test_data('sqlite/prot.sqlmf'), mf) + prefix = runtmp.output("protdir") + mf = runtmp.output("protdir/prot.sqlmf") + shutil.copytree(utils.get_test_data("prot"), prefix) + shutil.copyfile(utils.get_test_data("sqlite/prot.sqlmf"), mf) - runtmp.sourmash('sig', 'describe', mf) + runtmp.sourmash("sig", "describe", mf) print(runtmp.last_result.out) def test_sqlite_manifest_existing_insert(runtmp): # try out an existing sqlite manifest - insert into it - prefix = runtmp.output('protdir') - shutil.copytree(utils.get_test_data('prot'), prefix) + prefix = runtmp.output("protdir") + shutil.copytree(utils.get_test_data("prot"), prefix) - mfname = runtmp.output('protdir/prot.sqlmf') - shutil.copyfile(utils.get_test_data('sqlite/prot.sqlmf'), mfname) + mfname = runtmp.output("protdir/prot.sqlmf") + shutil.copyfile(utils.get_test_data("sqlite/prot.sqlmf"), mfname) mf = CollectionManifest.load_from_filename(mfname) assert isinstance(mf, SqliteCollectionManifest) - sigfile = utils.get_test_data('47.fa.sig') + sigfile = utils.get_test_data("47.fa.sig") ss = sourmash.load_one_signature(sigfile) - mf._insert_row(mf.conn.cursor(), mf.make_manifest_row(ss, 'some.sig')) + mf._insert_row(mf.conn.cursor(), mf.make_manifest_row(ss, "some.sig")) mf.conn.commit() # copy sig in since we want it to resolve... - shutil.copyfile(sigfile, runtmp.output('protdir/some.sig')) + shutil.copyfile(sigfile, runtmp.output("protdir/some.sig")) # 'describe' should work here. - runtmp.sourmash('sig', 'describe', mfname) + runtmp.sourmash("sig", "describe", mfname) print(runtmp.last_result.out) def test_sqlite_manifest_existing_mf_only(runtmp): # try out an existing sqlite manifest, but without underlying files -> fail - mf = runtmp.output('prot.sqlmf') - shutil.copyfile(utils.get_test_data('sqlite/prot.sqlmf'), mf) + mf = runtmp.output("prot.sqlmf") + shutil.copyfile(utils.get_test_data("sqlite/prot.sqlmf"), mf) # 'fileinfo' should work... - runtmp.sourmash('sig', 'fileinfo', mf) + runtmp.sourmash("sig", "fileinfo", mf) print(runtmp.last_result.out) - assert 'num signatures: 7' in runtmp.last_result.out + assert "num signatures: 7" in runtmp.last_result.out # ...but 'describe' should fail, since it needs actual sigs. - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('sig', 'describe', mf) + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash("sig", "describe", mf) print(runtmp.last_result.err) - assert 'ERROR: Error while reading signatures from' in runtmp.last_result.err + assert "ERROR: Error while reading signatures from" in runtmp.last_result.err def test_sqlite_manifest_existing_mfonly_insert(runtmp): # try out an existing sqlite manifest - insert into it, but fail describe - mfname = runtmp.output('prot.sqlmf') - shutil.copyfile(utils.get_test_data('sqlite/prot.sqlmf'), mfname) + mfname = runtmp.output("prot.sqlmf") + shutil.copyfile(utils.get_test_data("sqlite/prot.sqlmf"), mfname) mf = CollectionManifest.load_from_filename(mfname) assert isinstance(mf, SqliteCollectionManifest) - sigfile = utils.get_test_data('47.fa.sig') + sigfile = utils.get_test_data("47.fa.sig") ss = sourmash.load_one_signature(sigfile) mf._insert_row(mf.conn.cursor(), mf.make_manifest_row(ss, sigfile)) mf.conn.commit() # 'fileinfo' should work... - runtmp.sourmash('sig', 'fileinfo', mfname) + runtmp.sourmash("sig", "fileinfo", mfname) print(runtmp.last_result.out) - assert 'num signatures: 8' in runtmp.last_result.out + assert "num signatures: 8" in runtmp.last_result.out # ...but 'describe' should fail, since it needs actual sigs. - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('sig', 'describe', mfname) + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash("sig", "describe", mfname) def test_sqlite_manifest_load_existing_index(): # try loading an existing sqlite index as a manifest - filename = utils.get_test_data('sqlite/index.sqldb') + filename = utils.get_test_data("sqlite/index.sqldb") mf = CollectionManifest.load_from_filename(filename) assert isinstance(mf, SqliteCollectionManifest) @@ -759,14 +767,14 @@ def test_sqlite_manifest_load_existing_index(): def test_sqlite_manifest_load_existing_index_insert_fail(): # try loading an existing sqlite index as a manifest; insert should fail - filename = utils.get_test_data('sqlite/index.sqldb') + filename = utils.get_test_data("sqlite/index.sqldb") mf = CollectionManifest.load_from_filename(filename) assert isinstance(mf, SqliteCollectionManifest) assert len(mf) == 2 # try insert - should fail - sigfile = utils.get_test_data('47.fa.sig') + sigfile = utils.get_test_data("47.fa.sig") ss = sourmash.load_one_signature(sigfile) with pytest.raises(Exception) as exc: @@ -787,7 +795,7 @@ def test_sqlite_manifest_create_load_empty(runtmp): def test_sqlite_lca_db_load_existing(): # try loading an existing sqlite index - filename = utils.get_test_data('sqlite/lca.sqldb') + filename = utils.get_test_data("sqlite/lca.sqldb") sqlidx = sourmash.load_file_as_index(filename) assert isinstance(sqlidx, LCA_SqliteDatabase) @@ -797,27 +805,26 @@ def test_sqlite_lca_db_load_existing(): def test_sqlite_lca_db_select(): # try loading an existing sqlite index - filename = utils.get_test_data('sqlite/lca.sqldb') + filename = utils.get_test_data("sqlite/lca.sqldb") sqlidx = sourmash.load_file_as_index(filename) assert isinstance(sqlidx, LCA_SqliteDatabase) sqlidx2 = sqlidx.select(ksize=31) - x = list(sqlidx2.hashvals) # only on LCA_SqliteDatabase + list(sqlidx2.hashvals) # only on LCA_SqliteDatabase assert isinstance(sqlidx2, LCA_SqliteDatabase) def test_sqlite_lca_db_create_load_existing(runtmp): # try creating (from CLI) then loading (from API) an LCA db - filename = runtmp.output('lca.sqldb') - sig1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - sig2 = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') + filename = runtmp.output("lca.sqldb") + sig1 = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + sig2 = utils.get_test_data("lca/TARA_PSW_MAG_00136.sig") - runtmp.sourmash('sig', 'flatten', sig1, sig2, '-o', filename, '-k', '31') + runtmp.sourmash("sig", "flatten", sig1, sig2, "-o", filename, "-k", "31") # load tax - tax_csv = utils.get_test_data('sqlite/delmont-6.csv') - runtmp.sourmash('tax', 'prepare', '-t', tax_csv, - '-o', filename, '-F', 'sql') + tax_csv = utils.get_test_data("sqlite/delmont-6.csv") + runtmp.sourmash("tax", "prepare", "-t", tax_csv, "-o", filename, "-F", "sql") sqlidx = sourmash.load_file_as_index(filename) assert isinstance(sqlidx, LCA_SqliteDatabase) @@ -829,63 +836,62 @@ def test_sqlite_lca_db_create_load_existing(runtmp): def test_sqlite_lca_db_load_empty(runtmp): # try creating then loading an _empty_ LCA_SqliteDatabase - dbname = runtmp.output('empty.sqldb') + dbname = runtmp.output("empty.sqldb") # create empty SqliteIndex... - runtmp.sourmash('sig', 'cat', '-o', dbname) + runtmp.sourmash("sig", "cat", "-o", dbname) assert os.path.exists(dbname) # ...and create empty sourmash_taxonomy tables in there... - empty_tax = utils.get_test_data('scaled/empty-lineage.csv') - runtmp.sourmash('tax', 'prepare', '-F', 'sql', '-t', empty_tax, - '-o', dbname) + empty_tax = utils.get_test_data("scaled/empty-lineage.csv") + runtmp.sourmash("tax", "prepare", "-F", "sql", "-t", empty_tax, "-o", dbname) - runtmp.sourmash('sig', 'describe', dbname) - assert 'loaded 0 signatures' in runtmp.last_result.err + runtmp.sourmash("sig", "describe", dbname) + assert "loaded 0 signatures" in runtmp.last_result.err def test_sqlite_lca_db_create_readonly(runtmp): # try running 'prepare' on a read-only sqlite db, check error message. - dbname = runtmp.output('empty.sqldb') + dbname = runtmp.output("empty.sqldb") # create empty SqliteIndex... - runtmp.sourmash('sig', 'cat', '-o', dbname) + runtmp.sourmash("sig", "cat", "-o", dbname) assert os.path.exists(dbname) # make it read only... from stat import S_IREAD, S_IRGRP, S_IROTH - os.chmod(dbname, S_IREAD|S_IRGRP|S_IROTH) + + os.chmod(dbname, S_IREAD | S_IRGRP | S_IROTH) # ...and try creating empty sourmash_taxonomy tables in there... - empty_tax = utils.get_test_data('scaled/empty-lineage.csv') + empty_tax = utils.get_test_data("scaled/empty-lineage.csv") - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('tax', 'prepare', '-F', 'sql', '-t', empty_tax, - '-o', dbname) + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash("tax", "prepare", "-F", "sql", "-t", empty_tax, "-o", dbname) err = runtmp.last_result.err print(err) - assert not "taxonomy table already exists in" in err + assert "taxonomy table already exists in" not in err assert "attempt to write a readonly database" in err def test_sqlite_lca_db_try_load_sqlite_index(): # try loading a SqliteIndex with no tax tables from .load classmethod - dbname = utils.get_test_data('sqlite/index.sqldb') + dbname = utils.get_test_data("sqlite/index.sqldb") with pytest.raises(ValueError) as exc: - db = LCA_SqliteDatabase.load(dbname) + LCA_SqliteDatabase.load(dbname) assert "not a taxonomy database" in str(exc) def test_sqlite_lca_db_supply_lineage_db(): # try creating an LCA_SqliteDatabase object with a separate lineage DB. - dbname = utils.get_test_data('sqlite/index.sqldb') + dbname = utils.get_test_data("sqlite/index.sqldb") - tax_csv = utils.get_test_data('sqlite/shewanella-lineage.csv') + tax_csv = utils.get_test_data("sqlite/shewanella-lineage.csv") lineage_db = MultiLineageDB.load([tax_csv]) db = LCA_SqliteDatabase(dbname, lineage_db=lineage_db) @@ -893,21 +899,21 @@ def test_sqlite_lca_db_supply_lineage_db(): hashval = next(iter(db.hashvals)) lineages = db.get_lineage_assignments(hashval) print(lineages) - assert lineages[0][0].rank == 'superkingdom' - assert lineages[0][0].name == 'd__Bacteria' - assert lineages[0][-1].rank == 'species' - assert lineages[0][-1].name == 's__Shewanella baltica' - assert lineages[1][0].rank == 'superkingdom' - assert lineages[1][0].name == 'd__Bacteria' - assert lineages[0][-1].rank == 'species' - assert lineages[0][-1].name == 's__Shewanella baltica' + assert lineages[0][0].rank == "superkingdom" + assert lineages[0][0].name == "d__Bacteria" + assert lineages[0][-1].rank == "species" + assert lineages[0][-1].name == "s__Shewanella baltica" + assert lineages[1][0].rank == "superkingdom" + assert lineages[1][0].name == "d__Bacteria" + assert lineages[0][-1].rank == "species" + assert lineages[0][-1].name == "s__Shewanella baltica" def test_bad_sqlite_internal_version(): # check get_sourmash_internal - dbname = utils.get_test_data('sqlite/index.sqldb') + dbname = utils.get_test_data("sqlite/index.sqldb") conn = sqlite_utils.open_sqlite_db(dbname) c = conn.cursor() with pytest.raises(Exception): - sqlite_utils.add_sourmash_internal(c, 'SqliteIndex', '0.9') + sqlite_utils.add_sourmash_internal(c, "SqliteIndex", "0.9") diff --git a/tests/test_tax.py b/tests/test_tax.py index b37e8eaf6f..3f766f5e37 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -18,99 +18,249 @@ from sourmash.exceptions import IndexNotSupported from sourmash import sourmash_args + ## command line tests def test_run_sourmash_tax(): - status, out, err = utils.runscript('sourmash', ['tax'], fail_ok=True) - assert status != 0 # no args provided, ok ;) + status, out, err = utils.runscript("sourmash", ["tax"], fail_ok=True) + assert status != 0 # no args provided, ok ;) def test_metagenome_stdout_0(runtmp): # test basic metagenome c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax) + c.run_sourmash("tax", "metagenome", "-g", g_csv, "--taxonomy-csv", tax) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,superkingdom,0.204,d__Bacteria,md5,test1.sig,0.131,1024000' in c.last_result.out - assert 'test1,superkingdom,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out - assert 'test1,phylum,0.116,d__Bacteria;p__Bacteroidota,md5,test1.sig,0.073,582000' in c.last_result.out - assert 'test1,phylum,0.088,d__Bacteria;p__Proteobacteria,md5,test1.sig,0.058,442000' in c.last_result.out - assert 'test1,phylum,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out - assert 'test1,class,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia,md5,test1.sig,0.073,582000' in c.last_result.out - assert 'test1,class,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria,md5,test1.sig,0.058,442000' in c.last_result.out - assert 'test1,class,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out - assert 'test1,order,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales,md5,test1.sig,0.073,582000' in c.last_result.out - assert 'test1,order,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales,md5,test1.sig,0.058,442000' in c.last_result.out - assert 'test1,order,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out - assert 'test1,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000' in c.last_result.out - assert 'test1,family,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae,md5,test1.sig,0.058,442000' in c.last_result.out - assert 'test1,family,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out - assert 'test1,genus,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella,md5,test1.sig,0.057,444000' in c.last_result.out - assert 'test1,genus,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia,md5,test1.sig,0.058,442000' in c.last_result.out - assert 'test1,genus,0.028,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola,md5,test1.sig,0.016,138000' in c.last_result.out - assert 'test1,genus,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out - assert 'test1,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out - assert 'test1,species,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli,md5,test1.sig,0.058,442000' in c.last_result.out - assert 'test1,species,0.028,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus,md5,test1.sig,0.016,138000' in c.last_result.out - assert 'test1,species,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,superkingdom,0.204,d__Bacteria,md5,test1.sig,0.131,1024000" + in c.last_result.out + ) + assert ( + "test1,superkingdom,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) + assert ( + "test1,phylum,0.116,d__Bacteria;p__Bacteroidota,md5,test1.sig,0.073,582000" + in c.last_result.out + ) + assert ( + "test1,phylum,0.088,d__Bacteria;p__Proteobacteria,md5,test1.sig,0.058,442000" + in c.last_result.out + ) + assert ( + "test1,phylum,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) + assert ( + "test1,class,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia,md5,test1.sig,0.073,582000" + in c.last_result.out + ) + assert ( + "test1,class,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria,md5,test1.sig,0.058,442000" + in c.last_result.out + ) + assert ( + "test1,class,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) + assert ( + "test1,order,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales,md5,test1.sig,0.073,582000" + in c.last_result.out + ) + assert ( + "test1,order,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales,md5,test1.sig,0.058,442000" + in c.last_result.out + ) + assert ( + "test1,order,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) + assert ( + "test1,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000" + in c.last_result.out + ) + assert ( + "test1,family,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae,md5,test1.sig,0.058,442000" + in c.last_result.out + ) + assert ( + "test1,family,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) + assert ( + "test1,genus,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella,md5,test1.sig,0.057,444000" + in c.last_result.out + ) + assert ( + "test1,genus,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia,md5,test1.sig,0.058,442000" + in c.last_result.out + ) + assert ( + "test1,genus,0.028,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola,md5,test1.sig,0.016,138000" + in c.last_result.out + ) + assert ( + "test1,genus,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) + assert ( + "test1,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) + assert ( + "test1,species,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli,md5,test1.sig,0.058,442000" + in c.last_result.out + ) + assert ( + "test1,species,0.028,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus,md5,test1.sig,0.016,138000" + in c.last_result.out + ) + assert ( + "test1,species,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) def test_metagenome_stdout_0_db(runtmp): # test basic metagenome with sqlite database c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.db') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.db") - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax) + c.run_sourmash("tax", "metagenome", "-g", g_csv, "--taxonomy-csv", tax) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,superkingdom,0.204,d__Bacteria,md5,test1.sig,0.131,1024000' in c.last_result.out - assert 'test1,superkingdom,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out - assert 'test1,phylum,0.116,d__Bacteria;p__Bacteroidota,md5,test1.sig,0.073,582000' in c.last_result.out - assert 'test1,phylum,0.088,d__Bacteria;p__Proteobacteria,md5,test1.sig,0.058,442000' in c.last_result.out - assert 'test1,phylum,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out - assert 'test1,class,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia,md5,test1.sig,0.073,582000' in c.last_result.out - assert 'test1,class,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria,md5,test1.sig,0.058,442000' in c.last_result.out - assert 'test1,class,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out - assert 'test1,order,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales,md5,test1.sig,0.073,582000' in c.last_result.out - assert 'test1,order,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales,md5,test1.sig,0.058,442000' in c.last_result.out - assert 'test1,order,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out - assert 'test1,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000' in c.last_result.out - assert 'test1,family,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae,md5,test1.sig,0.058,442000' in c.last_result.out - assert 'test1,family,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out - assert 'test1,genus,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella,md5,test1.sig,0.057,444000' in c.last_result.out - assert 'test1,genus,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia,md5,test1.sig,0.058,442000' in c.last_result.out - assert 'test1,genus,0.028,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola,md5,test1.sig,0.016,138000' in c.last_result.out - assert 'test1,genus,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out - assert 'test1,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out - assert 'test1,species,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli,md5,test1.sig,0.058,442000' in c.last_result.out - assert 'test1,species,0.028,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus,md5,test1.sig,0.016,138000' in c.last_result.out - assert 'test1,species,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,superkingdom,0.204,d__Bacteria,md5,test1.sig,0.131,1024000" + in c.last_result.out + ) + assert ( + "test1,superkingdom,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) + assert ( + "test1,phylum,0.116,d__Bacteria;p__Bacteroidota,md5,test1.sig,0.073,582000" + in c.last_result.out + ) + assert ( + "test1,phylum,0.088,d__Bacteria;p__Proteobacteria,md5,test1.sig,0.058,442000" + in c.last_result.out + ) + assert ( + "test1,phylum,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) + assert ( + "test1,class,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia,md5,test1.sig,0.073,582000" + in c.last_result.out + ) + assert ( + "test1,class,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria,md5,test1.sig,0.058,442000" + in c.last_result.out + ) + assert ( + "test1,class,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) + assert ( + "test1,order,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales,md5,test1.sig,0.073,582000" + in c.last_result.out + ) + assert ( + "test1,order,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales,md5,test1.sig,0.058,442000" + in c.last_result.out + ) + assert ( + "test1,order,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) + assert ( + "test1,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000" + in c.last_result.out + ) + assert ( + "test1,family,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae,md5,test1.sig,0.058,442000" + in c.last_result.out + ) + assert ( + "test1,family,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) + assert ( + "test1,genus,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella,md5,test1.sig,0.057,444000" + in c.last_result.out + ) + assert ( + "test1,genus,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia,md5,test1.sig,0.058,442000" + in c.last_result.out + ) + assert ( + "test1,genus,0.028,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola,md5,test1.sig,0.016,138000" + in c.last_result.out + ) + assert ( + "test1,genus,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) + assert ( + "test1,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) + assert ( + "test1,species,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli,md5,test1.sig,0.058,442000" + in c.last_result.out + ) + assert ( + "test1,species,0.028,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus,md5,test1.sig,0.016,138000" + in c.last_result.out + ) + assert ( + "test1,species,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) def test_metagenome_summary_csv_out(runtmp): - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" sum_csv = csv_base + ".summarized.csv" csvout = runtmp.output(sum_csv) outdir = os.path.dirname(csvout) - runtmp.run_sourmash('tax', 'metagenome', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-dir', outdir) + runtmp.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_csv, + "--taxonomy-csv", + tax, + "-o", + csv_base, + "--output-dir", + outdir, + ) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -121,62 +271,164 @@ def test_metagenome_summary_csv_out(runtmp): sum_gather_results = [x.rstrip() for x in Path(csvout).read_text().splitlines()] assert f"saving 'csv_summary' output to '{csvout}'" in runtmp.last_result.err - assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in sum_gather_results[0] - assert 'test1,superkingdom,0.2042281611487834,d__Bacteria,md5,test1.sig,0.13080306238801107,1024000' in sum_gather_results[1] - assert 'test1,superkingdom,0.7957718388512166,unclassified,md5,test1.sig,0.8691969376119889,3990000' in sum_gather_results[2] - assert 'test1,phylum,0.11607499002792182,d__Bacteria;p__Bacteroidota,md5,test1.sig,0.07265026877341586,582000' in sum_gather_results[3] - assert 'test1,phylum,0.08815317112086159,d__Bacteria;p__Proteobacteria,md5,test1.sig,0.05815279361459521,442000' in sum_gather_results[4] - assert 'test1,phylum,0.7957718388512166,unclassified,md5,test1.sig,0.8691969376119889,3990000' in sum_gather_results[5] - assert 'test1,class,0.11607499002792182,d__Bacteria;p__Bacteroidota;c__Bacteroidia,md5,test1.sig,0.07265026877341586,582000' in sum_gather_results[6] - assert 'test1,class,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria,md5,test1.sig,0.05815279361459521,442000' in sum_gather_results[7] - assert 'test1,class,0.7957718388512166,unclassified,md5,test1.sig,0.8691969376119889,3990000' in sum_gather_results[8] - assert 'test1,order,0.11607499002792182,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales,md5,test1.sig,0.07265026877341586,582000' in sum_gather_results[9] - assert 'test1,order,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales,md5,test1.sig,0.05815279361459521,442000' in sum_gather_results[10] - assert 'test1,order,0.7957718388512166,unclassified,md5,test1.sig,0.8691969376119889,3990000' in sum_gather_results[11] - assert 'test1,family,0.11607499002792182,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.07265026877341586,582000' in sum_gather_results[12] - assert 'test1,family,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae,md5,test1.sig,0.05815279361459521,442000' in sum_gather_results[13] - assert 'test1,family,0.7957718388512166,unclassified,md5,test1.sig,0.8691969376119889,3990000' in sum_gather_results[14] - assert 'test1,genus,0.0885520542481053,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella,md5,test1.sig,0.05701254275940707,444000' in sum_gather_results[15] - assert 'test1,genus,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia,md5,test1.sig,0.05815279361459521,442000' in sum_gather_results[16] - assert 'test1,genus,0.027522935779816515,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola,md5,test1.sig,0.015637726014008795,138000' in sum_gather_results[17] - assert 'test1,genus,0.7957718388512166,unclassified,md5,test1.sig,0.8691969376119889,3990000' in sum_gather_results[18] - assert 'test1,species,0.0885520542481053,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.05701254275940707,444000' in sum_gather_results[19] - assert 'test1,species,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli,md5,test1.sig,0.05815279361459521,442000' in sum_gather_results[20] - assert 'test1,species,0.027522935779816515,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus,md5,test1.sig,0.015637726014008795,138000' in sum_gather_results[21] - assert 'test1,species,0.7957718388512166,unclassified,md5,test1.sig,0.8691969376119889,3990000' in sum_gather_results[22] + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in sum_gather_results[0] + ) + assert ( + "test1,superkingdom,0.2042281611487834,d__Bacteria,md5,test1.sig,0.13080306238801107,1024000" + in sum_gather_results[1] + ) + assert ( + "test1,superkingdom,0.7957718388512166,unclassified,md5,test1.sig,0.8691969376119889,3990000" + in sum_gather_results[2] + ) + assert ( + "test1,phylum,0.11607499002792182,d__Bacteria;p__Bacteroidota,md5,test1.sig,0.07265026877341586,582000" + in sum_gather_results[3] + ) + assert ( + "test1,phylum,0.08815317112086159,d__Bacteria;p__Proteobacteria,md5,test1.sig,0.05815279361459521,442000" + in sum_gather_results[4] + ) + assert ( + "test1,phylum,0.7957718388512166,unclassified,md5,test1.sig,0.8691969376119889,3990000" + in sum_gather_results[5] + ) + assert ( + "test1,class,0.11607499002792182,d__Bacteria;p__Bacteroidota;c__Bacteroidia,md5,test1.sig,0.07265026877341586,582000" + in sum_gather_results[6] + ) + assert ( + "test1,class,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria,md5,test1.sig,0.05815279361459521,442000" + in sum_gather_results[7] + ) + assert ( + "test1,class,0.7957718388512166,unclassified,md5,test1.sig,0.8691969376119889,3990000" + in sum_gather_results[8] + ) + assert ( + "test1,order,0.11607499002792182,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales,md5,test1.sig,0.07265026877341586,582000" + in sum_gather_results[9] + ) + assert ( + "test1,order,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales,md5,test1.sig,0.05815279361459521,442000" + in sum_gather_results[10] + ) + assert ( + "test1,order,0.7957718388512166,unclassified,md5,test1.sig,0.8691969376119889,3990000" + in sum_gather_results[11] + ) + assert ( + "test1,family,0.11607499002792182,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.07265026877341586,582000" + in sum_gather_results[12] + ) + assert ( + "test1,family,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae,md5,test1.sig,0.05815279361459521,442000" + in sum_gather_results[13] + ) + assert ( + "test1,family,0.7957718388512166,unclassified,md5,test1.sig,0.8691969376119889,3990000" + in sum_gather_results[14] + ) + assert ( + "test1,genus,0.0885520542481053,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella,md5,test1.sig,0.05701254275940707,444000" + in sum_gather_results[15] + ) + assert ( + "test1,genus,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia,md5,test1.sig,0.05815279361459521,442000" + in sum_gather_results[16] + ) + assert ( + "test1,genus,0.027522935779816515,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola,md5,test1.sig,0.015637726014008795,138000" + in sum_gather_results[17] + ) + assert ( + "test1,genus,0.7957718388512166,unclassified,md5,test1.sig,0.8691969376119889,3990000" + in sum_gather_results[18] + ) + assert ( + "test1,species,0.0885520542481053,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.05701254275940707,444000" + in sum_gather_results[19] + ) + assert ( + "test1,species,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli,md5,test1.sig,0.05815279361459521,442000" + in sum_gather_results[20] + ) + assert ( + "test1,species,0.027522935779816515,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus,md5,test1.sig,0.015637726014008795,138000" + in sum_gather_results[21] + ) + assert ( + "test1,species,0.7957718388512166,unclassified,md5,test1.sig,0.8691969376119889,3990000" + in sum_gather_results[22] + ) def test_metagenome_summary_csv_out_empty_gather_force(runtmp): # test multiple -g, empty -g file, and --force - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" sum_csv = csv_base + ".summarized.csv" csvout = runtmp.output(sum_csv) outdir = os.path.dirname(csvout) - gather_empty = runtmp.output('g.csv') + gather_empty = runtmp.output("g.csv") with open(gather_empty, "w") as fp: fp.write("") print("g_csv: ", gather_empty) - runtmp.run_sourmash('tax', 'metagenome', '--gather-csv', g_csv, '-g', gather_empty, '--taxonomy-csv', tax, '-o', csv_base, '--output-dir', outdir, '-f') + runtmp.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_csv, + "-g", + gather_empty, + "--taxonomy-csv", + tax, + "-o", + csv_base, + "--output-dir", + outdir, + "-f", + ) sum_gather_results = [x.rstrip() for x in Path(csvout).read_text().splitlines()] assert f"saving 'csv_summary' output to '{csvout}'" in runtmp.last_result.err - assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in sum_gather_results[0] - assert 'test1,superkingdom,0.2042281611487834,d__Bacteria,md5,test1.sig,0.13080306238801107,1024000' in sum_gather_results[1] + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in sum_gather_results[0] + ) + assert ( + "test1,superkingdom,0.2042281611487834,d__Bacteria,md5,test1.sig,0.13080306238801107,1024000" + in sum_gather_results[1] + ) def test_metagenome_kreport_out(runtmp): # test 'kreport' kraken output format - g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" sum_csv = csv_base + ".kreport.txt" csvout = runtmp.output(sum_csv) outdir = os.path.dirname(csvout) - runtmp.run_sourmash('tax', 'metagenome', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-dir', outdir, '-F', "kreport") + runtmp.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_csv, + "--taxonomy-csv", + tax, + "-o", + csv_base, + "--output-dir", + outdir, + "-F", + "kreport", + ) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -185,37 +437,89 @@ def test_metagenome_kreport_out(runtmp): assert runtmp.last_result.status == 0 assert os.path.exists(csvout) - kreport_results = [x.rstrip().split('\t') for x in Path(csvout).read_text().splitlines()] + kreport_results = [ + x.rstrip().split("\t") for x in Path(csvout).read_text().splitlines() + ] assert f"saving 'kreport' output to '{csvout}'" in runtmp.last_result.err print(kreport_results) - assert ['13.08', '1605999', '0', 'D', '', 'd__Bacteria'] == kreport_results[0] - assert ['86.92', '10672000', '10672000', 'U', '', 'unclassified'] == kreport_results[1] - assert ['7.27', '892000', '0', 'P', '', 'p__Bacteroidota'] == kreport_results[2] - assert ['5.82', '714000', '0', 'P', '', 'p__Proteobacteria'] == kreport_results[3] - assert ['7.27', '892000', '0', 'C', '', 'c__Bacteroidia'] == kreport_results[4] - assert ['5.82', '714000', '0', 'C', '', 'c__Gammaproteobacteria'] == kreport_results[5] - assert ['7.27', '892000', '0', 'O', '', 'o__Bacteroidales'] == kreport_results[6] - assert ['5.82', '714000', '0', 'O', '', 'o__Enterobacterales'] == kreport_results[7] - assert ['7.27', '892000', '0', 'F', '', 'f__Bacteroidaceae'] == kreport_results[8] - assert ['5.82', '714000', '0', 'F', '', 'f__Enterobacteriaceae'] == kreport_results[9] - assert ['5.70', '700000', '0', 'G', '', 'g__Prevotella'] == kreport_results[10] - assert ['5.82', '714000', '0', 'G', '', 'g__Escherichia'] == kreport_results[11] - assert ['1.56', '192000', '0', 'G', '', 'g__Phocaeicola'] == kreport_results[12] - assert ['5.70', '700000', '700000', 'S', '', 's__Prevotella copri'] == kreport_results[13] - assert ['5.82', '714000', '714000', 'S', '', 's__Escherichia coli']== kreport_results[14] - assert ['1.56', '192000', '192000', 'S', '', 's__Phocaeicola vulgatus'] == kreport_results[15] + assert ["13.08", "1605999", "0", "D", "", "d__Bacteria"] == kreport_results[0] + assert [ + "86.92", + "10672000", + "10672000", + "U", + "", + "unclassified", + ] == kreport_results[1] + assert ["7.27", "892000", "0", "P", "", "p__Bacteroidota"] == kreport_results[2] + assert ["5.82", "714000", "0", "P", "", "p__Proteobacteria"] == kreport_results[3] + assert ["7.27", "892000", "0", "C", "", "c__Bacteroidia"] == kreport_results[4] + assert [ + "5.82", + "714000", + "0", + "C", + "", + "c__Gammaproteobacteria", + ] == kreport_results[5] + assert ["7.27", "892000", "0", "O", "", "o__Bacteroidales"] == kreport_results[6] + assert ["5.82", "714000", "0", "O", "", "o__Enterobacterales"] == kreport_results[7] + assert ["7.27", "892000", "0", "F", "", "f__Bacteroidaceae"] == kreport_results[8] + assert ["5.82", "714000", "0", "F", "", "f__Enterobacteriaceae"] == kreport_results[ + 9 + ] + assert ["5.70", "700000", "0", "G", "", "g__Prevotella"] == kreport_results[10] + assert ["5.82", "714000", "0", "G", "", "g__Escherichia"] == kreport_results[11] + assert ["1.56", "192000", "0", "G", "", "g__Phocaeicola"] == kreport_results[12] + assert [ + "5.70", + "700000", + "700000", + "S", + "", + "s__Prevotella copri", + ] == kreport_results[13] + assert [ + "5.82", + "714000", + "714000", + "S", + "", + "s__Escherichia coli", + ] == kreport_results[14] + assert [ + "1.56", + "192000", + "192000", + "S", + "", + "s__Phocaeicola vulgatus", + ] == kreport_results[15] def test_metagenome_kreport_ncbi_taxid_out(runtmp): # test NCBI taxid output from kreport - g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.ncbi-taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.ncbi-taxonomy.csv") csv_base = "out" sum_csv = csv_base + ".kreport.txt" csvout = runtmp.output(sum_csv) outdir = os.path.dirname(csvout) - runtmp.run_sourmash('tax', 'metagenome', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-dir', outdir, '-F', "kreport") + runtmp.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_csv, + "--taxonomy-csv", + tax, + "-o", + csv_base, + "--output-dir", + outdir, + "-F", + "kreport", + ) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -224,38 +528,94 @@ def test_metagenome_kreport_ncbi_taxid_out(runtmp): assert runtmp.last_result.status == 0 assert os.path.exists(csvout) - kreport_results = [x.rstrip().split('\t') for x in Path(csvout).read_text().splitlines()] + kreport_results = [ + x.rstrip().split("\t") for x in Path(csvout).read_text().splitlines() + ] assert f"saving 'kreport' output to '{csvout}'" in runtmp.last_result.err print(kreport_results) - assert ['13.08', '1605999', '0', 'D', '2', 'Bacteria'] == kreport_results[0] - assert ['86.92', '10672000', '10672000', 'U', '', 'unclassified'] == kreport_results[1] - assert ['7.27', '892000', '0', 'P', '976', 'Bacteroidota'] == kreport_results[2] - assert ['5.82', '714000', '0', 'P', '1224', 'Pseudomonadota'] == kreport_results[3] - assert ['7.27', '892000', '0', 'C', '200643', 'Bacteroidia'] == kreport_results[4] - assert ['5.82', '714000', '0', 'C', '1236', 'Gammaproteobacteria'] == kreport_results[5] - assert ['7.27', '892000', '0', 'O', '171549', 'Bacteroidales'] == kreport_results[6] - assert ['5.82', '714000', '0', 'O', '91347', 'Enterobacterales'] == kreport_results[7] - assert ['5.70', '700000', '0', 'F', '171552', 'Prevotellaceae'] == kreport_results[8] - assert ['5.82', '714000', '0', 'F', '543', 'Enterobacteriaceae'] == kreport_results[9] - assert ['1.56', '192000', '0', 'F', '815', 'Bacteroidaceae'] == kreport_results[10] - assert ['5.70', '700000', '0', 'G', '838', 'Prevotella'] == kreport_results[11] - assert ['5.82', '714000', '0', 'G', '561', 'Escherichia'] == kreport_results[12] - assert ['1.56', '192000', '0', 'G', '909656', 'Phocaeicola'] == kreport_results[13] - assert ['5.70', '700000', '700000', 'S', '165179', 'Prevotella copri'] == kreport_results[14] - assert ['5.82', '714000', '714000', 'S', '562', 'Escherichia coli'] == kreport_results[15] - assert ['1.56', '192000', '192000', 'S', '821', 'Phocaeicola vulgatus'] == kreport_results[16] + assert ["13.08", "1605999", "0", "D", "2", "Bacteria"] == kreport_results[0] + assert [ + "86.92", + "10672000", + "10672000", + "U", + "", + "unclassified", + ] == kreport_results[1] + assert ["7.27", "892000", "0", "P", "976", "Bacteroidota"] == kreport_results[2] + assert ["5.82", "714000", "0", "P", "1224", "Pseudomonadota"] == kreport_results[3] + assert ["7.27", "892000", "0", "C", "200643", "Bacteroidia"] == kreport_results[4] + assert [ + "5.82", + "714000", + "0", + "C", + "1236", + "Gammaproteobacteria", + ] == kreport_results[5] + assert ["7.27", "892000", "0", "O", "171549", "Bacteroidales"] == kreport_results[6] + assert ["5.82", "714000", "0", "O", "91347", "Enterobacterales"] == kreport_results[ + 7 + ] + assert ["5.70", "700000", "0", "F", "171552", "Prevotellaceae"] == kreport_results[ + 8 + ] + assert ["5.82", "714000", "0", "F", "543", "Enterobacteriaceae"] == kreport_results[ + 9 + ] + assert ["1.56", "192000", "0", "F", "815", "Bacteroidaceae"] == kreport_results[10] + assert ["5.70", "700000", "0", "G", "838", "Prevotella"] == kreport_results[11] + assert ["5.82", "714000", "0", "G", "561", "Escherichia"] == kreport_results[12] + assert ["1.56", "192000", "0", "G", "909656", "Phocaeicola"] == kreport_results[13] + assert [ + "5.70", + "700000", + "700000", + "S", + "165179", + "Prevotella copri", + ] == kreport_results[14] + assert [ + "5.82", + "714000", + "714000", + "S", + "562", + "Escherichia coli", + ] == kreport_results[15] + assert [ + "1.56", + "192000", + "192000", + "S", + "821", + "Phocaeicola vulgatus", + ] == kreport_results[16] def test_metagenome_kreport_out_lemonade(runtmp): # test 'kreport' kraken output format against lemonade output - g_csv = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.csv') - tax = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.matches.tax.csv') + g_csv = utils.get_test_data("tax/lemonade-MAG3.x.gtdb.csv") + tax = utils.get_test_data("tax/lemonade-MAG3.x.gtdb.matches.tax.csv") csv_base = "out" sum_csv = csv_base + ".kreport.txt" csvout = runtmp.output(sum_csv) outdir = os.path.dirname(csvout) - runtmp.run_sourmash('tax', 'metagenome', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-dir', outdir, '-F', "kreport") + runtmp.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_csv, + "--taxonomy-csv", + tax, + "-o", + csv_base, + "--output-dir", + outdir, + "-F", + "kreport", + ) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -264,44 +624,80 @@ def test_metagenome_kreport_out_lemonade(runtmp): assert runtmp.last_result.status == 0 assert os.path.exists(csvout) - kreport_results = [x.rstrip().split('\t') for x in Path(csvout).read_text().splitlines()] + kreport_results = [ + x.rstrip().split("\t") for x in Path(csvout).read_text().splitlines() + ] assert f"saving 'kreport' output to '{csvout}'" in runtmp.last_result.err print(kreport_results) - assert ['5.35', '116000', '0', 'D', '', 'd__Bacteria'] == kreport_results[0] - assert ['94.65', '2054000', '2054000', 'U', '', 'unclassified'] == kreport_results[1] - assert ['5.35', '116000', '0', 'P', '', 'p__Bacteroidota'] == kreport_results[2] - assert ['5.35', '116000', '0', 'C', '', 'c__Chlorobia'] == kreport_results[3] - assert ['5.35', '116000', '0', 'O', '', 'o__Chlorobiales'] == kreport_results[4] - assert ['5.35', '116000', '0', 'F', '', 'f__Chlorobiaceae'] == kreport_results[5] - assert ['5.35', '116000', '0', 'G', '', 'g__Prosthecochloris'] == kreport_results[6] - assert ['5.35', '116000', '116000', 'S', '', 's__Prosthecochloris vibrioformis'] == kreport_results[7] + assert ["5.35", "116000", "0", "D", "", "d__Bacteria"] == kreport_results[0] + assert ["94.65", "2054000", "2054000", "U", "", "unclassified"] == kreport_results[ + 1 + ] + assert ["5.35", "116000", "0", "P", "", "p__Bacteroidota"] == kreport_results[2] + assert ["5.35", "116000", "0", "C", "", "c__Chlorobia"] == kreport_results[3] + assert ["5.35", "116000", "0", "O", "", "o__Chlorobiales"] == kreport_results[4] + assert ["5.35", "116000", "0", "F", "", "f__Chlorobiaceae"] == kreport_results[5] + assert ["5.35", "116000", "0", "G", "", "g__Prosthecochloris"] == kreport_results[6] + assert [ + "5.35", + "116000", + "116000", + "S", + "", + "s__Prosthecochloris vibrioformis", + ] == kreport_results[7] def test_metagenome_kreport_out_fail(runtmp): # kreport cannot be generated with gather results from < v4.5.0 - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" sum_csv = csv_base + ".kreport.txt" csvout = runtmp.output(sum_csv) outdir = os.path.dirname(csvout) with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('tax', 'metagenome', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-dir', outdir, '-F', "kreport") + runtmp.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_csv, + "--taxonomy-csv", + tax, + "-o", + csv_base, + "--output-dir", + outdir, + "-F", + "kreport", + ) print(runtmp.last_result.status) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert "ERROR: cannot produce 'kreport' format from gather results before sourmash v4.5.0" in runtmp.last_result.err + assert ( + "ERROR: cannot produce 'kreport' format from gather results before sourmash v4.5.0" + in runtmp.last_result.err + ) def test_metagenome_bioboxes_stdout(runtmp): # test CAMI bioboxes format output - g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.ncbi-taxonomy.csv') - - runtmp.run_sourmash('tax', 'metagenome', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-F', "bioboxes") + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.ncbi-taxonomy.csv") + + runtmp.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_csv, + "--taxonomy-csv", + tax, + "-F", + "bioboxes", + ) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -312,36 +708,97 @@ def test_metagenome_bioboxes_stdout(runtmp): assert "# Taxonomic Profiling Output" in runtmp.last_result.out assert "@SampleID:test1" in runtmp.last_result.out assert "@Version:0.10.0" in runtmp.last_result.out - assert "@Ranks:superkingdom|phylum|class|order|family|genus|species|strain" in runtmp.last_result.out + assert ( + "@Ranks:superkingdom|phylum|class|order|family|genus|species|strain" + in runtmp.last_result.out + ) assert "@__program__:sourmash" in runtmp.last_result.out assert "2 superkingdom 2 Bacteria 13.08" in runtmp.last_result.out - assert "976 phylum 2|976 Bacteria|Bacteroidota 7.27" in runtmp.last_result.out - assert "1224 phylum 2|1224 Bacteria|Pseudomonadota 5.82" in runtmp.last_result.out - assert "200643 class 2|976|200643 Bacteria|Bacteroidota|Bacteroidia 7.27" in runtmp.last_result.out - assert "1236 class 2|1224|1236 Bacteria|Pseudomonadota|Gammaproteobacteria 5.82" in runtmp.last_result.out - assert "171549 order 2|976|200643|171549 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales 7.27" in runtmp.last_result.out - assert "91347 order 2|1224|1236|91347 Bacteria|Pseudomonadota|Gammaproteobacteria|Enterobacterales 5.82" in runtmp.last_result.out - assert "171552 family 2|976|200643|171549|171552 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Prevotellaceae 5.70" in runtmp.last_result.out - assert "543 family 2|1224|1236|91347|543 Bacteria|Pseudomonadota|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae 5.82" in runtmp.last_result.out - assert "815 family 2|976|200643|171549|815 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Bacteroidaceae 1.56" in runtmp.last_result.out - assert "838 genus 2|976|200643|171549|171552|838 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Prevotellaceae|Prevotella 5.70" in runtmp.last_result.out - assert "561 genus 2|1224|1236|91347|543|561 Bacteria|Pseudomonadota|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Escherichia 5.82" in runtmp.last_result.out - assert "909656 genus 2|976|200643|171549|815|909656 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Bacteroidaceae|Phocaeicola 1.56" in runtmp.last_result.out - assert "165179 species 2|976|200643|171549|171552|838|165179 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Prevotellaceae|Prevotella|Prevotella copri 5.70" in runtmp.last_result.out - assert "562 species 2|1224|1236|91347|543|561|562 Bacteria|Pseudomonadota|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Escherichia|Escherichia coli 5.82" in runtmp.last_result.out - assert "821 species 2|976|200643|171549|815|909656|821 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Bacteroidaceae|Phocaeicola|Phocaeicola vulgatus 1.56" in runtmp.last_result.out + assert ( + "976 phylum 2|976 Bacteria|Bacteroidota 7.27" + in runtmp.last_result.out + ) + assert ( + "1224 phylum 2|1224 Bacteria|Pseudomonadota 5.82" + in runtmp.last_result.out + ) + assert ( + "200643 class 2|976|200643 Bacteria|Bacteroidota|Bacteroidia 7.27" + in runtmp.last_result.out + ) + assert ( + "1236 class 2|1224|1236 Bacteria|Pseudomonadota|Gammaproteobacteria 5.82" + in runtmp.last_result.out + ) + assert ( + "171549 order 2|976|200643|171549 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales 7.27" + in runtmp.last_result.out + ) + assert ( + "91347 order 2|1224|1236|91347 Bacteria|Pseudomonadota|Gammaproteobacteria|Enterobacterales 5.82" + in runtmp.last_result.out + ) + assert ( + "171552 family 2|976|200643|171549|171552 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Prevotellaceae 5.70" + in runtmp.last_result.out + ) + assert ( + "543 family 2|1224|1236|91347|543 Bacteria|Pseudomonadota|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae 5.82" + in runtmp.last_result.out + ) + assert ( + "815 family 2|976|200643|171549|815 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Bacteroidaceae 1.56" + in runtmp.last_result.out + ) + assert ( + "838 genus 2|976|200643|171549|171552|838 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Prevotellaceae|Prevotella 5.70" + in runtmp.last_result.out + ) + assert ( + "561 genus 2|1224|1236|91347|543|561 Bacteria|Pseudomonadota|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Escherichia 5.82" + in runtmp.last_result.out + ) + assert ( + "909656 genus 2|976|200643|171549|815|909656 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Bacteroidaceae|Phocaeicola 1.56" + in runtmp.last_result.out + ) + assert ( + "165179 species 2|976|200643|171549|171552|838|165179 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Prevotellaceae|Prevotella|Prevotella copri 5.70" + in runtmp.last_result.out + ) + assert ( + "562 species 2|1224|1236|91347|543|561|562 Bacteria|Pseudomonadota|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Escherichia|Escherichia coli 5.82" + in runtmp.last_result.out + ) + assert ( + "821 species 2|976|200643|171549|815|909656|821 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Bacteroidaceae|Phocaeicola|Phocaeicola vulgatus 1.56" + in runtmp.last_result.out + ) def test_metagenome_bioboxes_outfile(runtmp): # test CAMI bioboxes format output - g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.ncbi-taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.ncbi-taxonomy.csv") csv_base = "out" sum_csv = csv_base + ".bioboxes.profile" csvout = runtmp.output(sum_csv) outdir = os.path.dirname(csvout) - runtmp.run_sourmash('tax', 'metagenome', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-F', "bioboxes", '-o', csv_base, '--output-dir', outdir,) + runtmp.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_csv, + "--taxonomy-csv", + tax, + "-F", + "bioboxes", + "-o", + csv_base, + "--output-dir", + outdir, + ) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -349,26 +806,46 @@ def test_metagenome_bioboxes_outfile(runtmp): assert runtmp.last_result.status == 0 - bb_results = [x.rstrip().split('\t') for x in Path(csvout).read_text().splitlines()] + bb_results = [x.rstrip().split("\t") for x in Path(csvout).read_text().splitlines()] assert f"saving 'bioboxes' output to '{csvout}'" in runtmp.last_result.err print(bb_results) - assert ['# Taxonomic Profiling Output'] == bb_results[0] - assert ['@SampleID:test1'] == bb_results[1] - assert ['2', 'superkingdom', '2', 'Bacteria', '13.08'] == bb_results[6] - assert ['838', 'genus', '2|976|200643|171549|171552|838', 'Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Prevotellaceae|Prevotella', '5.70'] == bb_results[16] + assert ["# Taxonomic Profiling Output"] == bb_results[0] + assert ["@SampleID:test1"] == bb_results[1] + assert ["2", "superkingdom", "2", "Bacteria", "13.08"] == bb_results[6] + assert [ + "838", + "genus", + "2|976|200643|171549|171552|838", + "Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Prevotellaceae|Prevotella", + "5.70", + ] == bb_results[16] def test_metagenome_krona_tsv_out(runtmp): - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" kr_csv = csv_base + ".krona.tsv" csvout = runtmp.output(kr_csv) outdir = os.path.dirname(csvout) print("csvout: ", csvout) - runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '-o', csv_base, - '--output-format', 'krona', '--rank', 'genus', '--output-dir', outdir) + runtmp.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "-o", + csv_base, + "--output-format", + "krona", + "--rank", + "genus", + "--output-dir", + outdir, + ) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -378,27 +855,82 @@ def test_metagenome_krona_tsv_out(runtmp): assert os.path.exists(csvout) assert f"saving 'krona' output to '{csvout}'" in runtmp.last_result.err - gn_krona_results = [x.rstrip().split('\t') for x in Path(csvout).read_text().splitlines()] + gn_krona_results = [ + x.rstrip().split("\t") for x in Path(csvout).read_text().splitlines() + ] print("species krona results: \n", gn_krona_results) - assert ['fraction', 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus'] == gn_krona_results[0] - assert ['0.0885520542481053', 'd__Bacteria', 'p__Bacteroidota', 'c__Bacteroidia', 'o__Bacteroidales', 'f__Bacteroidaceae', 'g__Prevotella'] == gn_krona_results[1] - assert ['0.08815317112086159', 'd__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacterales', 'f__Enterobacteriaceae', 'g__Escherichia'] == gn_krona_results[2] - assert ['0.027522935779816515', 'd__Bacteria', 'p__Bacteroidota', 'c__Bacteroidia', 'o__Bacteroidales', 'f__Bacteroidaceae', 'g__Phocaeicola'] == gn_krona_results[3] - assert ['0.7957718388512166', 'unclassified', 'unclassified', 'unclassified', 'unclassified', 'unclassified', 'unclassified'] == gn_krona_results[4] + assert [ + "fraction", + "superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + ] == gn_krona_results[0] + assert [ + "0.0885520542481053", + "d__Bacteria", + "p__Bacteroidota", + "c__Bacteroidia", + "o__Bacteroidales", + "f__Bacteroidaceae", + "g__Prevotella", + ] == gn_krona_results[1] + assert [ + "0.08815317112086159", + "d__Bacteria", + "p__Proteobacteria", + "c__Gammaproteobacteria", + "o__Enterobacterales", + "f__Enterobacteriaceae", + "g__Escherichia", + ] == gn_krona_results[2] + assert [ + "0.027522935779816515", + "d__Bacteria", + "p__Bacteroidota", + "c__Bacteroidia", + "o__Bacteroidales", + "f__Bacteroidaceae", + "g__Phocaeicola", + ] == gn_krona_results[3] + assert [ + "0.7957718388512166", + "unclassified", + "unclassified", + "unclassified", + "unclassified", + "unclassified", + "unclassified", + ] == gn_krona_results[4] def test_metagenome_lineage_summary_out(runtmp): - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" lin_csv = csv_base + ".lineage_summary.tsv" csvout = runtmp.output(lin_csv) outdir = os.path.dirname(csvout) print("csvout: ", csvout) - runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '-o', csv_base, '--output-format', 'lineage_summary', '--rank', - 'genus', '--output-dir', outdir) + runtmp.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "-o", + csv_base, + "--output-format", + "lineage_summary", + "--rank", + "genus", + "--output-dir", + outdir, + ) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -408,26 +940,50 @@ def test_metagenome_lineage_summary_out(runtmp): assert os.path.exists(csvout) assert f"saving 'lineage_summary' output to '{csvout}'" in runtmp.last_result.err - gn_lineage_summary = [x.rstrip().split('\t') for x in Path(csvout).read_text().splitlines()] + gn_lineage_summary = [ + x.rstrip().split("\t") for x in Path(csvout).read_text().splitlines() + ] print("species lineage summary results: \n", gn_lineage_summary) - assert ['lineage', 'test1'] == gn_lineage_summary[0] - assert ['d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola', '0.027522935779816515'] == gn_lineage_summary[1] - assert ['d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella', '0.0885520542481053'] == gn_lineage_summary[2] - assert ['d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia', '0.08815317112086159'] == gn_lineage_summary[3] - assert ['unclassified', '0.7957718388512166'] == gn_lineage_summary[4] + assert ["lineage", "test1"] == gn_lineage_summary[0] + assert [ + "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola", + "0.027522935779816515", + ] == gn_lineage_summary[1] + assert [ + "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella", + "0.0885520542481053", + ] == gn_lineage_summary[2] + assert [ + "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia", + "0.08815317112086159", + ] == gn_lineage_summary[3] + assert ["unclassified", "0.7957718388512166"] == gn_lineage_summary[4] def test_metagenome_human_format_out(runtmp): - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" - csvout = runtmp.output(csv_base + '.human.txt') + csvout = runtmp.output(csv_base + ".human.txt") outdir = os.path.dirname(csvout) print("csvout: ", csvout) - runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '-o', csv_base, '--output-format', 'human', '--rank', - 'genus', '--output-dir', outdir) + runtmp.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "-o", + csv_base, + "--output-format", + "human", + "--rank", + "genus", + "--output-dir", + outdir, + ) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -441,104 +997,192 @@ def test_metagenome_human_format_out(runtmp): outp = fp.readlines() assert len(outp) == 6 - outp = [ x.strip() for x in outp ] + outp = [x.strip() for x in outp] print(outp) - assert outp[0] == 'sample name proportion cANI lineage' - assert outp[1] == '----------- ---------- ---- -------' - assert outp[2] == 'test1 86.9% - unclassified' - assert outp[3] == 'test1 5.8% 92.5% d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia' - assert outp[4] == 'test1 5.7% 92.5% d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella' - assert outp[5] == 'test1 1.6% 89.1% d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola' + assert outp[0] == "sample name proportion cANI lineage" + assert outp[1] == "----------- ---------- ---- -------" + assert outp[2] == "test1 86.9% - unclassified" + assert ( + outp[3] + == "test1 5.8% 92.5% d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia" + ) + assert ( + outp[4] + == "test1 5.7% 92.5% d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella" + ) + assert ( + outp[5] + == "test1 1.6% 89.1% d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola" + ) def test_metagenome_no_taxonomy_fail(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'metagenome', '-g', g_csv) - assert "error: the following arguments are required: -t/--taxonomy-csv" in str(exc.value) + c.run_sourmash("tax", "metagenome", "-g", g_csv) + assert "error: the following arguments are required: -t/--taxonomy-csv" in str( + exc.value + ) def test_metagenome_no_rank_lineage_summary(runtmp): - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-format', 'lineage_summary') + runtmp.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "-o", + csv_base, + "--output-format", + "lineage_summary", + ) print(str(exc.value)) - assert "Rank (--rank) is required for krona, lineage_summary output formats." in str(exc.value) + assert ( + "Rank (--rank) is required for krona, lineage_summary output formats." + in str(exc.value) + ) def test_metagenome_no_rank_krona(runtmp): - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-format', 'krona') + runtmp.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "-o", + csv_base, + "--output-format", + "krona", + ) print(str(exc.value)) - assert "Rank (--rank) is required for krona, lineage_summary output formats." in str(exc.value) + assert ( + "Rank (--rank) is required for krona, lineage_summary output formats." + in str(exc.value) + ) def test_metagenome_bad_rank_krona(runtmp): - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-format', 'krona', '--rank', 'NotARank') + runtmp.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "-o", + csv_base, + "--output-format", + "krona", + "--rank", + "NotARank", + ) print(str(exc.value)) - assert "Invalid '--rank'/'--position' input: 'NotARank'. Please choose: 'strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'" in runtmp.last_result.err + assert ( + "Invalid '--rank'/'--position' input: 'NotARank'. Please choose: 'strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'" + in runtmp.last_result.err + ) with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-format', 'krona', '--rank', '5') + runtmp.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "-o", + csv_base, + "--output-format", + "krona", + "--rank", + "5", + ) print(str(exc.value)) - assert "Invalid '--rank'/'--position' input: '5'. Please choose: 'strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'" in runtmp.last_result.err + assert ( + "Invalid '--rank'/'--position' input: '5'. Please choose: 'strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'" + in runtmp.last_result.err + ) def test_genome_no_rank_krona(runtmp): - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-format', 'krona') + runtmp.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "-o", + csv_base, + "--output-format", + "krona", + ) assert "ERROR: Rank (--rank) is required for krona output formats" in str(exc.value) def test_metagenome_rank_not_available(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '--rank', 'strain') + c.run_sourmash( + "tax", "metagenome", "-g", g_csv, "--taxonomy-csv", tax, "--rank", "strain" + ) print(str(exc.value)) assert c.last_result.status == -1 - assert "No taxonomic information provided for rank strain: cannot summarize at this rank" in str(exc.value) + assert ( + "No taxonomic information provided for rank strain: cannot summarize at this rank" + in str(exc.value) + ) def test_metagenome_duplicated_taxonomy_fail(runtmp): c = runtmp # write temp taxonomy with duplicates - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") duplicated_csv = runtmp.output("duplicated_taxonomy.csv") - with open(duplicated_csv, 'w') as dup: + with open(duplicated_csv, "w") as dup: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] - tax.append(tax[1] + 'FOO') # add first tax_assign again + tax.append(tax[1] + "FOO") # add first tax_assign again dup.write("\n".join(tax)) - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', duplicated_csv) + c.run_sourmash( + "tax", "metagenome", "-g", g_csv, "--taxonomy-csv", duplicated_csv + ) assert "cannot read taxonomy" in str(exc.value) assert "multiple lineages for identifier GCF_001881345" in str(exc.value) @@ -547,16 +1191,18 @@ def test_metagenome_duplicated_taxonomy_fail(runtmp): def test_metagenome_duplicated_taxonomy_force(runtmp): c = runtmp # write temp taxonomy with duplicates - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") duplicated_csv = runtmp.output("duplicated_taxonomy.csv") - with open(duplicated_csv, 'w') as dup: + with open(duplicated_csv, "w") as dup: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] - tax.append(tax[1]) # add first tax_assign again + tax.append(tax[1]) # add first tax_assign again dup.write("\n".join(tax)) - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', duplicated_csv, '--force') + c.run_sourmash( + "tax", "metagenome", "-g", g_csv, "--taxonomy-csv", duplicated_csv, "--force" + ) print(c.last_result.status) print(c.last_result.out) @@ -564,55 +1210,105 @@ def test_metagenome_duplicated_taxonomy_force(runtmp): # same as stdout test - just check the first few lines assert c.last_result.status == 0 - assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,superkingdom,0.204,d__Bacteria,md5,test1.sig,0.131,1024000' in c.last_result.out - assert 'test1,superkingdom,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out - assert 'test1,phylum,0.116,d__Bacteria;p__Bacteroidota,md5,test1.sig,0.073,582000' in c.last_result.out - assert 'test1,phylum,0.088,d__Bacteria;p__Proteobacteria,md5,test1.sig,0.058,442000' in c.last_result.out - assert 'test1,phylum,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,superkingdom,0.204,d__Bacteria,md5,test1.sig,0.131,1024000" + in c.last_result.out + ) + assert ( + "test1,superkingdom,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) + assert ( + "test1,phylum,0.116,d__Bacteria;p__Bacteroidota,md5,test1.sig,0.073,582000" + in c.last_result.out + ) + assert ( + "test1,phylum,0.088,d__Bacteria;p__Proteobacteria,md5,test1.sig,0.058,442000" + in c.last_result.out + ) + assert ( + "test1,phylum,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) def test_metagenome_missing_taxonomy(runtmp): c = runtmp # write temp taxonomy with missing entry - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") subset_csv = runtmp.output("subset_taxonomy.csv") - with open(subset_csv, 'w') as subset: + with open(subset_csv, "w") as subset: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] subset.write("\n".join(tax[:4])) - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', subset_csv) + c.run_sourmash("tax", "metagenome", "-g", g_csv, "--taxonomy-csv", subset_csv) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "The following are missing from the taxonomy information: GCF_003471795" in c.last_result.err - - assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,superkingdom,0.193,d__Bacteria,md5,test1.sig,0.124,970000'in c.last_result.out - assert 'test1,superkingdom,0.807,unclassified,md5,test1.sig,0.876,4044000' in c.last_result.out - assert 'test1,phylum,0.105,d__Bacteria;p__Bacteroidota,md5,test1.sig,0.066,528000' in c.last_result.out - assert 'test1,phylum,0.088,d__Bacteria;p__Proteobacteria,md5,test1.sig,0.058,442000' in c.last_result.out - assert 'test1,phylum,0.807,unclassified,md5,test1.sig,0.876,4044000' in c.last_result.out - assert 'test1,class,0.105,d__Bacteria;p__Bacteroidota;c__Bacteroidia,md5,test1.sig,0.066,528000' in c.last_result.out + assert ( + "The following are missing from the taxonomy information: GCF_003471795" + in c.last_result.err + ) + + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,superkingdom,0.193,d__Bacteria,md5,test1.sig,0.124,970000" + in c.last_result.out + ) + assert ( + "test1,superkingdom,0.807,unclassified,md5,test1.sig,0.876,4044000" + in c.last_result.out + ) + assert ( + "test1,phylum,0.105,d__Bacteria;p__Bacteroidota,md5,test1.sig,0.066,528000" + in c.last_result.out + ) + assert ( + "test1,phylum,0.088,d__Bacteria;p__Proteobacteria,md5,test1.sig,0.058,442000" + in c.last_result.out + ) + assert ( + "test1,phylum,0.807,unclassified,md5,test1.sig,0.876,4044000" + in c.last_result.out + ) + assert ( + "test1,class,0.105,d__Bacteria;p__Bacteroidota;c__Bacteroidia,md5,test1.sig,0.066,528000" + in c.last_result.out + ) def test_metagenome_missing_fail_taxonomy(runtmp): c = runtmp # write temp taxonomy with missing entry - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") subset_csv = runtmp.output("subset_taxonomy.csv") - with open(subset_csv, 'w') as subset: + with open(subset_csv, "w") as subset: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] subset.write("\n".join(tax[:4])) - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', subset_csv, '--fail-on-missing-taxonomy') + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + subset_csv, + "--fail-on-missing-taxonomy", + ) print(str(exc.value)) @@ -624,162 +1320,315 @@ def test_metagenome_missing_fail_taxonomy(runtmp): def test_metagenome_multiple_taxonomy_files_missing(runtmp): c = runtmp # write temp taxonomy with duplicates - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") # gather against mult databases - g_csv = utils.get_test_data('tax/test1_x_gtdbrs202_genbank_euks.gather.csv') + g_csv = utils.get_test_data("tax/test1_x_gtdbrs202_genbank_euks.gather.csv") - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', taxonomy_csv, '--force') + c.run_sourmash( + "tax", "metagenome", "-g", g_csv, "--taxonomy-csv", taxonomy_csv, "--force" + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) - assert "of 6 gather results, lineage assignments for 2 results were missed" in c.last_result.err - assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'multtest,superkingdom,0.204,d__Bacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.131,1024000' in c.last_result.out - assert 'multtest,superkingdom,0.796,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.869,3990000' in c.last_result.out - assert 'multtest,phylum,0.116,d__Bacteria;p__Bacteroidota,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000' in c.last_result.out - assert 'multtest,phylum,0.088,d__Bacteria;p__Proteobacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.058,442000' in c.last_result.out - assert 'multtest,phylum,0.796,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.869,3990000' in c.last_result.out - assert 'multtest,class,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000' in c.last_result.out - assert 'multtest,class,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.058,442000' in c.last_result.out - assert 'multtest,class,0.796,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.869,3990000' in c.last_result.out + assert ( + "of 6 gather results, lineage assignments for 2 results were missed" + in c.last_result.err + ) + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "multtest,superkingdom,0.204,d__Bacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.131,1024000" + in c.last_result.out + ) + assert ( + "multtest,superkingdom,0.796,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.869,3990000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.116,d__Bacteria;p__Bacteroidota,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.088,d__Bacteria;p__Proteobacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.058,442000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.796,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.869,3990000" + in c.last_result.out + ) + assert ( + "multtest,class,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000" + in c.last_result.out + ) + assert ( + "multtest,class,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.058,442000" + in c.last_result.out + ) + assert ( + "multtest,class,0.796,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.869,3990000" + in c.last_result.out + ) def test_metagenome_multiple_taxonomy_files(runtmp): c = runtmp # write temp taxonomy with duplicates - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - protozoa_genbank = utils.get_test_data('tax/protozoa_genbank_lineage.csv') - bacteria_refseq = utils.get_test_data('tax/bacteria_refseq_lineage.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + protozoa_genbank = utils.get_test_data("tax/protozoa_genbank_lineage.csv") + bacteria_refseq = utils.get_test_data("tax/bacteria_refseq_lineage.csv") # gather against mult databases - g_csv = utils.get_test_data('tax/test1_x_gtdbrs202_genbank_euks.gather.csv') - - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', taxonomy_csv, protozoa_genbank, bacteria_refseq) + g_csv = utils.get_test_data("tax/test1_x_gtdbrs202_genbank_euks.gather.csv") + + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + taxonomy_csv, + protozoa_genbank, + bacteria_refseq, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) - assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'multtest,superkingdom,0.204,Bacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.131,1024000' in c.last_result.out - assert 'multtest,superkingdom,0.051,Eukaryota,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.245,258000' in c.last_result.out - assert 'multtest,superkingdom,0.744,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.624,3732000' in c.last_result.out - assert 'multtest,phylum,0.116,Bacteria;Bacteroidetes,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000' in c.last_result.out - assert 'multtest,phylum,0.088,Bacteria;Proteobacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.058,442000' in c.last_result.out - assert 'multtest,phylum,0.051,Eukaryota;Apicomplexa,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.245,258000' in c.last_result.out - assert 'multtest,phylum,0.744,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.624,3732000' in c.last_result.out - assert 'multtest,class,0.116,Bacteria;Bacteroidetes;Bacteroidia,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000' in c.last_result.out + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "multtest,superkingdom,0.204,Bacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.131,1024000" + in c.last_result.out + ) + assert ( + "multtest,superkingdom,0.051,Eukaryota,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.245,258000" + in c.last_result.out + ) + assert ( + "multtest,superkingdom,0.744,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.624,3732000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.116,Bacteria;Bacteroidetes,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.088,Bacteria;Proteobacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.058,442000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.051,Eukaryota;Apicomplexa,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.245,258000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.744,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.624,3732000" + in c.last_result.out + ) + assert ( + "multtest,class,0.116,Bacteria;Bacteroidetes;Bacteroidia,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000" + in c.last_result.out + ) def test_metagenome_multiple_taxonomy_files_multiple_taxonomy_args(runtmp): c = runtmp # pass in mult tax files using mult tax arguments - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - protozoa_genbank = utils.get_test_data('tax/protozoa_genbank_lineage.csv') - bacteria_refseq = utils.get_test_data('tax/bacteria_refseq_lineage.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + protozoa_genbank = utils.get_test_data("tax/protozoa_genbank_lineage.csv") + bacteria_refseq = utils.get_test_data("tax/bacteria_refseq_lineage.csv") # gather against mult databases - g_csv = utils.get_test_data('tax/test1_x_gtdbrs202_genbank_euks.gather.csv') - - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', taxonomy_csv, '-t', protozoa_genbank, '-t', bacteria_refseq) + g_csv = utils.get_test_data("tax/test1_x_gtdbrs202_genbank_euks.gather.csv") + + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + taxonomy_csv, + "-t", + protozoa_genbank, + "-t", + bacteria_refseq, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) - assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'multtest,superkingdom,0.204,Bacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.131,1024000' in c.last_result.out - assert 'multtest,superkingdom,0.051,Eukaryota,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.245,258000' in c.last_result.out - assert 'multtest,superkingdom,0.744,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.624,3732000' in c.last_result.out - assert 'multtest,phylum,0.116,Bacteria;Bacteroidetes,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000' in c.last_result.out - assert 'multtest,phylum,0.088,Bacteria;Proteobacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.058,442000' in c.last_result.out - assert 'multtest,phylum,0.051,Eukaryota;Apicomplexa,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.245,258000' in c.last_result.out - assert 'multtest,phylum,0.744,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.624,3732000' in c.last_result.out - assert 'multtest,class,0.116,Bacteria;Bacteroidetes;Bacteroidia,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000' in c.last_result.out + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "multtest,superkingdom,0.204,Bacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.131,1024000" + in c.last_result.out + ) + assert ( + "multtest,superkingdom,0.051,Eukaryota,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.245,258000" + in c.last_result.out + ) + assert ( + "multtest,superkingdom,0.744,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.624,3732000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.116,Bacteria;Bacteroidetes,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.088,Bacteria;Proteobacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.058,442000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.051,Eukaryota;Apicomplexa,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.245,258000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.744,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.624,3732000" + in c.last_result.out + ) + assert ( + "multtest,class,0.116,Bacteria;Bacteroidetes;Bacteroidia,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000" + in c.last_result.out + ) def test_metagenome_multiple_taxonomy_files_multiple_taxonomy_args_empty_force(runtmp): # pass in mult tax files using mult tax arguments, with one empty, # and use --force c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - protozoa_genbank = utils.get_test_data('tax/protozoa_genbank_lineage.csv') - bacteria_refseq = utils.get_test_data('tax/bacteria_refseq_lineage.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + protozoa_genbank = utils.get_test_data("tax/protozoa_genbank_lineage.csv") + bacteria_refseq = utils.get_test_data("tax/bacteria_refseq_lineage.csv") - tax_empty = runtmp.output('t.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax_empty = runtmp.output("t.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") with open(tax_empty, "w") as fp: fp.write("") print("t_csv: ", tax_empty) # gather against mult databases - g_csv = utils.get_test_data('tax/test1_x_gtdbrs202_genbank_euks.gather.csv') - - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', taxonomy_csv, '-t', protozoa_genbank, '-t', bacteria_refseq, '-t', tax_empty, '--force') + g_csv = utils.get_test_data("tax/test1_x_gtdbrs202_genbank_euks.gather.csv") + + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + taxonomy_csv, + "-t", + protozoa_genbank, + "-t", + bacteria_refseq, + "-t", + tax_empty, + "--force", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) - assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'multtest,superkingdom,0.204,Bacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.131,1024000' in c.last_result.out - assert 'multtest,superkingdom,0.051,Eukaryota,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.245,258000' in c.last_result.out - assert 'multtest,superkingdom,0.744,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.624,3732000' in c.last_result.out - assert 'multtest,phylum,0.116,Bacteria;Bacteroidetes,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000' in c.last_result.out - assert 'multtest,phylum,0.088,Bacteria;Proteobacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.058,442000' in c.last_result.out - assert 'multtest,phylum,0.051,Eukaryota;Apicomplexa,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.245,258000' in c.last_result.out - assert 'multtest,phylum,0.744,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.624,3732000' in c.last_result.out - assert 'multtest,class,0.116,Bacteria;Bacteroidetes;Bacteroidia,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000' in c.last_result.out + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "multtest,superkingdom,0.204,Bacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.131,1024000" + in c.last_result.out + ) + assert ( + "multtest,superkingdom,0.051,Eukaryota,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.245,258000" + in c.last_result.out + ) + assert ( + "multtest,superkingdom,0.744,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.624,3732000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.116,Bacteria;Bacteroidetes,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.088,Bacteria;Proteobacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.058,442000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.051,Eukaryota;Apicomplexa,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.245,258000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.744,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.624,3732000" + in c.last_result.out + ) + assert ( + "multtest,class,0.116,Bacteria;Bacteroidetes;Bacteroidia,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000" + in c.last_result.out + ) def test_metagenome_empty_gather_results(runtmp): - tax = utils.get_test_data('tax/test.taxonomy.csv') + tax = utils.get_test_data("tax/test.taxonomy.csv") - #creates empty gather result - g_csv = runtmp.output('g.csv') + # creates empty gather result + g_csv = runtmp.output("g.csv") with open(g_csv, "w") as fp: fp.write("") print("g_csv: ", g_csv) with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax) + runtmp.run_sourmash("tax", "metagenome", "-g", g_csv, "--taxonomy-csv", tax) - assert f"Cannot read gather results from '{g_csv}'. Is file empty?" in str(exc.value) + assert f"Cannot read gather results from '{g_csv}'. Is file empty?" in str( + exc.value + ) assert runtmp.last_result.status == -1 def test_metagenome_bad_gather_header(runtmp): - tax = utils.get_test_data('tax/test.taxonomy.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data("tax/test.taxonomy.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") - bad_g_csv = runtmp.output('g.csv') + bad_g_csv = runtmp.output("g.csv") - #creates bad gather result - bad_g = [x.replace("query_bp", "nope") + "\n" for x in Path(g_csv).read_text().splitlines()] - with open(bad_g_csv, 'w') as fp: + # creates bad gather result + bad_g = [ + x.replace("query_bp", "nope") + "\n" + for x in Path(g_csv).read_text().splitlines() + ] + with open(bad_g_csv, "w") as fp: fp.writelines(bad_g) print("bad_gather_results: \n", bad_g) with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'metagenome', '-g', bad_g_csv, '--taxonomy-csv', tax) + runtmp.run_sourmash("tax", "metagenome", "-g", bad_g_csv, "--taxonomy-csv", tax) print(str(exc.value)) - assert 'is missing columns needed for taxonomic summarization.' in str(exc.value) + assert "is missing columns needed for taxonomic summarization." in str(exc.value) assert runtmp.last_result.status == -1 def test_metagenome_empty_tax_lineage_input(runtmp): # test an empty tax CSV - tax_empty = runtmp.output('t.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax_empty = runtmp.output("t.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") with open(tax_empty, "w") as fp: fp.write("") print("t_csv: ", tax_empty) - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax_empty) + runtmp.run_sourmash( + "tax", "metagenome", "-g", g_csv, "--taxonomy-csv", tax_empty + ) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -791,16 +1640,17 @@ def test_metagenome_empty_tax_lineage_input(runtmp): def test_metagenome_empty_tax_lineage_input_force(runtmp): # test an empty tax CSV with --force - tax_empty = runtmp.output('t.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax_empty = runtmp.output("t.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") with open(tax_empty, "w") as fp: fp.write("") print("t_csv: ", tax_empty) - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax_empty, '--force') + runtmp.run_sourmash( + "tax", "metagenome", "-g", g_csv, "--taxonomy-csv", tax_empty, "--force" + ) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -811,17 +1661,17 @@ def test_metagenome_empty_tax_lineage_input_force(runtmp): def test_metagenome_perfect_match_warning(runtmp): - tax = utils.get_test_data('tax/test.taxonomy.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data("tax/test.taxonomy.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") - perfect_g_csv = runtmp.output('g.csv') + perfect_g_csv = runtmp.output("g.csv") - #create a perfect gather result - with open(g_csv, 'r') as fp: - r = csv.DictReader(fp, delimiter=',') + # create a perfect gather result + with open(g_csv) as fp: + r = csv.DictReader(fp, delimiter=",") header = r.fieldnames print(header) - with open(perfect_g_csv, 'w') as out_fp: + with open(perfect_g_csv, "w") as out_fp: w = csv.DictWriter(out_fp, header) w.writeheader() for n, row in enumerate(r): @@ -834,28 +1684,31 @@ def test_metagenome_perfect_match_warning(runtmp): w.writerow(row) print(row) - runtmp.run_sourmash('tax', 'metagenome', '-g', perfect_g_csv, '--taxonomy-csv', tax) + runtmp.run_sourmash("tax", "metagenome", "-g", perfect_g_csv, "--taxonomy-csv", tax) print(runtmp.last_result.status) print(runtmp.last_result.out) print(runtmp.last_result.err) assert runtmp.last_result.status == 0 - assert "WARNING: 100% match! Is query 'test1' identical to its database match, 'GCF_001881345'?" in runtmp.last_result.err + assert ( + "WARNING: 100% match! Is query 'test1' identical to its database match, 'GCF_001881345'?" + in runtmp.last_result.err + ) def test_metagenome_over100percent_error(runtmp): - tax = utils.get_test_data('tax/test.taxonomy.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data("tax/test.taxonomy.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") - perfect_g_csv = runtmp.output('g.csv') + perfect_g_csv = runtmp.output("g.csv") - #create a perfect gather result - with open(g_csv, 'r') as fp: - r = csv.DictReader(fp, delimiter=',') + # create a perfect gather result + with open(g_csv) as fp: + r = csv.DictReader(fp, delimiter=",") header = r.fieldnames print(header) - with open(perfect_g_csv, 'w') as out_fp: + with open(perfect_g_csv, "w") as out_fp: w = csv.DictWriter(out_fp, header) w.writeheader() for n, row in enumerate(r): @@ -866,49 +1719,72 @@ def test_metagenome_over100percent_error(runtmp): print(row) with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('tax', 'metagenome', '-g', perfect_g_csv, '--taxonomy-csv', tax) + runtmp.run_sourmash( + "tax", "metagenome", "-g", perfect_g_csv, "--taxonomy-csv", tax + ) print(runtmp.last_result.status) print(runtmp.last_result.out) print(runtmp.last_result.err) assert runtmp.last_result.status == -1 - assert "fraction is > 100% of the query! This should not be possible." in runtmp.last_result.err + assert ( + "fraction is > 100% of the query! This should not be possible." + in runtmp.last_result.err + ) def test_metagenome_gather_duplicate_query(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") # different filename, contents identical to test1 g_res2 = runtmp.output("test2.gather.csv") - with open(g_res2, 'w') as fp: + with open(g_res2, "w") as fp: fp.write(Path(g_res).read_text()) with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'metagenome', '--gather-csv', g_res, g_res2, - '--taxonomy-csv', taxonomy_csv) + c.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_res, + g_res2, + "--taxonomy-csv", + taxonomy_csv, + ) assert c.last_result.status == -1 print(str(exc.value)) - assert "Gather query test1 was found in more than one CSV. Cannot load from " in str(exc.value) + assert ( + "Gather query test1 was found in more than one CSV. Cannot load from " + in str(exc.value) + ) def test_metagenome_gather_duplicate_query_force(runtmp): # do not load same query from multiple files. c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") # different filename, contents identical to test1 g_res2 = runtmp.output("test2.gather.csv") - with open(g_res2, 'w') as fp: + with open(g_res2, "w") as fp: fp.write(Path(g_res).read_text()) - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'metagenome', '--gather-csv', g_res, g_res2, - '--taxonomy-csv', taxonomy_csv, '--force') + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_res, + g_res2, + "--taxonomy-csv", + taxonomy_csv, + "--force", + ) print(c.last_result.status) print(c.last_result.out) @@ -923,18 +1799,27 @@ def test_metagenome_gather_duplicate_query_force(runtmp): def test_metagenome_two_queries_human_output(runtmp): # do not load same query from multiple files. c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") # make a second query with same output g_res2 = runtmp.output("test2.gather.csv") - with open(g_res2, 'w') as fp: + with open(g_res2, "w") as fp: for line in Path(g_res).read_text().splitlines(): - line = line.replace('test1', 'test2') + "\n" + line = line.replace("test1", "test2") + "\n" fp.write(line) - c.run_sourmash('tax', 'metagenome', '--gather-csv', g_res, g_res2, - '--taxonomy-csv', taxonomy_csv, '-F', "human") + c.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_res, + g_res2, + "--taxonomy-csv", + taxonomy_csv, + "-F", + "human", + ) print(c.last_result.status) print(c.last_result.out) @@ -942,9 +1827,15 @@ def test_metagenome_two_queries_human_output(runtmp): assert c.last_result.status == 0 assert "test1 86.9% - unclassified" in c.last_result.out - assert "test1 5.8% 92.5% d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert ( + "test1 5.8% 92.5% d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" + in c.last_result.out + ) assert "test2 86.9% - unclassified" in c.last_result.out - assert "test2 5.8% 92.5% d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert ( + "test2 5.8% 92.5% d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" + in c.last_result.out + ) assert "test2 5.7% 92.5% d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" assert "test2 1.6% 89.1% d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" @@ -952,22 +1843,36 @@ def test_metagenome_two_queries_human_output(runtmp): def test_metagenome_two_queries_with_single_query_output_formats_fail(runtmp): # fail on multiple queries with single query output formats c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") # make a second query with same output g_res2 = runtmp.output("test2.gather.csv") - with open(g_res2, 'w') as fp: + with open(g_res2, "w") as fp: for line in Path(g_res).read_text().splitlines(): - line = line.replace('test1', 'test2') + "\n" + line = line.replace("test1", "test2") + "\n" fp.write(line) csv_summary_out = runtmp.output("tst.summarized.csv") kreport_out = runtmp.output("tst.kreport.txt") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'metagenome', '--gather-csv', g_res, g_res2, - '--taxonomy-csv', taxonomy_csv, '-F', "csv_summary", "kreport", "--rank", "phylum", "-o", "tst") + c.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_res, + g_res2, + "--taxonomy-csv", + taxonomy_csv, + "-F", + "csv_summary", + "kreport", + "--rank", + "phylum", + "-o", + "tst", + ) print(str(exc.value)) assert not os.path.exists(csv_summary_out) @@ -975,29 +1880,47 @@ def test_metagenome_two_queries_with_single_query_output_formats_fail(runtmp): assert c.last_result.status == -1 assert "loaded results for 2 queries from 2 gather CSVs" in c.last_result.err - assert "WARNING: found results for multiple gather queries. Can only output multi-query result formats: skipping csv_summary, kreport" in c.last_result.err + assert ( + "WARNING: found results for multiple gather queries. Can only output multi-query result formats: skipping csv_summary, kreport" + in c.last_result.err + ) assert "ERROR: No output formats remaining." in c.last_result.err def test_metagenome_two_queries_skip_single_query_output_formats(runtmp): # remove single-query outputs when working with multiple queries c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") # make a second query with same output g_res2 = runtmp.output("test2.gather.csv") - with open(g_res2, 'w') as fp: + with open(g_res2, "w") as fp: for line in Path(g_res).read_text().splitlines(): - line = line.replace('test1', 'test2') + "\n" + line = line.replace("test1", "test2") + "\n" fp.write(line) csv_summary_out = runtmp.output("tst.summarized.csv") kreport_out = runtmp.output("tst.kreport.txt") lineage_summary_out = runtmp.output("tst.lineage_summary.tsv") - c.run_sourmash('tax', 'metagenome', '--gather-csv', g_res, g_res2, - '--taxonomy-csv', taxonomy_csv, '-F', "csv_summary", "kreport", "lineage_summary", "--rank", "phylum", "-o", "tst") + c.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_res, + g_res2, + "--taxonomy-csv", + taxonomy_csv, + "-F", + "csv_summary", + "kreport", + "lineage_summary", + "--rank", + "phylum", + "-o", + "tst", + ) assert not os.path.exists(csv_summary_out) assert not os.path.exists(kreport_out) @@ -1005,32 +1928,52 @@ def test_metagenome_two_queries_skip_single_query_output_formats(runtmp): assert c.last_result.status == 0 assert "loaded results for 2 queries from 2 gather CSVs" in c.last_result.err - assert "WARNING: found results for multiple gather queries. Can only output multi-query result formats: skipping csv_summary, kreport" in c.last_result.err + assert ( + "WARNING: found results for multiple gather queries. Can only output multi-query result formats: skipping csv_summary, kreport" + in c.last_result.err + ) def test_metagenome_two_queries_krona(runtmp): # for now, we enable multi-query krona. Is this desired? c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") # make a second query with same output g_res2 = runtmp.output("test2.gather.csv") - with open(g_res2, 'w') as fp: + with open(g_res2, "w") as fp: for line in Path(g_res).read_text().splitlines(): - line = line.replace('test1', 'test2') + "\n" + line = line.replace("test1", "test2") + "\n" fp.write(line) - c.run_sourmash('tax', 'metagenome', '--gather-csv', g_res, g_res2, - '--taxonomy-csv', taxonomy_csv, '-F', "krona", '--rank', 'superkingdom') + c.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_res, + g_res2, + "--taxonomy-csv", + taxonomy_csv, + "-F", + "krona", + "--rank", + "superkingdom", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "WARNING: results from more than one query found. Krona summarization not recommended." in c.last_result.err - assert "Percentage assignment will be normalized by the number of queries to maintain range 0-100%" in c.last_result.err + assert ( + "WARNING: results from more than one query found. Krona summarization not recommended." + in c.last_result.err + ) + assert ( + "Percentage assignment will be normalized by the number of queries to maintain range 0-100%" + in c.last_result.err + ) assert "fraction superkingdom" in c.last_result.out assert "0.2042281611487834 d__Bacteria" in c.last_result.out assert "0.7957718388512166 unclassified" in c.last_result.out @@ -1040,108 +1983,150 @@ def test_metagenome_gather_duplicate_filename(runtmp): # test that a duplicate filename is properly flagged, when passed in # twice to a single -g argument. c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') - - c.run_sourmash('tax', 'metagenome', '--gather-csv', g_res, g_res, '--taxonomy-csv', taxonomy_csv) + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") + + c.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_res, + g_res, + "--taxonomy-csv", + taxonomy_csv, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert f'ignoring duplicated reference to file: {g_res}' - assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,superkingdom,0.204,d__Bacteria,md5,test1.sig,0.131,1024000' in c.last_result.out + assert f"ignoring duplicated reference to file: {g_res}" + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,superkingdom,0.204,d__Bacteria,md5,test1.sig,0.131,1024000" + in c.last_result.out + ) def test_metagenome_gather_duplicate_filename_2(runtmp): # test that a duplicate filename is properly flagged, with -g a -g b c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') - - c.run_sourmash('tax', 'metagenome', '--gather-csv', g_res, '-g', g_res, '--taxonomy-csv', taxonomy_csv) + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") + + c.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_res, + "-g", + g_res, + "--taxonomy-csv", + taxonomy_csv, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert f'ignoring duplicated reference to file: {g_res}' - assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,superkingdom,0.204,d__Bacteria,md5,test1.sig,0.131,1024000' in c.last_result.out + assert f"ignoring duplicated reference to file: {g_res}" + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,superkingdom,0.204,d__Bacteria,md5,test1.sig,0.131,1024000" + in c.last_result.out + ) def test_metagenome_gather_duplicate_filename_from_file(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") g_from_file = runtmp.output("tmp-from-file.txt") - with open(g_from_file, 'w') as f_csv: + with open(g_from_file, "w") as f_csv: f_csv.write(f"{g_res}\n") f_csv.write(f"{g_res}\n") - c.run_sourmash('tax', 'metagenome', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv) + c.run_sourmash( + "tax", "metagenome", "--from-file", g_from_file, "--taxonomy-csv", taxonomy_csv + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert f'ignoring duplicated reference to file: {g_res}' - assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,superkingdom,0.204,d__Bacteria,md5,test1.sig,0.131,1024000' in c.last_result.out + assert f"ignoring duplicated reference to file: {g_res}" + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,superkingdom,0.204,d__Bacteria,md5,test1.sig,0.131,1024000" + in c.last_result.out + ) def test_genome_empty_gather_results(runtmp): - tax = utils.get_test_data('tax/test.taxonomy.csv') + tax = utils.get_test_data("tax/test.taxonomy.csv") - #creates empty gather result - g_csv = runtmp.output('g.csv') + # creates empty gather result + g_csv = runtmp.output("g.csv") with open(g_csv, "w") as fp: fp.write("") print("g_csv: ", g_csv) with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax) + runtmp.run_sourmash("tax", "genome", "-g", g_csv, "--taxonomy-csv", tax) assert runtmp.last_result.status == -1 print(runtmp.last_result.err) print(runtmp.last_result.out) - assert f"Cannot read gather results from '{g_csv}'. Is file empty?" in str(exc.value) + assert f"Cannot read gather results from '{g_csv}'. Is file empty?" in str( + exc.value + ) def test_genome_bad_gather_header(runtmp): - tax = utils.get_test_data('tax/test.taxonomy.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data("tax/test.taxonomy.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") - bad_g_csv = runtmp.output('g.csv') + bad_g_csv = runtmp.output("g.csv") - #creates bad gather result - bad_g = [x.replace("f_unique_to_query", "nope") + "\n" for x in Path(g_csv).read_text().splitlines()] - with open(bad_g_csv, 'w') as fp: + # creates bad gather result + bad_g = [ + x.replace("f_unique_to_query", "nope") + "\n" + for x in Path(g_csv).read_text().splitlines() + ] + with open(bad_g_csv, "w") as fp: fp.writelines(bad_g) print("bad_gather_results: \n", bad_g) with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'genome', '-g', bad_g_csv, '--taxonomy-csv', tax) + runtmp.run_sourmash("tax", "genome", "-g", bad_g_csv, "--taxonomy-csv", tax) - assert 'is missing columns needed for taxonomic summarization.' in str(exc.value) + assert "is missing columns needed for taxonomic summarization." in str(exc.value) assert runtmp.last_result.status == -1 def test_genome_empty_tax_lineage_input(runtmp): # test an empty tax csv - tax_empty = runtmp.output('t.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax_empty = runtmp.output("t.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") with open(tax_empty, "w") as fp: fp.write("") print("t_csv: ", tax_empty) with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax_empty) + runtmp.run_sourmash("tax", "genome", "-g", g_csv, "--taxonomy-csv", tax_empty) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -1155,66 +2140,124 @@ def test_genome_rank_stdout_0(runtmp): # test basic genome c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') - - c.run_sourmash('tax', 'genome', '--gather-csv', g_csv, '--taxonomy-csv', tax, - '--rank', 'species', '--containment-threshold', '0') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") + + c.run_sourmash( + "tax", + "genome", + "--gather-csv", + g_csv, + "--taxonomy-csv", + tax, + "--rank", + "species", + "--containment-threshold", + "0", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) def test_genome_rank_stdout_0_db(runtmp): # test basic genome with sqlite database c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.db') - - c.run_sourmash('tax', 'genome', '--gather-csv', g_csv, '--taxonomy-csv', - tax, '--rank', 'species', '--containment-threshold', '0') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.db") + + c.run_sourmash( + "tax", + "genome", + "--gather-csv", + g_csv, + "--taxonomy-csv", + tax, + "--rank", + "species", + "--containment-threshold", + "0", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) # too stringent of containment threshold: - c.run_sourmash('tax', 'genome', '--gather-csv', g_csv, '--taxonomy-csv', - tax, '--rank', 'species', '--containment-threshold', '1.0') + c.run_sourmash( + "tax", + "genome", + "--gather-csv", + g_csv, + "--taxonomy-csv", + tax, + "--rank", + "species", + "--containment-threshold", + "1.0", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "test1,below_threshold,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000," in c.last_result.out + assert ( + "test1,below_threshold,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000," + in c.last_result.out + ) def test_genome_rank_csv_0(runtmp): # test basic genome - output csv c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" cl_csv = csv_base + ".classifications.csv" csvout = runtmp.output(cl_csv) outdir = os.path.dirname(csvout) print("csvout: ", csvout) - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, - '--rank', 'species', '-o', csv_base, '--containment-threshold', '0', - '--output-dir', outdir) + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--rank", + "species", + "-o", + csv_base, + "--containment-threshold", + "0", + "--output-dir", + outdir, + ) print(c.last_result.status) print(c.last_result.out) @@ -1223,25 +2266,46 @@ def test_genome_rank_csv_0(runtmp): assert f"saving 'classification' output to '{csvout}'" in runtmp.last_result.err assert c.last_result.status == 0 cl_results = [x.rstrip() for x in Path(csvout).read_text().splitlines()] - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in cl_results[0] - assert 'test1,match,species,0.0885520542481053,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.05701254275940707,444000' in cl_results[1] + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in cl_results[0] + ) + assert ( + "test1,match,species,0.0885520542481053,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.05701254275940707,444000" + in cl_results[1] + ) def test_genome_rank_krona(runtmp): # test basic genome - output csv c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" cl_csv = csv_base + ".krona.tsv" csvout = runtmp.output(cl_csv) outdir = os.path.dirname(csvout) print("csvout: ", csvout) - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, - '--rank', 'species', '-o', csv_base, '--containment-threshold', '0', - '--output-format', 'krona', '--output-dir', outdir) + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--rank", + "species", + "-o", + csv_base, + "--containment-threshold", + "0", + "--output-format", + "krona", + "--output-dir", + outdir, + ) print(c.last_result.status) print(c.last_result.out) @@ -1249,26 +2313,59 @@ def test_genome_rank_krona(runtmp): assert f"saving 'krona' output to '{csvout}'" in runtmp.last_result.err assert c.last_result.status == 0 - kr_results = [x.rstrip().split('\t') for x in Path(csvout).read_text().splitlines()] + kr_results = [x.rstrip().split("\t") for x in Path(csvout).read_text().splitlines()] print(kr_results) - assert ['fraction', 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'] == kr_results[0] - assert ['0.0885520542481053', 'd__Bacteria', 'p__Bacteroidota', 'c__Bacteroidia', 'o__Bacteroidales', 'f__Bacteroidaceae', 'g__Prevotella', 's__Prevotella copri'] == kr_results[1] + assert [ + "fraction", + "superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + ] == kr_results[0] + assert [ + "0.0885520542481053", + "d__Bacteria", + "p__Bacteroidota", + "c__Bacteroidia", + "o__Bacteroidales", + "f__Bacteroidaceae", + "g__Prevotella", + "s__Prevotella copri", + ] == kr_results[1] def test_genome_rank_human_output(runtmp): # test basic genome - output csv c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" - csvout = runtmp.output(csv_base + '.human.txt') + csvout = runtmp.output(csv_base + ".human.txt") outdir = os.path.dirname(csvout) print("csvout: ", csvout) - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, - '--rank', 'species', '-o', csv_base, '--containment-threshold', '0', - '--output-format', 'human', '--output-dir', outdir) + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--rank", + "species", + "-o", + csv_base, + "--containment-threshold", + "0", + "--output-format", + "human", + "--output-dir", + outdir, + ) print(c.last_result.status) print(c.last_result.out) @@ -1282,27 +2379,45 @@ def test_genome_rank_human_output(runtmp): print(outp) assert len(outp) == 3 - outp = [ x.strip() for x in outp ] + outp = [x.strip() for x in outp] - assert outp[0] == 'sample name status proportion cANI lineage' - assert outp[1] == '----------- ------ ---------- ---- -------' - assert outp[2] == 'test1 match 5.7% 92.5% d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri' + assert outp[0] == "sample name status proportion cANI lineage" + assert outp[1] == "----------- ------ ---------- ---- -------" + assert ( + outp[2] + == "test1 match 5.7% 92.5% d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" + ) def test_genome_rank_lineage_csv_output(runtmp): # test basic genome - output csv c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" - csvout = runtmp.output(csv_base + '.lineage.csv') + csvout = runtmp.output(csv_base + ".lineage.csv") outdir = os.path.dirname(csvout) print("csvout: ", csvout) - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, - '--rank', 'species', '-o', csv_base, '--containment-threshold', '0', - '--output-format', 'lineage_csv', '--output-dir', outdir) + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--rank", + "species", + "-o", + csv_base, + "--containment-threshold", + "0", + "--output-format", + "lineage_csv", + "--output-dir", + outdir, + ) print(c.last_result.status) print(c.last_result.out) @@ -1314,169 +2429,291 @@ def test_genome_rank_lineage_csv_output(runtmp): outp = fp.readlines() assert len(outp) == 2 - outp = [ x.strip() for x in outp ] + outp = [x.strip() for x in outp] - assert outp[0] == 'ident,superkingdom,phylum,class,order,family,genus,species' - assert outp[1] == 'test1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Prevotella,s__Prevotella copri' + assert outp[0] == "ident,superkingdom,phylum,class,order,family,genus,species" + assert ( + outp[1] + == "test1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Prevotella,s__Prevotella copri" + ) def test_genome_gather_from_file_rank(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") g_from_file = runtmp.output("tmp-from-file.txt") - with open(g_from_file, 'w') as f_csv: + with open(g_from_file, "w") as f_csv: f_csv.write(f"{g_res}\n") - c.run_sourmash('tax', 'genome', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, - '--rank', 'species', '--containment-threshold', '0') + c.run_sourmash( + "tax", + "genome", + "--from-file", + g_from_file, + "--taxonomy-csv", + taxonomy_csv, + "--rank", + "species", + "--containment-threshold", + "0", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) def test_genome_gather_two_files(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") # make test2 results (identical to test1 except query_name and filename) g_res2 = runtmp.output("test2.gather.csv") - test2_results = [x.replace("test1", "test2") + "\n" for x in Path(g_res).read_text().splitlines()] - with open(g_res2, 'w') as fp: + test2_results = [ + x.replace("test1", "test2") + "\n" for x in Path(g_res).read_text().splitlines() + ] + with open(g_res2, "w") as fp: fp.writelines(test2_results) - c.run_sourmash('tax', 'genome', '-g', g_res, g_res2, '--taxonomy-csv', taxonomy_csv, - '--rank', 'species', '--containment-threshold', '0') + c.run_sourmash( + "tax", + "genome", + "-g", + g_res, + g_res2, + "--taxonomy-csv", + taxonomy_csv, + "--rank", + "species", + "--containment-threshold", + "0", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out - assert 'test2,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test2.sig,0.057,444000' in c.last_result.out + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) + assert ( + "test2,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test2.sig,0.057,444000" + in c.last_result.out + ) def test_genome_gather_two_files_empty_force(runtmp): # make test2 results (identical to test1 except query_name and filename) # add an empty file too, with --force -> should work c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") - g_empty_csv = runtmp.output('g_empty.csv') + g_empty_csv = runtmp.output("g_empty.csv") with open(g_empty_csv, "w") as fp: fp.write("") print("g_csv: ", g_empty_csv) g_res2 = runtmp.output("test2.gather.csv") - test2_results = [x.replace("test1", "test2") + "\n" for x in Path(g_res).read_text().splitlines()] - with open(g_res2, 'w') as fp: + test2_results = [ + x.replace("test1", "test2") + "\n" for x in Path(g_res).read_text().splitlines() + ] + with open(g_res2, "w") as fp: fp.writelines(test2_results) - c.run_sourmash('tax', 'genome', '-g', g_res, g_res2, '-g', g_empty_csv, - '--taxonomy-csv', taxonomy_csv, - '--rank', 'species', '--containment-threshold', '0', - '--force') + c.run_sourmash( + "tax", + "genome", + "-g", + g_res, + g_res2, + "-g", + g_empty_csv, + "--taxonomy-csv", + taxonomy_csv, + "--rank", + "species", + "--containment-threshold", + "0", + "--force", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out - assert 'test2,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test2.sig,0.057,444000' in c.last_result.out + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) + assert ( + "test2,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test2.sig,0.057,444000" + in c.last_result.out + ) def test_genome_gather_duplicate_filename(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') - - c.run_sourmash('tax', 'genome', '--gather-csv', g_res, '-g', g_res, '--taxonomy-csv', taxonomy_csv, - '--rank', 'species', '--containment-threshold', '0') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") + + c.run_sourmash( + "tax", + "genome", + "--gather-csv", + g_res, + "-g", + g_res, + "--taxonomy-csv", + taxonomy_csv, + "--rank", + "species", + "--containment-threshold", + "0", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert f'ignoring duplicated reference to file: {g_res}' - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out + assert f"ignoring duplicated reference to file: {g_res}" + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) def test_genome_gather_from_file_duplicate_filename(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") g_from_file = runtmp.output("tmp-from-file.txt") - with open(g_from_file, 'w') as f_csv: + with open(g_from_file, "w") as f_csv: f_csv.write(f"{g_res}\n") f_csv.write(f"{g_res}\n") - c.run_sourmash('tax', 'genome', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, - '--rank', 'species', '--containment-threshold', '0') + c.run_sourmash( + "tax", + "genome", + "--from-file", + g_from_file, + "--taxonomy-csv", + taxonomy_csv, + "--rank", + "species", + "--containment-threshold", + "0", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert f'ignoring duplicated reference to file: {g_res}' - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out + assert f"ignoring duplicated reference to file: {g_res}" + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) def test_genome_gather_from_file_duplicate_query(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") # different filename, contents identical to test1 g_res2 = runtmp.output("test2.gather.csv") - with open(g_res2, 'w') as fp: + with open(g_res2, "w") as fp: fp.write(Path(g_res).read_text()) g_from_file = runtmp.output("tmp-from-file.txt") - with open(g_from_file, 'w') as f_csv: + with open(g_from_file, "w") as f_csv: f_csv.write(f"{g_res}\n") f_csv.write(f"{g_res2}\n") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, - '--rank', 'species', '--containment-threshold', '0') + c.run_sourmash( + "tax", + "genome", + "--from-file", + g_from_file, + "--taxonomy-csv", + taxonomy_csv, + "--rank", + "species", + "--containment-threshold", + "0", + ) assert c.last_result.status == -1 print(str(exc.value)) - assert "Gather query test1 was found in more than one CSV. Cannot load from " in str(exc.value) + assert ( + "Gather query test1 was found in more than one CSV. Cannot load from " + in str(exc.value) + ) def test_genome_gather_from_file_duplicate_query_force(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") # different filename, contents identical to test1 g_res2 = runtmp.output("test2.gather.csv") - with open(g_res2, 'w') as fp: + with open(g_res2, "w") as fp: fp.write(Path(g_res).read_text()) g_from_file = runtmp.output("tmp-from-file.txt") - with open(g_from_file, 'w') as f_csv: + with open(g_from_file, "w") as f_csv: f_csv.write(f"{g_res}\n") f_csv.write(f"{g_res2}\n") - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, - '--rank', 'species', '--containment-threshold', '0', '--force') + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "tax", + "genome", + "--from-file", + g_from_file, + "--taxonomy-csv", + taxonomy_csv, + "--rank", + "species", + "--containment-threshold", + "0", + "--force", + ) print(c.last_result.status) print(c.last_result.out) @@ -1490,70 +2727,119 @@ def test_genome_gather_from_file_duplicate_query_force(runtmp): def test_genome_gather_cli_and_from_file(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") g_from_file = runtmp.output("tmp-from-file.txt") # make test2 results (identical to test1 except query_name) g_res2 = runtmp.output("test2.gather.csv") - test2_results = [x.replace("test1", "test2") + "\n" for x in Path(g_res).read_text().splitlines()] - with open(g_res2, 'w') as fp: + test2_results = [ + x.replace("test1", "test2") + "\n" for x in Path(g_res).read_text().splitlines() + ] + with open(g_res2, "w") as fp: fp.writelines(test2_results) # write test2 csv to a text file for input g_from_file = runtmp.output("tmp-from-file.txt") - with open(g_from_file, 'w') as f_csv: + with open(g_from_file, "w") as f_csv: f_csv.write(f"{g_res2}\n") - c.run_sourmash('tax', 'genome', '-g', g_res, '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, - '--rank', 'species', '--containment-threshold', '0') + c.run_sourmash( + "tax", + "genome", + "-g", + g_res, + "--from-file", + g_from_file, + "--taxonomy-csv", + taxonomy_csv, + "--rank", + "species", + "--containment-threshold", + "0", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out - assert 'test2,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test2.sig,0.057,444000' in c.last_result.out + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) + assert ( + "test2,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test2.sig,0.057,444000" + in c.last_result.out + ) def test_genome_gather_cli_and_from_file_duplicate_filename(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") g_from_file = runtmp.output("tmp-from-file.txt") # also write test1 csv to a text file for input g_from_file = runtmp.output("tmp-from-file.txt") - with open(g_from_file, 'w') as f_csv: + with open(g_from_file, "w") as f_csv: f_csv.write(f"{g_res}\n") - c.run_sourmash('tax', 'genome', '-g', g_res, '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, - '--rank', 'species', '--containment-threshold', '0') + c.run_sourmash( + "tax", + "genome", + "-g", + g_res, + "--from-file", + g_from_file, + "--taxonomy-csv", + taxonomy_csv, + "--rank", + "species", + "--containment-threshold", + "0", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert f'ignoring duplicated reference to file: {g_res}' in c.last_result.err - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out + assert f"ignoring duplicated reference to file: {g_res}" in c.last_result.err + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) def test_genome_gather_from_file_below_threshold(runtmp): # What do we want the results from this to be? I think I initially thought we shouldn't report anything, # but wouldn't a "below_threshold" + superkingdom result (here, 0.204) be helpful information? c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") g_from_file = runtmp.output("tmp-from-file.txt") - with open(g_from_file, 'w') as f_csv: + with open(g_from_file, "w") as f_csv: f_csv.write(f"{g_res}\n") - c.run_sourmash('tax', 'genome', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, - '--containment-threshold', '1') + c.run_sourmash( + "tax", + "genome", + "--from-file", + g_from_file, + "--taxonomy-csv", + taxonomy_csv, + "--containment-threshold", + "1", + ) print(c.last_result.status) print(c.last_result.out) @@ -1565,53 +2851,75 @@ def test_genome_gather_from_file_below_threshold(runtmp): def test_genome_gather_two_queries(runtmp): - ''' + """ This checks for initial bug where classification would only happen for one genome per rank when doing --containment-threshold classification - ''' + """ c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/47+63_x_gtdb-rs202.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/47+63_x_gtdb-rs202.gather.csv") # split 47+63 into two fake queries: q47, q63 g_res2 = runtmp.output("two-queries.gather.csv") q2_results = [x + "\n" for x in Path(g_res).read_text().splitlines()] # rename queries - q2_results[1] = q2_results[1].replace('47+63', 'q47') - q2_results[2] = q2_results[2].replace('47+63', 'q63') - with open(g_res2, 'w') as fp: + q2_results[1] = q2_results[1].replace("47+63", "q47") + q2_results[2] = q2_results[2].replace("47+63", "q63") + with open(g_res2, "w") as fp: for line in q2_results: print(line) fp.write(line) - c.run_sourmash('tax', 'genome', '-g', g_res2, '--taxonomy-csv', taxonomy_csv, - '--containment-threshold', '0') + c.run_sourmash( + "tax", + "genome", + "-g", + g_res2, + "--taxonomy-csv", + taxonomy_csv, + "--containment-threshold", + "0", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 assert "query_name,status,rank,fraction,lineage" in c.last_result.out - assert "q63,match,species,0.336,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Shewanellaceae;g__Shewanella;s__Shewanella baltica,491c0a81," in c.last_result.out - assert "q47,match,species,0.664,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Shewanellaceae;g__Shewanella;s__Shewanella baltica," in c.last_result.out + assert ( + "q63,match,species,0.336,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Shewanellaceae;g__Shewanella;s__Shewanella baltica,491c0a81," + in c.last_result.out + ) + assert ( + "q47,match,species,0.664,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Shewanellaceae;g__Shewanella;s__Shewanella baltica," + in c.last_result.out + ) def test_genome_rank_duplicated_taxonomy_fail(runtmp): c = runtmp # write temp taxonomy with duplicates - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") duplicated_csv = runtmp.output("duplicated_taxonomy.csv") - with open(duplicated_csv, 'w') as dup: + with open(duplicated_csv, "w") as dup: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] - tax.append(tax[1] + 'FOO') # add first tax_assign again + tax.append(tax[1] + "FOO") # add first tax_assign again dup.write("\n".join(tax)) - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', duplicated_csv, - '--rank', 'species') + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + duplicated_csv, + "--rank", + "species", + ) assert "cannot read taxonomy assignments" in str(exc.value) assert "multiple lineages for identifier GCF_001881345" in str(exc.value) @@ -1620,16 +2928,16 @@ def test_genome_rank_duplicated_taxonomy_fail_lineages(runtmp): # write temp taxonomy with duplicates => lineages-style file c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") taxdb = tax_utils.LineageDB.load(taxonomy_csv) for k, v in taxdb.items(): print(k, v) - lineage_csv = runtmp.output('lin.csv') - with open(lineage_csv, 'w', newline="") as fp: + lineage_csv = runtmp.output("lin.csv") + with open(lineage_csv, "w", newline="") as fp: w = csv.writer(fp) - w.writerow(['name', 'lineage']) + w.writerow(["name", "lineage"]) for k, v in taxdb.items(): linstr = lca_utils.display_lineage(v) w.writerow([k, linstr]) @@ -1640,7 +2948,7 @@ def test_genome_rank_duplicated_taxonomy_fail_lineages(runtmp): w.writerow([k, linstr]) with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'summarize', lineage_csv) + c.run_sourmash("tax", "summarize", lineage_csv) print(c.last_result.out) print(c.last_result.err) @@ -1651,174 +2959,292 @@ def test_genome_rank_duplicated_taxonomy_fail_lineages(runtmp): def test_genome_rank_duplicated_taxonomy_force(runtmp): # write temp taxonomy with duplicates c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") duplicated_csv = runtmp.output("duplicated_taxonomy.csv") - with open(duplicated_csv, 'w') as dup: + with open(duplicated_csv, "w") as dup: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] - tax.append(tax[1]) # add first tax_assign again + tax.append(tax[1]) # add first tax_assign again dup.write("\n".join(tax)) - g_csv = utils.get_test_data('tax/test1.gather.csv') - - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', duplicated_csv, - '--rank', 'species', '--force', '--containment-threshold', '0') + g_csv = utils.get_test_data("tax/test1.gather.csv") + + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + duplicated_csv, + "--rank", + "species", + "--force", + "--containment-threshold", + "0", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) def test_genome_missing_taxonomy_ignore_threshold(runtmp): c = runtmp # write temp taxonomy with missing entry - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") subset_csv = runtmp.output("subset_taxonomy.csv") - with open(subset_csv, 'w') as subset: + with open(subset_csv, "w") as subset: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] - tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) + tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) subset.write("\n".join(tax)) - g_csv = utils.get_test_data('tax/test1.gather.csv') - - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', subset_csv, '--containment-threshold', '0') + g_csv = utils.get_test_data("tax/test1.gather.csv") + + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + subset_csv, + "--containment-threshold", + "0", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "The following are missing from the taxonomy information: GCF_001881345" in c.last_result.err - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out + assert ( + "The following are missing from the taxonomy information: GCF_001881345" + in c.last_result.err + ) + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) def test_genome_missing_taxonomy_recover_with_second_tax_file(runtmp): c = runtmp # write temp taxonomy with missing entry - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") subset_csv = runtmp.output("subset_taxonomy.csv") - with open(subset_csv, 'w') as subset: + with open(subset_csv, "w") as subset: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] - tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) + tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) subset.write("\n".join(tax)) - g_csv = utils.get_test_data('tax/test1.gather.csv') - - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', subset_csv, '-t', taxonomy_csv, '--containment-threshold', '0') + g_csv = utils.get_test_data("tax/test1.gather.csv") + + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + subset_csv, + "-t", + taxonomy_csv, + "--containment-threshold", + "0", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "The following are missing from the taxonomy information: GCF_001881345" not in c.last_result.err - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out + assert ( + "The following are missing from the taxonomy information: GCF_001881345" + not in c.last_result.err + ) + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) def test_genome_missing_taxonomy_ignore_rank(runtmp): c = runtmp # write temp taxonomy with missing entry - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") subset_csv = runtmp.output("subset_taxonomy.csv") - with open(subset_csv, 'w') as subset: + with open(subset_csv, "w") as subset: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] - tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) + tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) subset.write("\n".join(tax)) - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', subset_csv, '--rank', 'species') + c.run_sourmash( + "tax", "genome", "-g", g_csv, "--taxonomy-csv", subset_csv, "--rank", "species" + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "The following are missing from the taxonomy information: GCF_001881345" in c.last_result.err - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,below_threshold,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out + assert ( + "The following are missing from the taxonomy information: GCF_001881345" + in c.last_result.err + ) + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,below_threshold,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) def test_genome_multiple_taxonomy_files(runtmp): c = runtmp # write temp taxonomy with missing entry - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") subset_csv = runtmp.output("subset_taxonomy.csv") - with open(subset_csv, 'w') as subset: + with open(subset_csv, "w") as subset: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] - tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) + tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) subset.write("\n".join(tax)) - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") # using mult -t args - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', subset_csv, '-t', taxonomy_csv) + c.run_sourmash( + "tax", "genome", "-g", g_csv, "--taxonomy-csv", subset_csv, "-t", taxonomy_csv + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "The following are missing from the taxonomy information: GCF_001881345" not in c.last_result.err - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000,' in c.last_result.out + assert ( + "The following are missing from the taxonomy information: GCF_001881345" + not in c.last_result.err + ) + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000," + in c.last_result.out + ) # using single -t arg - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', subset_csv, taxonomy_csv) + c.run_sourmash( + "tax", "genome", "-g", g_csv, "--taxonomy-csv", subset_csv, taxonomy_csv + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "The following are missing from the taxonomy information: GCF_001881345" not in c.last_result.err - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000,' in c.last_result.out + assert ( + "The following are missing from the taxonomy information: GCF_001881345" + not in c.last_result.err + ) + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000," + in c.last_result.out + ) def test_genome_multiple_taxonomy_files_empty_force(runtmp): c = runtmp # write temp taxonomy with missing entry, as well as an empty file, # and use force - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") subset_csv = runtmp.output("subset_taxonomy.csv") - with open(subset_csv, 'w') as subset: + with open(subset_csv, "w") as subset: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] - tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) + tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) subset.write("\n".join(tax)) - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") - empty_tax = runtmp.output('tax_empty.txt') + empty_tax = runtmp.output("tax_empty.txt") with open(empty_tax, "w") as fp: fp.write("") - + # using mult -t args - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', subset_csv, '-t', taxonomy_csv, '-t', empty_tax, '--force') + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + subset_csv, + "-t", + taxonomy_csv, + "-t", + empty_tax, + "--force", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "The following are missing from the taxonomy information: GCF_001881345" not in c.last_result.err - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000,' in c.last_result.out + assert ( + "The following are missing from the taxonomy information: GCF_001881345" + not in c.last_result.err + ) + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000," + in c.last_result.out + ) def test_genome_missing_taxonomy_fail_threshold(runtmp): c = runtmp # write temp taxonomy with missing entry - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") subset_csv = runtmp.output("subset_taxonomy.csv") - with open(subset_csv, 'w') as subset: + with open(subset_csv, "w") as subset: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] - tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) + tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) subset.write("\n".join(tax)) - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', subset_csv, - '--fail-on-missing-taxonomy', '--containment-threshold', '0') + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + subset_csv, + "--fail-on-missing-taxonomy", + "--containment-threshold", + "0", + ) print(str(exc.value)) print(c.last_result.status) @@ -1833,18 +3259,27 @@ def test_genome_missing_taxonomy_fail_threshold(runtmp): def test_genome_missing_taxonomy_fail_rank(runtmp): c = runtmp # write temp taxonomy with missing entry - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") subset_csv = runtmp.output("subset_taxonomy.csv") - with open(subset_csv, 'w') as subset: + with open(subset_csv, "w") as subset: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] - tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) + tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) subset.write("\n".join(tax)) - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', subset_csv, - '--fail-on-missing-taxonomy', '--rank', 'species') + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + subset_csv, + "--fail-on-missing-taxonomy", + "--rank", + "species", + ) print(str(exc.value)) print(c.last_result.status) @@ -1859,12 +3294,22 @@ def test_genome_missing_taxonomy_fail_rank(runtmp): def test_genome_rank_not_available(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, - '--rank', 'strain', '--containment-threshold', '0') + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--rank", + "strain", + "--containment-threshold", + "0", + ) print(str(exc.value)) print(c.last_result.status) @@ -1872,22 +3317,32 @@ def test_genome_rank_not_available(runtmp): print(c.last_result.err) assert c.last_result.status == -1 - assert "No taxonomic information provided for rank strain: cannot classify at this rank" in str(exc.value) + assert ( + "No taxonomic information provided for rank strain: cannot classify at this rank" + in str(exc.value) + ) def test_genome_empty_gather_results_with_header_single(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") gather_results = [x for x in Path(g_csv).read_text().splitlines()] - empty_gather_with_header = runtmp.output('g_header.csv') + empty_gather_with_header = runtmp.output("g_header.csv") # write temp empty gather results (header only) with open(empty_gather_with_header, "w") as fp: fp.write(gather_results[0]) with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', empty_gather_with_header, '--taxonomy-csv', taxonomy_csv) + c.run_sourmash( + "tax", + "genome", + "-g", + empty_gather_with_header, + "--taxonomy-csv", + taxonomy_csv, + ) print(str(exc.value)) print(c.last_result.status) @@ -1895,44 +3350,48 @@ def test_genome_empty_gather_results_with_header_single(runtmp): print(c.last_result.err) assert c.last_result.status == -1 - assert f'No gather results loaded from {empty_gather_with_header}.' in str(exc.value) - assert 'Exiting.' in str(exc.value) + assert f"No gather results loaded from {empty_gather_with_header}." in str( + exc.value + ) + assert "Exiting." in str(exc.value) def test_genome_empty_gather_results_single(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") # write temp empty gather results - empty_tax = runtmp.output('tax_header.csv') + empty_tax = runtmp.output("tax_header.csv") with open(empty_tax, "w") as fp: fp.write("") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', empty_tax, '--taxonomy-csv', taxonomy_csv) - + c.run_sourmash("tax", "genome", "-g", empty_tax, "--taxonomy-csv", taxonomy_csv) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == -1 - assert f"Cannot read gather results from '{empty_tax}'. Is file empty?" in str(exc.value) - assert 'Exiting.' in c.last_result.err + assert f"Cannot read gather results from '{empty_tax}'. Is file empty?" in str( + exc.value + ) + assert "Exiting." in c.last_result.err def test_genome_empty_gather_results_single_force(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") # write temp empty gather results (header only) - empty_tax = runtmp.output('tax_header.csv') + empty_tax = runtmp.output("tax_header.csv") with open(empty_tax, "w") as fp: fp.write("") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', empty_tax, '--taxonomy-csv', taxonomy_csv, - '--force') + c.run_sourmash( + "tax", "genome", "-g", empty_tax, "--taxonomy-csv", taxonomy_csv, "--force" + ) print(str(exc.value)) print(c.last_result.status) @@ -1940,26 +3399,40 @@ def test_genome_empty_gather_results_single_force(runtmp): print(c.last_result.err) assert c.last_result.status == -1 - assert '--force is set. Attempting to continue to next set of gather results.' in str(exc.value) - assert 'No results for classification. Exiting.' in str(exc.value) + assert ( + "--force is set. Attempting to continue to next set of gather results." + in str(exc.value) + ) + assert "No results for classification. Exiting." in str(exc.value) def test_genome_empty_gather_results_with_empty_csv_force(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") # write temp empty gather results - empty_tax = runtmp.output('tax_empty.txt') + empty_tax = runtmp.output("tax_empty.txt") with open(empty_tax, "w") as fp: fp.write("") g_from_file = runtmp.output("tmp-from-csv.csv") - with open(g_from_file, 'w') as f_csv: + with open(g_from_file, "w") as f_csv: f_csv.write(f"{empty_tax}\n") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', empty_tax, '--from-file', g_from_file, - '--taxonomy-csv', taxonomy_csv, '--rank', 'species', '--force') + c.run_sourmash( + "tax", + "genome", + "-g", + empty_tax, + "--from-file", + g_from_file, + "--taxonomy-csv", + taxonomy_csv, + "--rank", + "species", + "--force", + ) print(str(exc.value)) print(c.last_result.status) @@ -1967,48 +3440,80 @@ def test_genome_empty_gather_results_with_empty_csv_force(runtmp): print(c.last_result.err) assert c.last_result.status == -1 - assert '--force is set. Attempting to continue to next set of gather results.' in str(exc.value) - assert 'No results for classification. Exiting.' in str(exc.value) + assert ( + "--force is set. Attempting to continue to next set of gather results." + in str(exc.value) + ) + assert "No results for classification. Exiting." in str(exc.value) def test_genome_empty_gather_results_with_csv_force(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") - g_res = utils.get_test_data('tax/test1.gather.csv') + g_res = utils.get_test_data("tax/test1.gather.csv") g_from_file = runtmp.output("tmp-from-file.txt") - with open(g_from_file, 'w') as f_csv: + with open(g_from_file, "w") as f_csv: f_csv.write(f"{g_res}\n") # write temp empty gather results - empty_tax = runtmp.output('tax_empty.csv') + empty_tax = runtmp.output("tax_empty.csv") with open(empty_tax, "w") as fp: fp.write("") - c.run_sourmash('tax', 'genome', '-g', empty_tax, '--from-file', g_from_file, - '--taxonomy-csv', taxonomy_csv, '--rank', 'species', - '--containment-threshold', '0', '--force') + c.run_sourmash( + "tax", + "genome", + "-g", + empty_tax, + "--from-file", + g_from_file, + "--taxonomy-csv", + taxonomy_csv, + "--rank", + "species", + "--containment-threshold", + "0", + "--force", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert '--force is set. Attempting to continue to next set of gather results.' in c.last_result.err - assert 'loaded results for 1 queries from 1 gather CSVs' in c.last_result.err - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out + assert ( + "--force is set. Attempting to continue to next set of gather results." + in c.last_result.err + ) + assert "loaded results for 1 queries from 1 gather CSVs" in c.last_result.err + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) def test_genome_containment_threshold_bounds(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") below_threshold = "-1" with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', tax, '--taxonomy-csv', tax, - '--containment-threshold', below_threshold) + c.run_sourmash( + "tax", + "genome", + "-g", + tax, + "--taxonomy-csv", + tax, + "--containment-threshold", + below_threshold, + ) print(c.last_result.status) print(c.last_result.out) @@ -2017,8 +3522,16 @@ def test_genome_containment_threshold_bounds(runtmp): above_threshold = "1.1" with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, - '--containment-threshold', above_threshold) + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--containment-threshold", + above_threshold, + ) print(c.last_result.status) print(c.last_result.out) @@ -2028,13 +3541,21 @@ def test_genome_containment_threshold_bounds(runtmp): def test_genome_containment_threshold_type(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") not_a_float = "str" with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, - '--containment-threshold', not_a_float) + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--containment-threshold", + not_a_float, + ) print(c.last_result.status) print(c.last_result.out) @@ -2043,17 +3564,17 @@ def test_genome_containment_threshold_type(runtmp): def test_genome_over100percent_error(runtmp): - tax = utils.get_test_data('tax/test.taxonomy.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data("tax/test.taxonomy.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") - perfect_g_csv = runtmp.output('g.csv') + perfect_g_csv = runtmp.output("g.csv") - #create an impossible gather result - with open(g_csv, 'r') as fp: - r = csv.DictReader(fp, delimiter=',') + # create an impossible gather result + with open(g_csv) as fp: + r = csv.DictReader(fp, delimiter=",") header = r.fieldnames print(header) - with open(perfect_g_csv, 'w') as out_fp: + with open(perfect_g_csv, "w") as out_fp: w = csv.DictWriter(out_fp, header) w.writeheader() for n, row in enumerate(r): @@ -2063,25 +3584,36 @@ def test_genome_over100percent_error(runtmp): print(row) with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('tax', 'genome', '-g', perfect_g_csv, '--taxonomy-csv', tax) + runtmp.run_sourmash("tax", "genome", "-g", perfect_g_csv, "--taxonomy-csv", tax) print(runtmp.last_result.status) print(runtmp.last_result.out) print(runtmp.last_result.err) assert runtmp.last_result.status == -1 - assert "fraction is > 100% of the query! This should not be possible." in runtmp.last_result.err + assert ( + "fraction is > 100% of the query! This should not be possible." + in runtmp.last_result.err + ) def test_genome_ani_threshold_input_errors(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather_old.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather_old.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") below_threshold = "-1" with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', tax, '--taxonomy-csv', tax, - '--ani-threshold', below_threshold) + c.run_sourmash( + "tax", + "genome", + "-g", + tax, + "--taxonomy-csv", + tax, + "--ani-threshold", + below_threshold, + ) print(c.last_result.status) print(c.last_result.out) @@ -2090,8 +3622,16 @@ def test_genome_ani_threshold_input_errors(runtmp): above_threshold = "1.1" with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, - '--ani-threshold', above_threshold) + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--ani-threshold", + above_threshold, + ) print(c.last_result.status) print(c.last_result.out) @@ -2101,8 +3641,16 @@ def test_genome_ani_threshold_input_errors(runtmp): not_a_float = "str" with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, - '--ani-threshold', not_a_float) + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--ani-threshold", + not_a_float, + ) print(c.last_result.status) print(c.last_result.out) @@ -2112,49 +3660,76 @@ def test_genome_ani_threshold_input_errors(runtmp): def test_genome_ani_threshold(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, - '--ani-threshold', "0.93") # note: I think this was previously a bug, if 0.95 produced the result below... + c.run_sourmash( + "tax", "genome", "-g", g_csv, "--taxonomy-csv", tax, "--ani-threshold", "0.93" + ) # note: I think this was previously a bug, if 0.95 produced the result below... print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000,0.93' in c.last_result.out + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000,0.93" + in c.last_result.out + ) # more lax threshold - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, - '--ani-threshold', "0.9") + c.run_sourmash( + "tax", "genome", "-g", g_csv, "--taxonomy-csv", tax, "--ani-threshold", "0.9" + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) # too stringent of threshold (using rank) - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, - '--ani-threshold', "1.0", '--rank', 'species') + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--ani-threshold", + "1.0", + "--rank", + "species", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) - assert "test1,below_threshold,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000,0.92" in c.last_result.out + assert ( + "test1,below_threshold,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000,0.92" + in c.last_result.out + ) def test_genome_ani_oldgather(runtmp): # now fail if using gather <4.4 c = runtmp - g_csv = utils.get_test_data('tax/test1.gather_old.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather_old.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax) - assert "is missing columns needed for taxonomic summarization. Please run gather with sourmash >= 4.4." in str(exc.value) + c.run_sourmash("tax", "genome", "-g", g_csv, "--taxonomy-csv", tax) + assert ( + "is missing columns needed for taxonomic summarization. Please run gather with sourmash >= 4.4." + in str(exc.value) + ) assert c.last_result.status == -1 @@ -2164,11 +3739,10 @@ def test_genome_ani_lemonade_classify(runtmp): c = runtmp ## first run gather - genome = utils.get_test_data('tax/lemonade-MAG3.sig.gz') - matches = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.matches.zip') + genome = utils.get_test_data("tax/lemonade-MAG3.sig.gz") + matches = utils.get_test_data("tax/lemonade-MAG3.x.gtdb.matches.zip") - c.run_sourmash('gather', genome, matches, - '--threshold-bp=5000', '-o', 'gather.csv') + c.run_sourmash("gather", genome, matches, "--threshold-bp=5000", "-o", "gather.csv") print(c.last_result.status) print(c.last_result.out) @@ -2176,29 +3750,55 @@ def test_genome_ani_lemonade_classify(runtmp): assert c.last_result.status == 0 - this_gather_file = c.output('gather.csv') + this_gather_file = c.output("gather.csv") this_gather = Path(this_gather_file).read_text().splitlines() assert len(this_gather) == 4 ## now run 'tax genome' with human output - taxonomy_file = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.matches.tax.csv') - c.run_sourmash('tax', 'genome', '-g', this_gather_file, '-t', taxonomy_file, - '--ani', '0.8', '-F', 'human') + taxonomy_file = utils.get_test_data("tax/lemonade-MAG3.x.gtdb.matches.tax.csv") + c.run_sourmash( + "tax", + "genome", + "-g", + this_gather_file, + "-t", + taxonomy_file, + "--ani", + "0.8", + "-F", + "human", + ) output = c.last_result.out - assert 'MAG3_1 match 5.3% 91.0% d__Bacteria;p__Bacteroidota;c__Chlorobia;o__Chlorobiales;f__Chlorobiaceae;g__Prosthecochloris;s__Prosthecochloris vibrioformis' in output + assert ( + "MAG3_1 match 5.3% 91.0% d__Bacteria;p__Bacteroidota;c__Chlorobia;o__Chlorobiales;f__Chlorobiaceae;g__Prosthecochloris;s__Prosthecochloris vibrioformis" + in output + ) # aaand classify to lineage_csv - c.run_sourmash('tax', 'genome', '-g', this_gather_file, '-t', taxonomy_file, - '--ani', '0.8', '-F', 'lineage_csv') + c.run_sourmash( + "tax", + "genome", + "-g", + this_gather_file, + "-t", + taxonomy_file, + "--ani", + "0.8", + "-F", + "lineage_csv", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) output = c.last_result.out - assert 'ident,superkingdom,phylum,class,order,family,genus,species' in output - assert 'MAG3_1,d__Bacteria,p__Bacteroidota,c__Chlorobia,o__Chlorobiales,f__Chlorobiaceae,g__Prosthecochloris,s__Prosthecochloris vibrioformis' in output + assert "ident,superkingdom,phylum,class,order,family,genus,species" in output + assert ( + "MAG3_1,d__Bacteria,p__Bacteroidota,c__Chlorobia,o__Chlorobiales,f__Chlorobiaceae,g__Prosthecochloris,s__Prosthecochloris vibrioformis" + in output + ) def test_genome_ani_lemonade_classify_estimate_ani_ci(runtmp): @@ -2207,11 +3807,18 @@ def test_genome_ani_lemonade_classify_estimate_ani_ci(runtmp): c = runtmp ## first run gather - genome = utils.get_test_data('tax/lemonade-MAG3.sig.gz') - matches = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.matches.zip') - - c.run_sourmash('gather', genome, matches, - '--threshold-bp=5000', '-o', 'gather.csv', '--estimate-ani') + genome = utils.get_test_data("tax/lemonade-MAG3.sig.gz") + matches = utils.get_test_data("tax/lemonade-MAG3.x.gtdb.matches.zip") + + c.run_sourmash( + "gather", + genome, + matches, + "--threshold-bp=5000", + "-o", + "gather.csv", + "--estimate-ani", + ) print(c.last_result.status) print(c.last_result.out) @@ -2219,36 +3826,62 @@ def test_genome_ani_lemonade_classify_estimate_ani_ci(runtmp): assert c.last_result.status == 0 - this_gather_file = c.output('gather.csv') + this_gather_file = c.output("gather.csv") this_gather = Path(this_gather_file).read_text().splitlines() assert len(this_gather) == 4 ## now run 'tax genome' with human output - taxonomy_file = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.matches.tax.csv') - c.run_sourmash('tax', 'genome', '-g', this_gather_file, '-t', taxonomy_file, - '--ani', '0.8', '-F', 'human') + taxonomy_file = utils.get_test_data("tax/lemonade-MAG3.x.gtdb.matches.tax.csv") + c.run_sourmash( + "tax", + "genome", + "-g", + this_gather_file, + "-t", + taxonomy_file, + "--ani", + "0.8", + "-F", + "human", + ) output = c.last_result.out - assert 'MAG3_1 match 5.3% 91.0% d__Bacteria;p__Bacteroidota;c__Chlorobia;o__Chlorobiales;f__Chlorobiaceae;g__Prosthecochloris;s__Prosthecochloris vibrioformis' in output + assert ( + "MAG3_1 match 5.3% 91.0% d__Bacteria;p__Bacteroidota;c__Chlorobia;o__Chlorobiales;f__Chlorobiaceae;g__Prosthecochloris;s__Prosthecochloris vibrioformis" + in output + ) # aaand classify to lineage_csv - c.run_sourmash('tax', 'genome', '-g', this_gather_file, '-t', taxonomy_file, - '--ani', '0.8', '-F', 'lineage_csv') + c.run_sourmash( + "tax", + "genome", + "-g", + this_gather_file, + "-t", + taxonomy_file, + "--ani", + "0.8", + "-F", + "lineage_csv", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) output = c.last_result.out - assert 'ident,superkingdom,phylum,class,order,family,genus,species' in output - assert 'MAG3_1,d__Bacteria,p__Bacteroidota,c__Chlorobia,o__Chlorobiales,f__Chlorobiaceae,g__Prosthecochloris,s__Prosthecochloris vibrioformis' in output + assert "ident,superkingdom,phylum,class,order,family,genus,species" in output + assert ( + "MAG3_1,d__Bacteria,p__Bacteroidota,c__Chlorobia,o__Chlorobiales,f__Chlorobiaceae,g__Prosthecochloris,s__Prosthecochloris vibrioformis" + in output + ) def test_metagenome_no_gather_csv(runtmp): # test tax metagenome with no -g - taxonomy_file = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.matches.tax.csv') - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'metagenome', '-t', taxonomy_file) + taxonomy_file = utils.get_test_data("tax/lemonade-MAG3.x.gtdb.matches.tax.csv") + with pytest.raises(SourmashCommandFailed): + runtmp.run_sourmash("tax", "metagenome", "-t", taxonomy_file) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -2257,9 +3890,9 @@ def test_metagenome_no_gather_csv(runtmp): def test_genome_no_gather_csv(runtmp): # test tax genome with no -g - taxonomy_file = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.matches.tax.csv') - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'genome', '-t', taxonomy_file) + taxonomy_file = utils.get_test_data("tax/lemonade-MAG3.x.gtdb.matches.tax.csv") + with pytest.raises(SourmashCommandFailed): + runtmp.run_sourmash("tax", "genome", "-t", taxonomy_file) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -2268,9 +3901,9 @@ def test_genome_no_gather_csv(runtmp): def test_annotate_no_gather_csv(runtmp): # test tax annotate with no -g - taxonomy_file = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.matches.tax.csv') - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'annotate', '-t', taxonomy_file) + taxonomy_file = utils.get_test_data("tax/lemonade-MAG3.x.gtdb.matches.tax.csv") + with pytest.raises(SourmashCommandFailed): + runtmp.run_sourmash("tax", "annotate", "-t", taxonomy_file) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -2281,89 +3914,165 @@ def test_genome_LIN(runtmp): # test basic genome with LIN taxonomy c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') - - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '--lins', '--ani-threshold', '0.93') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") + + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--lins", + "--ani-threshold", + "0.93", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" in c.last_result.out - assert "test1,below_threshold,0,0.089,1,md5,test1.sig,0.057,444000,0.925" in c.last_result.out - - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '--lins', '--ani-threshold', '0.924') + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" + in c.last_result.out + ) + assert ( + "test1,below_threshold,0,0.089,1,md5,test1.sig,0.057,444000,0.925" + in c.last_result.out + ) + + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--lins", + "--ani-threshold", + "0.924", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" in c.last_result.out - assert "test1,match,19,0.088,0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.058,442000,0.925" in c.last_result.out - - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '--lins', '--rank', '4') + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,19,0.088,0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.058,442000,0.925" + in c.last_result.out + ) + + c.run_sourmash( + "tax", "genome", "-g", g_csv, "--taxonomy-csv", tax, "--lins", "--rank", "4" + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" in c.last_result.out - assert "test1,below_threshold,4,0.088,0;0;0;0;0,md5,test1.sig,0.058,442000,0.925" in c.last_result.out + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" + in c.last_result.out + ) + assert ( + "test1,below_threshold,4,0.088,0;0;0;0;0,md5,test1.sig,0.058,442000,0.925" + in c.last_result.out + ) def test_genome_LIN_lingroups(runtmp): # test basic genome with LIN taxonomy c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") lg_file = runtmp.output("test.lg.csv") - with open(lg_file, 'w') as out: - out.write('lin,name\n') - out.write('0;0;0,lg1\n') - out.write('1;0;0,lg2\n') - out.write('2;0;0,lg3\n') - out.write('1;0;1,lg3\n') + with open(lg_file, "w") as out: + out.write("lin,name\n") + out.write("0;0;0,lg1\n") + out.write("1;0;0,lg2\n") + out.write("2;0;0,lg3\n") + out.write("1;0;1,lg3\n") # write a 19 so we can check the end - out.write('0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,lg4\n') - - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '--lins', '--lingroup', lg_file) + out.write("0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,lg4\n") + + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--lins", + "--lingroup", + lg_file, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" in c.last_result.out - assert "test1,below_threshold,2,0.088,0;0;0,md5,test1.sig,0.058,442000,0.925" in c.last_result.out - - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '--lins', '--lingroup', lg_file, '--ani-threshold', '0.924') + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" + in c.last_result.out + ) + assert ( + "test1,below_threshold,2,0.088,0;0;0,md5,test1.sig,0.058,442000,0.925" + in c.last_result.out + ) + + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--lins", + "--lingroup", + lg_file, + "--ani-threshold", + "0.924", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" in c.last_result.out - assert "test1,match,19,0.088,0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.058,442000,0.925" in c.last_result.out + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,19,0.088,0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.058,442000,0.925" + in c.last_result.out + ) def test_annotate_0(runtmp): # test annotate basics c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csvout = runtmp.output("test1.gather.with-lineages.csv") out_dir = os.path.dirname(csvout) - c.run_sourmash('tax', 'annotate', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', out_dir) + c.run_sourmash( + "tax", "annotate", "--gather-csv", g_csv, "--taxonomy-csv", tax, "-o", out_dir + ) print(c.last_result.status) print(c.last_result.out) @@ -2377,27 +4086,48 @@ def test_annotate_0(runtmp): assert f"saving 'annotate' output to '{csvout}'" in runtmp.last_result.err assert "lineage" in lin_gather_results[0] - assert "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in lin_gather_results[1] - assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[2] - assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in lin_gather_results[3] - assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[4] + assert ( + "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" + in lin_gather_results[1] + ) + assert ( + "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" + in lin_gather_results[2] + ) + assert ( + "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" + in lin_gather_results[3] + ) + assert ( + "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" + in lin_gather_results[4] + ) def test_annotate_gzipped_gather(runtmp): # test annotate basics c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") # rewrite gather_csv as gzipped csv - gz_gather = runtmp.output('test1.gather.csv.gz') - with open(g_csv, 'rb') as f_in, gzip.open(gz_gather, 'wb') as f_out: + gz_gather = runtmp.output("test1.gather.csv.gz") + with open(g_csv, "rb") as f_in, gzip.open(gz_gather, "wb") as f_out: f_out.writelines(f_in) - tax = utils.get_test_data('tax/test.taxonomy.csv') + tax = utils.get_test_data("tax/test.taxonomy.csv") csvout = runtmp.output("test1.gather.with-lineages.csv") out_dir = os.path.dirname(csvout) - c.run_sourmash('tax', 'annotate', '--gather-csv', gz_gather, '--taxonomy-csv', tax, '-o', out_dir) + c.run_sourmash( + "tax", + "annotate", + "--gather-csv", + gz_gather, + "--taxonomy-csv", + tax, + "-o", + out_dir, + ) print(c.last_result.status) print(c.last_result.out) @@ -2411,22 +4141,44 @@ def test_annotate_gzipped_gather(runtmp): assert f"saving 'annotate' output to '{csvout}'" in runtmp.last_result.err assert "lineage" in lin_gather_results[0] - assert "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in lin_gather_results[1] - assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[2] - assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in lin_gather_results[3] - assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[4] + assert ( + "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" + in lin_gather_results[1] + ) + assert ( + "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" + in lin_gather_results[2] + ) + assert ( + "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" + in lin_gather_results[3] + ) + assert ( + "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" + in lin_gather_results[4] + ) def test_annotate_0_LIN(runtmp): # test annotate basics c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") csvout = runtmp.output("test1.gather.with-lineages.csv") out_dir = os.path.dirname(csvout) - c.run_sourmash('tax', 'annotate', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', out_dir, "--lins") + c.run_sourmash( + "tax", + "annotate", + "--gather-csv", + g_csv, + "--taxonomy-csv", + tax, + "-o", + out_dir, + "--lins", + ) print(c.last_result.status) print(c.last_result.out) @@ -2451,19 +4203,29 @@ def test_annotate_gather_argparse(runtmp): # this tests argparse handling w/extend. c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csvout = runtmp.output("test1.gather.with-lineages.csv") out_dir = os.path.dirname(csvout) - g_empty_csv = runtmp.output('g_empty.csv') + g_empty_csv = runtmp.output("g_empty.csv") with open(g_empty_csv, "w") as fp: fp.write("") print("g_csv: ", g_empty_csv) - c.run_sourmash('tax', 'annotate', '--gather-csv', g_csv, - '-g', g_empty_csv, '--taxonomy-csv', tax, '-o', out_dir, - '--force') + c.run_sourmash( + "tax", + "annotate", + "--gather-csv", + g_csv, + "-g", + g_empty_csv, + "--taxonomy-csv", + tax, + "-o", + out_dir, + "--force", + ) print(c.last_result.status) print(c.last_result.out) @@ -2477,19 +4239,24 @@ def test_annotate_gather_argparse(runtmp): assert f"saving 'annotate' output to '{csvout}'" in runtmp.last_result.err assert "lineage" in lin_gather_results[0] - assert "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in lin_gather_results[1] + assert ( + "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" + in lin_gather_results[1] + ) def test_annotate_0_db(runtmp): # test annotate with sqlite db c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.db') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.db") csvout = runtmp.output("test1.gather.with-lineages.csv") out_dir = os.path.dirname(csvout) - c.run_sourmash('tax', 'annotate', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', out_dir) + c.run_sourmash( + "tax", "annotate", "--gather-csv", g_csv, "--taxonomy-csv", tax, "-o", out_dir + ) print(c.last_result.status) print(c.last_result.out) @@ -2502,105 +4269,134 @@ def test_annotate_0_db(runtmp): assert f"saving 'annotate' output to '{csvout}'" in runtmp.last_result.err assert "lineage" in lin_gather_results[0] - assert "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in lin_gather_results[1] - assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[2] - assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in lin_gather_results[3] - assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[4] + assert ( + "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" + in lin_gather_results[1] + ) + assert ( + "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" + in lin_gather_results[2] + ) + assert ( + "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" + in lin_gather_results[3] + ) + assert ( + "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" + in lin_gather_results[4] + ) def test_annotate_empty_gather_results(runtmp): - tax = utils.get_test_data('tax/test.taxonomy.csv') + tax = utils.get_test_data("tax/test.taxonomy.csv") - #creates empty gather result - g_csv = runtmp.output('g.csv') + # creates empty gather result + g_csv = runtmp.output("g.csv") with open(g_csv, "w") as fp: fp.write("") print("g_csv: ", g_csv) with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'annotate', '-g', g_csv, '--taxonomy-csv', tax) + runtmp.run_sourmash("tax", "annotate", "-g", g_csv, "--taxonomy-csv", tax) assert f"Cannot read from '{g_csv}'. Is file empty?" in str(exc.value) assert runtmp.last_result.status == -1 def test_annotate_prefetch_or_other_header(runtmp): - tax = utils.get_test_data('tax/test.taxonomy.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') - - alt_csv = runtmp.output('g.csv') - for alt_col in ['match_name', 'ident', 'accession']: - #modify 'name' to other acceptable id_columns result - alt_g = [x.replace("name", alt_col) + "\n" for x in Path(g_csv).read_text().splitlines()] - with open(alt_csv, 'w') as fp: + tax = utils.get_test_data("tax/test.taxonomy.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") + + alt_csv = runtmp.output("g.csv") + for alt_col in ["match_name", "ident", "accession"]: + # modify 'name' to other acceptable id_columns result + alt_g = [ + x.replace("name", alt_col) + "\n" + for x in Path(g_csv).read_text().splitlines() + ] + with open(alt_csv, "w") as fp: fp.writelines(alt_g) - runtmp.run_sourmash('tax', 'annotate', '-g', alt_csv, '--taxonomy-csv', tax) + runtmp.run_sourmash("tax", "annotate", "-g", alt_csv, "--taxonomy-csv", tax) assert runtmp.last_result.status == 0 print(runtmp.last_result.out) print(runtmp.last_result.err) - assert f"Starting annotation on '{alt_csv}'. Using ID column: '{alt_col}'" in runtmp.last_result.err + assert ( + f"Starting annotation on '{alt_csv}'. Using ID column: '{alt_col}'" + in runtmp.last_result.err + ) assert f"Annotated 4 of 4 total rows from '{alt_csv}'" in runtmp.last_result.err def test_annotate_bad_header(runtmp): - tax = utils.get_test_data('tax/test.taxonomy.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data("tax/test.taxonomy.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") - bad_g_csv = runtmp.output('g.csv') + bad_g_csv = runtmp.output("g.csv") - #creates bad gather result - bad_g = [x.replace("name", "nope") + "\n" for x in Path(g_csv).read_text().splitlines()] - with open(bad_g_csv, 'w') as fp: + # creates bad gather result + bad_g = [ + x.replace("name", "nope") + "\n" for x in Path(g_csv).read_text().splitlines() + ] + with open(bad_g_csv, "w") as fp: fp.writelines(bad_g) # print("bad_gather_results: \n", bad_g) with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'annotate', '-g', bad_g_csv, '--taxonomy-csv', tax) + runtmp.run_sourmash("tax", "annotate", "-g", bad_g_csv, "--taxonomy-csv", tax) - assert f"ERROR: Cannot find taxonomic identifier column in '{bad_g_csv}'. Tried: name, match_name, ident, accession" in str(exc.value) + assert ( + f"ERROR: Cannot find taxonomic identifier column in '{bad_g_csv}'. Tried: name, match_name, ident, accession" + in str(exc.value) + ) assert runtmp.last_result.status == -1 print(runtmp.last_result.out) print(runtmp.last_result.err) def test_annotate_no_tax_matches(runtmp): - tax = utils.get_test_data('tax/test.taxonomy.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data("tax/test.taxonomy.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") - bad_g_csv = runtmp.output('g.csv') + bad_g_csv = runtmp.output("g.csv") - #mess up tax idents - bad_g = [x.replace("GCF_", "GGG_") + "\n" for x in Path(g_csv).read_text().splitlines()] - with open(bad_g_csv, 'w') as fp: + # mess up tax idents + bad_g = [ + x.replace("GCF_", "GGG_") + "\n" for x in Path(g_csv).read_text().splitlines() + ] + with open(bad_g_csv, "w") as fp: fp.writelines(bad_g) # print("bad_gather_results: \n", bad_g) with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'annotate', '-g', bad_g_csv, '--taxonomy-csv', tax) + runtmp.run_sourmash("tax", "annotate", "-g", bad_g_csv, "--taxonomy-csv", tax) assert f"ERROR: Could not annotate any rows from '{bad_g_csv}'" in str(exc.value) assert runtmp.last_result.status == -1 print(runtmp.last_result.out) print(runtmp.last_result.err) - runtmp.run_sourmash('tax', 'annotate', '-g', bad_g_csv, '--taxonomy-csv', tax, '--force') + runtmp.run_sourmash( + "tax", "annotate", "-g", bad_g_csv, "--taxonomy-csv", tax, "--force" + ) assert runtmp.last_result.status == 0 assert f"Could not annotate any rows from '{bad_g_csv}'" in runtmp.last_result.err - assert f"--force is set. Attempting to continue to next file." in runtmp.last_result.err + assert ( + "--force is set. Attempting to continue to next file." in runtmp.last_result.err + ) print(runtmp.last_result.out) print(runtmp.last_result.err) def test_annotate_missed_tax_matches(runtmp): - tax = utils.get_test_data('tax/test.taxonomy.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data("tax/test.taxonomy.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") - bad_g_csv = runtmp.output('g.csv') + bad_g_csv = runtmp.output("g.csv") - with open(g_csv, 'r') as gather_lines, open(bad_g_csv, 'w') as fp: + with open(g_csv) as gather_lines, open(bad_g_csv, "w") as fp: for n, line in enumerate(gather_lines): if n > 2: # mess up tax idents of lines 3, 4 @@ -2608,7 +4404,7 @@ def test_annotate_missed_tax_matches(runtmp): fp.write(line) # print("bad_gather_results: \n", bad_g) - runtmp.run_sourmash('tax', 'annotate', '-g', bad_g_csv, '--taxonomy-csv', tax) + runtmp.run_sourmash("tax", "annotate", "-g", bad_g_csv, "--taxonomy-csv", tax) print(runtmp.last_result.out) print(runtmp.last_result.err) @@ -2618,16 +4414,15 @@ def test_annotate_missed_tax_matches(runtmp): def test_annotate_empty_tax_lineage_input(runtmp): - tax_empty = runtmp.output('t.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax_empty = runtmp.output("t.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") with open(tax_empty, "w") as fp: fp.write("") print("t_csv: ", tax_empty) - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'annotate', '-g', g_csv, '--taxonomy-csv', tax_empty) + runtmp.run_sourmash("tax", "annotate", "-g", g_csv, "--taxonomy-csv", tax_empty) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -2638,15 +4433,25 @@ def test_annotate_empty_tax_lineage_input(runtmp): def test_annotate_empty_tax_lineage_input_recover_with_second_taxfile(runtmp): - tax_empty = runtmp.output('t.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax_empty = runtmp.output("t.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") with open(tax_empty, "w") as fp: fp.write("") print("t_csv: ", tax_empty) - runtmp.run_sourmash('tax', 'annotate', '-g', g_csv, '-t', tax_empty, '--taxonomy-csv', tax, '--force') + runtmp.run_sourmash( + "tax", + "annotate", + "-g", + g_csv, + "-t", + tax_empty, + "--taxonomy-csv", + tax, + "--force", + ) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -2657,16 +4462,25 @@ def test_annotate_empty_tax_lineage_input_recover_with_second_taxfile(runtmp): def test_annotate_empty_tax_lineage_input_recover_with_second_taxfile_2(runtmp): # test with empty tax second, to check on argparse handling - tax_empty = runtmp.output('t.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax_empty = runtmp.output("t.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") with open(tax_empty, "w") as fp: fp.write("") print("t_csv: ", tax_empty) - runtmp.run_sourmash('tax', 'annotate', '-g', g_csv, - '--taxonomy-csv', tax, '-t', tax_empty, '--force') + runtmp.run_sourmash( + "tax", + "annotate", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "-t", + tax_empty, + "--force", + ) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -2677,29 +4491,31 @@ def test_annotate_empty_tax_lineage_input_recover_with_second_taxfile_2(runtmp): def test_tax_prepare_1_csv_to_csv(runtmp, keep_identifiers, keep_versions): # CSV -> CSV; same assignments - tax = utils.get_test_data('tax/test.taxonomy.csv') - taxout = runtmp.output('out.csv') + tax = utils.get_test_data("tax/test.taxonomy.csv") + taxout = runtmp.output("out.csv") args = [] if keep_identifiers: - args.append('--keep-full-identifiers') + args.append("--keep-full-identifiers") if keep_versions: - args.append('--keep-identifier-versions') + args.append("--keep-identifier-versions") # this is an error - can't strip versions if not splitting identifiers if keep_identifiers and not keep_versions: with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', - taxout, '-F', 'csv', *args) + runtmp.run_sourmash( + "tax", "prepare", "-t", tax, "-o", taxout, "-F", "csv", *args + ) return - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', - taxout, '-F', 'csv', *args) + runtmp.run_sourmash("tax", "prepare", "-t", tax, "-o", taxout, "-F", "csv", *args) assert os.path.exists(taxout) - db1 = tax_utils.MultiLineageDB.load([tax], - keep_full_identifiers=keep_identifiers, - keep_identifier_versions=keep_versions) + db1 = tax_utils.MultiLineageDB.load( + [tax], + keep_full_identifiers=keep_identifiers, + keep_identifier_versions=keep_versions, + ) db2 = tax_utils.MultiLineageDB.load([taxout]) @@ -2708,13 +4524,12 @@ def test_tax_prepare_1_csv_to_csv(runtmp, keep_identifiers, keep_versions): def test_tax_prepare_1_combine_csv(runtmp): # multiple CSVs to a single combined CSV - tax1 = utils.get_test_data('tax/test.taxonomy.csv') - tax2 = utils.get_test_data('tax/protozoa_genbank_lineage.csv') + tax1 = utils.get_test_data("tax/test.taxonomy.csv") + tax2 = utils.get_test_data("tax/protozoa_genbank_lineage.csv") - taxout = runtmp.output('out.csv') + taxout = runtmp.output("out.csv") - runtmp.sourmash('tax', 'prepare', '-t', tax1, tax2, '-F', 'csv', - '-o', taxout) + runtmp.sourmash("tax", "prepare", "-t", tax1, tax2, "-F", "csv", "-o", taxout) out = runtmp.last_result.out err = runtmp.last_result.err @@ -2728,29 +4543,31 @@ def test_tax_prepare_1_combine_csv(runtmp): def test_tax_prepare_1_csv_to_csv_empty_ranks(runtmp, keep_identifiers, keep_versions): # CSV -> CSV; same assignments, even when trailing ranks are empty - tax = utils.get_test_data('tax/test-empty-ranks.taxonomy.csv') - taxout = runtmp.output('out.csv') + tax = utils.get_test_data("tax/test-empty-ranks.taxonomy.csv") + taxout = runtmp.output("out.csv") args = [] if keep_identifiers: - args.append('--keep-full-identifiers') + args.append("--keep-full-identifiers") if keep_versions: - args.append('--keep-identifier-versions') + args.append("--keep-identifier-versions") # this is an error - can't strip versions if not splitting identifiers if keep_identifiers and not keep_versions: with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', - taxout, '-F', 'csv', *args) + runtmp.run_sourmash( + "tax", "prepare", "-t", tax, "-o", taxout, "-F", "csv", *args + ) return - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', - taxout, '-F', 'csv', *args) + runtmp.run_sourmash("tax", "prepare", "-t", tax, "-o", taxout, "-F", "csv", *args) assert os.path.exists(taxout) - db1 = tax_utils.MultiLineageDB.load([tax], - keep_full_identifiers=keep_identifiers, - keep_identifier_versions=keep_versions) + db1 = tax_utils.MultiLineageDB.load( + [tax], + keep_full_identifiers=keep_identifiers, + keep_identifier_versions=keep_versions, + ) db2 = tax_utils.MultiLineageDB.load([taxout]) @@ -2760,9 +4577,9 @@ def test_tax_prepare_1_csv_to_csv_empty_ranks(runtmp, keep_identifiers, keep_ver def test_tax_prepare_1_csv_to_csv_empty_file(runtmp, keep_identifiers, keep_versions): # CSV -> CSV with an empty input file and --force # tests argparse extend - tax = utils.get_test_data('tax/test-empty-ranks.taxonomy.csv') - tax_empty = runtmp.output('t.csv') - taxout = runtmp.output('out.csv') + tax = utils.get_test_data("tax/test-empty-ranks.taxonomy.csv") + tax_empty = runtmp.output("t.csv") + taxout = runtmp.output("out.csv") with open(tax_empty, "w") as fp: fp.write("") @@ -2770,86 +4587,109 @@ def test_tax_prepare_1_csv_to_csv_empty_file(runtmp, keep_identifiers, keep_vers args = [] if keep_identifiers: - args.append('--keep-full-identifiers') + args.append("--keep-full-identifiers") if keep_versions: - args.append('--keep-identifier-versions') + args.append("--keep-identifier-versions") # this is an error - can't strip versions if not splitting identifiers if keep_identifiers and not keep_versions: with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', - taxout, '-F', 'csv', *args) + runtmp.run_sourmash( + "tax", "prepare", "-t", tax, "-o", taxout, "-F", "csv", *args + ) return - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-t', tax_empty, '-o', - taxout, '-F', 'csv', *args, '--force') + runtmp.run_sourmash( + "tax", + "prepare", + "-t", + tax, + "-t", + tax_empty, + "-o", + taxout, + "-F", + "csv", + *args, + "--force", + ) assert os.path.exists(taxout) - db1 = tax_utils.MultiLineageDB.load([tax], - keep_full_identifiers=keep_identifiers, - keep_identifier_versions=keep_versions) + db1 = tax_utils.MultiLineageDB.load( + [tax], + keep_full_identifiers=keep_identifiers, + keep_identifier_versions=keep_versions, + ) db2 = tax_utils.MultiLineageDB.load([taxout]) assert set(db1) == set(db2) -def test_tax_prepare_1_csv_to_csv_empty_ranks_2(runtmp, keep_identifiers, keep_versions): +def test_tax_prepare_1_csv_to_csv_empty_ranks_2( + runtmp, keep_identifiers, keep_versions +): # CSV -> CSV; same assignments for situations with empty internal ranks - tax = utils.get_test_data('tax/test-empty-ranks-2.taxonomy.csv') - taxout = runtmp.output('out.csv') + tax = utils.get_test_data("tax/test-empty-ranks-2.taxonomy.csv") + taxout = runtmp.output("out.csv") args = [] if keep_identifiers: - args.append('--keep-full-identifiers') + args.append("--keep-full-identifiers") if keep_versions: - args.append('--keep-identifier-versions') + args.append("--keep-identifier-versions") # this is an error - can't strip versions if not splitting identifiers if keep_identifiers and not keep_versions: with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', - taxout, '-F', 'csv', *args) + runtmp.run_sourmash( + "tax", "prepare", "-t", tax, "-o", taxout, "-F", "csv", *args + ) return - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', - taxout, '-F', 'csv', *args) + runtmp.run_sourmash("tax", "prepare", "-t", tax, "-o", taxout, "-F", "csv", *args) assert os.path.exists(taxout) - db1 = tax_utils.MultiLineageDB.load([tax], - keep_full_identifiers=keep_identifiers, - keep_identifier_versions=keep_versions) + db1 = tax_utils.MultiLineageDB.load( + [tax], + keep_full_identifiers=keep_identifiers, + keep_identifier_versions=keep_versions, + ) db2 = tax_utils.MultiLineageDB.load([taxout]) assert set(db1) == set(db2) -def test_tax_prepare_1_csv_to_csv_empty_ranks_3(runtmp, keep_identifiers, keep_versions): +def test_tax_prepare_1_csv_to_csv_empty_ranks_3( + runtmp, keep_identifiers, keep_versions +): # CSV -> CSV; same assignments for situations with empty internal ranks - tax = utils.get_test_data('tax/test-empty-ranks-3.taxonomy.csv') - taxout = runtmp.output('out.csv') + tax = utils.get_test_data("tax/test-empty-ranks-3.taxonomy.csv") + taxout = runtmp.output("out.csv") args = [] if keep_identifiers: - args.append('--keep-full-identifiers') + args.append("--keep-full-identifiers") if keep_versions: - args.append('--keep-identifier-versions') + args.append("--keep-identifier-versions") # this is an error - can't strip versions if not splitting identifiers if keep_identifiers and not keep_versions: with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', - taxout, '-F', 'csv', *args) + runtmp.run_sourmash( + "tax", "prepare", "-t", tax, "-o", taxout, "-F", "csv", *args + ) return - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', - taxout, '-F', 'csv', *args) + runtmp.run_sourmash("tax", "prepare", "-t", tax, "-o", taxout, "-F", "csv", *args) assert os.path.exists(taxout) - db1 = tax_utils.MultiLineageDB.load([tax], - keep_full_identifiers=keep_identifiers, - keep_identifier_versions=keep_versions) + db1 = tax_utils.MultiLineageDB.load( + [tax], + keep_full_identifiers=keep_identifiers, + keep_identifier_versions=keep_versions, + ) db2 = tax_utils.MultiLineageDB.load([taxout]) @@ -2858,65 +4698,70 @@ def test_tax_prepare_1_csv_to_csv_empty_ranks_3(runtmp, keep_identifiers, keep_v def test_tax_prepare_2_csv_to_sql(runtmp, keep_identifiers, keep_versions): # CSV -> SQL; same assignments? - tax = utils.get_test_data('tax/test.taxonomy.csv') - taxout = runtmp.output('out.db') + tax = utils.get_test_data("tax/test.taxonomy.csv") + taxout = runtmp.output("out.db") args = [] if keep_identifiers: - args.append('--keep-full-identifiers') + args.append("--keep-full-identifiers") if keep_versions: - args.append('--keep-identifier-versions') + args.append("--keep-identifier-versions") # this is an error - can't strip versions if not splitting identifiers if keep_identifiers and not keep_versions: with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, - '-F', 'sql', *args) + runtmp.run_sourmash( + "tax", "prepare", "-t", tax, "-o", taxout, "-F", "sql", *args + ) return - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, - '-F', 'sql', *args) + runtmp.run_sourmash("tax", "prepare", "-t", tax, "-o", taxout, "-F", "sql", *args) assert os.path.exists(taxout) - db1 = tax_utils.MultiLineageDB.load([tax], - keep_full_identifiers=keep_identifiers, - keep_identifier_versions=keep_versions) + db1 = tax_utils.MultiLineageDB.load( + [tax], + keep_full_identifiers=keep_identifiers, + keep_identifier_versions=keep_versions, + ) db2 = tax_utils.MultiLineageDB.load([taxout]) assert set(db1) == set(db2) # cannot overwrite - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, - '-F', 'sql', *args) - assert 'taxonomy table already exists' in str(exc.value) + runtmp.run_sourmash( + "tax", "prepare", "-t", tax, "-o", taxout, "-F", "sql", *args + ) + assert "taxonomy table already exists" in str(exc.value) def test_tax_prepare_2_csv_to_sql_empty_ranks(runtmp, keep_identifiers, keep_versions): # CSV -> SQL with some empty ranks in the taxonomy file - tax = utils.get_test_data('tax/test-empty-ranks.taxonomy.csv') - taxout = runtmp.output('out.db') + tax = utils.get_test_data("tax/test-empty-ranks.taxonomy.csv") + taxout = runtmp.output("out.db") args = [] if keep_identifiers: - args.append('--keep-full-identifiers') + args.append("--keep-full-identifiers") if keep_versions: - args.append('--keep-identifier-versions') + args.append("--keep-identifier-versions") # this is an error - can't strip versions if not splitting identifiers if keep_identifiers and not keep_versions: with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, - '-F', 'sql', *args) + runtmp.run_sourmash( + "tax", "prepare", "-t", tax, "-o", taxout, "-F", "sql", *args + ) return - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, - '-F', 'sql', *args) + runtmp.run_sourmash("tax", "prepare", "-t", tax, "-o", taxout, "-F", "sql", *args) assert os.path.exists(taxout) - db1 = tax_utils.MultiLineageDB.load([tax], - keep_full_identifiers=keep_identifiers, - keep_identifier_versions=keep_versions) + db1 = tax_utils.MultiLineageDB.load( + [tax], + keep_full_identifiers=keep_identifiers, + keep_identifier_versions=keep_versions, + ) db2 = tax_utils.MultiLineageDB.load([taxout]) assert set(db1) == set(db2) @@ -2924,107 +4769,113 @@ def test_tax_prepare_2_csv_to_sql_empty_ranks(runtmp, keep_identifiers, keep_ver def test_tax_prepare_3_db_to_csv(runtmp): # SQL -> CSV; same assignments - taxcsv = utils.get_test_data('tax/test.taxonomy.csv') - taxdb = utils.get_test_data('tax/test.taxonomy.db') - taxout = runtmp.output('out.csv') + taxcsv = utils.get_test_data("tax/test.taxonomy.csv") + taxdb = utils.get_test_data("tax/test.taxonomy.db") + taxout = runtmp.output("out.csv") - runtmp.run_sourmash('tax', 'prepare', '-t', taxdb, - '-o', taxout, '-F', 'csv') + runtmp.run_sourmash("tax", "prepare", "-t", taxdb, "-o", taxout, "-F", "csv") assert os.path.exists(taxout) with open(taxout) as fp: print(fp.read()) - db1 = tax_utils.MultiLineageDB.load([taxcsv], - keep_full_identifiers=False, - keep_identifier_versions=False) + db1 = tax_utils.MultiLineageDB.load( + [taxcsv], keep_full_identifiers=False, keep_identifier_versions=False + ) db2 = tax_utils.MultiLineageDB.load([taxout]) - db3 = tax_utils.MultiLineageDB.load([taxdb], - keep_full_identifiers=False, - keep_identifier_versions=False) + db3 = tax_utils.MultiLineageDB.load( + [taxdb], keep_full_identifiers=False, keep_identifier_versions=False + ) assert set(db1) == set(db2) assert set(db1) == set(db3) def test_tax_prepare_3_db_to_csv_gz(runtmp): # SQL -> CSV; same assignments - taxcsv = utils.get_test_data('tax/test.taxonomy.csv') - taxdb = utils.get_test_data('tax/test.taxonomy.db') - taxout = runtmp.output('out.csv.gz') + taxcsv = utils.get_test_data("tax/test.taxonomy.csv") + taxdb = utils.get_test_data("tax/test.taxonomy.db") + taxout = runtmp.output("out.csv.gz") - runtmp.run_sourmash('tax', 'prepare', '-t', taxdb, - '-o', taxout, '-F', 'csv') + runtmp.run_sourmash("tax", "prepare", "-t", taxdb, "-o", taxout, "-F", "csv") assert os.path.exists(taxout) - with gzip.open(taxout, 'rt') as fp: + with gzip.open(taxout, "rt") as fp: print(fp.read()) - db1 = tax_utils.MultiLineageDB.load([taxcsv], - keep_full_identifiers=False, - keep_identifier_versions=False) + db1 = tax_utils.MultiLineageDB.load( + [taxcsv], keep_full_identifiers=False, keep_identifier_versions=False + ) db2 = tax_utils.MultiLineageDB.load([taxout]) - db3 = tax_utils.MultiLineageDB.load([taxdb], - keep_full_identifiers=False, - keep_identifier_versions=False) + db3 = tax_utils.MultiLineageDB.load( + [taxdb], keep_full_identifiers=False, keep_identifier_versions=False + ) assert set(db1) == set(db2) assert set(db1) == set(db3) -def test_tax_prepare_2_csv_to_sql_empty_ranks_2(runtmp, keep_identifiers, keep_versions): +def test_tax_prepare_2_csv_to_sql_empty_ranks_2( + runtmp, keep_identifiers, keep_versions +): # CSV -> SQL with some empty internal ranks in the taxonomy file - tax = utils.get_test_data('tax/test-empty-ranks-2.taxonomy.csv') - taxout = runtmp.output('out.db') + tax = utils.get_test_data("tax/test-empty-ranks-2.taxonomy.csv") + taxout = runtmp.output("out.db") args = [] if keep_identifiers: - args.append('--keep-full-identifiers') + args.append("--keep-full-identifiers") if keep_versions: - args.append('--keep-identifier-versions') + args.append("--keep-identifier-versions") # this is an error - can't strip versions if not splitting identifiers if keep_identifiers and not keep_versions: with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, - '-F', 'sql', *args) + runtmp.run_sourmash( + "tax", "prepare", "-t", tax, "-o", taxout, "-F", "sql", *args + ) return - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, - '-F', 'sql', *args) + runtmp.run_sourmash("tax", "prepare", "-t", tax, "-o", taxout, "-F", "sql", *args) assert os.path.exists(taxout) - db1 = tax_utils.MultiLineageDB.load([tax], - keep_full_identifiers=keep_identifiers, - keep_identifier_versions=keep_versions) + db1 = tax_utils.MultiLineageDB.load( + [tax], + keep_full_identifiers=keep_identifiers, + keep_identifier_versions=keep_versions, + ) db2 = tax_utils.MultiLineageDB.load([taxout]) assert set(db1) == set(db2) -def test_tax_prepare_2_csv_to_sql_empty_ranks_3(runtmp, keep_identifiers, keep_versions): +def test_tax_prepare_2_csv_to_sql_empty_ranks_3( + runtmp, keep_identifiers, keep_versions +): # CSV -> SQL with some empty internal ranks in the taxonomy file - tax = utils.get_test_data('tax/test-empty-ranks-3.taxonomy.csv') - taxout = runtmp.output('out.db') + tax = utils.get_test_data("tax/test-empty-ranks-3.taxonomy.csv") + taxout = runtmp.output("out.db") args = [] if keep_identifiers: - args.append('--keep-full-identifiers') + args.append("--keep-full-identifiers") if keep_versions: - args.append('--keep-identifier-versions') + args.append("--keep-identifier-versions") # this is an error - can't strip versions if not splitting identifiers if keep_identifiers and not keep_versions: with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, - '-F', 'sql', *args) + runtmp.run_sourmash( + "tax", "prepare", "-t", tax, "-o", taxout, "-F", "sql", *args + ) return - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, - '-F', 'sql', *args) + runtmp.run_sourmash("tax", "prepare", "-t", tax, "-o", taxout, "-F", "sql", *args) assert os.path.exists(taxout) - db1 = tax_utils.MultiLineageDB.load([tax], - keep_full_identifiers=keep_identifiers, - keep_identifier_versions=keep_versions) + db1 = tax_utils.MultiLineageDB.load( + [tax], + keep_full_identifiers=keep_identifiers, + keep_identifier_versions=keep_versions, + ) db2 = tax_utils.MultiLineageDB.load([taxout]) assert set(db1) == set(db2) @@ -3032,83 +4883,79 @@ def test_tax_prepare_2_csv_to_sql_empty_ranks_3(runtmp, keep_identifiers, keep_v def test_tax_prepare_3_db_to_csv_empty_ranks(runtmp): # SQL -> CSV; same assignments, with empty ranks - taxcsv = utils.get_test_data('tax/test-empty-ranks.taxonomy.csv') - taxdb = utils.get_test_data('tax/test-empty-ranks.taxonomy.db') - taxout = runtmp.output('out.csv') + taxcsv = utils.get_test_data("tax/test-empty-ranks.taxonomy.csv") + taxdb = utils.get_test_data("tax/test-empty-ranks.taxonomy.db") + taxout = runtmp.output("out.csv") - runtmp.run_sourmash('tax', 'prepare', '-t', taxdb, - '-o', taxout, '-F', 'csv') + runtmp.run_sourmash("tax", "prepare", "-t", taxdb, "-o", taxout, "-F", "csv") assert os.path.exists(taxout) with open(taxout) as fp: print(fp.read()) - db1 = tax_utils.MultiLineageDB.load([taxcsv], - keep_full_identifiers=False, - keep_identifier_versions=False) + db1 = tax_utils.MultiLineageDB.load( + [taxcsv], keep_full_identifiers=False, keep_identifier_versions=False + ) db2 = tax_utils.MultiLineageDB.load([taxout]) - db3 = tax_utils.MultiLineageDB.load([taxdb], - keep_full_identifiers=False, - keep_identifier_versions=False) + db3 = tax_utils.MultiLineageDB.load( + [taxdb], keep_full_identifiers=False, keep_identifier_versions=False + ) assert set(db1) == set(db2) assert set(db1) == set(db3) def test_tax_prepare_3_db_to_csv_empty_ranks_2(runtmp): # SQL -> CSV; same assignments, with empty ranks - taxcsv = utils.get_test_data('tax/test-empty-ranks-2.taxonomy.csv') - taxdb = utils.get_test_data('tax/test-empty-ranks-2.taxonomy.db') - taxout = runtmp.output('out.csv') + taxcsv = utils.get_test_data("tax/test-empty-ranks-2.taxonomy.csv") + taxdb = utils.get_test_data("tax/test-empty-ranks-2.taxonomy.db") + taxout = runtmp.output("out.csv") - runtmp.run_sourmash('tax', 'prepare', '-t', taxdb, - '-o', taxout, '-F', 'csv') + runtmp.run_sourmash("tax", "prepare", "-t", taxdb, "-o", taxout, "-F", "csv") assert os.path.exists(taxout) with open(taxout) as fp: print(fp.read()) - db1 = tax_utils.MultiLineageDB.load([taxcsv], - keep_full_identifiers=False, - keep_identifier_versions=False) + db1 = tax_utils.MultiLineageDB.load( + [taxcsv], keep_full_identifiers=False, keep_identifier_versions=False + ) db2 = tax_utils.MultiLineageDB.load([taxout]) - db3 = tax_utils.MultiLineageDB.load([taxdb], - keep_full_identifiers=False, - keep_identifier_versions=False) + db3 = tax_utils.MultiLineageDB.load( + [taxdb], keep_full_identifiers=False, keep_identifier_versions=False + ) assert set(db1) == set(db2) assert set(db1) == set(db3) def test_tax_prepare_3_db_to_csv_empty_ranks_3(runtmp): # SQL -> CSV; same assignments, with empty ranks - taxcsv = utils.get_test_data('tax/test-empty-ranks-3.taxonomy.csv') - taxdb = utils.get_test_data('tax/test-empty-ranks-3.taxonomy.db') - taxout = runtmp.output('out.csv') + taxcsv = utils.get_test_data("tax/test-empty-ranks-3.taxonomy.csv") + taxdb = utils.get_test_data("tax/test-empty-ranks-3.taxonomy.db") + taxout = runtmp.output("out.csv") - runtmp.run_sourmash('tax', 'prepare', '-t', taxdb, - '-o', taxout, '-F', 'csv') + runtmp.run_sourmash("tax", "prepare", "-t", taxdb, "-o", taxout, "-F", "csv") assert os.path.exists(taxout) with open(taxout) as fp: print(fp.read()) - db1 = tax_utils.MultiLineageDB.load([taxcsv], - keep_full_identifiers=False, - keep_identifier_versions=False) + db1 = tax_utils.MultiLineageDB.load( + [taxcsv], keep_full_identifiers=False, keep_identifier_versions=False + ) db2 = tax_utils.MultiLineageDB.load([taxout]) - db3 = tax_utils.MultiLineageDB.load([taxdb], - keep_full_identifiers=False, - keep_identifier_versions=False) + db3 = tax_utils.MultiLineageDB.load( + [taxdb], keep_full_identifiers=False, keep_identifier_versions=False + ) assert set(db1) == set(db2) assert set(db1) == set(db3) def test_tax_prepare_sqlite_lineage_version(runtmp): # test bad sourmash_internals version for SqliteLineage - taxcsv = utils.get_test_data('tax/test.taxonomy.csv') - taxout = runtmp.output('out.db') + taxcsv = utils.get_test_data("tax/test.taxonomy.csv") + taxout = runtmp.output("out.db") - runtmp.run_sourmash('tax', 'prepare', '-t', taxcsv, - '-o', taxout, '-F', 'sql') + runtmp.run_sourmash("tax", "prepare", "-t", taxcsv, "-o", taxout, "-F", "sql") assert os.path.exists(taxout) # set bad version @@ -3120,206 +4967,208 @@ def test_tax_prepare_sqlite_lineage_version(runtmp): conn.close() with pytest.raises(IndexNotSupported): - db = tax_utils.MultiLineageDB.load([taxout]) + tax_utils.MultiLineageDB.load([taxout]) def test_tax_prepare_sqlite_no_lineage(): # no lineage table at all - sqldb = utils.get_test_data('sqlite/index.sqldb') + sqldb = utils.get_test_data("sqlite/index.sqldb") with pytest.raises(ValueError): - db = tax_utils.MultiLineageDB.load([sqldb]) + tax_utils.MultiLineageDB.load([sqldb]) def test_tax_grep_exists(runtmp): # test that 'tax grep' exists with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('tax', 'grep') + runtmp.sourmash("tax", "grep") err = runtmp.last_result.err - assert 'usage:' in err + assert "usage:" in err def test_tax_grep_search_shew(runtmp): # test 'tax grep Shew' - taxfile = utils.get_test_data('tax/test.taxonomy.csv') + taxfile = utils.get_test_data("tax/test.taxonomy.csv") - runtmp.sourmash('tax', 'grep', 'Shew', '-t', taxfile) + runtmp.sourmash("tax", "grep", "Shew", "-t", taxfile) out = runtmp.last_result.out err = runtmp.last_result.err - lines = [ x.strip() for x in out.splitlines() ] - lines = [ x.split(',') for x in lines ] - assert lines[0][0] == 'ident' - assert lines[1][0] == 'GCF_000017325.1' - assert lines[2][0] == 'GCF_000021665.1' + lines = [x.strip() for x in out.splitlines()] + lines = [x.split(",") for x in lines] + assert lines[0][0] == "ident" + assert lines[1][0] == "GCF_000017325.1" + assert lines[2][0] == "GCF_000021665.1" assert len(lines) == 3 assert "searching 1 taxonomy files for 'Shew'" in err - assert 'found 2 matches; saved identifiers to picklist' in err + assert "found 2 matches; saved identifiers to picklist" in err def test_tax_grep_search_shew_out(runtmp): # test 'tax grep Shew', save result to a file - taxfile = utils.get_test_data('tax/test.taxonomy.csv') + taxfile = utils.get_test_data("tax/test.taxonomy.csv") - runtmp.sourmash('tax', 'grep', 'Shew', '-t', taxfile, '-o', 'pick.csv') + runtmp.sourmash("tax", "grep", "Shew", "-t", taxfile, "-o", "pick.csv") err = runtmp.last_result.err - out = Path(runtmp.output('pick.csv')).read_text() - lines = [ x.strip() for x in out.splitlines() ] - lines = [ x.split(',') for x in lines ] - assert lines[0][0] == 'ident' - assert lines[1][0] == 'GCF_000017325.1' - assert lines[2][0] == 'GCF_000021665.1' + out = Path(runtmp.output("pick.csv")).read_text() + lines = [x.strip() for x in out.splitlines()] + lines = [x.split(",") for x in lines] + assert lines[0][0] == "ident" + assert lines[1][0] == "GCF_000017325.1" + assert lines[2][0] == "GCF_000021665.1" assert len(lines) == 3 assert "searching 1 taxonomy files for 'Shew'" in err - assert 'found 2 matches; saved identifiers to picklist' in err + assert "found 2 matches; saved identifiers to picklist" in err def test_tax_grep_search_shew_sqldb_out(runtmp): # test 'tax grep Shew' on a sqldb, save result to a file - taxfile = utils.get_test_data('tax/test.taxonomy.db') + taxfile = utils.get_test_data("tax/test.taxonomy.db") - runtmp.sourmash('tax', 'grep', 'Shew', '-t', taxfile, '-o', 'pick.csv') + runtmp.sourmash("tax", "grep", "Shew", "-t", taxfile, "-o", "pick.csv") err = runtmp.last_result.err - out = Path(runtmp.output('pick.csv')).read_text() - lines = [ x.strip() for x in out.splitlines() ] - lines = [ x.split(',') for x in lines ] - assert lines[0][0] == 'ident' - assert lines[1][0] == 'GCF_000017325' - assert lines[2][0] == 'GCF_000021665' + out = Path(runtmp.output("pick.csv")).read_text() + lines = [x.strip() for x in out.splitlines()] + lines = [x.split(",") for x in lines] + assert lines[0][0] == "ident" + assert lines[1][0] == "GCF_000017325" + assert lines[2][0] == "GCF_000021665" assert len(lines) == 3 assert "searching 1 taxonomy files for 'Shew'" in err - assert 'found 2 matches; saved identifiers to picklist' in err + assert "found 2 matches; saved identifiers to picklist" in err def test_tax_grep_search_shew_lowercase(runtmp): # test 'tax grep shew' (lowercase), save result to a file - taxfile = utils.get_test_data('tax/test.taxonomy.csv') + taxfile = utils.get_test_data("tax/test.taxonomy.csv") - runtmp.sourmash('tax', 'grep', 'shew', '-t', taxfile, '-o', 'pick.csv') + runtmp.sourmash("tax", "grep", "shew", "-t", taxfile, "-o", "pick.csv") err = runtmp.last_result.err assert "searching 1 taxonomy files for 'shew'" in err - assert 'found 0 matches; saved identifiers to picklist' in err + assert "found 0 matches; saved identifiers to picklist" in err - runtmp.sourmash('tax', 'grep', '-i', 'shew', - '-t', taxfile, '-o', 'pick.csv') + runtmp.sourmash("tax", "grep", "-i", "shew", "-t", taxfile, "-o", "pick.csv") err = runtmp.last_result.err assert "searching 1 taxonomy files for 'shew'" in err - assert 'found 2 matches; saved identifiers to picklist' in err - - out = Path(runtmp.output('pick.csv')).read_text() - lines = [ x.strip() for x in out.splitlines() ] - lines = [ x.split(',') for x in lines ] - assert lines[0][0] == 'ident' - assert lines[1][0] == 'GCF_000017325.1' - assert lines[2][0] == 'GCF_000021665.1' + assert "found 2 matches; saved identifiers to picklist" in err + + out = Path(runtmp.output("pick.csv")).read_text() + lines = [x.strip() for x in out.splitlines()] + lines = [x.split(",") for x in lines] + assert lines[0][0] == "ident" + assert lines[1][0] == "GCF_000017325.1" + assert lines[2][0] == "GCF_000021665.1" assert len(lines) == 3 def test_tax_grep_search_shew_out_use_picklist(runtmp): # test 'tax grep Shew', output to a picklist, use picklist - taxfile = utils.get_test_data('tax/test.taxonomy.csv') - dbfile = utils.get_test_data('tax/gtdb-tax-grep.sigs.zip') + taxfile = utils.get_test_data("tax/test.taxonomy.csv") + dbfile = utils.get_test_data("tax/gtdb-tax-grep.sigs.zip") - runtmp.sourmash('tax', 'grep', 'Shew', '-t', taxfile, '-o', 'pick.csv') + runtmp.sourmash("tax", "grep", "Shew", "-t", taxfile, "-o", "pick.csv") - runtmp.sourmash('sig', 'cat', dbfile, '--picklist', - 'pick.csv:ident:ident', '-o', 'pick-out.zip') + runtmp.sourmash( + "sig", "cat", dbfile, "--picklist", "pick.csv:ident:ident", "-o", "pick-out.zip" + ) all_sigs = sourmash.load_file_as_index(dbfile) assert len(all_sigs) == 3 - pick_sigs = sourmash.load_file_as_index(runtmp.output('pick-out.zip')) + pick_sigs = sourmash.load_file_as_index(runtmp.output("pick-out.zip")) assert len(pick_sigs) == 2 - names = [ ss.name.split()[0] for ss in pick_sigs.signatures() ] + names = [ss.name.split()[0] for ss in pick_sigs.signatures()] assert len(names) == 2 - assert 'GCF_000017325.1' in names - assert 'GCF_000021665.1' in names + assert "GCF_000017325.1" in names + assert "GCF_000021665.1" in names def test_tax_grep_search_shew_invert(runtmp): # test 'tax grep -v Shew' - taxfile = utils.get_test_data('tax/test.taxonomy.csv') + taxfile = utils.get_test_data("tax/test.taxonomy.csv") - runtmp.sourmash('tax', 'grep', '-v', 'Shew', '-t', taxfile) + runtmp.sourmash("tax", "grep", "-v", "Shew", "-t", taxfile) out = runtmp.last_result.out err = runtmp.last_result.err - assert "-v/--invert-match specified; returning only lineages that do not match." in err + assert ( + "-v/--invert-match specified; returning only lineages that do not match." in err + ) - lines = [ x.strip() for x in out.splitlines() ] - lines = [ x.split(',') for x in lines ] - assert lines[0][0] == 'ident' - assert lines[1][0] == 'GCF_001881345.1' - assert lines[2][0] == 'GCF_003471795.1' + lines = [x.strip() for x in out.splitlines()] + lines = [x.split(",") for x in lines] + assert lines[0][0] == "ident" + assert lines[1][0] == "GCF_001881345.1" + assert lines[2][0] == "GCF_003471795.1" assert len(lines) == 5 assert "searching 1 taxonomy files for 'Shew'" in err - assert 'found 4 matches; saved identifiers to picklist' in err + assert "found 4 matches; saved identifiers to picklist" in err - all_names = set([ x[0] for x in lines ]) - assert 'GCF_000017325.1' not in all_names - assert 'GCF_000021665.1' not in all_names + all_names = set([x[0] for x in lines]) + assert "GCF_000017325.1" not in all_names + assert "GCF_000021665.1" not in all_names def test_tax_grep_search_shew_invert_select_phylum(runtmp): # test 'tax grep -v Shew -r phylum' - taxfile = utils.get_test_data('tax/test.taxonomy.csv') + taxfile = utils.get_test_data("tax/test.taxonomy.csv") - runtmp.sourmash('tax', 'grep', '-v', 'Shew', '-t', taxfile, '-r', 'phylum') + runtmp.sourmash("tax", "grep", "-v", "Shew", "-t", taxfile, "-r", "phylum") out = runtmp.last_result.out err = runtmp.last_result.err - assert "-v/--invert-match specified; returning only lineages that do not match." in err + assert ( + "-v/--invert-match specified; returning only lineages that do not match." in err + ) assert "limiting matches to phylum" - lines = [ x.strip() for x in out.splitlines() ] - lines = [ x.split(',') for x in lines ] - assert lines[0][0] == 'ident' + lines = [x.strip() for x in out.splitlines()] + lines = [x.split(",") for x in lines] + assert lines[0][0] == "ident" assert len(lines) == 7 assert "searching 1 taxonomy files for 'Shew'" in err - assert 'found 6 matches; saved identifiers to picklist' in err + assert "found 6 matches; saved identifiers to picklist" in err - all_names = set([ x[0] for x in lines ]) - assert 'GCF_000017325.1' in all_names - assert 'GCF_000021665.1' in all_names + all_names = set([x[0] for x in lines]) + assert "GCF_000017325.1" in all_names + assert "GCF_000021665.1" in all_names def test_tax_grep_search_shew_invert_select_bad_rank(runtmp): # test 'tax grep -v Shew -r badrank' - should fail - taxfile = utils.get_test_data('tax/test.taxonomy.csv') + taxfile = utils.get_test_data("tax/test.taxonomy.csv") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('tax', 'grep', '-v', 'Shew', '-t', taxfile, - '-r', 'badrank') + runtmp.sourmash("tax", "grep", "-v", "Shew", "-t", taxfile, "-r", "badrank") - out = runtmp.last_result.out err = runtmp.last_result.err print(err) - assert 'error: argument -r/--rank: invalid choice:' in err + assert "error: argument -r/--rank: invalid choice:" in err def test_tax_grep_search_shew_count(runtmp): # test 'tax grep Shew --count' - taxfile = utils.get_test_data('tax/test.taxonomy.csv') + taxfile = utils.get_test_data("tax/test.taxonomy.csv") - runtmp.sourmash('tax', 'grep', 'Shew', '-t', taxfile, '-c') + runtmp.sourmash("tax", "grep", "Shew", "-t", taxfile, "-c") out = runtmp.last_result.out err = runtmp.last_result.err @@ -3327,19 +5176,17 @@ def test_tax_grep_search_shew_count(runtmp): assert not out.strip() assert "searching 1 taxonomy files for 'Shew'" in err - assert not 'found 2 matches; saved identifiers to picklist' in err + assert "found 2 matches; saved identifiers to picklist" not in err def test_tax_grep_multiple_csv(runtmp): # grep on multiple CSVs - tax1 = utils.get_test_data('tax/test.taxonomy.csv') - tax2 = utils.get_test_data('tax/protozoa_genbank_lineage.csv') + tax1 = utils.get_test_data("tax/test.taxonomy.csv") + tax2 = utils.get_test_data("tax/protozoa_genbank_lineage.csv") - taxout = runtmp.output('out.csv') + taxout = runtmp.output("out.csv") - runtmp.sourmash('tax', 'grep', "Toxo|Gamma", - '-t', tax1, tax2, - '-o', taxout) + runtmp.sourmash("tax", "grep", "Toxo|Gamma", "-t", tax1, tax2, "-o", taxout) out = runtmp.last_result.out err = runtmp.last_result.err @@ -3350,27 +5197,37 @@ def test_tax_grep_multiple_csv(runtmp): lines = Path(taxout).read_text().splitlines() assert len(lines) == 5 - names = set([ x.split(',')[0] for x in lines ]) - assert 'GCA_000256725' in names - assert 'GCF_000017325.1' in names - assert 'GCF_000021665.1' in names - assert 'GCF_001881345.1' in names + names = set([x.split(",")[0] for x in lines]) + assert "GCA_000256725" in names + assert "GCF_000017325.1" in names + assert "GCF_000021665.1" in names + assert "GCF_001881345.1" in names def test_tax_grep_multiple_csv_empty_force(runtmp): # grep on multiple CSVs, one empty, with --force - tax1 = utils.get_test_data('tax/test.taxonomy.csv') - tax2 = utils.get_test_data('tax/protozoa_genbank_lineage.csv') - tax_empty = runtmp.output('t.csv') + tax1 = utils.get_test_data("tax/test.taxonomy.csv") + tax2 = utils.get_test_data("tax/protozoa_genbank_lineage.csv") + tax_empty = runtmp.output("t.csv") - taxout = runtmp.output('out.csv') + taxout = runtmp.output("out.csv") with open(tax_empty, "w") as fp: fp.write("") print("t_csv: ", tax_empty) - runtmp.sourmash('tax', 'grep', "Toxo|Gamma", - '-t', tax1, tax2, '-t', tax_empty, - '-o', taxout, '--force') + runtmp.sourmash( + "tax", + "grep", + "Toxo|Gamma", + "-t", + tax1, + tax2, + "-t", + tax_empty, + "-o", + taxout, + "--force", + ) out = runtmp.last_result.out err = runtmp.last_result.err @@ -3381,22 +5238,20 @@ def test_tax_grep_multiple_csv_empty_force(runtmp): lines = Path(taxout).read_text().splitlines() assert len(lines) == 5 - names = set([ x.split(',')[0] for x in lines ]) - assert 'GCA_000256725' in names - assert 'GCF_000017325.1' in names - assert 'GCF_000021665.1' in names - assert 'GCF_001881345.1' in names + names = set([x.split(",")[0] for x in lines]) + assert "GCA_000256725" in names + assert "GCF_000017325.1" in names + assert "GCF_000021665.1" in names + assert "GCF_001881345.1" in names def test_tax_grep_duplicate_csv(runtmp): # grep on duplicates => should collapse to uniques on identifiers - tax1 = utils.get_test_data('tax/test.taxonomy.csv') + tax1 = utils.get_test_data("tax/test.taxonomy.csv") - taxout = runtmp.output('out.csv') + taxout = runtmp.output("out.csv") - runtmp.sourmash('tax', 'grep', "Gamma", - '-t', tax1, tax1, - '-o', taxout) + runtmp.sourmash("tax", "grep", "Gamma", "-t", tax1, tax1, "-o", taxout) out = runtmp.last_result.out err = runtmp.last_result.err @@ -3407,20 +5262,19 @@ def test_tax_grep_duplicate_csv(runtmp): lines = Path(taxout).read_text().splitlines() assert len(lines) == 4 - names = set([ x.split(',')[0] for x in lines ]) - assert 'GCF_000017325.1' in names - assert 'GCF_000021665.1' in names - assert 'GCF_001881345.1' in names + names = set([x.split(",")[0] for x in lines]) + assert "GCF_000017325.1" in names + assert "GCF_000021665.1" in names + assert "GCF_001881345.1" in names def test_tax_summarize(runtmp): # test basic operation with summarize - taxfile = utils.get_test_data('tax/test.taxonomy.csv') + taxfile = utils.get_test_data("tax/test.taxonomy.csv") - runtmp.sourmash('tax', 'summarize', taxfile) + runtmp.sourmash("tax", "summarize", taxfile) out = runtmp.last_result.out - err = runtmp.last_result.err assert "number of distinct taxonomic lineages: 6" in out assert "rank superkingdom: 1 distinct taxonomic lineages" in out @@ -3434,13 +5288,12 @@ def test_tax_summarize(runtmp): def test_tax_summarize_multiple(runtmp): # test basic operation with summarize on multiple files - tax1 = utils.get_test_data('tax/bacteria_refseq_lineage.csv') - tax2 = utils.get_test_data('tax/protozoa_genbank_lineage.csv') + tax1 = utils.get_test_data("tax/bacteria_refseq_lineage.csv") + tax2 = utils.get_test_data("tax/protozoa_genbank_lineage.csv") - runtmp.sourmash('tax', 'summarize', tax1, tax2) + runtmp.sourmash("tax", "summarize", tax1, tax2) out = runtmp.last_result.out - err = runtmp.last_result.err assert "number of distinct taxonomic lineages: 6" in out assert "rank superkingdom: 2 distinct taxonomic lineages" in out @@ -3454,12 +5307,11 @@ def test_tax_summarize_multiple(runtmp): def test_tax_summarize_empty_line(runtmp): # test basic operation with summarize on a file w/empty line - taxfile = utils.get_test_data('tax/test-empty-line.taxonomy.csv') + taxfile = utils.get_test_data("tax/test-empty-line.taxonomy.csv") - runtmp.sourmash('tax', 'summarize', taxfile) + runtmp.sourmash("tax", "summarize", taxfile) out = runtmp.last_result.out - err = runtmp.last_result.err assert "number of distinct taxonomic lineages: 6" in out assert "rank superkingdom: 1 distinct taxonomic lineages" in out @@ -3473,21 +5325,20 @@ def test_tax_summarize_empty_line(runtmp): def test_tax_summarize_empty(runtmp): # test failure on empty file - taxfile = runtmp.output('no-exist') + taxfile = runtmp.output("no-exist") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('tax', 'summarize', taxfile) + runtmp.sourmash("tax", "summarize", taxfile) - out = runtmp.last_result.out err = runtmp.last_result.err assert "ERROR while loading taxonomies" in err def test_tax_summarize_csv(runtmp): # test basic operation w/csv output - taxfile = utils.get_test_data('tax/test.taxonomy.csv') + taxfile = utils.get_test_data("tax/test.taxonomy.csv") - runtmp.sourmash('tax', 'summarize', taxfile, '-o', 'ranks.csv') + runtmp.sourmash("tax", "summarize", taxfile, "-o", "ranks.csv") out = runtmp.last_result.out err = runtmp.last_result.err @@ -3495,28 +5346,30 @@ def test_tax_summarize_csv(runtmp): assert "number of distinct taxonomic lineages: 6" in out assert "saved 18 lineage counts to 'ranks.csv'" in err - csv_out = runtmp.output('ranks.csv') + csv_out = runtmp.output("ranks.csv") with sourmash_args.FileInputCSV(csv_out) as r: # count number across ranks as a cheap consistency check c = Counter() for row in r: - val = row['lineage_count'] + val = row["lineage_count"] c[val] += 1 - assert c['3'] == 7 - assert c['2'] == 5 - assert c['1'] == 5 + assert c["3"] == 7 + assert c["2"] == 5 + assert c["1"] == 5 def test_tax_summarize_on_annotate(runtmp): # test summarize on output of annotate basics - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csvout = runtmp.output("test1.gather.with-lineages.csv") out_dir = os.path.dirname(csvout) - runtmp.run_sourmash('tax', 'annotate', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', out_dir) + runtmp.run_sourmash( + "tax", "annotate", "--gather-csv", g_csv, "--taxonomy-csv", tax, "-o", out_dir + ) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -3527,7 +5380,7 @@ def test_tax_summarize_on_annotate(runtmp): # so far so good - now see if we can run summarize! - runtmp.run_sourmash('tax', 'summarize', csvout) + runtmp.run_sourmash("tax", "summarize", csvout) out = runtmp.last_result.out err = runtmp.last_result.err @@ -3546,9 +5399,9 @@ def test_tax_summarize_on_annotate(runtmp): def test_tax_summarize_strain_csv(runtmp): # test basic operation w/csv output on taxonomy with strains - taxfile = utils.get_test_data('tax/test-strain.taxonomy.csv') + taxfile = utils.get_test_data("tax/test-strain.taxonomy.csv") - runtmp.sourmash('tax', 'summarize', taxfile, '-o', 'ranks.csv') + runtmp.sourmash("tax", "summarize", taxfile, "-o", "ranks.csv") out = runtmp.last_result.out err = runtmp.last_result.err @@ -3556,38 +5409,38 @@ def test_tax_summarize_strain_csv(runtmp): assert "number of distinct taxonomic lineages: 6" in out assert "saved 24 lineage counts to 'ranks.csv'" in err - csv_out = runtmp.output('ranks.csv') + csv_out = runtmp.output("ranks.csv") with sourmash_args.FileInputCSV(csv_out) as r: # count number across ranks as a cheap consistency check c = Counter() for row in r: print(row) - val = row['lineage_count'] + val = row["lineage_count"] c[val] += 1 print(list(c.most_common())) - assert c['3'] == 7 - assert c['2'] == 5 - assert c['6'] == 1 - assert c['1'] == 11 + assert c["3"] == 7 + assert c["2"] == 5 + assert c["6"] == 1 + assert c["1"] == 11 def test_tax_summarize_strain_csv_with_lineages(runtmp): # test basic operation w/csv output on lineages-style file w/strain csv - taxfile = utils.get_test_data('tax/test-strain.taxonomy.csv') - lineage_csv = runtmp.output('lin-with-strains.csv') + taxfile = utils.get_test_data("tax/test-strain.taxonomy.csv") + lineage_csv = runtmp.output("lin-with-strains.csv") taxdb = tax_utils.LineageDB.load(taxfile) - with open(lineage_csv, 'w', newline="") as fp: + with open(lineage_csv, "w", newline="") as fp: w = csv.writer(fp) - w.writerow(['name', 'lineage']) + w.writerow(["name", "lineage"]) for k, v in taxdb.items(): linstr = lca_utils.display_lineage(v) w.writerow([k, linstr]) - runtmp.sourmash('tax', 'summarize', lineage_csv, '-o', 'ranks.csv') + runtmp.sourmash("tax", "summarize", lineage_csv, "-o", "ranks.csv") out = runtmp.last_result.out err = runtmp.last_result.err @@ -3595,40 +5448,40 @@ def test_tax_summarize_strain_csv_with_lineages(runtmp): assert "number of distinct taxonomic lineages: 6" in out assert "saved 24 lineage counts to" in err - csv_out = runtmp.output('ranks.csv') + csv_out = runtmp.output("ranks.csv") with sourmash_args.FileInputCSV(csv_out) as r: # count number across ranks as a cheap consistency check c = Counter() for row in r: print(row) - val = row['lineage_count'] + val = row["lineage_count"] c[val] += 1 print(list(c.most_common())) - assert c['3'] == 7 - assert c['2'] == 5 - assert c['6'] == 1 - assert c['1'] == 11 + assert c["3"] == 7 + assert c["2"] == 5 + assert c["6"] == 1 + assert c["1"] == 11 def test_tax_summarize_LINS(runtmp): # test basic operation w/LINs - taxfile = utils.get_test_data('tax/test.LIN-taxonomy.csv') - lineage_csv = runtmp.output('annotated-lin.csv') + taxfile = utils.get_test_data("tax/test.LIN-taxonomy.csv") + lineage_csv = runtmp.output("annotated-lin.csv") taxdb = tax_utils.LineageDB.load(taxfile, lins=True) - with open(lineage_csv, 'w', newline="") as fp: + with open(lineage_csv, "w", newline="") as fp: w = csv.writer(fp) - w.writerow(['name', 'lineage']) + w.writerow(["name", "lineage"]) for k, v in taxdb.items(): lin = tax_utils.LINLineageInfo(lineage=v) linstr = lin.display_lineage(truncate_empty=False) print(linstr) w.writerow([k, linstr]) - runtmp.sourmash('tax', 'summarize', lineage_csv, '-o', 'ranks.csv', '--lins') + runtmp.sourmash("tax", "summarize", lineage_csv, "-o", "ranks.csv", "--lins") out = runtmp.last_result.out err = runtmp.last_result.err @@ -3639,137 +5492,209 @@ def test_tax_summarize_LINS(runtmp): assert "number of distinct taxonomic lineages: 6" in out assert "saved 91 lineage counts to" in err - csv_out = runtmp.output('ranks.csv') + csv_out = runtmp.output("ranks.csv") with sourmash_args.FileInputCSV(csv_out) as r: - # count number across ranks as a cheap consistency check + # count number across ranks as a cheap consistency check c = Counter() for row in r: print(row) - val = row['lineage_count'] + val = row["lineage_count"] c[val] += 1 print(list(c.most_common())) - assert c['1'] == 77 - assert c['2'] == 1 - assert c['3'] == 11 - assert c['4'] == 2 + assert c["1"] == 77 + assert c["2"] == 1 + assert c["3"] == 11 + assert c["4"] == 2 def test_metagenome_LIN(runtmp): # test basic metagenome with LIN taxonomy c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '--lins') + c.run_sourmash("tax", "metagenome", "-g", g_csv, "--taxonomy-csv", tax, "--lins") print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) # 0th rank/position assert "test1,0,0.089,1,md5,test1.sig,0.057,444000,0.925,0" in c.last_result.out assert "test1,0,0.088,0,md5,test1.sig,0.058,442000,0.925,0" in c.last_result.out assert "test1,0,0.028,2,md5,test1.sig,0.016,138000,0.891,0" in c.last_result.out - assert "test1,0,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" in c.last_result.out + assert ( + "test1,0,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" in c.last_result.out + ) # 1st rank/position assert "test1,1,0.089,1;0,md5,test1.sig,0.057,444000,0.925,0" in c.last_result.out assert "test1,1,0.088,0;0,md5,test1.sig,0.058,442000,0.925,0" in c.last_result.out assert "test1,1,0.028,2;0,md5,test1.sig,0.016,138000,0.891,0" in c.last_result.out - assert "test1,1,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" in c.last_result.out + assert ( + "test1,1,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" in c.last_result.out + ) # 2nd rank/position assert "test1,2,0.088,0;0;0,md5,test1.sig,0.058,442000,0.925,0" in c.last_result.out assert "test1,2,0.078,1;0;0,md5,test1.sig,0.050,390000,0.921,0" in c.last_result.out assert "test1,2,0.028,2;0;0,md5,test1.sig,0.016,138000,0.891,0" in c.last_result.out assert "test1,2,0.011,1;0;1,md5,test1.sig,0.007,54000,0.864,0" in c.last_result.out - assert "test1,2,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" in c.last_result.out + assert ( + "test1,2,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" in c.last_result.out + ) # 19th rank/position - assert "test1,19,0.088,0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.058,442000,0.925,0" in c.last_result.out - assert "test1,19,0.078,1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.050,390000,0.921,0" in c.last_result.out - assert "test1,19,0.028,2;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.016,138000,0.891,0" in c.last_result.out - assert "test1,19,0.011,1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.007,54000,0.864,0" in c.last_result.out - assert "test1,19,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" in c.last_result.out + assert ( + "test1,19,0.088,0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.058,442000,0.925,0" + in c.last_result.out + ) + assert ( + "test1,19,0.078,1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.050,390000,0.921,0" + in c.last_result.out + ) + assert ( + "test1,19,0.028,2;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.016,138000,0.891,0" + in c.last_result.out + ) + assert ( + "test1,19,0.011,1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.007,54000,0.864,0" + in c.last_result.out + ) + assert ( + "test1,19,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" + in c.last_result.out + ) def test_metagenome_LIN_lingroups(runtmp): # test lingroups output c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") lg_file = runtmp.output("test.lg.csv") - with open(lg_file, 'w') as out: - out.write('lin,name\n') - out.write('0;0;0,lg1\n') - out.write('1;0;0,lg2\n') - out.write('2;0;0,lg3\n') - out.write('1;0;1,lg3\n') + with open(lg_file, "w") as out: + out.write("lin,name\n") + out.write("0;0;0,lg1\n") + out.write("1;0;0,lg2\n") + out.write("2;0;0,lg3\n") + out.write("1;0;1,lg3\n") # write a 19 so we can check the end - out.write('1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,lg4\n') - - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '--lins', '--lingroup', lg_file) + out.write("1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,lg4\n") + + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--lins", + "--lingroup", + lg_file, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" in c.last_result.err - assert "Read 5 lingroup rows and found 5 distinct lingroup prefixes." in c.last_result.err + assert ( + "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" + in c.last_result.err + ) + assert ( + "Read 5 lingroup rows and found 5 distinct lingroup prefixes." + in c.last_result.err + ) assert "name lin percent_containment num_bp_contained" in c.last_result.out assert "lg1 0;0;0 5.82 714000" in c.last_result.out assert "lg2 1;0;0 5.05 620000" in c.last_result.out assert "lg3 2;0;0 1.56 192000" in c.last_result.out assert "lg3 1;0;1 0.65 80000" in c.last_result.out - assert "lg4 1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 0.65 80000" in c.last_result.out + assert ( + "lg4 1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 0.65 80000" + in c.last_result.out + ) def test_metagenome_LIN_human_summary_no_lin_position(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '--lins', '-F', "human") + c.run_sourmash( + "tax", "metagenome", "-g", g_csv, "--taxonomy-csv", tax, "--lins", "-F", "human" + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" in c.last_result.err + assert ( + "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" + in c.last_result.err + ) assert "sample name proportion cANI lineage" in c.last_result.out assert "----------- ---------- ---- -------" in c.last_result.out assert "test1 86.9% - unclassified" in c.last_result.out - assert "test1 5.8% 92.5% 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0" in c.last_result.out - assert "test1 5.0% 92.1% 1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0" in c.last_result.out - assert "test1 1.6% 89.1% 2;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0" in c.last_result.out - assert "test1 0.7% 86.4% 1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0" in c.last_result.out + assert ( + "test1 5.8% 92.5% 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0" + in c.last_result.out + ) + assert ( + "test1 5.0% 92.1% 1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0" + in c.last_result.out + ) + assert ( + "test1 1.6% 89.1% 2;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0" + in c.last_result.out + ) + assert ( + "test1 0.7% 86.4% 1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0" + in c.last_result.out + ) def test_metagenome_LIN_human_summary_lin_position_5(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') - - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '--lins', '-F', "human", '--lin-position', '5') + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") + + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--lins", + "-F", + "human", + "--lin-position", + "5", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" in c.last_result.err + assert ( + "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" + in c.last_result.err + ) assert "sample name proportion cANI lineage" in c.last_result.out assert "----------- ---------- ---- -------" in c.last_result.out assert "test1 86.9% - unclassified" in c.last_result.out @@ -3782,155 +5707,274 @@ def test_metagenome_LIN_human_summary_lin_position_5(runtmp): def test_metagenome_LIN_krona_lin_position_5(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') - - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '--lins', '-F', "krona", '--lin-position', '5') + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") + + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--lins", + "-F", + "krona", + "--lin-position", + "5", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" in c.last_result.err + assert ( + "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" + in c.last_result.err + ) assert "fraction 0 1 2 3 4 5" in c.last_result.out assert "0.08815317112086159 0 0 0 0 0 0" in c.last_result.out assert "0.07778220981252493 1 0 0 0 0 0" in c.last_result.out assert "0.027522935779816515 2 0 0 0 0 0" in c.last_result.out assert "0.010769844435580374 1 0 1 0 0 0" in c.last_result.out - assert "0.7957718388512166 unclassified unclassified unclassified unclassified unclassified unclassified" in c.last_result.out + assert ( + "0.7957718388512166 unclassified unclassified unclassified unclassified unclassified unclassified" + in c.last_result.out + ) def test_metagenome_LIN_krona_bad_rank(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '--lins', '-F', "krona", '--lin-position', 'strain') + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--lins", + "-F", + "krona", + "--lin-position", + "strain", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status != 0 - assert "Invalid '--rank'/'--position' input: 'strain'. '--lins' is specified. Rank must be an integer corresponding to a LIN position." in c.last_result.err - + assert ( + "Invalid '--rank'/'--position' input: 'strain'. '--lins' is specified. Rank must be an integer corresponding to a LIN position." + in c.last_result.err + ) def test_metagenome_LIN_lingroups_empty_lg_file(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") lg_file = runtmp.output("test.lg.csv") - with open(lg_file, 'w') as out: + with open(lg_file, "w") as out: out.write("") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '--lins', '--lingroup', lg_file) + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--lins", + "--lingroup", + lg_file, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status != 0 - assert "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" in c.last_result.err - assert f"Cannot read lingroups from '{lg_file}'. Is file empty?" in c.last_result.err + assert ( + "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" + in c.last_result.err + ) + assert ( + f"Cannot read lingroups from '{lg_file}'. Is file empty?" in c.last_result.err + ) def test_metagenome_LIN_lingroups_bad_cli_inputs(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") lg_file = runtmp.output("test.lg.csv") - with open(lg_file, 'w') as out: + with open(lg_file, "w") as out: out.write("") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '--lins', '-F', "lingroup") + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--lins", + "-F", + "lingroup", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status != 0 - assert "Must provide lingroup csv via '--lingroup' in order to output a lingroup report." in c.last_result.err + assert ( + "Must provide lingroup csv via '--lingroup' in order to output a lingroup report." + in c.last_result.err + ) with pytest.raises(SourmashCommandFailed): - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '-F', "lingroup") + c.run_sourmash( + "tax", "metagenome", "-g", g_csv, "--taxonomy-csv", tax, "-F", "lingroup" + ) print(c.last_result.err) assert c.last_result.status != 0 - assert "Must enable LIN taxonomy via '--lins' in order to use lingroups." in c.last_result.err + assert ( + "Must enable LIN taxonomy via '--lins' in order to use lingroups." + in c.last_result.err + ) with pytest.raises(SourmashCommandFailed): - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '--lingroup', lg_file) + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--lingroup", + lg_file, + ) print(c.last_result.err) assert c.last_result.status != 0 with pytest.raises(SourmashCommandFailed): - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '--lins', '-F', 'bioboxes') + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--lins", + "-F", + "bioboxes", + ) print(c.last_result.err) assert c.last_result.status != 0 - assert "ERROR: The following outputs are incompatible with '--lins': : bioboxes, kreport" in c.last_result.err + assert ( + "ERROR: The following outputs are incompatible with '--lins': : bioboxes, kreport" + in c.last_result.err + ) def test_metagenome_mult_outputs_stdout_fail(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '-F', "kreport", 'csv_summary') + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "-F", + "kreport", + "csv_summary", + ) print(c.last_result.err) assert c.last_result.status != 0 - assert f"Writing to stdout is incompatible with multiple output formats ['kreport', 'csv_summary']" in c.last_result.err + assert ( + "Writing to stdout is incompatible with multiple output formats ['kreport', 'csv_summary']" + in c.last_result.err + ) def test_genome_mult_outputs_stdout_fail(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, - '-F', "lineage_csv", 'csv_summary') + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "-F", + "lineage_csv", + "csv_summary", + ) print(c.last_result.err) assert c.last_result.status != 0 - assert f"Writing to stdout is incompatible with multiple output formats ['lineage_csv', 'csv_summary']" in c.last_result.err + assert ( + "Writing to stdout is incompatible with multiple output formats ['lineage_csv', 'csv_summary']" + in c.last_result.err + ) def test_metagenome_LIN_lingroups_lg_only_header(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") lg_file = runtmp.output("test.lg.csv") - with open(lg_file, 'w') as out: - out.write('lin,name\n') + with open(lg_file, "w") as out: + out.write("lin,name\n") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '--lins', '--lingroup', lg_file) + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--lins", + "--lingroup", + lg_file, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status != 0 - assert "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" in c.last_result.err + assert ( + "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" + in c.last_result.err + ) assert f"No lingroups loaded from {lg_file}" in c.last_result.err diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 00344ec0d0..a362984532 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -11,20 +11,40 @@ import sourmash_tst_utils as utils -from sourmash.tax.tax_utils import (ascending_taxlist, get_ident, load_gather_results, - collect_gather_csvs, check_and_load_gather_csvs, - LineagePair, QueryInfo, GatherRow, TaxResult, QueryTaxResult, - SummarizedGatherResult, ClassificationResult, AnnotateTaxResult, - BaseLineageInfo, RankLineageInfo, LINLineageInfo, - aggregate_by_lineage_at_rank, format_for_krona, - write_krona, write_lineage_sample_frac, read_lingroups, - LineageTree, LineageDB, LineageDB_Sqlite, MultiLineageDB) +from sourmash.tax.tax_utils import ( + ascending_taxlist, + get_ident, + load_gather_results, + collect_gather_csvs, + check_and_load_gather_csvs, + LineagePair, + QueryInfo, + GatherRow, + TaxResult, + QueryTaxResult, + SummarizedGatherResult, + ClassificationResult, + AnnotateTaxResult, + BaseLineageInfo, + RankLineageInfo, + LINLineageInfo, + aggregate_by_lineage_at_rank, + format_for_krona, + write_krona, + write_lineage_sample_frac, + read_lingroups, + LineageTree, + LineageDB, + LineageDB_Sqlite, + MultiLineageDB, +) + # utility functions for testing def make_mini_taxonomy(tax_info, LIN=False): - #pass in list of tuples: (name, lineage) + # pass in list of tuples: (name, lineage) taxD = {} - for (name, lin) in tax_info: + for name, lin in tax_info: if LIN: lineage = LINLineageInfo(lineage_str=lin) else: @@ -32,15 +52,16 @@ def make_mini_taxonomy(tax_info, LIN=False): taxD[name] = lineage.filled_lineage return taxD + def make_mini_taxonomy_with_taxids(tax_info, LIN=False): taxD = {} - for (name, lin, taxids) in tax_info: + for name, lin, taxids in tax_info: if LIN: lineage = LINLineageInfo(lineage_str=lin) else: ranks = RankLineageInfo.ranks - txs = taxids.split(';') - lns = lin.split(';') + txs = taxids.split(";") + lns = lin.split(";") lineage_tups = [] for n, taxname in enumerate(lns): rk = ranks[n] @@ -51,20 +72,23 @@ def make_mini_taxonomy_with_taxids(tax_info, LIN=False): taxD[name] = lineage.filled_lineage return taxD + def make_GatherRow(gather_dict=None, exclude_cols=[]): """Load artificial gather row (dict) into GatherRow class""" # default contains just the essential cols - gatherD = {'query_name': 'q1', - 'query_md5': 'md5', - 'query_filename': 'query_fn', - 'name': 'gA', - 'f_unique_weighted': 0.2, - 'f_unique_to_query': 0.1, - 'query_bp':100, - 'unique_intersect_bp': 20, - 'remaining_bp': 1, - 'ksize': 31, - 'scaled': 1} + gatherD = { + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "name": "gA", + "f_unique_weighted": 0.2, + "f_unique_to_query": 0.1, + "query_bp": 100, + "unique_intersect_bp": 20, + "remaining_bp": 1, + "ksize": 31, + "scaled": 1, + } if gather_dict is not None: gatherD.update(gather_dict) for col in exclude_cols: @@ -73,39 +97,73 @@ def make_GatherRow(gather_dict=None, exclude_cols=[]): return gatherRaw -def make_TaxResult(gather_dict=None, taxD=None, keep_full_ident=False, keep_ident_version=False, skip_idents=None, LIN=False): +def make_TaxResult( + gather_dict=None, + taxD=None, + keep_full_ident=False, + keep_ident_version=False, + skip_idents=None, + LIN=False, +): """Make TaxResult from artificial gather row (dict)""" gRow = make_GatherRow(gather_dict) - taxres = TaxResult(raw=gRow, keep_full_identifiers=keep_full_ident, - keep_identifier_versions=keep_ident_version, lins=LIN) + taxres = TaxResult( + raw=gRow, + keep_full_identifiers=keep_full_ident, + keep_identifier_versions=keep_ident_version, + lins=LIN, + ) if taxD is not None: taxres.get_match_lineage(tax_assignments=taxD, skip_idents=skip_idents) return taxres -def make_QueryTaxResults(gather_info, taxD=None, single_query=False, keep_full_ident=False, keep_ident_version=False, - skip_idents=None, summarize=False, classify=False, classify_rank=None, c_thresh=0.1, ani_thresh=None, - LIN=False): +def make_QueryTaxResults( + gather_info, + taxD=None, + single_query=False, + keep_full_ident=False, + keep_ident_version=False, + skip_idents=None, + summarize=False, + classify=False, + classify_rank=None, + c_thresh=0.1, + ani_thresh=None, + LIN=False, +): """Make QueryTaxResult(s) from artificial gather information, formatted as list of gather rows (dicts)""" gather_results = {} this_querytaxres = None for gather_infoD in gather_info: - taxres = make_TaxResult(gather_infoD, taxD=taxD, keep_full_ident=keep_full_ident, - keep_ident_version=keep_ident_version, skip_idents=skip_idents, LIN=LIN) + taxres = make_TaxResult( + gather_infoD, + taxD=taxD, + keep_full_ident=keep_full_ident, + keep_ident_version=keep_ident_version, + skip_idents=skip_idents, + LIN=LIN, + ) query_name = taxres.query_name # add to matching QueryTaxResult or create new one if not this_querytaxres or not this_querytaxres.is_compatible(taxres): # get existing or initialize new - this_querytaxres = gather_results.get(query_name, QueryTaxResult(taxres.query_info, lins=LIN)) + this_querytaxres = gather_results.get( + query_name, QueryTaxResult(taxres.query_info, lins=LIN) + ) this_querytaxres.add_taxresult(taxres) -# print('missed_ident?', taxres.missed_ident) + # print('missed_ident?', taxres.missed_ident) gather_results[query_name] = this_querytaxres if summarize: for query_name, qres in gather_results.items(): qres.build_summarized_result() if classify: for query_name, qres in gather_results.items(): - qres.build_classification_result(rank=classify_rank, containment_threshold=c_thresh, ani_threshold=ani_thresh) + qres.build_classification_result( + rank=classify_rank, + containment_threshold=c_thresh, + ani_threshold=ani_thresh, + ) # for convenience: If working with single query, just return that QueryTaxResult. if single_query: if len(gather_results.keys()) > 1: @@ -117,17 +175,43 @@ def make_QueryTaxResults(gather_info, taxD=None, single_query=False, keep_full_i ## tests def test_ascending_taxlist_1(): - assert list(ascending_taxlist()) == ['strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'] + assert list(ascending_taxlist()) == [ + "strain", + "species", + "genus", + "family", + "order", + "class", + "phylum", + "superkingdom", + ] def test_ascending_taxlist_2(): - assert list(ascending_taxlist(include_strain=False)) == ['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'] + assert list(ascending_taxlist(include_strain=False)) == [ + "species", + "genus", + "family", + "order", + "class", + "phylum", + "superkingdom", + ] def test_QueryInfo_basic(): "basic functionality of QueryInfo dataclass" - qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100',query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') - assert qInf.query_name == 'q1' + qInf = QueryInfo( + query_name="q1", + query_md5="md5", + query_filename="f1", + query_bp="100", + query_n_hashes="10", + ksize="31", + scaled="10", + total_weighted_hashes="200", + ) + assert qInf.query_name == "q1" assert isinstance(qInf.query_n_hashes, int) assert isinstance(qInf.ksize, int) assert isinstance(qInf.scaled, int) @@ -137,8 +221,15 @@ def test_QueryInfo_basic(): def test_QueryInfo_no_hash_info(): "QueryInfo dataclass for older gather results without query_n_hashes or total_weighted_hashes" - qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100',ksize=31,scaled=10) - assert qInf.query_name == 'q1' + qInf = QueryInfo( + query_name="q1", + query_md5="md5", + query_filename="f1", + query_bp="100", + ksize=31, + scaled=10, + ) + assert qInf.query_name == "q1" assert qInf.query_n_hashes == 0 assert qInf.total_weighted_hashes == 0 assert qInf.total_weighted_bp == 0 @@ -147,89 +238,213 @@ def test_QueryInfo_no_hash_info(): def test_QueryInfo_missing(): "check that required args" with pytest.raises(TypeError) as exc: - QueryInfo(query_name='q1', query_filename='f1',query_bp='100',query_n_hashes='10',ksize=31,scaled=10, total_weighted_hashes=200) + QueryInfo( + query_name="q1", + query_filename="f1", + query_bp="100", + query_n_hashes="10", + ksize=31, + scaled=10, + total_weighted_hashes=200, + ) print(str(exc)) assert "missing 1 required positional argument: 'query_md5'" in str(exc) def test_SummarizedGatherResult(): "basic functionality of SummarizedGatherResult dataclass" - qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', - query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') - sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0.3, bp_match_at_rank=30) + qInf = QueryInfo( + query_name="q1", + query_md5="md5", + query_filename="f1", + query_bp="100", + query_n_hashes="10", + ksize="31", + scaled="10", + total_weighted_hashes="200", + ) + sgr = SummarizedGatherResult( + rank="phylum", + fraction=0.2, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, + bp_match_at_rank=30, + ) print(sgr) - assert sgr.rank=='phylum' + assert sgr.rank == "phylum" sumD = sgr.as_summary_dict(query_info=qInf) print(sumD) - assert sumD == {'rank': 'phylum', 'fraction': "0.2", 'lineage': 'a;b', 'f_weighted_at_rank': "0.3", - 'bp_match_at_rank': "30", 'query_ani_at_rank': None, 'query_name': 'q1', - 'query_md5': 'md5', 'query_filename': 'f1', 'total_weighted_hashes': "200"} + assert sumD == { + "rank": "phylum", + "fraction": "0.2", + "lineage": "a;b", + "f_weighted_at_rank": "0.3", + "bp_match_at_rank": "30", + "query_ani_at_rank": None, + "query_name": "q1", + "query_md5": "md5", + "query_filename": "f1", + "total_weighted_hashes": "200", + } hD = sgr.as_human_friendly_dict(query_info=qInf) print(hD) - assert hD == {'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b', 'f_weighted_at_rank': '30.0%', - 'bp_match_at_rank': "30", 'query_ani_at_rank': '- ', 'query_name': 'q1', - 'query_md5': 'md5', 'query_filename': 'f1', 'total_weighted_hashes': "200"} + assert hD == { + "rank": "phylum", + "fraction": "0.200", + "lineage": "a;b", + "f_weighted_at_rank": "30.0%", + "bp_match_at_rank": "30", + "query_ani_at_rank": "- ", + "query_name": "q1", + "query_md5": "md5", + "query_filename": "f1", + "total_weighted_hashes": "200", + } krD = sgr.as_kreport_dict(query_info=qInf) print(krD) - assert krD == {'ncbi_taxid': None, 'sci_name': 'b', 'rank_code': 'P', 'num_bp_assigned': "0", - 'percent_containment': '30.00', 'num_bp_contained': "600"} - lD = sgr.as_lineage_dict(ranks = RankLineageInfo().ranks, query_info=qInf) + assert krD == { + "ncbi_taxid": None, + "sci_name": "b", + "rank_code": "P", + "num_bp_assigned": "0", + "percent_containment": "30.00", + "num_bp_contained": "600", + } + lD = sgr.as_lineage_dict(ranks=RankLineageInfo().ranks, query_info=qInf) print(lD) - assert lD == {'ident': 'q1', 'superkingdom': 'a', 'phylum': 'b', 'class': '', 'order': '', - 'family': '', 'genus': '', 'species': '', 'strain': ''} + assert lD == { + "ident": "q1", + "superkingdom": "a", + "phylum": "b", + "class": "", + "order": "", + "family": "", + "genus": "", + "species": "", + "strain": "", + } cami = sgr.as_cami_bioboxes() print(cami) - assert cami == [None, 'phylum', None, 'a|b', '30.00'] + assert cami == [None, "phylum", None, "a|b", "30.00"] def test_SummarizedGatherResult_withtaxids(): "basic functionality of SummarizedGatherResult dataclass" - qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', - query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') - lin = [LineagePair(rank='superkingdom', name='a', taxid='1'), LineagePair(rank='phylum', name='b', taxid=2)] - sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage=lin), - f_weighted_at_rank=0.3, bp_match_at_rank=30) + qInf = QueryInfo( + query_name="q1", + query_md5="md5", + query_filename="f1", + query_bp="100", + query_n_hashes="10", + ksize="31", + scaled="10", + total_weighted_hashes="200", + ) + lin = [ + LineagePair(rank="superkingdom", name="a", taxid="1"), + LineagePair(rank="phylum", name="b", taxid=2), + ] + sgr = SummarizedGatherResult( + rank="phylum", + fraction=0.2, + lineage=RankLineageInfo(lineage=lin), + f_weighted_at_rank=0.3, + bp_match_at_rank=30, + ) print(sgr) - assert sgr.rank=='phylum' + assert sgr.rank == "phylum" sumD = sgr.as_summary_dict(query_info=qInf) print(sumD) - assert sumD == {'rank': 'phylum', 'fraction': "0.2", 'lineage': 'a;b', 'f_weighted_at_rank': "0.3", - 'bp_match_at_rank': "30", 'query_ani_at_rank': None, 'query_name': 'q1', - 'query_md5': 'md5', 'query_filename': 'f1', 'total_weighted_hashes': "200"} + assert sumD == { + "rank": "phylum", + "fraction": "0.2", + "lineage": "a;b", + "f_weighted_at_rank": "0.3", + "bp_match_at_rank": "30", + "query_ani_at_rank": None, + "query_name": "q1", + "query_md5": "md5", + "query_filename": "f1", + "total_weighted_hashes": "200", + } hD = sgr.as_human_friendly_dict(query_info=qInf) print(hD) - assert hD == {'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b', 'f_weighted_at_rank': '30.0%', - 'bp_match_at_rank': "30", 'query_ani_at_rank': '- ', 'query_name': 'q1', - 'query_md5': 'md5', 'query_filename': 'f1', 'total_weighted_hashes': "200"} + assert hD == { + "rank": "phylum", + "fraction": "0.200", + "lineage": "a;b", + "f_weighted_at_rank": "30.0%", + "bp_match_at_rank": "30", + "query_ani_at_rank": "- ", + "query_name": "q1", + "query_md5": "md5", + "query_filename": "f1", + "total_weighted_hashes": "200", + } krD = sgr.as_kreport_dict(query_info=qInf) print(krD) - assert krD == {'ncbi_taxid': '2', 'sci_name': 'b', 'rank_code': 'P', 'num_bp_assigned': "0", - 'percent_containment': '30.00', 'num_bp_contained': "600"} - lD = sgr.as_lineage_dict(ranks = RankLineageInfo().ranks, query_info=qInf) + assert krD == { + "ncbi_taxid": "2", + "sci_name": "b", + "rank_code": "P", + "num_bp_assigned": "0", + "percent_containment": "30.00", + "num_bp_contained": "600", + } + lD = sgr.as_lineage_dict(ranks=RankLineageInfo().ranks, query_info=qInf) print(lD) - assert lD == {'ident': 'q1', 'superkingdom': 'a', 'phylum': 'b', 'class': '', 'order': '', - 'family': '', 'genus': '', 'species': '', 'strain': ''} + assert lD == { + "ident": "q1", + "superkingdom": "a", + "phylum": "b", + "class": "", + "order": "", + "family": "", + "genus": "", + "species": "", + "strain": "", + } cami = sgr.as_cami_bioboxes() print(cami) - assert cami == ['2', 'phylum', '1|2', 'a|b', '30.00'] + assert cami == ["2", "phylum", "1|2", "a|b", "30.00"] def test_SummarizedGatherResult_LINs(): "SummarizedGatherResult with LINs" - qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', - query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') - sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=LINLineageInfo(lineage_str="0;0;1"), - f_weighted_at_rank=0.3, bp_match_at_rank=30) + qInf = QueryInfo( + query_name="q1", + query_md5="md5", + query_filename="f1", + query_bp="100", + query_n_hashes="10", + ksize="31", + scaled="10", + total_weighted_hashes="200", + ) + sgr = SummarizedGatherResult( + rank="phylum", + fraction=0.2, + lineage=LINLineageInfo(lineage_str="0;0;1"), + f_weighted_at_rank=0.3, + bp_match_at_rank=30, + ) lgD = sgr.as_lingroup_dict(query_info=qInf, lg_name="lg_name") print(lgD) - assert lgD == {'name': "lg_name", "lin": "0;0;1", - 'percent_containment': '30.00', 'num_bp_contained': "600"} + assert lgD == { + "name": "lg_name", + "lin": "0;0;1", + "percent_containment": "30.00", + "num_bp_contained": "600", + } lgD = sgr.as_lingroup_dict(query_info=qInf, lg_name="lg_name") print(lgD) - assert lgD == {'name': "lg_name", "lin": "0;0;1", - 'percent_containment': '30.00', 'num_bp_contained': "600"} + assert lgD == { + "name": "lg_name", + "lin": "0;0;1", + "percent_containment": "30.00", + "num_bp_contained": "600", + } with pytest.raises(ValueError) as exc: sgr.as_kreport_dict(query_info=qInf) print(str(exc)) @@ -242,164 +457,344 @@ def test_SummarizedGatherResult_LINs(): def test_SummarizedGatherResult_set_query_ani(): "Check ANI estimation within SummarizedGatherResult dataclass" - qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', - query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') - sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0.3, bp_match_at_rank=30) + qInf = QueryInfo( + query_name="q1", + query_md5="md5", + query_filename="f1", + query_bp="100", + query_n_hashes="10", + ksize="31", + scaled="10", + total_weighted_hashes="200", + ) + sgr = SummarizedGatherResult( + rank="phylum", + fraction=0.2, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, + bp_match_at_rank=30, + ) sgr.set_query_ani(query_info=qInf) print(sgr.query_ani_at_rank) - assert sgr.query_ani_at_rank == approx(0.949, rel=1e-3) + assert sgr.query_ani_at_rank == approx(0.949, rel=1e-3) # ANI can be calculated with query_bp OR query_n_hashes. Remove each and check the results are identical - qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', - query_n_hashes=0,ksize='31',scaled='10', total_weighted_hashes='200') - sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0.3, bp_match_at_rank=30) + qInf = QueryInfo( + query_name="q1", + query_md5="md5", + query_filename="f1", + query_bp="100", + query_n_hashes=0, + ksize="31", + scaled="10", + total_weighted_hashes="200", + ) + sgr = SummarizedGatherResult( + rank="phylum", + fraction=0.2, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, + bp_match_at_rank=30, + ) sgr.set_query_ani(query_info=qInf) print(sgr.query_ani_at_rank) - assert sgr.query_ani_at_rank == approx(0.949, rel=1e-3) + assert sgr.query_ani_at_rank == approx(0.949, rel=1e-3) # try without query_bp - qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp=0, - query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') - sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0.3, bp_match_at_rank=30) + qInf = QueryInfo( + query_name="q1", + query_md5="md5", + query_filename="f1", + query_bp=0, + query_n_hashes="10", + ksize="31", + scaled="10", + total_weighted_hashes="200", + ) + sgr = SummarizedGatherResult( + rank="phylum", + fraction=0.2, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, + bp_match_at_rank=30, + ) sgr.set_query_ani(query_info=qInf) print(sgr.query_ani_at_rank) - assert sgr.query_ani_at_rank == approx(0.949, rel=1e-3) + assert sgr.query_ani_at_rank == approx(0.949, rel=1e-3) def test_SummarizedGatherResult_greater_than_1(): "basic functionality of SummarizedGatherResult dataclass" # fraction > 1 with pytest.raises(ValueError) as exc: - SummarizedGatherResult(rank="phylum", fraction=0.3, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=1.2, bp_match_at_rank=30) + SummarizedGatherResult( + rank="phylum", + fraction=0.3, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=1.2, + bp_match_at_rank=30, + ) print(str(exc)) assert "> 100% of the query!" in str(exc) # f_weighted > 1 with pytest.raises(ValueError) as exc: - SummarizedGatherResult(rank="phylum", fraction=1.2, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0.3, bp_match_at_rank=30) + SummarizedGatherResult( + rank="phylum", + fraction=1.2, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, + bp_match_at_rank=30, + ) print(str(exc)) assert "> 100% of the query!" in str(exc) def test_SummarizedGatherResult_0_fraction(): with pytest.raises(ValueError) as exc: - SummarizedGatherResult(rank="phylum", fraction=-.1, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0.3, bp_match_at_rank=30) + SummarizedGatherResult( + rank="phylum", + fraction=-0.1, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, + bp_match_at_rank=30, + ) err_msg = "Summarized fraction is <=0% of the query! This should not occur." assert err_msg in str(exc) - #assert cr.status == 'nomatch' - + # assert cr.status == 'nomatch' + with pytest.raises(ValueError) as exc: - SummarizedGatherResult(rank="phylum", fraction=.1, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0, bp_match_at_rank=30) + SummarizedGatherResult( + rank="phylum", + fraction=0.1, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0, + bp_match_at_rank=30, + ) print(str(exc)) assert err_msg in str(exc) def test_SummarizedGatherResult_species_kreport(): "basic functionality of SummarizedGatherResult dataclass" - qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', - query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') - sgr = SummarizedGatherResult(rank="species", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b;c;d;e;f;g"), - f_weighted_at_rank=0.3, bp_match_at_rank=30) + qInf = QueryInfo( + query_name="q1", + query_md5="md5", + query_filename="f1", + query_bp="100", + query_n_hashes="10", + ksize="31", + scaled="10", + total_weighted_hashes="200", + ) + sgr = SummarizedGatherResult( + rank="species", + fraction=0.2, + lineage=RankLineageInfo(lineage_str="a;b;c;d;e;f;g"), + f_weighted_at_rank=0.3, + bp_match_at_rank=30, + ) print(sgr) - assert sgr.rank=='species' + assert sgr.rank == "species" krD = sgr.as_kreport_dict(query_info=qInf) print(krD) - assert krD == {'ncbi_taxid': None, 'sci_name': 'g', 'rank_code': 'S', 'num_bp_assigned': "600", - 'percent_containment': '30.00', 'num_bp_contained': "600"} + assert krD == { + "ncbi_taxid": None, + "sci_name": "g", + "rank_code": "S", + "num_bp_assigned": "600", + "percent_containment": "30.00", + "num_bp_contained": "600", + } def test_SummarizedGatherResult_summary_dict_limit_float(): "basic functionality of SummarizedGatherResult dataclass" - qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', - query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') - sgr = SummarizedGatherResult(rank="phylum", fraction=0.123456, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0.345678, bp_match_at_rank=30) + qInf = QueryInfo( + query_name="q1", + query_md5="md5", + query_filename="f1", + query_bp="100", + query_n_hashes="10", + ksize="31", + scaled="10", + total_weighted_hashes="200", + ) + sgr = SummarizedGatherResult( + rank="phylum", + fraction=0.123456, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.345678, + bp_match_at_rank=30, + ) print(sgr) - assert sgr.rank=='phylum' + assert sgr.rank == "phylum" sumD = sgr.as_summary_dict(query_info=qInf) print(sumD) - assert sumD == {'rank': 'phylum', 'fraction': "0.123456", 'lineage': 'a;b', 'f_weighted_at_rank': "0.345678", - 'bp_match_at_rank': "30", 'query_ani_at_rank': None, 'query_name': 'q1', - 'query_md5': 'md5', 'query_filename': 'f1', 'total_weighted_hashes': "200"} - + assert sumD == { + "rank": "phylum", + "fraction": "0.123456", + "lineage": "a;b", + "f_weighted_at_rank": "0.345678", + "bp_match_at_rank": "30", + "query_ani_at_rank": None, + "query_name": "q1", + "query_md5": "md5", + "query_filename": "f1", + "total_weighted_hashes": "200", + } + sumD = sgr.as_summary_dict(query_info=qInf, limit_float=True) print(sumD) - assert sumD == {'rank': 'phylum', 'fraction': "0.123", 'lineage': 'a;b', 'f_weighted_at_rank': "0.346", - 'bp_match_at_rank': "30", 'query_ani_at_rank': None, 'query_name': 'q1', - 'query_md5': 'md5', 'query_filename': 'f1', 'total_weighted_hashes': "200"} + assert sumD == { + "rank": "phylum", + "fraction": "0.123", + "lineage": "a;b", + "f_weighted_at_rank": "0.346", + "bp_match_at_rank": "30", + "query_ani_at_rank": None, + "query_name": "q1", + "query_md5": "md5", + "query_filename": "f1", + "total_weighted_hashes": "200", + } def test_ClassificationResult(): "basic functionality of ClassificationResult dataclass" - qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', - query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') - cr = ClassificationResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0.3, bp_match_at_rank=30, query_ani_at_rank=0.97) + qInf = QueryInfo( + query_name="q1", + query_md5="md5", + query_filename="f1", + query_bp="100", + query_n_hashes="10", + ksize="31", + scaled="10", + total_weighted_hashes="200", + ) + cr = ClassificationResult( + rank="phylum", + fraction=0.2, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, + bp_match_at_rank=30, + query_ani_at_rank=0.97, + ) cr.set_status(query_info=qInf, containment_threshold=0.1) - assert cr.status == 'match' + assert cr.status == "match" print(cr.query_ani_at_rank) - assert cr.query_ani_at_rank == approx(0.949, rel=1e-3) + assert cr.query_ani_at_rank == approx(0.949, rel=1e-3) cr.set_status(query_info=qInf, containment_threshold=0.35) - assert cr.status == 'below_threshold' - lD = cr.as_lineage_dict(ranks = RankLineageInfo().ranks, query_info=qInf) + assert cr.status == "below_threshold" + lD = cr.as_lineage_dict(ranks=RankLineageInfo().ranks, query_info=qInf) print(lD) - assert lD == {'ident': 'q1', 'superkingdom': 'a', 'phylum': 'b', 'class': '', 'order': '', - 'family': '', 'genus': '', 'species': '', 'strain': ''} + assert lD == { + "ident": "q1", + "superkingdom": "a", + "phylum": "b", + "class": "", + "order": "", + "family": "", + "genus": "", + "species": "", + "strain": "", + } def test_ClassificationResult_greater_than_1(): "basic functionality of SummarizedGatherResult dataclass" # fraction > 1 with pytest.raises(ValueError) as exc: - ClassificationResult(rank="phylum", fraction=0.3, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=1.2, bp_match_at_rank=30) + ClassificationResult( + rank="phylum", + fraction=0.3, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=1.2, + bp_match_at_rank=30, + ) print(str(exc)) assert "> 100% of the query!" in str(exc) # f_weighted > 1 with pytest.raises(ValueError) as exc: - ClassificationResult(rank="phylum", fraction=1.2, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0.3, bp_match_at_rank=30) + ClassificationResult( + rank="phylum", + fraction=1.2, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, + bp_match_at_rank=30, + ) print(str(exc)) assert "> 100% of the query!" in str(exc) def test_ClassificationResult_0_fraction(): with pytest.raises(ValueError) as exc: - ClassificationResult(rank="phylum", fraction=-.1, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0.3, bp_match_at_rank=30) + ClassificationResult( + rank="phylum", + fraction=-0.1, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, + bp_match_at_rank=30, + ) err_msg = "Summarized fraction is <=0% of the query! This should not occur." assert err_msg in str(exc) - #assert cr.status == 'nomatch' - + # assert cr.status == 'nomatch' + with pytest.raises(ValueError) as exc: - ClassificationResult(rank="phylum", fraction=.1, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0, bp_match_at_rank=30) + ClassificationResult( + rank="phylum", + fraction=0.1, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0, + bp_match_at_rank=30, + ) print(str(exc)) assert err_msg in str(exc) def test_ClassificationResult_build_krona_result(): - qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', - query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') - cr = ClassificationResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0.3, bp_match_at_rank=30, query_ani_at_rank=0.97) - #cr.set_status(query_info=qInf, rank='phylum') - kr, ukr = cr.build_krona_result(rank='phylum') + QueryInfo( + query_name="q1", + query_md5="md5", + query_filename="f1", + query_bp="100", + query_n_hashes="10", + ksize="31", + scaled="10", + total_weighted_hashes="200", + ) + cr = ClassificationResult( + rank="phylum", + fraction=0.2, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, + bp_match_at_rank=30, + query_ani_at_rank=0.97, + ) + # cr.set_status(query_info=qInf, rank='phylum') + kr, ukr = cr.build_krona_result(rank="phylum") print(kr) - assert kr == (0.2, 'a', 'b') + assert kr == (0.2, "a", "b") print(ukr) - assert ukr == (0.8, 'unclassified', 'unclassified') + assert ukr == (0.8, "unclassified", "unclassified") def test_ClassificationResult_build_krona_result_no_rank(): - qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', - query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') - cr = ClassificationResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0.3, bp_match_at_rank=30, query_ani_at_rank=0.97) + qInf = QueryInfo( + query_name="q1", + query_md5="md5", + query_filename="f1", + query_bp="100", + query_n_hashes="10", + ksize="31", + scaled="10", + total_weighted_hashes="200", + ) + cr = ClassificationResult( + rank="phylum", + fraction=0.2, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, + bp_match_at_rank=30, + query_ani_at_rank=0.97, + ) cr.set_status(query_info=qInf, containment_threshold=0.1) @@ -407,7 +802,7 @@ def test_GatherRow_old_gather(): # gather does not contain query_name column gA = {"name": "gA.1 name"} with pytest.raises(TypeError) as exc: - make_GatherRow(gA, exclude_cols=['query_bp']) + make_GatherRow(gA, exclude_cols=["query_bp"]) print(str(exc)) assert "__init__() missing 1 required positional argument: 'query_bp'" in str(exc) @@ -433,7 +828,12 @@ def test_AnnotateTaxResult_get_ident_default(): def test_AnnotateTaxResult_get_ident_idcol(): - gA = {"name": "n1", "match_name": "n2", "ident": "n3", "accession": "n4"} # gather result with match name as GCF_001881345.1 + gA = { + "name": "n1", + "match_name": "n2", + "ident": "n3", + "accession": "n4", + } # gather result with match name as GCF_001881345.1 taxres = AnnotateTaxResult(raw=gA) print(taxres.match_ident) assert taxres.match_ident == "n1" @@ -449,7 +849,12 @@ def test_AnnotateTaxResult_get_ident_idcol(): def test_AnnotateTaxResult_get_ident_idcol_fail(): - gA = {"name": "n1", "match_name": "n2", "ident": "n3", "accession": "n4"} # gather result with match name as GCF_001881345.1 + gA = { + "name": "n1", + "match_name": "n2", + "ident": "n3", + "accession": "n4", + } # gather result with match name as GCF_001881345.1 with pytest.raises(ValueError) as exc: AnnotateTaxResult(raw=gA, id_col="NotACol") print(str(exc)) @@ -467,7 +872,7 @@ def test_TaxResult_get_ident_split_but_keep_version(): taxres = make_TaxResult(gA, keep_ident_version=True) print("raw ident: ", taxres.raw.name) print("keep_full?: ", taxres.keep_full_identifiers) - print("keep_version?: ",taxres.keep_identifier_versions) + print("keep_version?: ", taxres.keep_identifier_versions) print("final ident: ", taxres.match_ident) assert taxres.match_ident == "GCF_001881345.1" @@ -475,9 +880,9 @@ def test_TaxResult_get_ident_split_but_keep_version(): def test_AnnotateTaxResult_get_ident_split_but_keep_version(): gA = {"name": "GCF_001881345.1 secondname"} taxres = AnnotateTaxResult(gA, keep_identifier_versions=True) - print("raw ident: ", taxres.raw['name']) + print("raw ident: ", taxres.raw["name"]) print("keep_full?: ", taxres.keep_full_identifiers) - print("keep_version?: ",taxres.keep_identifier_versions) + print("keep_version?: ", taxres.keep_identifier_versions) print("final ident: ", taxres.match_ident) assert taxres.match_ident == "GCF_001881345.1" @@ -493,7 +898,7 @@ def test_TaxResult_get_ident_keep_full(): taxres = make_TaxResult(gA, keep_full_ident=True) print("raw ident: ", taxres.raw.name) print("keep_full?: ", taxres.keep_full_identifiers) - print("keep_version?: ",taxres.keep_identifier_versions) + print("keep_version?: ", taxres.keep_identifier_versions) print("final ident: ", taxres.match_ident) assert taxres.match_ident == "GCF_001881345.1 secondname" @@ -501,32 +906,32 @@ def test_TaxResult_get_ident_keep_full(): def test_AnnotateTaxResult_get_ident_keep_full(): gA = {"name": "GCF_001881345.1 secondname"} taxres = AnnotateTaxResult(gA, keep_full_identifiers=True) - print("raw ident: ", taxres.raw['name']) + print("raw ident: ", taxres.raw["name"]) print("keep_full?: ", taxres.keep_full_identifiers) - print("keep_version?: ",taxres.keep_identifier_versions) + print("keep_version?: ", taxres.keep_identifier_versions) print("final ident: ", taxres.match_ident) assert taxres.match_ident == "GCF_001881345.1 secondname" def test_collect_gather_csvs(runtmp): - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") from_file = runtmp.output("tmp-from-file.txt") - with open(from_file, 'w') as fp: + with open(from_file, "w") as fp: fp.write(f"{g_csv}\n") gather_files = collect_gather_csvs([g_csv], from_file=from_file) print("gather_files: ", gather_files) assert len(gather_files) == 1 - assert basename(gather_files[0]) == 'test1.gather.csv' + assert basename(gather_files[0]) == "test1.gather.csv" def test_check_and_load_gather_csvs_empty(runtmp): - g_res = runtmp.output('empty.gather.csv') - with open(g_res, 'w') as fp: + g_res = runtmp.output("empty.gather.csv") + with open(g_res, "w") as fp: fp.write("") csvs = [g_res] # load taxonomy csv - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") tax_assign = MultiLineageDB.load([taxonomy_csv], keep_full_identifiers=1) print(tax_assign) @@ -537,24 +942,27 @@ def test_check_and_load_gather_csvs_empty(runtmp): def test_check_and_load_gather_csvs_with_empty_force(runtmp): - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") # make gather results with taxonomy name not in tax_assign - g_res2 = runtmp.output('gA.gather.csv') - g_results = [x.replace("GCF_001881345.1", "gA") + "\n" for x in Path(g_csv).read_text().splitlines()] - with open(g_res2, 'w') as fp: + g_res2 = runtmp.output("gA.gather.csv") + g_results = [ + x.replace("GCF_001881345.1", "gA") + "\n" + for x in Path(g_csv).read_text().splitlines() + ] + with open(g_res2, "w") as fp: fp.writelines(g_results) # make empty gather results - g_res3 = runtmp.output('empty.gather.csv') - with open(g_res3, 'w') as fp: + g_res3 = runtmp.output("empty.gather.csv") + with open(g_res3, "w") as fp: fp.write("") csvs = [g_res2, g_res3] # load taxonomy csv - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign = MultiLineageDB.load([taxonomy_csv], - keep_full_identifiers=False, - keep_identifier_versions=False) + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + tax_assign = MultiLineageDB.load( + [taxonomy_csv], keep_full_identifiers=False, keep_identifier_versions=False + ) print(tax_assign) # check gather results and missing ids gather_results = check_and_load_gather_csvs(csvs, tax_assign, force=True) @@ -562,214 +970,269 @@ def test_check_and_load_gather_csvs_with_empty_force(runtmp): q_res = gather_results[0] assert len(q_res.raw_taxresults) == 4 assert q_res.n_missed == 1 - assert 'gA' in q_res.missed_idents + assert "gA" in q_res.missed_idents assert q_res.n_skipped == 0 def test_check_and_load_gather_lineage_csvs_empty(runtmp): # try loading an empty annotated gather file - g_res = runtmp.output('empty.gather-tax.csv') - with open(g_res, 'w') as fp: + g_res = runtmp.output("empty.gather-tax.csv") + with open(g_res, "w") as fp: fp.write("") with pytest.raises(ValueError) as exc: - tax_assign = LineageDB.load_from_gather_with_lineages(g_res) + LineageDB.load_from_gather_with_lineages(g_res) assert "cannot read taxonomy assignments" in str(exc.value) def test_check_and_load_gather_lineage_csvs_bad_header(runtmp): # test on file with wrong headers - g_res = runtmp.output('empty.gather-tax.csv') - with open(g_res, 'w', newline="") as fp: + g_res = runtmp.output("empty.gather-tax.csv") + with open(g_res, "w", newline="") as fp: fp.write("x,y,z") with pytest.raises(ValueError) as exc: - tax_assign = LineageDB.load_from_gather_with_lineages(g_res) - assert "Expected headers 'name' and 'lineage' not found. Is this a with-lineages file?" in str(exc.value) + LineageDB.load_from_gather_with_lineages(g_res) + assert ( + "Expected headers 'name' and 'lineage' not found. Is this a with-lineages file?" + in str(exc.value) + ) def test_check_and_load_gather_lineage_csvs_dne(runtmp): # test loading with-lineage file that does not exist - g_res = runtmp.output('empty.gather-tax.csv') + g_res = runtmp.output("empty.gather-tax.csv") with pytest.raises(ValueError) as exc: - tax_assign = LineageDB.load_from_gather_with_lineages(g_res) + LineageDB.load_from_gather_with_lineages(g_res) assert "does not exist" in str(exc.value) def test_check_and_load_gather_lineage_csvs_isdir(runtmp): # test loading a with-lineage file that is actually a directory - g_res = runtmp.output('empty.gather-tax.csv') + g_res = runtmp.output("empty.gather-tax.csv") os.mkdir(g_res) with pytest.raises(ValueError) as exc: - tax_assign = LineageDB.load_from_gather_with_lineages(g_res) + LineageDB.load_from_gather_with_lineages(g_res) assert "is a directory" in str(exc.value) def test_check_and_load_gather_csvs_fail_on_missing(runtmp): - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") # make gather results with taxonomy name not in tax_assign - g_res2 = runtmp.output('gA.gather.csv') - g_results = [x.replace("GCF_001881345.1", "gA") + "\n" for x in Path(g_csv).read_text().splitlines()] - with open(g_res2, 'w') as fp: + g_res2 = runtmp.output("gA.gather.csv") + g_results = [ + x.replace("GCF_001881345.1", "gA") + "\n" + for x in Path(g_csv).read_text().splitlines() + ] + with open(g_res2, "w") as fp: fp.writelines(g_results) csvs = [g_res2] # load taxonomy csv - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") tax_assign = MultiLineageDB.load([taxonomy_csv], keep_full_identifiers=1) print(tax_assign) # check gather results and missing ids with pytest.raises(ValueError) as exc: - check_and_load_gather_csvs(csvs, tax_assign, fail_on_missing_taxonomy=True, force=True) + check_and_load_gather_csvs( + csvs, tax_assign, fail_on_missing_taxonomy=True, force=True + ) assert "Failing, as requested via --fail-on-missing-taxonomy" in str(exc) def test_load_gather_results(): - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign = MultiLineageDB.load([taxonomy_csv], - keep_full_identifiers=False, - keep_identifier_versions=False) - gather_csv = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + tax_assign = MultiLineageDB.load( + [taxonomy_csv], keep_full_identifiers=False, keep_identifier_versions=False + ) + gather_csv = utils.get_test_data("tax/test1.gather.csv") gather_results, header = load_gather_results(gather_csv, tax_assignments=tax_assign) assert len(gather_results) == 1 for query_name, res in gather_results.items(): - assert query_name == 'test1' + assert query_name == "test1" assert len(res.raw_taxresults) == 4 def test_load_gather_results_gzipped(runtmp): - gather_csv = utils.get_test_data('tax/test1.gather.csv') - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign = MultiLineageDB.load([taxonomy_csv], - keep_full_identifiers=False, - keep_identifier_versions=False) - gather_csv = utils.get_test_data('tax/test1.gather.csv') + gather_csv = utils.get_test_data("tax/test1.gather.csv") + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + tax_assign = MultiLineageDB.load( + [taxonomy_csv], keep_full_identifiers=False, keep_identifier_versions=False + ) + gather_csv = utils.get_test_data("tax/test1.gather.csv") # rewrite gather_csv as gzipped csv - gz_gather = runtmp.output('g.csv.gz') - with open(gather_csv, 'rb') as f_in, gzip.open(gz_gather, 'wb') as f_out: + gz_gather = runtmp.output("g.csv.gz") + with open(gather_csv, "rb") as f_in, gzip.open(gz_gather, "wb") as f_out: f_out.writelines(f_in) - #gather_results, header, seen_queries = load_gather_results(gz_gather) + # gather_results, header, seen_queries = load_gather_results(gz_gather) gather_results, header = load_gather_results(gz_gather, tax_assignments=tax_assign) assert len(gather_results) == 1 for query_name, res in gather_results.items(): - assert query_name == 'test1' + assert query_name == "test1" assert len(res.raw_taxresults) == 4 def test_load_gather_results_bad_header(runtmp): - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign = MultiLineageDB.load([taxonomy_csv], - keep_full_identifiers=False, - keep_identifier_versions=False) - g_csv = utils.get_test_data('tax/test1.gather.csv') - - bad_g_csv = runtmp.output('g.csv') - - #creates bad gather result - bad_g = [x.replace("f_unique_to_query", "nope") + "\n" for x in Path(g_csv).read_text().splitlines()] - with open(bad_g_csv, 'w') as fp: + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + tax_assign = MultiLineageDB.load( + [taxonomy_csv], keep_full_identifiers=False, keep_identifier_versions=False + ) + g_csv = utils.get_test_data("tax/test1.gather.csv") + + bad_g_csv = runtmp.output("g.csv") + + # creates bad gather result + bad_g = [ + x.replace("f_unique_to_query", "nope") + "\n" + for x in Path(g_csv).read_text().splitlines() + ] + with open(bad_g_csv, "w") as fp: fp.writelines(bad_g) print("bad_gather_results: \n", bad_g) with pytest.raises(ValueError) as exc: - gather_results, header = load_gather_results(bad_g_csv, tax_assignments=tax_assign) - assert f"'{bad_g_csv}' is missing columns needed for taxonomic summarization" in str(exc.value) + gather_results, header = load_gather_results( + bad_g_csv, tax_assignments=tax_assign + ) + assert ( + f"'{bad_g_csv}' is missing columns needed for taxonomic summarization" + in str(exc.value) + ) def test_load_gather_results_empty(runtmp): - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign = MultiLineageDB.load([taxonomy_csv], - keep_full_identifiers=False, - keep_identifier_versions=False) - empty_csv = runtmp.output('g.csv') - - #creates empty gather result - with open(empty_csv, 'w') as fp: - fp.write('') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + tax_assign = MultiLineageDB.load( + [taxonomy_csv], keep_full_identifiers=False, keep_identifier_versions=False + ) + empty_csv = runtmp.output("g.csv") + + # creates empty gather result + with open(empty_csv, "w") as fp: + fp.write("") with pytest.raises(ValueError) as exc: - gather_results, header = load_gather_results(empty_csv, tax_assignments=tax_assign) - assert f"Cannot read gather results from '{empty_csv}'. Is file empty?" in str(exc.value) + gather_results, header = load_gather_results( + empty_csv, tax_assignments=tax_assign + ) + assert f"Cannot read gather results from '{empty_csv}'. Is file empty?" in str( + exc.value + ) def test_load_taxonomy_csv(): - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") tax_assign = MultiLineageDB.load([taxonomy_csv]) print("taxonomy assignments: \n", tax_assign) - assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1', 'GCF_000017325.1', 'GCF_000021665.1'] - assert len(tax_assign) == 6 # should have read 6 rows + assert list(tax_assign.keys()) == [ + "GCF_001881345.1", + "GCF_009494285.1", + "GCF_013368705.1", + "GCF_003471795.1", + "GCF_000017325.1", + "GCF_000021665.1", + ] + assert len(tax_assign) == 6 # should have read 6 rows def test_load_taxonomy_csv_LIN(): - taxonomy_csv = utils.get_test_data('tax/test.LIN-taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.LIN-taxonomy.csv") tax_assign = MultiLineageDB.load([taxonomy_csv], lins=True) print("taxonomy assignments: \n", tax_assign) - assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1', 'GCF_000017325.1', 'GCF_000021665.1'] - #assert list(tax_assign.keys()) == ["GCF_000010525.1", "GCF_000007365.1", "GCF_000007725.1", "GCF_000009605.1", "GCF_000021065.1", "GCF_000021085.1"] - assert len(tax_assign) == 6 # should have read 6 rows + assert list(tax_assign.keys()) == [ + "GCF_001881345.1", + "GCF_009494285.1", + "GCF_013368705.1", + "GCF_003471795.1", + "GCF_000017325.1", + "GCF_000021665.1", + ] + # assert list(tax_assign.keys()) == ["GCF_000010525.1", "GCF_000007365.1", "GCF_000007725.1", "GCF_000009605.1", "GCF_000021065.1", "GCF_000021085.1"] + assert len(tax_assign) == 6 # should have read 6 rows print(tax_assign.available_ranks) - assert tax_assign.available_ranks == {str(x) for x in range(0,20)} + assert tax_assign.available_ranks == {str(x) for x in range(0, 20)} def test_load_taxonomy_csv_LIN_fail(): - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") with pytest.raises(ValueError) as exc: MultiLineageDB.load([taxonomy_csv], lins=True) - assert f"'lin' column not found: cannot read LIN taxonomy assignments from {taxonomy_csv}." in str(exc.value) + assert ( + f"'lin' column not found: cannot read LIN taxonomy assignments from {taxonomy_csv}." + in str(exc.value) + ) def test_load_taxonomy_csv_LIN_mismatch_in_taxfile(runtmp): - taxonomy_csv = utils.get_test_data('tax/test.LIN-taxonomy.csv') - mimatchLIN_csv = runtmp.output('mmLIN-taxonomy.csv') - with open(mimatchLIN_csv, 'w') as mm: - tax21=[] + taxonomy_csv = utils.get_test_data("tax/test.LIN-taxonomy.csv") + mimatchLIN_csv = runtmp.output("mmLIN-taxonomy.csv") + with open(mimatchLIN_csv, "w") as mm: + tax21 = [] tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] for n, taxline in enumerate(tax): - if n == 2: # add ;0 to a LIN - taxlist = taxline.split(',') - taxlist[1] += ';0' # add 21st position to LIN + if n == 2: # add ;0 to a LIN + taxlist = taxline.split(",") + taxlist[1] += ";0" # add 21st position to LIN tax21.append(",".join(taxlist)) else: tax21.append(taxline) mm.write("\n".join(tax21)) with pytest.raises(ValueError) as exc: MultiLineageDB.load([mimatchLIN_csv], lins=True) - assert "For taxonomic summarization, all LIN assignments must use the same number of LIN positions." in str(exc.value) + assert ( + "For taxonomic summarization, all LIN assignments must use the same number of LIN positions." + in str(exc.value) + ) def test_load_taxonomy_csv_gzip(runtmp): # test loading a gzipped taxonomy csv file - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_gz = runtmp.output('tax.csv.gz') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + tax_gz = runtmp.output("tax.csv.gz") - with gzip.open(tax_gz, 'wt') as outfp: - with open(taxonomy_csv, 'rt') as infp: + with gzip.open(tax_gz, "wt") as outfp: + with open(taxonomy_csv) as infp: data = infp.read() outfp.write(data) tax_assign = MultiLineageDB.load([tax_gz]) print("taxonomy assignments: \n", tax_assign) - assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1', 'GCF_000017325.1', 'GCF_000021665.1'] - assert len(tax_assign) == 6 # should have read 6 rows + assert list(tax_assign.keys()) == [ + "GCF_001881345.1", + "GCF_009494285.1", + "GCF_013368705.1", + "GCF_003471795.1", + "GCF_000017325.1", + "GCF_000021665.1", + ] + assert len(tax_assign) == 6 # should have read 6 rows def test_load_taxonomy_csv_split_id(): - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign = MultiLineageDB.load([taxonomy_csv], keep_full_identifiers=0, - keep_identifier_versions=False) + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + tax_assign = MultiLineageDB.load( + [taxonomy_csv], keep_full_identifiers=0, keep_identifier_versions=False + ) print("taxonomy assignments: \n", tax_assign) - assert list(tax_assign.keys()) == ['GCF_001881345', 'GCF_009494285', 'GCF_013368705', 'GCF_003471795', 'GCF_000017325', 'GCF_000021665'] - assert len(tax_assign) == 6 # should have read 6 rows + assert list(tax_assign.keys()) == [ + "GCF_001881345", + "GCF_009494285", + "GCF_013368705", + "GCF_003471795", + "GCF_000017325", + "GCF_000021665", + ] + assert len(tax_assign) == 6 # should have read 6 rows def test_load_taxonomy_csv_with_ncbi_id(runtmp): - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") upd_csv = runtmp.output("updated_taxonomy.csv") - with open(upd_csv, 'w') as new_tax: + with open(upd_csv, "w") as new_tax: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] ncbi_id = "ncbi_id after_space" fake_lin = [ncbi_id] + ["sk", "phy", "cls", "ord", "fam", "gen", "sp"] @@ -779,14 +1242,22 @@ def test_load_taxonomy_csv_with_ncbi_id(runtmp): tax_assign = MultiLineageDB.load([upd_csv], keep_full_identifiers=True) print("taxonomy assignments: \n", tax_assign) - assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1', 'GCF_000017325.1', 'GCF_000021665.1', "ncbi_id after_space"] + assert list(tax_assign.keys()) == [ + "GCF_001881345.1", + "GCF_009494285.1", + "GCF_013368705.1", + "GCF_003471795.1", + "GCF_000017325.1", + "GCF_000021665.1", + "ncbi_id after_space", + ] assert len(tax_assign) == 7 # should have read 7 rows def test_load_taxonomy_csv_split_id_ncbi(runtmp): - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") upd_csv = runtmp.output("updated_taxonomy.csv") - with open(upd_csv, 'w') as new_tax: + with open(upd_csv, "w") as new_tax: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] ncbi_id = "ncbi_id after_space" fake_lin = [ncbi_id] + ["sk", "phy", "cls", "ord", "fam", "gen", "sp"] @@ -794,24 +1265,34 @@ def test_load_taxonomy_csv_split_id_ncbi(runtmp): tax.append(ncbi_tax) new_tax.write("\n".join(tax)) - tax_assign = MultiLineageDB.load([upd_csv], keep_full_identifiers=False, - keep_identifier_versions=False) + tax_assign = MultiLineageDB.load( + [upd_csv], keep_full_identifiers=False, keep_identifier_versions=False + ) print("taxonomy assignments: \n", tax_assign) - assert list(tax_assign.keys()) == ['GCF_001881345', 'GCF_009494285', 'GCF_013368705', 'GCF_003471795', 'GCF_000017325', 'GCF_000021665', "ncbi_id"] - assert len(tax_assign) == 7 # should have read 7 rows + assert list(tax_assign.keys()) == [ + "GCF_001881345", + "GCF_009494285", + "GCF_013368705", + "GCF_003471795", + "GCF_000017325", + "GCF_000021665", + "ncbi_id", + ] + assert len(tax_assign) == 7 # should have read 7 rows # check for non-sensical args. with pytest.raises(ValueError): - tax_assign = MultiLineageDB.load([upd_csv], keep_full_identifiers=1, - keep_identifier_versions=False) + tax_assign = MultiLineageDB.load( + [upd_csv], keep_full_identifiers=1, keep_identifier_versions=False + ) def test_load_taxonomy_csv_duplicate(runtmp): - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") duplicated_csv = runtmp.output("duplicated_taxonomy.csv") - with open(duplicated_csv, 'w') as dup: + with open(duplicated_csv, "w") as dup: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] - tax.append(tax[1] + 'FOO') # add first tax_assign again + tax.append(tax[1] + "FOO") # add first tax_assign again print(tax[-1]) dup.write("\n".join(tax)) @@ -823,73 +1304,132 @@ def test_load_taxonomy_csv_duplicate(runtmp): def test_load_taxonomy_csv_duplicate_force(runtmp): - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") duplicated_csv = runtmp.output("duplicated_taxonomy.csv") - with open(duplicated_csv, 'w') as dup: + with open(duplicated_csv, "w") as dup: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] - tax.append(tax[1]) # add first tax_assign again + tax.append(tax[1]) # add first tax_assign again dup.write("\n".join(tax)) # now force tax_assign = MultiLineageDB.load([duplicated_csv], force=True) print("taxonomy assignments: \n", tax_assign) - assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1', 'GCF_000017325.1', 'GCF_000021665.1'] + assert list(tax_assign.keys()) == [ + "GCF_001881345.1", + "GCF_009494285.1", + "GCF_013368705.1", + "GCF_003471795.1", + "GCF_000017325.1", + "GCF_000021665.1", + ] def test_format_for_krona_summarization(): """test format for krona""" # make gather results - # make mini taxonomy + # make mini taxonomy gA_tax = ("gA", "a;b") gB_tax = ("gB", "a;c") - taxD = make_mini_taxonomy([gA_tax,gB_tax]) + taxD = make_mini_taxonomy([gA_tax, gB_tax]) - gather_results = [{'query_name': 'queryA', 'name': 'gA', 'f_unique_weighted': 0.2,'f_unique_to_query': 0.2,'unique_intersect_bp': 50}, - {'query_name': 'queryA', "name": 'gB', 'f_unique_weighted': 0.3,'f_unique_to_query': 0.3,'unique_intersect_bp': 30}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, summarize=True, single_query=True) - kres, header = format_for_krona([q_res], 'superkingdom') - assert header == ['fraction', 'superkingdom'] + gather_results = [ + { + "query_name": "queryA", + "name": "gA", + "f_unique_weighted": 0.2, + "f_unique_to_query": 0.2, + "unique_intersect_bp": 50, + }, + { + "query_name": "queryA", + "name": "gB", + "f_unique_weighted": 0.3, + "f_unique_to_query": 0.3, + "unique_intersect_bp": 30, + }, + ] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, summarize=True, single_query=True + ) + kres, header = format_for_krona([q_res], "superkingdom") + assert header == ["fraction", "superkingdom"] print("krona_res: ", kres) - assert kres == [(0.5, 'a'), (0.5, 'unclassified')] - kres, header = format_for_krona([q_res], 'phylum') - assert header == ['fraction', 'superkingdom', 'phylum'] - assert kres == [(0.3, 'a', 'c'), (0.2, 'a', 'b'), (0.5, 'unclassified', 'unclassified')] + assert kres == [(0.5, "a"), (0.5, "unclassified")] + kres, header = format_for_krona([q_res], "phylum") + assert header == ["fraction", "superkingdom", "phylum"] + assert kres == [ + (0.3, "a", "c"), + (0.2, "a", "b"), + (0.5, "unclassified", "unclassified"), + ] def test_format_for_krona_classification(): """test format for krona""" # make gather results - # make mini taxonomy + # make mini taxonomy gA_tax = ("gA", "a;b") gB_tax = ("gB", "a;c") - taxD = make_mini_taxonomy([gA_tax,gB_tax]) + taxD = make_mini_taxonomy([gA_tax, gB_tax]) - gather_results = [{'query_name': 'queryA', 'name': 'gA', 'f_unique_weighted': 0.2,'f_unique_to_query': 0.2,'unique_intersect_bp': 50}, - {'query_name': 'queryA', "name": 'gB', 'f_unique_weighted': 0.3,'f_unique_to_query': 0.3,'unique_intersect_bp': 30}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, classify=True, single_query=True) - kres, header = format_for_krona([q_res], 'superkingdom', classification=True) - assert header == ['fraction', 'superkingdom'] + gather_results = [ + { + "query_name": "queryA", + "name": "gA", + "f_unique_weighted": 0.2, + "f_unique_to_query": 0.2, + "unique_intersect_bp": 50, + }, + { + "query_name": "queryA", + "name": "gB", + "f_unique_weighted": 0.3, + "f_unique_to_query": 0.3, + "unique_intersect_bp": 30, + }, + ] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, classify=True, single_query=True + ) + kres, header = format_for_krona([q_res], "superkingdom", classification=True) + assert header == ["fraction", "superkingdom"] print("krona_res: ", kres) - assert kres == [(0.5, 'a')]#, (0.5, 'unclassified')] - kres, header = format_for_krona([q_res], 'phylum', classification=True) - assert header == ['fraction', 'superkingdom', 'phylum'] - assert kres == [(0.3, 'a', 'c')]#, (0.7, 'unclassified', 'unclassified')] + assert kres == [(0.5, "a")] # , (0.5, 'unclassified')] + kres, header = format_for_krona([q_res], "phylum", classification=True) + assert header == ["fraction", "superkingdom", "phylum"] + assert kres == [(0.3, "a", "c")] # , (0.7, 'unclassified', 'unclassified')] def test_format_for_krona_improper_rank(): """test format for krona""" # make gather results - # make mini taxonomy + # make mini taxonomy gA_tax = ("gA", "a;b") gB_tax = ("gB", "a;c") - taxD = make_mini_taxonomy([gA_tax,gB_tax]) + taxD = make_mini_taxonomy([gA_tax, gB_tax]) - gather_results = [{'query_name': 'queryA', 'name': 'gA', 'f_unique_weighted': 0.2,'f_unique_to_query': 0.2,'unique_intersect_bp': 50}, - {'query_name': 'queryA', "name": 'gB', 'f_unique_weighted': 0.3,'f_unique_to_query': 0.3,'unique_intersect_bp': 30}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, summarize=True, single_query=True) + gather_results = [ + { + "query_name": "queryA", + "name": "gA", + "f_unique_weighted": 0.2, + "f_unique_to_query": 0.2, + "unique_intersect_bp": 50, + }, + { + "query_name": "queryA", + "name": "gB", + "f_unique_weighted": 0.3, + "f_unique_to_query": 0.3, + "unique_intersect_bp": 30, + }, + ] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, summarize=True, single_query=True + ) with pytest.raises(ValueError) as exc: - format_for_krona([q_res], 'NotARank') + format_for_krona([q_res], "NotARank") print(str(exc)) assert "Rank 'NotARank' not present in summarized ranks." in str(exc) @@ -897,33 +1437,57 @@ def test_format_for_krona_improper_rank(): def test_format_for_krona_summarization_two_queries(): """test format for krona with multiple queries (normalize by n_queries)""" # make gather results - # make mini taxonomy + # make mini taxonomy gA_tax = ("gA", "a;b") gB_tax = ("gB", "a;c") - taxD = make_mini_taxonomy([gA_tax,gB_tax]) + taxD = make_mini_taxonomy([gA_tax, gB_tax]) - gather_results = [{'query_name': 'queryA', 'name': 'gA', 'f_unique_weighted': 0.2,'f_unique_to_query': 0.2,'unique_intersect_bp': 50}, - {'query_name': 'queryA', "name": 'gB', 'f_unique_weighted': 0.3,'f_unique_to_query': 0.3,'unique_intersect_bp': 30}, - {'query_name': 'queryB', "name": 'gB', 'f_unique_weighted': 0.5,'f_unique_to_query': 0.5,'unique_intersect_bp': 50}] + gather_results = [ + { + "query_name": "queryA", + "name": "gA", + "f_unique_weighted": 0.2, + "f_unique_to_query": 0.2, + "unique_intersect_bp": 50, + }, + { + "query_name": "queryA", + "name": "gB", + "f_unique_weighted": 0.3, + "f_unique_to_query": 0.3, + "unique_intersect_bp": 30, + }, + { + "query_name": "queryB", + "name": "gB", + "f_unique_weighted": 0.5, + "f_unique_to_query": 0.5, + "unique_intersect_bp": 50, + }, + ] gres = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, summarize=True) - kres, header = format_for_krona(list(gres.values()), 'superkingdom') - assert header == ['fraction', 'superkingdom'] + kres, header = format_for_krona(list(gres.values()), "superkingdom") + assert header == ["fraction", "superkingdom"] print("krona_res: ", kres) - assert kres == [(0.5, 'a'), (0.5, 'unclassified')] - kres, header = format_for_krona(list(gres.values()), 'phylum') - assert header == ['fraction', 'superkingdom', 'phylum'] - assert kres == [(0.4, 'a', 'c'), (0.1, 'a', 'b'), (0.5, 'unclassified', 'unclassified')] + assert kres == [(0.5, "a"), (0.5, "unclassified")] + kres, header = format_for_krona(list(gres.values()), "phylum") + assert header == ["fraction", "superkingdom", "phylum"] + assert kres == [ + (0.4, "a", "c"), + (0.1, "a", "b"), + (0.5, "unclassified", "unclassified"), + ] def test_write_krona(runtmp): """test two matches, equal f_unique_to_query""" - krona_results = [(0.5, 'a', 'b', 'c'), (0.5, 'a', 'b', 'd')] - header = ['fraction', 'superkingdom', 'phylum', 'class'] - outk= runtmp.output("outkrona.tsv") - with open(outk, 'w') as out_fp: + krona_results = [(0.5, "a", "b", "c"), (0.5, "a", "b", "d")] + header = ["fraction", "superkingdom", "phylum", "class"] + outk = runtmp.output("outkrona.tsv") + with open(outk, "w") as out_fp: write_krona(header, krona_results, out_fp) - kr = [x.strip().split('\t') for x in Path(outk).read_text().splitlines()] + kr = [x.strip().split("\t") for x in Path(outk).read_text().splitlines()] print("krona_results_from_file: \n", kr) assert kr[0] == ["fraction", "superkingdom", "phylum", "class"] assert kr[1] == ["0.5", "a", "b", "c"] @@ -931,65 +1495,73 @@ def test_write_krona(runtmp): def test_write_lineage_sample_frac(runtmp): - outfrac = runtmp.output('outfrac.csv') - sample_names = ['sample1', 'sample2'] - sk_linD = {'a': {'sample1': '0.500' ,'sample2': '0.700'}} - with open(outfrac, 'w') as out_fp: + outfrac = runtmp.output("outfrac.csv") + sample_names = ["sample1", "sample2"] + sk_linD = {"a": {"sample1": "0.500", "sample2": "0.700"}} + with open(outfrac, "w") as out_fp: write_lineage_sample_frac(sample_names, sk_linD, out_fp) - frac_lines = [x.strip().split('\t') for x in Path(outfrac).read_text().splitlines()] + frac_lines = [x.strip().split("\t") for x in Path(outfrac).read_text().splitlines()] print("csv_lines: ", frac_lines) - assert frac_lines == [['lineage', 'sample1', 'sample2'], ['a', '0.500', '0.700']] + assert frac_lines == [["lineage", "sample1", "sample2"], ["a", "0.500", "0.700"]] - phy_linD = {'a;b': {'sample1': '0.500'}, 'a;c': {'sample2': '0.700'}} - with open(outfrac, 'w') as out_fp: + phy_linD = {"a;b": {"sample1": "0.500"}, "a;c": {"sample2": "0.700"}} + with open(outfrac, "w") as out_fp: write_lineage_sample_frac(sample_names, phy_linD, out_fp) - frac_lines = [x.strip().split('\t') for x in Path(outfrac).read_text().splitlines()] + frac_lines = [x.strip().split("\t") for x in Path(outfrac).read_text().splitlines()] print("csv_lines: ", frac_lines) - assert frac_lines == [['lineage', 'sample1', 'sample2'], ['a;b', '0.500', '0'], ['a;c', '0', '0.700']] + assert frac_lines == [ + ["lineage", "sample1", "sample2"], + ["a;b", "0.500", "0"], + ["a;c", "0", "0.700"], + ] def test_write_lineage_sample_frac_format_lineage(runtmp): - outfrac = runtmp.output('outfrac.csv') - sample_names = ['sample1', 'sample2'] - sk_lineage='a' + outfrac = runtmp.output("outfrac.csv") + sample_names = ["sample1", "sample2"] + sk_lineage = "a" print(sk_lineage) - sk_linD = {sk_lineage: {'sample1': '0.500' ,'sample2': '0.700'}} - with open(outfrac, 'w') as out_fp: + sk_linD = {sk_lineage: {"sample1": "0.500", "sample2": "0.700"}} + with open(outfrac, "w") as out_fp: write_lineage_sample_frac(sample_names, sk_linD, out_fp) - frac_lines = [x.strip().split('\t') for x in Path(outfrac).read_text().splitlines()] + frac_lines = [x.strip().split("\t") for x in Path(outfrac).read_text().splitlines()] print("csv_lines: ", frac_lines) - assert frac_lines == [['lineage', 'sample1', 'sample2'], ['a', '0.500', '0.700']] + assert frac_lines == [["lineage", "sample1", "sample2"], ["a", "0.500", "0.700"]] - phy_lineage='a;b' + phy_lineage = "a;b" print(phy_lineage) - phy2_lineage = 'a;c' + phy2_lineage = "a;c" print(phy2_lineage) - phy_linD = {phy_lineage: {'sample1': '0.500'}, phy2_lineage: {'sample2': '0.700'}} - with open(outfrac, 'w') as out_fp: + phy_linD = {phy_lineage: {"sample1": "0.500"}, phy2_lineage: {"sample2": "0.700"}} + with open(outfrac, "w") as out_fp: write_lineage_sample_frac(sample_names, phy_linD, out_fp) - frac_lines = [x.strip().split('\t') for x in Path(outfrac).read_text().splitlines()] + frac_lines = [x.strip().split("\t") for x in Path(outfrac).read_text().splitlines()] print("csv_lines: ", frac_lines) - assert frac_lines == [['lineage', 'sample1', 'sample2'], ['a;b', '0.500', '0'], ['a;c', '0', '0.700']] + assert frac_lines == [ + ["lineage", "sample1", "sample2"], + ["a;b", "0.500", "0"], + ["a;c", "0", "0.700"], + ] def test_tax_multi_load_files(runtmp): # test loading various good and bad files - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - taxonomy_csv2 = utils.get_test_data('tax/test-strain.taxonomy.csv') - badcsv = utils.get_test_data('tax/47+63_x_gtdb-rs202.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + taxonomy_csv2 = utils.get_test_data("tax/test-strain.taxonomy.csv") + badcsv = utils.get_test_data("tax/47+63_x_gtdb-rs202.gather.csv") db = MultiLineageDB.load([taxonomy_csv]) assert len(db) == 6 - assert 'strain' not in db.available_ranks + assert "strain" not in db.available_ranks db = MultiLineageDB.load([taxonomy_csv2]) assert len(db) == 6 - assert 'strain' in db.available_ranks - assert db['GCF_001881345.1'][0].rank == 'superkingdom' + assert "strain" in db.available_ranks + assert db["GCF_001881345.1"][0].rank == "superkingdom" # load a string rather than a list with pytest.raises(TypeError): @@ -1001,75 +1573,83 @@ def test_tax_multi_load_files(runtmp): # load a directory with pytest.raises(ValueError): - MultiLineageDB.load([runtmp.output('')]) + MultiLineageDB.load([runtmp.output("")]) # file does not exist with pytest.raises(ValueError): - MultiLineageDB.load([runtmp.output('no-such-file')]) + MultiLineageDB.load([runtmp.output("no-such-file")]) def test_tax_sql_load_new_file(runtmp): # test loading a newer-format sql file with sourmash_internals table - taxonomy_db = utils.get_test_data('sqlite/test.taxonomy.db') + taxonomy_db = utils.get_test_data("sqlite/test.taxonomy.db") db = MultiLineageDB.load([taxonomy_db]) print(list(db.keys())) assert len(db) == 6 - assert 'strain' not in db.available_ranks - assert db['GCF_001881345'][0].rank == 'superkingdom' + assert "strain" not in db.available_ranks + assert db["GCF_001881345"][0].rank == "superkingdom" def test_tax_multi_load_files_shadowed(runtmp): # test loading various good and bad files - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - taxonomy_csv2 = utils.get_test_data('tax/test-strain.taxonomy.csv') - taxonomy_db = utils.get_test_data('tax/test.taxonomy.db') - - db = MultiLineageDB.load([taxonomy_csv, taxonomy_csv2, taxonomy_db], - keep_full_identifiers=False, - keep_identifier_versions=False) + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + taxonomy_csv2 = utils.get_test_data("tax/test-strain.taxonomy.csv") + taxonomy_db = utils.get_test_data("tax/test.taxonomy.db") + + db = MultiLineageDB.load( + [taxonomy_csv, taxonomy_csv2, taxonomy_db], + keep_full_identifiers=False, + keep_identifier_versions=False, + ) assert len(db.shadowed_identifiers()) == 6 # we should have everything including strain assert set(RankLineageInfo().taxlist) == set(db.available_ranks) - db = MultiLineageDB.load([taxonomy_csv, taxonomy_db], - keep_full_identifiers=False, - keep_identifier_versions=False) + db = MultiLineageDB.load( + [taxonomy_csv, taxonomy_db], + keep_full_identifiers=False, + keep_identifier_versions=False, + ) assert len(db.shadowed_identifiers()) == 6 assert set(RankLineageInfo().taxlist[:-1]) == set(db.available_ranks) def test_tax_multi_save_files(runtmp, keep_identifiers, keep_versions): # test save - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") if keep_identifiers and not keep_versions: with pytest.raises(ValueError): - db = MultiLineageDB.load([taxonomy_csv], - keep_full_identifiers=keep_identifiers, - keep_identifier_versions=keep_versions) + db = MultiLineageDB.load( + [taxonomy_csv], + keep_full_identifiers=keep_identifiers, + keep_identifier_versions=keep_versions, + ) return - db = MultiLineageDB.load([taxonomy_csv], - keep_full_identifiers=keep_identifiers, - keep_identifier_versions=keep_versions) + db = MultiLineageDB.load( + [taxonomy_csv], + keep_full_identifiers=keep_identifiers, + keep_identifier_versions=keep_versions, + ) - out_db = runtmp.output('out.db') - out_csv = runtmp.output('out.csv') - out2_csv = runtmp.output('out2.csv') + out_db = runtmp.output("out.db") + out_csv = runtmp.output("out.csv") + out2_csv = runtmp.output("out2.csv") # can't save to fp with sql - with open(out_csv, 'wt') as fp: + with open(out_csv, "w") as fp: with pytest.raises(ValueError): - db.save(fp, 'sql') + db.save(fp, "sql") # these should all work... - with open(out_csv, 'wt') as fp: - db.save(fp, 'csv') + with open(out_csv, "w") as fp: + db.save(fp, "csv") - db.save(out2_csv, 'csv') - db.save(out_db, 'sql') + db.save(out2_csv, "csv") + db.save(out_db, "sql") # ...and be equal db1 = db.load([out_db]) @@ -1078,19 +1658,20 @@ def test_tax_multi_save_files(runtmp, keep_identifiers, keep_versions): def strip_strain(it): for k, v in it: - if v[-1].rank == 'strain': + if v[-1].rank == "strain": v = v[:-1] yield k, v import pprint + db_items = list(strip_strain(db.items())) db1_items = list(strip_strain(db1.items())) db2_items = list(strip_strain(db2.items())) db3_items = list(strip_strain(db3.items())) pprint.pprint(db_items) - print('XXX') + print("XXX") pprint.pprint(list(db1_items)) - print('XXX') + print("XXX") pprint.pprint(list(db2_items)) assert set(db_items) == set(db1_items) @@ -1100,18 +1681,18 @@ def strip_strain(it): def test_lineage_db_csv_load(runtmp): # test LineageDB.load - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - taxonomy_csv2 = utils.get_test_data('tax/test-strain.taxonomy.csv') - badcsv = utils.get_test_data('tax/47+63_x_gtdb-rs202.gather.csv') - badcsv2 = utils.get_test_data('tax/test-missing-ranks.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + taxonomy_csv2 = utils.get_test_data("tax/test-strain.taxonomy.csv") + badcsv = utils.get_test_data("tax/47+63_x_gtdb-rs202.gather.csv") + badcsv2 = utils.get_test_data("tax/test-missing-ranks.taxonomy.csv") db = LineageDB.load(taxonomy_csv) assert len(db) == 6 - assert 'strain' not in db.available_ranks + assert "strain" not in db.available_ranks db = LineageDB.load(taxonomy_csv2) assert len(db) == 6 - assert 'strain' in db.available_ranks + assert "strain" in db.available_ranks # load the wrong kind of csv with pytest.raises(ValueError): @@ -1123,32 +1704,32 @@ def test_lineage_db_csv_load(runtmp): # load a directory with pytest.raises(ValueError): - LineageDB.load(runtmp.output('')) + LineageDB.load(runtmp.output("")) # file does not exist with pytest.raises(ValueError): - LineageDB.load(runtmp.output('no-such-file')) + LineageDB.load(runtmp.output("no-such-file")) # construct a CSV with bad headers - with open(runtmp.output('xxx.csv'), 'w', newline="") as fp: - fp.write('x,y,z\n') + with open(runtmp.output("xxx.csv"), "w", newline="") as fp: + fp.write("x,y,z\n") with pytest.raises(ValueError): - LineageDB.load(runtmp.output('xxx.csv')) + LineageDB.load(runtmp.output("xxx.csv")) def test_lineage_db_sql_load(runtmp): # test LineageDB_sqlite.load - taxonomy_db = utils.get_test_data('tax/test.taxonomy.db') - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_db = utils.get_test_data("tax/test.taxonomy.db") + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") db = LineageDB_Sqlite.load(taxonomy_db) assert bool(db) assert len(db) == 6 db.available_ranks - assert 'strain' not in db.available_ranks - assert db['GCF_001881345'][0].rank == 'superkingdom' + assert "strain" not in db.available_ranks + assert db["GCF_001881345"][0].rank == "superkingdom" with pytest.raises(KeyError): - db['foo'] + db["foo"] # load any kind of CSV with pytest.raises(ValueError): @@ -1156,57 +1737,63 @@ def test_lineage_db_sql_load(runtmp): # load a directory with pytest.raises(ValueError): - LineageDB_Sqlite.load(runtmp.output('')) + LineageDB_Sqlite.load(runtmp.output("")) # file does not exist with pytest.raises(ValueError): - LineageDB_Sqlite.load(runtmp.output('no-such-file')) + LineageDB_Sqlite.load(runtmp.output("no-such-file")) def test_LineagePair(): - lin = LineagePair(rank="rank1", name='name1') + lin = LineagePair(rank="rank1", name="name1") print(lin) - assert lin.rank=="rank1" - assert lin.name =="name1" - assert lin.taxid==None + assert lin.rank == "rank1" + assert lin.name == "name1" + assert lin.taxid is None def test_LineagePair_1(): - lin = LineagePair(rank="rank1", name='name1', taxid=1) - assert lin.rank=="rank1" - assert lin.name =="name1" - assert lin.taxid==1 + lin = LineagePair(rank="rank1", name="name1", taxid=1) + assert lin.rank == "rank1" + assert lin.name == "name1" + assert lin.taxid == 1 print(lin) def test_BaseLineageInfo_init_empty(): - ranks=["A", "B", "C"] + ranks = ["A", "B", "C"] taxinf = BaseLineageInfo(ranks=ranks) print(taxinf.lineage) print(taxinf.lineage_str) - assert taxinf.zip_lineage()== ['', '', ''] # this is a bit odd, but it's what preserves empty ranks... + assert taxinf.zip_lineage() == [ + "", + "", + "", + ] # this is a bit odd, but it's what preserves empty ranks... print(taxinf.filled_lineage) assert taxinf.filled_lineage == () - assert taxinf.lowest_lineage_name == None - assert taxinf.lowest_lineage_taxid == None + assert taxinf.lowest_lineage_name is None + assert taxinf.lowest_lineage_taxid is None assert taxinf.filled_ranks == () - assert taxinf.name_at_rank("A") == None - assert taxinf.lowest_rank == None + assert taxinf.name_at_rank("A") is None + assert taxinf.lowest_rank is None assert taxinf.display_lineage() == "" assert taxinf.display_lineage(null_as_unclassified=True) == "unclassified" def test_BaseLineageInfo_init_lineage_str(): x = "a;b;c" - ranks=["A", "B", "C"] + ranks = ["A", "B", "C"] taxinf = BaseLineageInfo(lineage_str=x, ranks=ranks) print(taxinf.lineage) print(taxinf.lineage_str) - assert taxinf.zip_lineage()== ['a', 'b', 'c'] + assert taxinf.zip_lineage() == ["a", "b", "c"] print(taxinf.filled_lineage) - assert taxinf.filled_lineage == (LineagePair(rank='A', name='a', taxid=None), - LineagePair(rank='B', name='b', taxid=None), - LineagePair(rank='C', name='c', taxid=None)) + assert taxinf.filled_lineage == ( + LineagePair(rank="A", name="a", taxid=None), + LineagePair(rank="B", name="b", taxid=None), + LineagePair(rank="C", name="c", taxid=None), + ) assert taxinf.lowest_lineage_name == "c" assert taxinf.lowest_rank == "C" assert taxinf.name_at_rank("A") == "a" @@ -1214,37 +1801,39 @@ def test_BaseLineageInfo_init_lineage_str(): def test_BaseLineageInfo_init_lineage_str_comma_sep(): x = "a,b,c" - ranks=["A", "B", "C"] + ranks = ["A", "B", "C"] taxinf = BaseLineageInfo(lineage_str=x, ranks=ranks) print(taxinf.lineage) print(taxinf.lineage_str) - assert taxinf.zip_lineage()== ['a', 'b', 'c'] + assert taxinf.zip_lineage() == ["a", "b", "c"] print(taxinf.filled_lineage) assert taxinf.lowest_lineage_name == "c" def test_BaseLineageInfo_init_lineage_tups(): - ranks=["A", "B", "C"] - lin_tups = (LineagePair(rank="A", name='a'), LineagePair(rank="C", name='b')) + ranks = ["A", "B", "C"] + lin_tups = (LineagePair(rank="A", name="a"), LineagePair(rank="C", name="b")) taxinf = BaseLineageInfo(lineage=lin_tups, ranks=ranks) print(taxinf.lineage) print(taxinf.lineage_str) - assert taxinf.zip_lineage()== ['a', '', 'b'] + assert taxinf.zip_lineage() == ["a", "", "b"] def test_BaseLineageInfo_init_lca_lineage_tups(): - ranks=["A", "B", "C"] - lin_tups = (LineagePair(rank="A", name='a'), LineagePair(rank="C", name='b')) + ranks = ["A", "B", "C"] + lin_tups = (LineagePair(rank="A", name="a"), LineagePair(rank="C", name="b")) taxinf = BaseLineageInfo(lineage=lin_tups, ranks=ranks) print(taxinf.lineage) print(taxinf.lineage_str) - assert taxinf.zip_lineage()== ['a', '', 'b'] + assert taxinf.zip_lineage() == ["a", "", "b"] def test_BaseLineageInfo_init_no_ranks(): x = "a;b;c" - rankD = {"superkingdom": "a", "phylum": "b", "class": "c"} - lin_tups = (LineagePair(rank="rank2", name='name1'), LineagePair(rank="rank1", name='name1')) + lin_tups = ( + LineagePair(rank="rank2", name="name1"), + LineagePair(rank="rank1", name="name1"), + ) with pytest.raises(TypeError) as exc: BaseLineageInfo(lineage_str=x) print(exc) @@ -1256,9 +1845,8 @@ def test_BaseLineageInfo_init_no_ranks(): def test_BaseLineageInfo_init_with_wrong_ranks(): - ranks=["A", "B", "C"] - lin_tups = [LineagePair(rank="rank1", name='name1')] - linD = {"rank1": "a"} + ranks = ["A", "B", "C"] + lin_tups = [LineagePair(rank="rank1", name="name1")] with pytest.raises(ValueError) as exc: BaseLineageInfo(lineage=lin_tups, ranks=ranks) print(str(exc)) @@ -1266,7 +1854,7 @@ def test_BaseLineageInfo_init_with_wrong_ranks(): def test_BaseLineageInfo_init_not_lineagepair(): - ranks=["A", "B", "C"] + ranks = ["A", "B", "C"] lin_tups = (("rank1", "name1"),) with pytest.raises(ValueError) as exc: BaseLineageInfo(lineage=lin_tups, ranks=ranks) @@ -1276,7 +1864,16 @@ def test_BaseLineageInfo_init_not_lineagepair(): def test_RankLineageInfo_taxlist(): taxinf = RankLineageInfo() - taxranks = ('superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain') + taxranks = ( + "superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + "strain", + ) assert taxinf.taxlist == taxranks assert taxinf.ascending_taxlist == taxranks[::-1] @@ -1286,14 +1883,14 @@ def test_RankLineageInfo_init_lineage_str(): taxinf = RankLineageInfo(lineage_str=x) print(taxinf.lineage) print(taxinf.lineage_str) - assert taxinf.zip_lineage()== ['a', 'b', 'c', '', '', '', '', ''] + assert taxinf.zip_lineage() == ["a", "b", "c", "", "", "", "", ""] def test_LINLineageInfo_init_empty(): taxinf = LINLineageInfo() assert taxinf.n_lin_positions == 0 - assert taxinf.zip_lineage()== [] - assert taxinf.display_lineage()== "" + assert taxinf.zip_lineage() == [] + assert taxinf.display_lineage() == "" assert taxinf.filled_ranks == () assert taxinf.n_filled_pos == 0 @@ -1304,7 +1901,7 @@ def test_LINLineageInfo_init_n_pos(): print(taxinf.lineage) print(taxinf.lineage_str) assert taxinf.n_lin_positions == 5 - assert taxinf.zip_lineage()== ['', '', '', '', ''] + assert taxinf.zip_lineage() == ["", "", "", "", ""] assert taxinf.filled_ranks == () assert taxinf.n_filled_pos == 0 @@ -1316,8 +1913,8 @@ def test_LINLineageInfo_init_n_pos_and_lineage_str(): print(taxinf.lineage) print(taxinf.lineage_str) assert taxinf.n_lin_positions == 5 - assert taxinf.zip_lineage()== ['0', '0', '1', '', ''] - assert taxinf.filled_ranks == ("0","1","2") + assert taxinf.zip_lineage() == ["0", "0", "1", "", ""] + assert taxinf.filled_ranks == ("0", "1", "2") assert taxinf.n_filled_pos == 3 @@ -1327,7 +1924,10 @@ def test_LINLineageInfo_init_n_pos_and_lineage_str_fail(): with pytest.raises(ValueError) as exc: LINLineageInfo(lineage_str=x, n_lin_positions=n_pos) print(str(exc)) - assert "Provided 'n_lin_positions' has fewer positions than provided 'lineage_str'." in str(exc) + assert ( + "Provided 'n_lin_positions' has fewer positions than provided 'lineage_str'." + in str(exc) + ) def test_LINLineageInfo_init_lineage_str_only(): @@ -1336,8 +1936,8 @@ def test_LINLineageInfo_init_lineage_str_only(): print(taxinf.lineage) print(taxinf.lineage_str) assert taxinf.n_lin_positions == 3 - assert taxinf.zip_lineage()== ['0', '0', '1'] - assert taxinf.filled_ranks == ("0","1","2") + assert taxinf.zip_lineage() == ["0", "0", "1"] + assert taxinf.filled_ranks == ("0", "1", "2") assert taxinf.n_filled_pos == 3 @@ -1350,12 +1950,15 @@ def test_LINLineageInfo_init_not_lineagepair(): def test_LINLineageInfo_init_lineagepair(): - lin_tups = (LineagePair("rank1", "name1"), LineagePair("rank2", None),) + lin_tups = ( + LineagePair("rank1", "name1"), + LineagePair("rank2", None), + ) taxinf = LINLineageInfo(lineage=lin_tups) print(taxinf.lineage) assert taxinf.n_lin_positions == 2 - assert taxinf.zip_lineage()== ["name1", ""] - assert taxinf.zip_lineage(truncate_empty=True)== ["name1"] + assert taxinf.zip_lineage() == ["name1", ""] + assert taxinf.zip_lineage(truncate_empty=True) == ["name1"] assert taxinf.filled_ranks == ("rank1",) assert taxinf.ranks == ("rank1", "rank2") assert taxinf.n_filled_pos == 1 @@ -1363,7 +1966,7 @@ def test_LINLineageInfo_init_lineagepair(): def test_lca_LINLineageInfo_diff_n_pos(): x = "0;0;1" - y = '0' + y = "0" lin1 = LINLineageInfo(lineage_str=x) lin2 = LINLineageInfo(lineage_str=y) assert lin1.is_compatible(lin2) @@ -1376,30 +1979,30 @@ def test_lca_LINLineageInfo_diff_n_pos(): def test_lca_LINLineageInfo_no_lca(): x = "0;0;1" - y = '12;0;1' + y = "12;0;1" lin1 = LINLineageInfo(lineage_str=x) lin2 = LINLineageInfo(lineage_str=y) assert lin1.is_compatible(lin2) assert lin2.is_compatible(lin1) lca_from_lin1 = lin1.find_lca(lin2) lca_from_lin2 = lin2.find_lca(lin1) - assert lca_from_lin1 == lca_from_lin2 == None + assert lca_from_lin1 == lca_from_lin2 is None def test_lca_RankLineageInfo_no_lca(): x = "a;b;c" - y = 'd;e;f;g' + y = "d;e;f;g" lin1 = RankLineageInfo(lineage_str=x) lin2 = RankLineageInfo(lineage_str=y) assert lin1.is_compatible(lin2) assert lin2.is_compatible(lin1) lca_from_lin1 = lin1.find_lca(lin2) lca_from_lin2 = lin2.find_lca(lin1) - assert lca_from_lin1 == lca_from_lin2 == None + assert lca_from_lin1 == lca_from_lin2 is None def test_incompatibility_LINLineageInfo_RankLineageInfo(): - x="a;b;c" + x = "a;b;c" lin1 = RankLineageInfo(lineage_str=x) lin2 = LINLineageInfo(lineage_str=x) assert not lin1.is_compatible(lin2) @@ -1408,64 +2011,75 @@ def test_incompatibility_LINLineageInfo_RankLineageInfo(): def test_RankLineageInfo_init_lineage_str_with_ranks_as_list(): x = "a;b;c" - taxranks = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'] + taxranks = [ + "superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + ] taxinf = RankLineageInfo(lineage_str=x, ranks=taxranks) print(taxinf.lineage) print(taxinf.lineage_str) - assert taxinf.zip_lineage()== ['a', 'b', 'c', '', '', '', ''] + assert taxinf.zip_lineage() == ["a", "b", "c", "", "", "", ""] def test_RankLineageInfo_init_lineage_tups(): - x = (LineagePair(rank="superkingdom", name='a'), LineagePair(rank="phylum", name='b')) + x = ( + LineagePair(rank="superkingdom", name="a"), + LineagePair(rank="phylum", name="b"), + ) taxinf = RankLineageInfo(lineage=x) print(taxinf.lineage) print(taxinf.lineage_str) - assert taxinf.zip_lineage()== ['a', 'b', '', '', '', '', '', ''] + assert taxinf.zip_lineage() == ["a", "b", "", "", "", "", "", ""] def test_RankLineageInfo_init_lineage_dict_fail(): - ranks=["A", "B", "C"] - lin_tups = (LineagePair(rank="A", name='a'), LineagePair(rank="C", name='b')) + ranks = ["A", "B", "C"] + lin_tups = (LineagePair(rank="A", name="a"), LineagePair(rank="C", name="b")) with pytest.raises(ValueError) as exc: - taxinf = RankLineageInfo(ranks=ranks, lineage_dict=lin_tups) + RankLineageInfo(ranks=ranks, lineage_dict=lin_tups) print(str(exc)) assert "is not dictionary" in str(exc) def test_RankLineageInfo_init_lineage_dict(): - x = {'rank1': 'name1', 'rank2': 'name2'} + x = {"rank1": "name1", "rank2": "name2"} taxinf = RankLineageInfo(lineage_dict=x, ranks=["rank1", "rank2"]) print("ranks: ", taxinf.ranks) print("lineage: ", taxinf.lineage) print("zipped lineage: ", taxinf.zip_lineage()) - assert taxinf.zip_lineage()== ['name1', 'name2'] + assert taxinf.zip_lineage() == ["name1", "name2"] def test_RankLineageInfo_init_lineage_dict_default_ranks(): - x = {"superkingdom":'a',"phylum":'b'} + x = {"superkingdom": "a", "phylum": "b"} taxinf = RankLineageInfo(lineage_dict=x) print(taxinf.lineage) print(taxinf.lineage_str) - assert taxinf.zip_lineage()== ['a', 'b', '', '', '', '', '', ''] + assert taxinf.zip_lineage() == ["a", "b", "", "", "", "", "", ""] def test_RankLineageInfo_init_lineage_dict_withtaxpath(): - x = {'rank1': 'name1', 'rank2': 'name2', 'taxpath': "1|2"} + x = {"rank1": "name1", "rank2": "name2", "taxpath": "1|2"} taxinf = RankLineageInfo(lineage_dict=x, ranks=["rank1", "rank2"]) print("ranks: ", taxinf.ranks) print("lineage: ", taxinf.lineage) print("zipped lineage: ", taxinf.zip_lineage()) print("zipped taxids: ", taxinf.zip_taxid()) - assert taxinf.zip_lineage()== ['name1', 'name2'] - assert taxinf.zip_taxid()== ['1', '2'] + assert taxinf.zip_lineage() == ["name1", "name2"] + assert taxinf.zip_taxid() == ["1", "2"] assert taxinf.lowest_lineage_taxid == "2" assert taxinf.lowest_lineage_name == "name2" def test_RankLineageInfo_init_lineage_str_lineage_dict_test_eq(): x = "a;b;c" - ranks=["A", "B", "C"] + ranks = ["A", "B", "C"] rankD = {"A": "a", "B": "b", "C": "c"} lin1 = RankLineageInfo(lineage_str=x, ranks=ranks) lin2 = RankLineageInfo(lineage_dict=rankD, ranks=ranks) @@ -1473,56 +2087,56 @@ def test_RankLineageInfo_init_lineage_str_lineage_dict_test_eq(): def test_RankLineageInfo_init_lineage_dict_missing_rank(): - x = {'superkingdom': 'name1', 'class': 'name2'} + x = {"superkingdom": "name1", "class": "name2"} taxinf = RankLineageInfo(lineage_dict=x) print("ranks: ", taxinf.ranks) print("lineage: ", taxinf.lineage) print("zipped lineage: ", taxinf.zip_lineage()) - assert taxinf.zip_lineage()== ['name1', '', 'name2', '', '', '', '', ''] - assert taxinf.zip_lineage(truncate_empty=True)== ['name1', '', 'name2'] + assert taxinf.zip_lineage() == ["name1", "", "name2", "", "", "", "", ""] + assert taxinf.zip_lineage(truncate_empty=True) == ["name1", "", "name2"] def test_RankLineageInfo_init_lineage_dict_missing_rank_with_taxpath(): - x = {'superkingdom': 'name1', 'class': 'name2', 'taxpath': '1||2'} + x = {"superkingdom": "name1", "class": "name2", "taxpath": "1||2"} taxinf = RankLineageInfo(lineage_dict=x) print("ranks: ", taxinf.ranks) print("lineage: ", taxinf.lineage) print("zipped lineage: ", taxinf.zip_lineage()) - assert taxinf.zip_lineage()== ['name1', '', 'name2', '', '', '', '', ''] - assert taxinf.zip_taxid()== ['1', '', '2', '', '', '', '', ''] + assert taxinf.zip_lineage() == ["name1", "", "name2", "", "", "", "", ""] + assert taxinf.zip_taxid() == ["1", "", "2", "", "", "", "", ""] def test_RankLineageInfo_init_lineage_dict_name_taxpath_mismatch(): # If there's no name, we don't report the taxpath, because lineage is not "filled". # Is this desired behavior? - x = {'superkingdom': 'name1', 'taxpath': '1||2'} + x = {"superkingdom": "name1", "taxpath": "1||2"} taxinf = RankLineageInfo(lineage_dict=x) print("ranks: ", taxinf.ranks) print("lineage: ", taxinf.lineage) print("zipped lineage: ", taxinf.zip_lineage()) - assert taxinf.zip_lineage()== ['name1', '', '', '', '', '', '', ''] - assert taxinf.zip_taxid()== ['1', '', '', '', '', '', '', ''] + assert taxinf.zip_lineage() == ["name1", "", "", "", "", "", "", ""] + assert taxinf.zip_taxid() == ["1", "", "", "", "", "", "", ""] def test_RankLineageInfo_init_lineage_dict_name_taxpath_missing_taxids(): # If there's no name, we don't report the taxpath, because lineage is not "filled". # Is this desired behavior? - x = {'superkingdom': 'name1', 'phylum': "name2", "class": "name3", 'taxpath': '|2'} + x = {"superkingdom": "name1", "phylum": "name2", "class": "name3", "taxpath": "|2"} taxinf = RankLineageInfo(lineage_dict=x) print("ranks: ", taxinf.ranks) print("lineage: ", taxinf.lineage) print("zipped lineage: ", taxinf.zip_lineage()) print("zipped taxids: ", taxinf.zip_taxid()) - assert taxinf.zip_lineage()== ['name1', 'name2', 'name3', '', '', '', '', ''] - assert taxinf.zip_taxid()== ['', '2', '', '', '', '', '', ''] + assert taxinf.zip_lineage() == ["name1", "name2", "name3", "", "", "", "", ""] + assert taxinf.zip_taxid() == ["", "2", "", "", "", "", "", ""] def test_RankLineageInfo_init_lineage_dict_taxpath_too_long(): - x = {'superkingdom': 'name1', 'class': 'name2', 'taxpath': '1||2||||||||||'} + x = {"superkingdom": "name1", "class": "name2", "taxpath": "1||2||||||||||"} with pytest.raises(ValueError) as exc: RankLineageInfo(lineage_dict=x) print(str(exc)) - assert f"Number of NCBI taxids (13) exceeds number of ranks (8)" in str(exc) + assert "Number of NCBI taxids (13) exceeds number of ranks (8)" in str(exc) def test_RankLineageInfo_init_lineage_str_lineage_dict_test_eq(): @@ -1540,7 +2154,7 @@ def test_RankLineageInfo_init_lineage_str_1_truncate(): taxinf = RankLineageInfo(lineage_str=x) print(taxinf.lineage) print(taxinf.lineage_str) - assert taxinf.zip_lineage(truncate_empty=True)== ['a', 'b', 'c'] + assert taxinf.zip_lineage(truncate_empty=True) == ["a", "b", "c"] def test_RankLineageInfo_init_lineage_str_2(): @@ -1548,7 +2162,7 @@ def test_RankLineageInfo_init_lineage_str_2(): taxinf = RankLineageInfo(lineage_str=x) print(taxinf.lineage) print(taxinf.lineage_str) - assert taxinf.zip_lineage()== ['a', 'b', '', 'c' '', '', '', '', ''] + assert taxinf.zip_lineage() == ["a", "b", "", "c" "", "", "", "", ""] def test_RankLineageInfo_init_lineage_str_2_truncate(): @@ -1556,72 +2170,92 @@ def test_RankLineageInfo_init_lineage_str_2_truncate(): taxinf = RankLineageInfo(lineage_str=x) print(taxinf.lineage) print(taxinf.lineage_str) - assert taxinf.zip_lineage(truncate_empty=True)== ['a', 'b', '', 'c'] + assert taxinf.zip_lineage(truncate_empty=True) == ["a", "b", "", "c"] def test_RankLineageInfo_init_lineage_with_incorrect_rank(): - x = [ LineagePair('superkingdom', 'a'), LineagePair("NotARank", ''), LineagePair('class', 'c') ] + x = [ + LineagePair("superkingdom", "a"), + LineagePair("NotARank", ""), + LineagePair("class", "c"), + ] with pytest.raises(ValueError) as exc: RankLineageInfo(lineage=x) print(str(exc)) - assert f"Rank 'NotARank' not present in " in str(exc) + assert "Rank 'NotARank' not present in " in str(exc) def test_zip_lineage_1(): - x = [ LineagePair('superkingdom', 'a'), LineagePair('phylum', 'b') ] + x = [LineagePair("superkingdom", "a"), LineagePair("phylum", "b")] taxinf = RankLineageInfo(lineage=x) print("ranks: ", taxinf.ranks) print("zipped lineage: ", taxinf.zip_lineage()) - assert taxinf.zip_lineage() == ['a', 'b', '', '', '', '', '', ''] + assert taxinf.zip_lineage() == ["a", "b", "", "", "", "", "", ""] def test_zip_lineage_2(): - x = [ LineagePair('superkingdom', 'a'), LineagePair('phylum', 'b') ] + x = [LineagePair("superkingdom", "a"), LineagePair("phylum", "b")] taxinf = RankLineageInfo(lineage=x) print("ranks: ", taxinf.ranks) print("zipped lineage: ", taxinf.zip_lineage(truncate_empty=True)) - assert taxinf.zip_lineage(truncate_empty=True) == ['a', 'b'] + assert taxinf.zip_lineage(truncate_empty=True) == ["a", "b"] def test_zip_lineage_3(): - x = [ LineagePair('superkingdom', 'a'), LineagePair(None, ''), LineagePair('class', 'c') ] + x = [ + LineagePair("superkingdom", "a"), + LineagePair(None, ""), + LineagePair("class", "c"), + ] taxinf = RankLineageInfo(lineage=x) - assert taxinf.zip_lineage() == ['a', '', 'c', '', '', '', '', ''] + assert taxinf.zip_lineage() == ["a", "", "c", "", "", "", "", ""] def test_zip_lineage_3_truncate(): - x = [ LineagePair('superkingdom', 'a'), LineagePair(None, ''), LineagePair('class', 'c') ] + x = [ + LineagePair("superkingdom", "a"), + LineagePair(None, ""), + LineagePair("class", "c"), + ] taxinf = RankLineageInfo(lineage=x) - assert taxinf.zip_lineage(truncate_empty=True) == ['a', '', 'c'] + assert taxinf.zip_lineage(truncate_empty=True) == ["a", "", "c"] def test_zip_lineage_4(): - x = [ LineagePair('superkingdom', 'a'), LineagePair('class', 'c') ] + x = [LineagePair("superkingdom", "a"), LineagePair("class", "c")] taxinf = RankLineageInfo(lineage=x) - assert taxinf.zip_lineage(truncate_empty=True) == ['a', '', 'c'] + assert taxinf.zip_lineage(truncate_empty=True) == ["a", "", "c"] def test_display_lineage_1(): - x = [ LineagePair('superkingdom', 'a'), LineagePair('phylum', 'b') ] + x = [LineagePair("superkingdom", "a"), LineagePair("phylum", "b")] taxinf = RankLineageInfo(lineage=x) assert taxinf.display_lineage() == "a;b" def test_display_lineage_2(): - x = [ LineagePair('superkingdom', 'a'), LineagePair(None, ''), LineagePair('class', 'c') ] + x = [ + LineagePair("superkingdom", "a"), + LineagePair(None, ""), + LineagePair("class", "c"), + ] taxinf = RankLineageInfo(lineage=x) assert taxinf.display_lineage() == "a;;c" def test_display_taxid_1(): - x = [ LineagePair('superkingdom', 'a', 1), LineagePair('phylum', 'b', 2) ] + x = [LineagePair("superkingdom", "a", 1), LineagePair("phylum", "b", 2)] taxinf = RankLineageInfo(lineage=x) print(taxinf) assert taxinf.display_taxid() == "1;2" def test_display_taxid_2(): - x = [ LineagePair('superkingdom', 'name1', 1), LineagePair(None, ''), LineagePair ('class', 'name2',2) ] + x = [ + LineagePair("superkingdom", "name1", 1), + LineagePair(None, ""), + LineagePair("class", "name2", 2), + ] taxinf = RankLineageInfo(lineage=x) print(taxinf) assert taxinf.display_taxid() == "1;;2" @@ -1629,54 +2263,53 @@ def test_display_taxid_2(): def test_is_lineage_match_1(): # basic behavior: match at order and above, but not at family or below. - lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__e') - lin2 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') + lin1 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__e") + lin2 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__f") print(lin1.lineage) assert lin1.is_compatible(lin2) - assert lin1.is_lineage_match(lin2, 'superkingdom') - assert lin2.is_lineage_match(lin1, 'superkingdom') - assert lin1.is_lineage_match(lin2, 'phylum') - assert lin2.is_lineage_match(lin1, 'phylum') - assert lin1.is_lineage_match(lin2, 'class') - assert lin2.is_lineage_match(lin1, 'class') - assert lin1.is_lineage_match(lin2, 'order') - assert lin2.is_lineage_match(lin1, 'order') - - assert not lin1.is_lineage_match(lin2, 'family') - assert not lin2.is_lineage_match(lin1, 'family') - assert not lin1.is_lineage_match(lin2, 'genus') - assert not lin2.is_lineage_match(lin1, 'genus') - assert not lin1.is_lineage_match(lin2, 'species') - assert not lin2.is_lineage_match(lin1, 'species') + assert lin1.is_lineage_match(lin2, "superkingdom") + assert lin2.is_lineage_match(lin1, "superkingdom") + assert lin1.is_lineage_match(lin2, "phylum") + assert lin2.is_lineage_match(lin1, "phylum") + assert lin1.is_lineage_match(lin2, "class") + assert lin2.is_lineage_match(lin1, "class") + assert lin1.is_lineage_match(lin2, "order") + assert lin2.is_lineage_match(lin1, "order") + + assert not lin1.is_lineage_match(lin2, "family") + assert not lin2.is_lineage_match(lin1, "family") + assert not lin1.is_lineage_match(lin2, "genus") + assert not lin2.is_lineage_match(lin1, "genus") + assert not lin1.is_lineage_match(lin2, "species") + assert not lin2.is_lineage_match(lin1, "species") lca_from_lin1 = lin1.find_lca(lin2) print(lca_from_lin1.display_lineage()) lca_from_lin2 = lin2.find_lca(lin1) assert lca_from_lin1 == lca_from_lin2 assert lca_from_lin1.display_lineage() == "d__a;p__b;c__c;o__d" - def test_is_lineage_match_2(): # match at family, and above, levels; no genus or species to match - lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') - lin2 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') + lin1 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__f") + lin2 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__f") assert lin1.is_compatible(lin2) - assert lin1.is_lineage_match(lin2, 'superkingdom') - assert lin2.is_lineage_match(lin1, 'superkingdom') - assert lin1.is_lineage_match(lin2, 'phylum') - assert lin2.is_lineage_match(lin1, 'phylum') - assert lin1.is_lineage_match(lin2, 'class') - assert lin2.is_lineage_match(lin1, 'class') - assert lin1.is_lineage_match(lin2, 'order') - assert lin2.is_lineage_match(lin1, 'order') - assert lin1.is_lineage_match(lin2, 'family') - assert lin2.is_lineage_match(lin1, 'family') - - assert not lin1.is_lineage_match(lin2, 'genus') - assert not lin2.is_lineage_match(lin1, 'genus') - assert not lin1.is_lineage_match(lin2, 'species') - assert not lin2.is_lineage_match(lin1, 'species') + assert lin1.is_lineage_match(lin2, "superkingdom") + assert lin2.is_lineage_match(lin1, "superkingdom") + assert lin1.is_lineage_match(lin2, "phylum") + assert lin2.is_lineage_match(lin1, "phylum") + assert lin1.is_lineage_match(lin2, "class") + assert lin2.is_lineage_match(lin1, "class") + assert lin1.is_lineage_match(lin2, "order") + assert lin2.is_lineage_match(lin1, "order") + assert lin1.is_lineage_match(lin2, "family") + assert lin2.is_lineage_match(lin1, "family") + + assert not lin1.is_lineage_match(lin2, "genus") + assert not lin2.is_lineage_match(lin1, "genus") + assert not lin1.is_lineage_match(lin2, "species") + assert not lin2.is_lineage_match(lin1, "species") lca_from_lin1 = lin1.find_lca(lin2) print(lca_from_lin1.display_lineage()) @@ -1688,70 +2321,79 @@ def test_is_lineage_match_2(): def test_is_lineage_match_3(): # one lineage is empty lin1 = RankLineageInfo() - lin2 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') + lin2 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__f") assert lin1.is_compatible(lin2) - assert not lin1.is_lineage_match(lin2, 'superkingdom') - assert not lin2.is_lineage_match(lin1, 'superkingdom') - assert not lin1.is_lineage_match(lin2, 'phylum') - assert not lin2.is_lineage_match(lin1, 'phylum') - assert not lin1.is_lineage_match(lin2, 'class') - assert not lin2.is_lineage_match(lin1, 'class') - assert not lin1.is_lineage_match(lin2, 'order') - assert not lin2.is_lineage_match(lin1, 'order') - assert not lin1.is_lineage_match(lin2, 'family') - assert not lin2.is_lineage_match(lin1, 'family') - assert not lin1.is_lineage_match(lin2, 'genus') - assert not lin2.is_lineage_match(lin1, 'genus') - assert not lin1.is_lineage_match(lin2, 'species') - assert not lin2.is_lineage_match(lin1, 'species') + assert not lin1.is_lineage_match(lin2, "superkingdom") + assert not lin2.is_lineage_match(lin1, "superkingdom") + assert not lin1.is_lineage_match(lin2, "phylum") + assert not lin2.is_lineage_match(lin1, "phylum") + assert not lin1.is_lineage_match(lin2, "class") + assert not lin2.is_lineage_match(lin1, "class") + assert not lin1.is_lineage_match(lin2, "order") + assert not lin2.is_lineage_match(lin1, "order") + assert not lin1.is_lineage_match(lin2, "family") + assert not lin2.is_lineage_match(lin1, "family") + assert not lin1.is_lineage_match(lin2, "genus") + assert not lin2.is_lineage_match(lin1, "genus") + assert not lin1.is_lineage_match(lin2, "species") + assert not lin2.is_lineage_match(lin1, "species") def test_is_lineage_match_incorrect_ranks(): - #test comparison with incompatible ranks - taxranks = ('superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain') - lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__e', ranks=taxranks[::-1]) - lin2 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') + # test comparison with incompatible ranks + taxranks = ( + "superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + "strain", + ) + lin1 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__e", ranks=taxranks[::-1]) + lin2 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__f") print(lin1.lineage) assert not lin1.is_compatible(lin2) with pytest.raises(ValueError) as exc: - lin1.is_lineage_match(lin2, 'superkingdom') + lin1.is_lineage_match(lin2, "superkingdom") print(str(exc)) - assert 'Cannot compare lineages from taxonomies with different ranks.' in str(exc) + assert "Cannot compare lineages from taxonomies with different ranks." in str(exc) def test_is_lineage_match_improper_rank(): - #test comparison with incompatible ranks - lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__e') - lin2 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') + # test comparison with incompatible ranks + lin1 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__e") + lin2 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__f") print(lin1.lineage) assert lin1.is_compatible(lin2) with pytest.raises(ValueError) as exc: - lin1.is_lineage_match(lin2, 'NotARank') + lin1.is_lineage_match(lin2, "NotARank") print(str(exc)) assert "Desired Rank 'NotARank' not available for this lineage" in str(exc) def test_pop_to_rank_1(): # basic behavior - pop to order? - lin1 = RankLineageInfo(lineage_str='d__a;p__b;c__c;o__d') - lin2 = RankLineageInfo(lineage_str='d__a;p__b;c__c;o__d;f__f') + lin1 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d") + lin2 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__f") print(lin1) - popped = lin2.pop_to_rank('order') + popped = lin2.pop_to_rank("order") print(popped) assert popped == lin1 def test_pop_to_rank_2(): # what if we're already above rank? - lin2 = RankLineageInfo(lineage_str='d__a;p__b;c__c;o__d;f__f') - print(lin2.pop_to_rank('species')) - assert lin2.pop_to_rank('species') == lin2 + lin2 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__f") + print(lin2.pop_to_rank("species")) + assert lin2.pop_to_rank("species") == lin2 def test_pop_to_rank_rank_not_avail(): - lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') + lin1 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__f") with pytest.raises(ValueError) as exc: lin1.pop_to_rank("NotARank") print(str(exc)) @@ -1759,15 +2401,17 @@ def test_pop_to_rank_rank_not_avail(): def test_lineage_at_rank_norank(): - lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') + lin1 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__f") with pytest.raises(TypeError) as exc: lin1.lineage_at_rank() print(str(exc)) - assert "lineage_at_rank() missing 1 required positional argument: 'rank'" in str(exc) + assert "lineage_at_rank() missing 1 required positional argument: 'rank'" in str( + exc + ) def test_lineage_at_rank_rank_not_avail(): - lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') + lin1 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__f") with pytest.raises(ValueError) as exc: lin1.lineage_at_rank("NotARank") print(str(exc)) @@ -1775,27 +2419,33 @@ def test_lineage_at_rank_rank_not_avail(): def test_lineage_at_rank_1(): - lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') - print(lin1.lineage_at_rank('superkingdom')) - - assert lin1.lineage_at_rank('superkingdom') == (LineagePair(rank='superkingdom', name='d__a', taxid=None),) - print(lin1.lineage_at_rank('class')) - assert lin1.lineage_at_rank('class') == (LineagePair(rank='superkingdom', name='d__a', taxid=None), - LineagePair(rank='phylum', name='p__b', taxid=None), - LineagePair(rank='class', name='c__c', taxid=None)) + lin1 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__f") + print(lin1.lineage_at_rank("superkingdom")) + + assert lin1.lineage_at_rank("superkingdom") == ( + LineagePair(rank="superkingdom", name="d__a", taxid=None), + ) + print(lin1.lineage_at_rank("class")) + assert lin1.lineage_at_rank("class") == ( + LineagePair(rank="superkingdom", name="d__a", taxid=None), + LineagePair(rank="phylum", name="p__b", taxid=None), + LineagePair(rank="class", name="c__c", taxid=None), + ) def test_lineage_at_rank_below_rank(): - lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') - print(lin1.lineage_at_rank('superkingdom')) + lin1 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__f") + print(lin1.lineage_at_rank("superkingdom")) # if rank is not provided, we only return the filled lineage, to follow original pop_to_rank behavior. - print(lin1.lineage_at_rank('genus')) - assert lin1.lineage_at_rank('genus') == (LineagePair(rank='superkingdom', name='d__a', taxid=None), - LineagePair(rank='phylum', name='p__b', taxid=None), - LineagePair(rank='class', name='c__c', taxid=None), - LineagePair(rank='order', name='o__d', taxid=None), - LineagePair(rank='family', name='f__f', taxid=None)) + print(lin1.lineage_at_rank("genus")) + assert lin1.lineage_at_rank("genus") == ( + LineagePair(rank="superkingdom", name="d__a", taxid=None), + LineagePair(rank="phylum", name="p__b", taxid=None), + LineagePair(rank="class", name="c__c", taxid=None), + LineagePair(rank="order", name="o__d", taxid=None), + LineagePair(rank="family", name="f__f", taxid=None), + ) def test_TaxResult_get_match_lineage_1(): @@ -1825,13 +2475,15 @@ def test_TaxResult_get_match_lineage_skip_ident(): gA = {"name": "gA.1 name"} taxres = make_TaxResult(gA) - taxres.get_match_lineage(tax_assignments=taxD, skip_idents=['gA']) + taxres.get_match_lineage(tax_assignments=taxD, skip_idents=["gA"]) print("skipped_ident?: ", taxres.skipped_ident) print("missed_ident?: ", taxres.missed_ident) assert taxres.skipped_ident == True assert taxres.lineageInfo == RankLineageInfo() assert taxres.lineageInfo.display_lineage() == "" - assert taxres.lineageInfo.display_lineage(null_as_unclassified=True) == "unclassified" + assert ( + taxres.lineageInfo.display_lineage(null_as_unclassified=True) == "unclassified" + ) def test_TaxResult_get_match_lineage_missed_ident_fail_on_missing(): @@ -1840,14 +2492,16 @@ def test_TaxResult_get_match_lineage_missed_ident_fail_on_missing(): gA = {"name": "gA.1 name"} taxres = make_TaxResult(gA) - taxres.get_match_lineage(tax_assignments=taxD, skip_idents=['gB']) + taxres.get_match_lineage(tax_assignments=taxD, skip_idents=["gB"]) print("skipped_ident?: ", taxres.skipped_ident) print("missed_ident?: ", taxres.missed_ident) assert taxres.skipped_ident == False assert taxres.missed_ident == True assert taxres.lineageInfo == RankLineageInfo() assert taxres.lineageInfo.display_lineage() == "" - assert taxres.lineageInfo.display_lineage(null_as_unclassified=True) == "unclassified" + assert ( + taxres.lineageInfo.display_lineage(null_as_unclassified=True) == "unclassified" + ) def test_TaxResult_get_match_lineage_missed_ident_fail_on_missing(): @@ -1857,7 +2511,9 @@ def test_TaxResult_get_match_lineage_missed_ident_fail_on_missing(): gA = {"name": "gA.1 name"} taxres = make_TaxResult(gA) with pytest.raises(ValueError) as exc: - taxres.get_match_lineage(tax_assignments=taxD, skip_idents=['gB'], fail_on_missing_taxonomy=True) + taxres.get_match_lineage( + tax_assignments=taxD, skip_idents=["gB"], fail_on_missing_taxonomy=True + ) print(str(exc)) assert "Error: ident 'gA' is not in the taxonomy database." in str(exc) @@ -1881,7 +2537,16 @@ def test_QueryTaxResult(): assert q_res.skipped_idents == set() assert q_res.missed_idents == set() assert q_res.summarized_lineage_results == {} - taxranks = ('superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain') + taxranks = ( + "superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + "strain", + ) assert q_res.ranks == taxranks assert q_res.ascending_ranks == taxranks[::-1] @@ -1891,7 +2556,7 @@ def test_QueryTaxResult_add_incompatible(): tax_info = [("gA", "a;b;c")] taxD = make_mini_taxonomy(tax_info=tax_info) taxres = make_TaxResult(taxD=taxD) - taxres2 = make_TaxResult({'query_name': 'q2'}, taxD=taxD) + taxres2 = make_TaxResult({"query_name": "q2"}, taxD=taxD) # initialize q_res = QueryTaxResult(taxres.query_info) # check that new querytaxres is compatible with taxres and not taxres2 @@ -1906,22 +2571,25 @@ def test_QueryTaxResult_add_incompatible(): def test_QueryTaxResult_add_without_tax_info(): "initialize and add a taxresult with missed ident" - taxres = make_TaxResult() # do not add taxonomic info + taxres = make_TaxResult() # do not add taxonomic info # initialize q_res = QueryTaxResult(taxres.query_info) print("attempted to add lineage info?: ", taxres.match_lineage_attempted) with pytest.raises(ValueError) as exc: q_res.add_taxresult(taxres) print(str(exc)) - assert "Error: Cannot add TaxResult. Please use get_match_lineage() to add taxonomic lineage information first." in str(exc) - - + assert ( + "Error: Cannot add TaxResult. Please use get_match_lineage() to add taxonomic lineage information first." + in str(exc) + ) + + def test_QueryTaxResult_add_skipped_ident(): "initialize and add a taxresult with skipped ident" gA_tax = ("gA", "a;b;c") taxD = make_mini_taxonomy([gA_tax]) - taxres = make_TaxResult(taxD=taxD, skip_idents = ['gA']) -# taxres.get_match_lineage(tax_assignments=taxD, skip_idents=['gA']) + taxres = make_TaxResult(taxD=taxD, skip_idents=["gA"]) + # taxres.get_match_lineage(tax_assignments=taxD, skip_idents=['gA']) # initialize q_res = QueryTaxResult(taxres.query_info) q_res.add_taxresult(taxres) @@ -1953,16 +2621,16 @@ def test_QueryTaxResult_track_missed_and_skipped(): taxD = make_mini_taxonomy(tax_info=tax_info) # make results taxres = make_TaxResult() - taxres2 = make_TaxResult({"name": 'gB'}) # skipped - taxres3 = make_TaxResult({"name": 'gB'}) # skipped - taxres4 = make_TaxResult({"name": 'gC'}) # skipped - taxres5 = make_TaxResult({"name": 'gD'}) # missed - taxres6 = make_TaxResult({"name": 'gE'}) # missed + taxres2 = make_TaxResult({"name": "gB"}) # skipped + taxres3 = make_TaxResult({"name": "gB"}) # skipped + taxres4 = make_TaxResult({"name": "gC"}) # skipped + taxres5 = make_TaxResult({"name": "gD"}) # missed + taxres6 = make_TaxResult({"name": "gE"}) # missed # initialize q_res = QueryTaxResult(taxres.query_info) # add taxonomic info to taxres, add to q_res for n, tr in enumerate([taxres, taxres2, taxres3, taxres4, taxres5, taxres6]): - tr.get_match_lineage(tax_assignments=taxD, skip_idents=['gB', 'gC']) + tr.get_match_lineage(tax_assignments=taxD, skip_idents=["gB", "gC"]) print("num: ", n) print("skipped?: ", tr.skipped_ident) print("missed?: ", tr.missed_ident) @@ -1972,18 +2640,27 @@ def test_QueryTaxResult_track_missed_and_skipped(): print(q_res.n_missed) assert q_res.n_missed == 2 assert q_res.n_skipped == 3 - assert 'gB' in q_res.skipped_idents + assert "gB" in q_res.skipped_idents assert len(q_res.skipped_idents) == 2 - assert 'gD' in q_res.missed_idents + assert "gD" in q_res.missed_idents assert q_res.summarized_lineage_results == {} def test_QueryTaxResult_track_missed_and_skipped_using_fn(): "make sure missed and skipped idents are being tracked. Same as above but use helper fn." taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}, {"name": 'gB'}, {"name": 'gC'}, {"name": 'gD'}, {"name": 'gE'}] - gres = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, skip_idents=['gB', 'gC']) - # should have 6 results for default query 'q1' + gather_results = [ + {}, + {"name": "gB"}, + {"name": "gB"}, + {"name": "gC"}, + {"name": "gD"}, + {"name": "gE"}, + ] + gres = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, skip_idents=["gB", "gC"] + ) + # should have 6 results for default query 'q1' print(gres.keys()) q_res = next(iter(gres.values())) assert len(q_res.raw_taxresults) == 6 @@ -1991,237 +2668,411 @@ def test_QueryTaxResult_track_missed_and_skipped_using_fn(): print(q_res.n_missed) assert q_res.n_missed == 2 assert q_res.n_skipped == 3 - assert 'gB' in q_res.skipped_idents + assert "gB" in q_res.skipped_idents assert len(q_res.skipped_idents) == 2 - assert 'gD' in q_res.missed_idents + assert "gD" in q_res.missed_idents assert q_res.summarized_lineage_results == {} def test_QueryTaxResult_summarize_up_ranks_1(): "basic functionality: summarize up ranks" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] + gather_results = [{}, {"name": "gB"}] gres = make_QueryTaxResults(gather_info=gather_results, taxD=taxD) assert len(gres.keys()) == 1 q_res = next(iter(gres.values())) # now summarize up the ranks q_res.summarize_up_ranks() assert len(q_res.raw_taxresults) == 2 - #print(q_res.sum_uniq_weighted.values()) - #print(q_res.sum_uniq_weighted['superkingdom']) - assert list(q_res.sum_uniq_weighted.keys()) == ['class', 'phylum', 'superkingdom'] - assert q_res.sum_uniq_weighted['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.4)} - assert q_res.sum_uniq_to_query['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.2)} - assert q_res.sum_uniq_bp['superkingdom'] == {RankLineageInfo(lineage_str="a"): 40} - assert q_res.sum_uniq_weighted['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.4)} - assert q_res.sum_uniq_to_query['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.2)} - assert q_res.sum_uniq_bp['phylum'] == {RankLineageInfo(lineage_str="a;b"): 40} - assert q_res.sum_uniq_weighted['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.2), - RankLineageInfo(lineage_str="a;b;d"): approx(0.2)} - assert q_res.sum_uniq_to_query['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.1), - RankLineageInfo(lineage_str="a;b;d"): approx(0.1)} - assert q_res.sum_uniq_bp['class'] == {RankLineageInfo(lineage_str="a;b;c"): 20, - RankLineageInfo(lineage_str="a;b;d"): 20} + # print(q_res.sum_uniq_weighted.values()) + # print(q_res.sum_uniq_weighted['superkingdom']) + assert list(q_res.sum_uniq_weighted.keys()) == ["class", "phylum", "superkingdom"] + assert q_res.sum_uniq_weighted["superkingdom"] == { + RankLineageInfo(lineage_str="a"): approx(0.4) + } + assert q_res.sum_uniq_to_query["superkingdom"] == { + RankLineageInfo(lineage_str="a"): approx(0.2) + } + assert q_res.sum_uniq_bp["superkingdom"] == {RankLineageInfo(lineage_str="a"): 40} + assert q_res.sum_uniq_weighted["phylum"] == { + RankLineageInfo(lineage_str="a;b"): approx(0.4) + } + assert q_res.sum_uniq_to_query["phylum"] == { + RankLineageInfo(lineage_str="a;b"): approx(0.2) + } + assert q_res.sum_uniq_bp["phylum"] == {RankLineageInfo(lineage_str="a;b"): 40} + assert q_res.sum_uniq_weighted["class"] == { + RankLineageInfo(lineage_str="a;b;c"): approx(0.2), + RankLineageInfo(lineage_str="a;b;d"): approx(0.2), + } + assert q_res.sum_uniq_to_query["class"] == { + RankLineageInfo(lineage_str="a;b;c"): approx(0.1), + RankLineageInfo(lineage_str="a;b;d"): approx(0.1), + } + assert q_res.sum_uniq_bp["class"] == { + RankLineageInfo(lineage_str="a;b;c"): 20, + RankLineageInfo(lineage_str="a;b;d"): 20, + } def test_QueryTaxResult_summarize_up_ranks_2(): "summarize up ranks: different values" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB','f_unique_weighted': 0.1,'f_unique_to_query': 0.05,'unique_intersect_bp': 10,}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [ + {}, + { + "name": "gB", + "f_unique_weighted": 0.1, + "f_unique_to_query": 0.05, + "unique_intersect_bp": 10, + }, + ] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) # now summarize up the ranks q_res.summarize_up_ranks() assert len(q_res.raw_taxresults) == 2 print(q_res.sum_uniq_weighted.values()) - print(q_res.sum_uniq_weighted['superkingdom']) - assert q_res.sum_uniq_weighted['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.3)} - assert q_res.sum_uniq_to_query['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.15)} - assert q_res.sum_uniq_bp['superkingdom'] == {RankLineageInfo(lineage_str="a"): 30} - assert q_res.sum_uniq_weighted['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.3)} - assert q_res.sum_uniq_to_query['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.15)} - assert q_res.sum_uniq_bp['phylum'] == {RankLineageInfo(lineage_str="a;b"): 30} - assert q_res.sum_uniq_weighted['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.2), - RankLineageInfo(lineage_str="a;b;d"): approx(0.1)} - assert q_res.sum_uniq_to_query['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.1), - RankLineageInfo(lineage_str="a;b;d"): approx(0.05)} - assert q_res.sum_uniq_bp['class'] == {RankLineageInfo(lineage_str="a;b;c"): 20, - RankLineageInfo(lineage_str="a;b;d"): 10} + print(q_res.sum_uniq_weighted["superkingdom"]) + assert q_res.sum_uniq_weighted["superkingdom"] == { + RankLineageInfo(lineage_str="a"): approx(0.3) + } + assert q_res.sum_uniq_to_query["superkingdom"] == { + RankLineageInfo(lineage_str="a"): approx(0.15) + } + assert q_res.sum_uniq_bp["superkingdom"] == {RankLineageInfo(lineage_str="a"): 30} + assert q_res.sum_uniq_weighted["phylum"] == { + RankLineageInfo(lineage_str="a;b"): approx(0.3) + } + assert q_res.sum_uniq_to_query["phylum"] == { + RankLineageInfo(lineage_str="a;b"): approx(0.15) + } + assert q_res.sum_uniq_bp["phylum"] == {RankLineageInfo(lineage_str="a;b"): 30} + assert q_res.sum_uniq_weighted["class"] == { + RankLineageInfo(lineage_str="a;b;c"): approx(0.2), + RankLineageInfo(lineage_str="a;b;d"): approx(0.1), + } + assert q_res.sum_uniq_to_query["class"] == { + RankLineageInfo(lineage_str="a;b;c"): approx(0.1), + RankLineageInfo(lineage_str="a;b;d"): approx(0.05), + } + assert q_res.sum_uniq_bp["class"] == { + RankLineageInfo(lineage_str="a;b;c"): 20, + RankLineageInfo(lineage_str="a;b;d"): 10, + } def test_QueryTaxResult_summarize_up_ranks_missing_lineage(): "basic functionality: summarize up ranks" taxD = make_mini_taxonomy([("gA", "a;b;c")]) - gather_results = [{}, {"name": 'gB'}] + gather_results = [{}, {"name": "gB"}] gres = make_QueryTaxResults(gather_info=gather_results, taxD=taxD) assert len(gres.keys()) == 1 q_res = next(iter(gres.values())) # now summarize up the ranks q_res.summarize_up_ranks() assert len(q_res.raw_taxresults) == 2 - #print(q_res.sum_uniq_weighted.values()) - print(q_res.sum_uniq_weighted['superkingdom']) - assert q_res.sum_uniq_weighted['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.2)} - assert q_res.sum_uniq_to_query['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.1)} - assert q_res.sum_uniq_bp['superkingdom'] == {RankLineageInfo(lineage_str="a"): 20} - assert q_res.sum_uniq_weighted['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.2)} - assert q_res.sum_uniq_to_query['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.1)} - assert q_res.sum_uniq_bp['phylum'] == {RankLineageInfo(lineage_str="a;b"): 20} - assert q_res.sum_uniq_weighted['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.2)} - assert q_res.sum_uniq_to_query['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.1)} - assert q_res.sum_uniq_bp['class'] == {RankLineageInfo(lineage_str="a;b;c"): 20} + # print(q_res.sum_uniq_weighted.values()) + print(q_res.sum_uniq_weighted["superkingdom"]) + assert q_res.sum_uniq_weighted["superkingdom"] == { + RankLineageInfo(lineage_str="a"): approx(0.2) + } + assert q_res.sum_uniq_to_query["superkingdom"] == { + RankLineageInfo(lineage_str="a"): approx(0.1) + } + assert q_res.sum_uniq_bp["superkingdom"] == {RankLineageInfo(lineage_str="a"): 20} + assert q_res.sum_uniq_weighted["phylum"] == { + RankLineageInfo(lineage_str="a;b"): approx(0.2) + } + assert q_res.sum_uniq_to_query["phylum"] == { + RankLineageInfo(lineage_str="a;b"): approx(0.1) + } + assert q_res.sum_uniq_bp["phylum"] == {RankLineageInfo(lineage_str="a;b"): 20} + assert q_res.sum_uniq_weighted["class"] == { + RankLineageInfo(lineage_str="a;b;c"): approx(0.2) + } + assert q_res.sum_uniq_to_query["class"] == { + RankLineageInfo(lineage_str="a;b;c"): approx(0.1) + } + assert q_res.sum_uniq_bp["class"] == {RankLineageInfo(lineage_str="a;b;c"): 20} def test_QueryTaxResult_summarize_up_ranks_skipped_lineage(): "basic functionality: summarize up ranks" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - gres = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, skip_idents=['gB']) + gather_results = [{}, {"name": "gB"}] + gres = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, skip_idents=["gB"] + ) assert len(gres.keys()) == 1 q_res = next(iter(gres.values())) # now summarize up the ranks q_res.summarize_up_ranks() assert len(q_res.raw_taxresults) == 2 - assert list(q_res.sum_uniq_weighted.keys()) == ['class', 'phylum', 'superkingdom'] - #print(q_res.sum_uniq_weighted.values()) - print(q_res.sum_uniq_weighted['superkingdom']) - assert q_res.sum_uniq_weighted['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.2)} - assert q_res.sum_uniq_to_query['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.1)} - assert q_res.sum_uniq_bp['superkingdom'] == {RankLineageInfo(lineage_str="a"): 20} - assert q_res.sum_uniq_weighted['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.2)} - assert q_res.sum_uniq_to_query['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.1)} - assert q_res.sum_uniq_bp['phylum'] == {RankLineageInfo(lineage_str="a;b"): 20} - assert q_res.sum_uniq_weighted['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.2)} - assert q_res.sum_uniq_to_query['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.1)} - assert q_res.sum_uniq_bp['class'] == {RankLineageInfo(lineage_str="a;b;c"): 20} + assert list(q_res.sum_uniq_weighted.keys()) == ["class", "phylum", "superkingdom"] + # print(q_res.sum_uniq_weighted.values()) + print(q_res.sum_uniq_weighted["superkingdom"]) + assert q_res.sum_uniq_weighted["superkingdom"] == { + RankLineageInfo(lineage_str="a"): approx(0.2) + } + assert q_res.sum_uniq_to_query["superkingdom"] == { + RankLineageInfo(lineage_str="a"): approx(0.1) + } + assert q_res.sum_uniq_bp["superkingdom"] == {RankLineageInfo(lineage_str="a"): 20} + assert q_res.sum_uniq_weighted["phylum"] == { + RankLineageInfo(lineage_str="a;b"): approx(0.2) + } + assert q_res.sum_uniq_to_query["phylum"] == { + RankLineageInfo(lineage_str="a;b"): approx(0.1) + } + assert q_res.sum_uniq_bp["phylum"] == {RankLineageInfo(lineage_str="a;b"): 20} + assert q_res.sum_uniq_weighted["class"] == { + RankLineageInfo(lineage_str="a;b;c"): approx(0.2) + } + assert q_res.sum_uniq_to_query["class"] == { + RankLineageInfo(lineage_str="a;b;c"): approx(0.1) + } + assert q_res.sum_uniq_bp["class"] == {RankLineageInfo(lineage_str="a;b;c"): 20} def test_QueryTaxResult_summarize_up_ranks_perfect_match(): "summarize up ranks: different values" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{'f_unique_to_query': 1.0}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [{"f_unique_to_query": 1.0}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) # now summarize up the ranks q_res.summarize_up_ranks() assert len(q_res.raw_taxresults) == 1 print(q_res.sum_uniq_weighted.values()) - print(q_res.sum_uniq_to_query['superkingdom']) - assert list(q_res.sum_uniq_to_query['superkingdom'].values()) == [1.0] - assert 'gA' in q_res.perfect_match + print(q_res.sum_uniq_to_query["superkingdom"]) + assert list(q_res.sum_uniq_to_query["superkingdom"].values()) == [1.0] + assert "gA" in q_res.perfect_match def test_QueryTaxResult_summarize_up_ranks_already_summarized(): "summarize up ranks: error, already summarized" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{'f_unique_to_query': 1.0}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [{"f_unique_to_query": 1.0}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) # now summarize up the ranks q_res.summarize_up_ranks() with pytest.raises(ValueError) as exc: q_res.summarize_up_ranks() print(str(exc)) assert "Error: already summarized" in str(exc) - + def test_QueryTaxResult_summarize_up_ranks_already_summarized_force(): "summarize up ranks: already summarized but force" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB','f_unique_weighted': 0.1,'f_unique_to_query': 0.05,'unique_intersect_bp': 10,}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [ + {}, + { + "name": "gB", + "f_unique_weighted": 0.1, + "f_unique_to_query": 0.05, + "unique_intersect_bp": 10, + }, + ] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) # now summarize up the ranks q_res.summarize_up_ranks() q_res.summarize_up_ranks(force_resummarize=True) - assert list(q_res.sum_uniq_weighted.keys()) == ['class', 'phylum', 'superkingdom'] + assert list(q_res.sum_uniq_weighted.keys()) == ["class", "phylum", "superkingdom"] - #check that all results are still good + # check that all results are still good assert len(q_res.raw_taxresults) == 2 - assert q_res.sum_uniq_weighted['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.3)} - assert q_res.sum_uniq_weighted['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.3)} - assert q_res.sum_uniq_to_query['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.15)} - assert q_res.sum_uniq_bp['phylum'] == {RankLineageInfo(lineage_str="a;b"): 30} - assert q_res.sum_uniq_to_query['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.1), - RankLineageInfo(lineage_str="a;b;d"): approx(0.05)} - assert q_res.sum_uniq_weighted['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.2), - RankLineageInfo(lineage_str="a;b;d"): approx(0.1)} - assert q_res.sum_uniq_bp['class'] == {RankLineageInfo(lineage_str="a;b;c"): 20, - RankLineageInfo(lineage_str="a;b;d"): 10} + assert q_res.sum_uniq_weighted["superkingdom"] == { + RankLineageInfo(lineage_str="a"): approx(0.3) + } + assert q_res.sum_uniq_weighted["phylum"] == { + RankLineageInfo(lineage_str="a;b"): approx(0.3) + } + assert q_res.sum_uniq_to_query["phylum"] == { + RankLineageInfo(lineage_str="a;b"): approx(0.15) + } + assert q_res.sum_uniq_bp["phylum"] == {RankLineageInfo(lineage_str="a;b"): 30} + assert q_res.sum_uniq_to_query["class"] == { + RankLineageInfo(lineage_str="a;b;c"): approx(0.1), + RankLineageInfo(lineage_str="a;b;d"): approx(0.05), + } + assert q_res.sum_uniq_weighted["class"] == { + RankLineageInfo(lineage_str="a;b;c"): approx(0.2), + RankLineageInfo(lineage_str="a;b;d"): approx(0.1), + } + assert q_res.sum_uniq_bp["class"] == { + RankLineageInfo(lineage_str="a;b;c"): 20, + RankLineageInfo(lineage_str="a;b;d"): 10, + } def test_QueryTaxResult_summarize_up_ranks_single_rank(): "summarize up ranks: different values" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB','f_unique_weighted': 0.1,'f_unique_to_query': 0.05,'unique_intersect_bp': 10,}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [ + {}, + { + "name": "gB", + "f_unique_weighted": 0.1, + "f_unique_to_query": 0.05, + "unique_intersect_bp": 10, + }, + ] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) # now summarize up the ranks - q_res.summarize_up_ranks(single_rank='phylum') + q_res.summarize_up_ranks(single_rank="phylum") assert len(q_res.raw_taxresults) == 2 - assert list(q_res.sum_uniq_weighted.keys()) == ['phylum'] + assert list(q_res.sum_uniq_weighted.keys()) == ["phylum"] print(q_res.sum_uniq_weighted.keys()) print(q_res.sum_uniq_weighted.values()) - print(q_res.sum_uniq_weighted['phylum']) - assert q_res.sum_uniq_weighted['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.3)} - assert list(q_res.sum_uniq_to_query['phylum'].values()) == [approx(0.15)] - assert list(q_res.sum_uniq_bp['phylum'].values()) == [30] - assert q_res.summarized_ranks == ['phylum'] + print(q_res.sum_uniq_weighted["phylum"]) + assert q_res.sum_uniq_weighted["phylum"] == { + RankLineageInfo(lineage_str="a;b"): approx(0.3) + } + assert list(q_res.sum_uniq_to_query["phylum"].values()) == [approx(0.15)] + assert list(q_res.sum_uniq_bp["phylum"].values()) == [30] + assert q_res.summarized_ranks == ["phylum"] + def test_QueryTaxResult_summarize_up_ranks_single_rank_not_available(): "summarize up ranks: different values" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB','f_unique_weighted': 0.1,'f_unique_to_query': 0.05,'unique_intersect_bp': 10,}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [ + {}, + { + "name": "gB", + "f_unique_weighted": 0.1, + "f_unique_to_query": 0.05, + "unique_intersect_bp": 10, + }, + ] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) # now summarize up the ranks with pytest.raises(ValueError) as exc: - q_res.summarize_up_ranks(single_rank='NotARank') + q_res.summarize_up_ranks(single_rank="NotARank") print(str(exc)) - assert "Error: rank 'NotARank' not in available ranks (strain, species, genus, family, order, class, phylum, superkingdom)" in str(exc) + assert ( + "Error: rank 'NotARank' not in available ranks (strain, species, genus, family, order, class, phylum, superkingdom)" + in str(exc) + ) def test_QueryTaxResult_summarize_up_ranks_single_rank_not_filled(): "summarize up ranks: different values" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB','f_unique_weighted': 0.1,'f_unique_to_query': 0.05,'unique_intersect_bp': 10,}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [ + {}, + { + "name": "gB", + "f_unique_weighted": 0.1, + "f_unique_to_query": 0.05, + "unique_intersect_bp": 10, + }, + ] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) # now summarize up the ranks with pytest.raises(ValueError) as exc: - q_res.summarize_up_ranks(single_rank='species') + q_res.summarize_up_ranks(single_rank="species") print(str(exc)) - assert "Error: rank 'species' was not available for any matching lineages." in str(exc) + assert "Error: rank 'species' was not available for any matching lineages." in str( + exc + ) def test_QueryTaxResult_build_summarized_result_1(): "basic functionality: build summarized_result" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) q_res.build_summarized_result() print(q_res.summarized_lineage_results.keys()) - sk = [SummarizedGatherResult(rank='superkingdom', fraction=0.2, f_weighted_at_rank=0.4, - lineage=RankLineageInfo(lineage_str='a'), - bp_match_at_rank=40, query_ani_at_rank=approx(0.95, rel=1e-2)), - SummarizedGatherResult(rank='superkingdom', fraction=0.8, f_weighted_at_rank=0.6, - lineage=RankLineageInfo(), bp_match_at_rank=60, query_ani_at_rank=None)] - print(q_res.summarized_lineage_results['superkingdom']) - assert q_res.summarized_lineage_results['superkingdom'] == sk - print(q_res.summarized_lineage_results['phylum']) - phy = [SummarizedGatherResult(rank='phylum', fraction=0.2, f_weighted_at_rank=0.4, - lineage=RankLineageInfo(lineage_str='a;b'), - bp_match_at_rank=40, query_ani_at_rank=approx(0.95, rel=1e-2)), - SummarizedGatherResult(rank='phylum', fraction=0.8, f_weighted_at_rank=0.6, - lineage=RankLineageInfo(), bp_match_at_rank=60, query_ani_at_rank=None)] - assert q_res.summarized_lineage_results['phylum'] == phy - print(q_res.summarized_lineage_results['class']) - cl = [SummarizedGatherResult(rank='class', fraction=0.1, f_weighted_at_rank=0.2, - lineage=RankLineageInfo(lineage_str='a;b;c'), - bp_match_at_rank=20, query_ani_at_rank=approx(0.93, rel=1e-2)), - SummarizedGatherResult(rank='class', fraction=0.1, f_weighted_at_rank=0.2, - lineage=RankLineageInfo(lineage_str='a;b;d'), - bp_match_at_rank=20, query_ani_at_rank=approx(0.93, rel=1e-2)), - SummarizedGatherResult(rank='class', fraction=0.8, f_weighted_at_rank=0.6, - lineage=RankLineageInfo(), bp_match_at_rank=60, query_ani_at_rank=None)] - assert q_res.summarized_lineage_results['class'] == cl - - assert q_res.total_f_weighted['phylum'] == approx(0.4) - assert q_res.total_f_classified['class'] == approx(0.2) - assert q_res.total_bp_classified['superkingdom'] == 40 + sk = [ + SummarizedGatherResult( + rank="superkingdom", + fraction=0.2, + f_weighted_at_rank=0.4, + lineage=RankLineageInfo(lineage_str="a"), + bp_match_at_rank=40, + query_ani_at_rank=approx(0.95, rel=1e-2), + ), + SummarizedGatherResult( + rank="superkingdom", + fraction=0.8, + f_weighted_at_rank=0.6, + lineage=RankLineageInfo(), + bp_match_at_rank=60, + query_ani_at_rank=None, + ), + ] + print(q_res.summarized_lineage_results["superkingdom"]) + assert q_res.summarized_lineage_results["superkingdom"] == sk + print(q_res.summarized_lineage_results["phylum"]) + phy = [ + SummarizedGatherResult( + rank="phylum", + fraction=0.2, + f_weighted_at_rank=0.4, + lineage=RankLineageInfo(lineage_str="a;b"), + bp_match_at_rank=40, + query_ani_at_rank=approx(0.95, rel=1e-2), + ), + SummarizedGatherResult( + rank="phylum", + fraction=0.8, + f_weighted_at_rank=0.6, + lineage=RankLineageInfo(), + bp_match_at_rank=60, + query_ani_at_rank=None, + ), + ] + assert q_res.summarized_lineage_results["phylum"] == phy + print(q_res.summarized_lineage_results["class"]) + cl = [ + SummarizedGatherResult( + rank="class", + fraction=0.1, + f_weighted_at_rank=0.2, + lineage=RankLineageInfo(lineage_str="a;b;c"), + bp_match_at_rank=20, + query_ani_at_rank=approx(0.93, rel=1e-2), + ), + SummarizedGatherResult( + rank="class", + fraction=0.1, + f_weighted_at_rank=0.2, + lineage=RankLineageInfo(lineage_str="a;b;d"), + bp_match_at_rank=20, + query_ani_at_rank=approx(0.93, rel=1e-2), + ), + SummarizedGatherResult( + rank="class", + fraction=0.8, + f_weighted_at_rank=0.6, + lineage=RankLineageInfo(), + bp_match_at_rank=60, + query_ani_at_rank=None, + ), + ] + assert q_res.summarized_lineage_results["class"] == cl + + assert q_res.total_f_weighted["phylum"] == approx(0.4) + assert q_res.total_f_classified["class"] == approx(0.2) + assert q_res.total_bp_classified["superkingdom"] == 40 def test_QueryTaxResult_build_summarized_result_2(): @@ -2231,19 +3082,39 @@ def test_QueryTaxResult_build_summarized_result_2(): gB_tax = ("gB", "a;c") taxD = make_mini_taxonomy([gA_tax, gB_tax]) # make gather results - gather_results = [{'query_name': 'queryA', 'name': 'gA', 'f_unique_weighted': 0.5,'f_unique_to_query': 0.5,'unique_intersect_bp': 50}, - {'query_name': 'queryA', "name": 'gB', 'f_unique_weighted': 0.4,'f_unique_to_query': 0.3,'unique_intersect_bp': 30}, - {'query_name': 'queryB', "name": 'gB', 'f_unique_weighted': 0.3,'f_unique_to_query': 0.3,'unique_intersect_bp': 30}] + gather_results = [ + { + "query_name": "queryA", + "name": "gA", + "f_unique_weighted": 0.5, + "f_unique_to_query": 0.5, + "unique_intersect_bp": 50, + }, + { + "query_name": "queryA", + "name": "gB", + "f_unique_weighted": 0.4, + "f_unique_to_query": 0.3, + "unique_intersect_bp": 30, + }, + { + "query_name": "queryB", + "name": "gB", + "f_unique_weighted": 0.3, + "f_unique_to_query": 0.3, + "unique_intersect_bp": 30, + }, + ] gres = make_QueryTaxResults(gather_info=gather_results, taxD=taxD) - + for query_name, q_res in gres.items(): - q_res.build_summarized_result() # summarize and build result - sk = q_res.summarized_lineage_results['superkingdom'] - phy = q_res.summarized_lineage_results['phylum'] + q_res.build_summarized_result() # summarize and build result + sk = q_res.summarized_lineage_results["superkingdom"] + phy = q_res.summarized_lineage_results["phylum"] assert len(sk) == 2 assert sk[0].lineage == RankLineageInfo(lineage_str="a") print(phy) - if query_name == 'queryA': + if query_name == "queryA": # check superkingdom results assert sk[0].fraction == approx(0.8) assert sk[0].f_weighted_at_rank == approx(0.9) @@ -2257,16 +3128,16 @@ def test_QueryTaxResult_build_summarized_result_2(): assert phy[0].fraction == approx(0.5) assert phy[0].f_weighted_at_rank == approx(0.5) assert phy[0].bp_match_at_rank == 50 - assert phy[0].lineage == RankLineageInfo(lineage_str="a;b") + assert phy[0].lineage == RankLineageInfo(lineage_str="a;b") assert phy[1].fraction == approx(0.3) assert phy[1].f_weighted_at_rank == approx(0.4) assert phy[1].bp_match_at_rank == 30 - assert phy[1].lineage == RankLineageInfo(lineage_str="a;c") + assert phy[1].lineage == RankLineageInfo(lineage_str="a;c") assert phy[2].fraction == approx(0.2) assert phy[2].f_weighted_at_rank == approx(0.1) assert phy[2].bp_match_at_rank == 20 assert phy[2].lineage == RankLineageInfo() - if query_name == 'queryB': + if query_name == "queryB": # check superkingdom results assert sk[0].fraction == approx(0.3) assert sk[0].f_weighted_at_rank == approx(0.3) @@ -2280,7 +3151,7 @@ def test_QueryTaxResult_build_summarized_result_2(): assert phy[0].fraction == approx(0.3) assert phy[0].f_weighted_at_rank == approx(0.3) assert phy[0].bp_match_at_rank == 30 - assert phy[0].lineage == RankLineageInfo(lineage_str="a;c") + assert phy[0].lineage == RankLineageInfo(lineage_str="a;c") assert phy[1].fraction == approx(0.7) assert phy[1].f_weighted_at_rank == approx(0.7) assert phy[1].bp_match_at_rank == 70 @@ -2290,91 +3161,183 @@ def test_QueryTaxResult_build_summarized_result_2(): def test_QueryTaxResult_build_summarized_result_missing_lineage(): "build summarized_result with missing lineage" taxD = make_mini_taxonomy([("gA", "a;b;c")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) q_res.build_summarized_result() print(q_res.summarized_lineage_results.keys()) - print(q_res.summarized_lineage_results['superkingdom']) - - sk = [SummarizedGatherResult(rank='superkingdom', fraction=0.1, f_weighted_at_rank=0.2, - lineage=RankLineageInfo(lineage_str="a"), - bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)), - SummarizedGatherResult(rank='superkingdom', fraction=0.9, lineage=RankLineageInfo(),f_weighted_at_rank=0.8, - bp_match_at_rank=80, query_ani_at_rank=None)] - assert q_res.summarized_lineage_results['superkingdom'] == sk - print(q_res.summarized_lineage_results['phylum']) - phy = [SummarizedGatherResult(rank='phylum', fraction=0.1, f_weighted_at_rank=0.2, - lineage=RankLineageInfo(lineage_str="a;b"), - bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)), - SummarizedGatherResult(rank='phylum', fraction=0.9, lineage=RankLineageInfo(),f_weighted_at_rank=0.8, - bp_match_at_rank=80, query_ani_at_rank=None)] - assert q_res.summarized_lineage_results['phylum'] == phy - print(q_res.summarized_lineage_results['class']) - cl = [SummarizedGatherResult(rank='class', fraction=0.1, lineage= RankLineageInfo(lineage_str="a;b;c"), - f_weighted_at_rank=0.2, bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)), - SummarizedGatherResult(rank='class', fraction=0.9, lineage=RankLineageInfo(), f_weighted_at_rank=0.8, - bp_match_at_rank=80, query_ani_at_rank=None)] - assert q_res.summarized_lineage_results['class'] == cl - - assert q_res.total_f_weighted['phylum'] == approx(0.2) - assert q_res.total_f_classified['class'] == approx(0.1) - assert q_res.total_bp_classified['superkingdom'] == 20 + print(q_res.summarized_lineage_results["superkingdom"]) + + sk = [ + SummarizedGatherResult( + rank="superkingdom", + fraction=0.1, + f_weighted_at_rank=0.2, + lineage=RankLineageInfo(lineage_str="a"), + bp_match_at_rank=20, + query_ani_at_rank=approx(0.928, rel=1e-2), + ), + SummarizedGatherResult( + rank="superkingdom", + fraction=0.9, + lineage=RankLineageInfo(), + f_weighted_at_rank=0.8, + bp_match_at_rank=80, + query_ani_at_rank=None, + ), + ] + assert q_res.summarized_lineage_results["superkingdom"] == sk + print(q_res.summarized_lineage_results["phylum"]) + phy = [ + SummarizedGatherResult( + rank="phylum", + fraction=0.1, + f_weighted_at_rank=0.2, + lineage=RankLineageInfo(lineage_str="a;b"), + bp_match_at_rank=20, + query_ani_at_rank=approx(0.928, rel=1e-2), + ), + SummarizedGatherResult( + rank="phylum", + fraction=0.9, + lineage=RankLineageInfo(), + f_weighted_at_rank=0.8, + bp_match_at_rank=80, + query_ani_at_rank=None, + ), + ] + assert q_res.summarized_lineage_results["phylum"] == phy + print(q_res.summarized_lineage_results["class"]) + cl = [ + SummarizedGatherResult( + rank="class", + fraction=0.1, + lineage=RankLineageInfo(lineage_str="a;b;c"), + f_weighted_at_rank=0.2, + bp_match_at_rank=20, + query_ani_at_rank=approx(0.928, rel=1e-2), + ), + SummarizedGatherResult( + rank="class", + fraction=0.9, + lineage=RankLineageInfo(), + f_weighted_at_rank=0.8, + bp_match_at_rank=80, + query_ani_at_rank=None, + ), + ] + assert q_res.summarized_lineage_results["class"] == cl + + assert q_res.total_f_weighted["phylum"] == approx(0.2) + assert q_res.total_f_classified["class"] == approx(0.1) + assert q_res.total_bp_classified["superkingdom"] == 20 def test_QueryTaxResult_build_summarized_result_skipped_lineage(): "build summarized_result with skipped lineage" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, skip_idents=['gB']) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, skip_idents=["gB"] + ) q_res.build_summarized_result() print(q_res.summarized_lineage_results.keys()) - print(q_res.summarized_lineage_results['superkingdom']) - - sk = [SummarizedGatherResult(rank='superkingdom', fraction=0.1, f_weighted_at_rank=0.2, - lineage=RankLineageInfo(lineage_str="a"), - bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)), - SummarizedGatherResult(rank='superkingdom', fraction=0.9, lineage=RankLineageInfo(),f_weighted_at_rank=0.8, - bp_match_at_rank=80, query_ani_at_rank=None)] - assert q_res.summarized_lineage_results['superkingdom'] == sk - print(q_res.summarized_lineage_results['phylum']) - phy = [SummarizedGatherResult(rank='phylum', fraction=0.1, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0.2, bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)), - SummarizedGatherResult(rank='phylum', fraction=0.9, lineage=RankLineageInfo(), f_weighted_at_rank=0.8, bp_match_at_rank=80, - query_ani_at_rank=None)] - assert q_res.summarized_lineage_results['phylum'] == phy - print(q_res.summarized_lineage_results['class']) - cl = [SummarizedGatherResult(rank='class', fraction=0.1,lineage=RankLineageInfo(lineage_str="a;b;c"), - f_weighted_at_rank=0.2, bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)), - SummarizedGatherResult(rank='class', fraction=0.9, lineage=RankLineageInfo(), f_weighted_at_rank=0.8, bp_match_at_rank=80, - query_ani_at_rank=None)] - assert q_res.summarized_lineage_results['class'] == cl - - assert q_res.total_f_weighted['phylum'] == approx(0.2) - assert q_res.total_f_classified['class'] == approx(0.1) - assert q_res.total_bp_classified['superkingdom'] == 20 + print(q_res.summarized_lineage_results["superkingdom"]) + + sk = [ + SummarizedGatherResult( + rank="superkingdom", + fraction=0.1, + f_weighted_at_rank=0.2, + lineage=RankLineageInfo(lineage_str="a"), + bp_match_at_rank=20, + query_ani_at_rank=approx(0.928, rel=1e-2), + ), + SummarizedGatherResult( + rank="superkingdom", + fraction=0.9, + lineage=RankLineageInfo(), + f_weighted_at_rank=0.8, + bp_match_at_rank=80, + query_ani_at_rank=None, + ), + ] + assert q_res.summarized_lineage_results["superkingdom"] == sk + print(q_res.summarized_lineage_results["phylum"]) + phy = [ + SummarizedGatherResult( + rank="phylum", + fraction=0.1, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.2, + bp_match_at_rank=20, + query_ani_at_rank=approx(0.928, rel=1e-2), + ), + SummarizedGatherResult( + rank="phylum", + fraction=0.9, + lineage=RankLineageInfo(), + f_weighted_at_rank=0.8, + bp_match_at_rank=80, + query_ani_at_rank=None, + ), + ] + assert q_res.summarized_lineage_results["phylum"] == phy + print(q_res.summarized_lineage_results["class"]) + cl = [ + SummarizedGatherResult( + rank="class", + fraction=0.1, + lineage=RankLineageInfo(lineage_str="a;b;c"), + f_weighted_at_rank=0.2, + bp_match_at_rank=20, + query_ani_at_rank=approx(0.928, rel=1e-2), + ), + SummarizedGatherResult( + rank="class", + fraction=0.9, + lineage=RankLineageInfo(), + f_weighted_at_rank=0.8, + bp_match_at_rank=80, + query_ani_at_rank=None, + ), + ] + assert q_res.summarized_lineage_results["class"] == cl + + assert q_res.total_f_weighted["phylum"] == approx(0.2) + assert q_res.total_f_classified["class"] == approx(0.1) + assert q_res.total_bp_classified["superkingdom"] == 20 def test_QueryTaxResult_build_summarized_result_over100percent(): "summarize up ranks: different values" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB','f_unique_to_query': 0.95}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [{}, {"name": "gB", "f_unique_to_query": 0.95}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) # now summarize up the ranks assert len(q_res.raw_taxresults) == 2 with pytest.raises(ValueError) as exc: q_res.build_summarized_result() print(str(exc)) - assert "Summarized fraction is > 100% of the query! This should not be possible" in str(exc) + assert ( + "Summarized fraction is > 100% of the query! This should not be possible" + in str(exc) + ) def test_build_summarized_result_rank_fail_not_available_resummarize(): "build classification result" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) - q_res.summarize_up_ranks('superkingdom') + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) + q_res.summarize_up_ranks("superkingdom") with pytest.raises(ValueError) as exc: - q_res.build_summarized_result(single_rank='order') + q_res.build_summarized_result(single_rank="order") print(str(exc)) assert "Error: rank 'order' not in summarized rank(s), superkingdom" in str(exc) @@ -2386,15 +3349,31 @@ def test_aggregate_by_lineage_at_rank(): gB_tax = ("gB", "a;c") taxD = make_mini_taxonomy([gA_tax, gB_tax]) # make gather results - gather_results = [{'query_name': 'queryA', 'name': 'gA', 'f_unique_weighted': 0.5,'f_unique_to_query': 0.4,'unique_intersect_bp': 50}, - {'query_name': 'queryA', "name": 'gB', 'f_unique_weighted': 0.3,'f_unique_to_query': 0.3,'unique_intersect_bp': 30}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) - summarized, all_queries = aggregate_by_lineage_at_rank([q_res], rank='phylum', by_query=False) + gather_results = [ + { + "query_name": "queryA", + "name": "gA", + "f_unique_weighted": 0.5, + "f_unique_to_query": 0.4, + "unique_intersect_bp": 50, + }, + { + "query_name": "queryA", + "name": "gB", + "f_unique_weighted": 0.3, + "f_unique_to_query": 0.3, + "unique_intersect_bp": 30, + }, + ] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, summarize=True + ) + summarized, all_queries = aggregate_by_lineage_at_rank( + [q_res], rank="phylum", by_query=False + ) print(summarized) - assert summarized == {'a;b': 0.4, - 'a;c': 0.3, - 'unclassified': approx(0.3, rel=1e-2)} - assert all_queries == ['queryA'] + assert summarized == {"a;b": 0.4, "a;c": 0.3, "unclassified": approx(0.3, rel=1e-2)} + assert all_queries == ["queryA"] def test_aggregate_by_lineage_at_rank_not_available(): @@ -2404,11 +3383,27 @@ def test_aggregate_by_lineage_at_rank_not_available(): gB_tax = ("gB", "a;c") taxD = make_mini_taxonomy([gA_tax, gB_tax]) # make gather results - gather_results = [{'query_name': 'queryA', 'name': 'gA', 'f_unique_weighted': 0.5,'f_unique_to_query': 0.4,'unique_intersect_bp': 50}, - {'query_name': 'queryA', "name": 'gB', 'f_unique_weighted': 0.3,'f_unique_to_query': 0.3,'unique_intersect_bp': 30}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) + gather_results = [ + { + "query_name": "queryA", + "name": "gA", + "f_unique_weighted": 0.5, + "f_unique_to_query": 0.4, + "unique_intersect_bp": 50, + }, + { + "query_name": "queryA", + "name": "gB", + "f_unique_weighted": 0.3, + "f_unique_to_query": 0.3, + "unique_intersect_bp": 30, + }, + ] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, summarize=True + ) with pytest.raises(ValueError) as exc: - aggregate_by_lineage_at_rank([q_res], rank='species', by_query=False) + aggregate_by_lineage_at_rank([q_res], rank="species", by_query=False) print(str(exc)) assert "Error: rank 'species' not available for aggregation." in str(exc) @@ -2420,49 +3415,85 @@ def test_aggregate_by_lineage_at_rank_by_query(): gB_tax = ("gB", "a;c") taxD = make_mini_taxonomy([gA_tax, gB_tax]) # make gather results - gather_results = [{'query_name': 'queryA', 'name': 'gA', 'f_unique_weighted': 0.2,'f_unique_to_query': 0.2,'unique_intersect_bp': 50}, - {'query_name': 'queryA', "name": 'gB', 'f_unique_weighted': 0.3,'f_unique_to_query': 0.3,'unique_intersect_bp': 30}, - {'query_name': 'queryB', "name": 'gB', 'f_unique_weighted': 0.4,'f_unique_to_query': 0.4,'unique_intersect_bp': 30}] + gather_results = [ + { + "query_name": "queryA", + "name": "gA", + "f_unique_weighted": 0.2, + "f_unique_to_query": 0.2, + "unique_intersect_bp": 50, + }, + { + "query_name": "queryA", + "name": "gB", + "f_unique_weighted": 0.3, + "f_unique_to_query": 0.3, + "unique_intersect_bp": 30, + }, + { + "query_name": "queryB", + "name": "gB", + "f_unique_weighted": 0.4, + "f_unique_to_query": 0.4, + "unique_intersect_bp": 30, + }, + ] gres = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, summarize=True) # check by query - summarized, all_queries = aggregate_by_lineage_at_rank(gres.values(), rank='superkingdom', by_query=True) + summarized, all_queries = aggregate_by_lineage_at_rank( + gres.values(), rank="superkingdom", by_query=True + ) print(summarized) - assert summarized == {"a": {'queryA': 0.5, 'queryB': 0.4}, - "unclassified": {'queryA': 0.5, 'queryB': 0.6}} - #assert summarized == {'a': {'queryA': approx(0.1, rel=1e-2), 'queryB': 0.7}} - assert all_queries == ['queryA', 'queryB'] - summarized, all_queries = aggregate_by_lineage_at_rank(gres.values(), rank='phylum', by_query=True) + assert summarized == { + "a": {"queryA": 0.5, "queryB": 0.4}, + "unclassified": {"queryA": 0.5, "queryB": 0.6}, + } + # assert summarized == {'a': {'queryA': approx(0.1, rel=1e-2), 'queryB': 0.7}} + assert all_queries == ["queryA", "queryB"] + summarized, all_queries = aggregate_by_lineage_at_rank( + gres.values(), rank="phylum", by_query=True + ) print(summarized) - assert summarized == {'a;c': {'queryA': 0.3, 'queryB': 0.4}, - 'a;b': {'queryA': 0.2}, - "unclassified": {'queryA': 0.5, 'queryB': 0.6}} - + assert summarized == { + "a;c": {"queryA": 0.3, "queryB": 0.4}, + "a;b": {"queryA": 0.2}, + "unclassified": {"queryA": 0.5, "queryB": 0.6}, + } + def test_build_classification_result_containment_threshold_fail(): "classification result: improper containment threshold" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) with pytest.raises(ValueError) as exc: q_res.build_classification_result(containment_threshold=1.2) print(str(exc)) - assert "Containment threshold must be between 0 and 1 (input value: 1.2)." in str(exc) + assert "Containment threshold must be between 0 and 1 (input value: 1.2)." in str( + exc + ) with pytest.raises(ValueError) as exc: - q_res.build_classification_result(containment_threshold=-.1) + q_res.build_classification_result(containment_threshold=-0.1) print(str(exc)) - assert "Containment threshold must be between 0 and 1 (input value: -0.1)." in str(exc) + assert "Containment threshold must be between 0 and 1 (input value: -0.1)." in str( + exc + ) def test_build_classification_result_containment_threshold(): "basic functionality: build classification result using containment threshold" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) q_res.build_classification_result(containment_threshold=0.1) print("classif: ", q_res.classification_result) - assert q_res.classification_result.status == 'match' - assert q_res.classification_result.rank == 'class' + assert q_res.classification_result.status == "match" + assert q_res.classification_result.rank == "class" assert q_res.classification_result.fraction == 0.1 assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b;c") assert q_res.classification_result.f_weighted_at_rank == 0.2 @@ -2471,8 +3502,8 @@ def test_build_classification_result_containment_threshold(): q_res.build_classification_result(containment_threshold=0.2) print("classif: ", q_res.classification_result) - assert q_res.classification_result.status == 'match' - assert q_res.classification_result.rank == 'phylum' + assert q_res.classification_result.status == "match" + assert q_res.classification_result.rank == "phylum" assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b") assert q_res.classification_result.f_weighted_at_rank == 0.4 assert q_res.classification_result.fraction == 0.2 @@ -2481,8 +3512,8 @@ def test_build_classification_result_containment_threshold(): q_res.build_classification_result(containment_threshold=1.0) print("classif: ", q_res.classification_result) - assert q_res.classification_result.status == 'below_threshold' - assert q_res.classification_result.rank == 'superkingdom' + assert q_res.classification_result.status == "below_threshold" + assert q_res.classification_result.rank == "superkingdom" assert q_res.classification_result.fraction == 0.2 assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a") assert q_res.classification_result.f_weighted_at_rank == 0.4 @@ -2493,23 +3524,25 @@ def test_build_classification_result_containment_threshold(): def test_build_classification_result_ani_threshold(): "basic functionality: build classification result" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) - q_res.build_classification_result(ani_threshold=.92) + q_res.build_classification_result(ani_threshold=0.92) print("classif: ", q_res.classification_result) - assert q_res.classification_result.status == 'match' - assert q_res.classification_result.rank == 'class' + assert q_res.classification_result.status == "match" + assert q_res.classification_result.rank == "class" assert q_res.classification_result.fraction == 0.1 assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b;c") assert q_res.classification_result.f_weighted_at_rank == 0.2 assert q_res.classification_result.bp_match_at_rank == 20 assert q_res.classification_result.query_ani_at_rank == approx(0.928, rel=1e-2) - q_res.build_classification_result(ani_threshold=0.94) # should classify at phylum + q_res.build_classification_result(ani_threshold=0.94) # should classify at phylum print("classif: ", q_res.classification_result) - assert q_res.classification_result.status == 'match' - assert q_res.classification_result.rank == 'phylum' + assert q_res.classification_result.status == "match" + assert q_res.classification_result.rank == "phylum" assert q_res.classification_result.fraction == 0.2 assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b") assert q_res.classification_result.f_weighted_at_rank == 0.4 @@ -2519,8 +3552,8 @@ def test_build_classification_result_ani_threshold(): # superk result, but doesn't meet ANI threshold q_res.build_classification_result(ani_threshold=0.96) print("classif: ", q_res.classification_result) - assert q_res.classification_result.status == 'below_threshold' - assert q_res.classification_result.rank == 'superkingdom' + assert q_res.classification_result.status == "below_threshold" + assert q_res.classification_result.rank == "superkingdom" assert q_res.classification_result.fraction == 0.2 assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a") assert q_res.classification_result.f_weighted_at_rank == 0.4 @@ -2531,14 +3564,16 @@ def test_build_classification_result_ani_threshold(): def test_build_classification_result_ani_threshold_fail(): "classification result: improper ANI threshold" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) with pytest.raises(ValueError) as exc: q_res.build_classification_result(ani_threshold=1.2) print(str(exc)) assert "ANI threshold must be between 0 and 1 (input value: 1.2)." in str(exc) with pytest.raises(ValueError) as exc: - q_res.build_classification_result(ani_threshold=-.1) + q_res.build_classification_result(ani_threshold=-0.1) print(str(exc)) assert "ANI threshold must be between 0 and 1 (input value: -0.1)." in str(exc) @@ -2546,22 +3581,28 @@ def test_build_classification_result_ani_threshold_fail(): def test_build_classification_result_rank_fail_not_filled(): "classification result: rank not available (wasn't filled in tax lineage matches)" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) with pytest.raises(ValueError) as exc: - q_res.build_classification_result(rank='order') + q_res.build_classification_result(rank="order") print(str(exc)) - assert "Error: rank 'order' was not available for any matching lineages." in str(exc) + assert "Error: rank 'order' was not available for any matching lineages." in str( + exc + ) def test_build_classification_result_rank_fail_not_available_resummarize(): "classification result: rank not available (wasn't summarized)" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) - q_res.summarize_up_ranks('superkingdom') + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) + q_res.summarize_up_ranks("superkingdom") with pytest.raises(ValueError) as exc: - q_res.build_classification_result(rank='order') + q_res.build_classification_result(rank="order") print(str(exc)) assert "Error: rank 'order' not in summarized rank(s), superkingdom" in str(exc) @@ -2569,33 +3610,40 @@ def test_build_classification_result_rank_fail_not_available_resummarize(): def test_build_classification_result_rank_fail_not_available(): "classification result: rank not available" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) with pytest.raises(ValueError) as exc: - q_res.build_classification_result(rank='NotARank') + q_res.build_classification_result(rank="NotARank") print(str(exc)) - assert "Error: rank 'NotARank' not in available ranks (strain, species, genus, family, order, class, phylum, superkingdom)" in str(exc) + assert ( + "Error: rank 'NotARank' not in available ranks (strain, species, genus, family, order, class, phylum, superkingdom)" + in str(exc) + ) def test_build_classification_result_rank_containment_threshold(): "classification result - rank and containment threshold (default)" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) - q_res.build_classification_result(rank='class') + q_res.build_classification_result(rank="class") print("classif: ", q_res.classification_result) - assert q_res.classification_result.status == 'match' - assert q_res.classification_result.rank == 'class' + assert q_res.classification_result.status == "match" + assert q_res.classification_result.rank == "class" assert q_res.classification_result.fraction == 0.1 assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b;c") assert q_res.classification_result.f_weighted_at_rank == 0.2 assert q_res.classification_result.bp_match_at_rank == 20 assert q_res.classification_result.query_ani_at_rank == approx(0.928, rel=1e-2) - q_res.build_classification_result(rank='class', containment_threshold=0.4) - assert q_res.classification_result.status == 'below_threshold' - assert q_res.classification_result.rank == 'class' + q_res.build_classification_result(rank="class", containment_threshold=0.4) + assert q_res.classification_result.status == "below_threshold" + assert q_res.classification_result.rank == "class" assert q_res.classification_result.fraction == 0.1 assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b;c") assert q_res.classification_result.f_weighted_at_rank == 0.2 @@ -2606,21 +3654,23 @@ def test_build_classification_result_rank_containment_threshold(): def test_build_classification_result_rank_ani_threshold(): "classification result with rank and ANI threshold" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) - - q_res.build_classification_result(rank='class', ani_threshold=0.92) - assert q_res.classification_result.status == 'match' - assert q_res.classification_result.rank == 'class' + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) + + q_res.build_classification_result(rank="class", ani_threshold=0.92) + assert q_res.classification_result.status == "match" + assert q_res.classification_result.rank == "class" assert q_res.classification_result.fraction == 0.1 assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b;c") assert q_res.classification_result.f_weighted_at_rank == 0.2 assert q_res.classification_result.bp_match_at_rank == 20 assert q_res.classification_result.query_ani_at_rank == approx(0.928, rel=1e-2) - q_res.build_classification_result(rank='class', ani_threshold=0.95) - assert q_res.classification_result.status == 'below_threshold' - assert q_res.classification_result.rank == 'class' + q_res.build_classification_result(rank="class", ani_threshold=0.95) + assert q_res.classification_result.status == "below_threshold" + assert q_res.classification_result.rank == "class" assert q_res.classification_result.fraction == 0.1 assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b;c") assert q_res.classification_result.f_weighted_at_rank == 0.2 @@ -2631,55 +3681,63 @@ def test_build_classification_result_rank_ani_threshold(): def test_krona_classified(): "basic functionality: build classification result using containment threshold" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) q_res.build_classification_result() - assert q_res.krona_classified == None - q_res.build_classification_result(rank='phylum')#, force_resummarize=True) + assert q_res.krona_classified is None + q_res.build_classification_result(rank="phylum") # , force_resummarize=True) print(q_res.krona_classified) - assert q_res.krona_classified == (0.2, 'a', 'b') - assert q_res.krona_unclassified == (0.8, 'unclassified', 'unclassified') - q_res.build_classification_result(rank='superkingdom') + assert q_res.krona_classified == (0.2, "a", "b") + assert q_res.krona_unclassified == (0.8, "unclassified", "unclassified") + q_res.build_classification_result(rank="superkingdom") print(q_res.krona_classified) - assert q_res.krona_classified == (0.2, 'a') - assert q_res.krona_unclassified == (0.8, 'unclassified') + assert q_res.krona_classified == (0.2, "a") + assert q_res.krona_unclassified == (0.8, "unclassified") # make sure this goes back to None if we reclassify without rank q_res.build_classification_result() - assert q_res.krona_classified == None - assert q_res.krona_unclassified == None + assert q_res.krona_classified is None + assert q_res.krona_unclassified is None assert q_res.krona_header == [] def test_make_krona_header_basic(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] + gather_results = [{}, {"name": "gB"}] phy_header = ["fraction", "superkingdom", "phylum"] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) - q_res.build_classification_result(rank='phylum') + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) + q_res.build_classification_result(rank="phylum") print(q_res.krona_classified) print(q_res.krona_header) assert q_res.krona_header == phy_header - hd = q_res.make_krona_header('phylum') + hd = q_res.make_krona_header("phylum") print("header: ", hd) assert hd == phy_header def test_make_krona_header_basic_1(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] + gather_results = [{}, {"name": "gB"}] class_header = ["fraction", "superkingdom", "phylum", "class"] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) - q_res.build_classification_result(rank='class') + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) + q_res.build_classification_result(rank="class") assert q_res.krona_header == class_header - hd = q_res.make_krona_header(min_rank='class') + hd = q_res.make_krona_header(min_rank="class") print("header: ", hd) assert hd == class_header def test_make_krona_header_fail(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, summarize=True + ) with pytest.raises(ValueError) as exc: q_res.make_krona_header("order") assert "Rank 'order' not present in summarized ranks." in str(exc.value) @@ -2690,305 +3748,740 @@ def test_make_krona_header_fail(): def test_make_human_summary(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) - hs = q_res.make_human_summary(display_rank = "superkingdom") + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, summarize=True + ) + hs = q_res.make_human_summary(display_rank="superkingdom") print(hs) - assert hs == [{'rank': 'superkingdom', 'fraction': '0.800', 'lineage': 'unclassified', - 'f_weighted_at_rank': '60.0%', 'bp_match_at_rank': "60", 'query_ani_at_rank': '- ', - 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', - 'total_weighted_hashes': "0"}, - {'rank': 'superkingdom', 'fraction': '0.200', 'lineage': "a", - 'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': "40", 'query_ani_at_rank': '94.9%', - 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': "0"}] + assert hs == [ + { + "rank": "superkingdom", + "fraction": "0.800", + "lineage": "unclassified", + "f_weighted_at_rank": "60.0%", + "bp_match_at_rank": "60", + "query_ani_at_rank": "- ", + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + { + "rank": "superkingdom", + "fraction": "0.200", + "lineage": "a", + "f_weighted_at_rank": "40.0%", + "bp_match_at_rank": "40", + "query_ani_at_rank": "94.9%", + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + ] def test_make_human_summary_2(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) - hs = q_res.make_human_summary(display_rank = "phylum") + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, summarize=True + ) + hs = q_res.make_human_summary(display_rank="phylum") print(hs) - assert hs == [{'rank': 'phylum', 'fraction': '0.800', 'lineage': 'unclassified', - 'f_weighted_at_rank': '60.0%', 'bp_match_at_rank': "60", 'query_ani_at_rank': '- ', - 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', - 'total_weighted_hashes': "0"}, - {'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b', - 'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': "40", 'query_ani_at_rank': '94.9%', - 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': "0"}] + assert hs == [ + { + "rank": "phylum", + "fraction": "0.800", + "lineage": "unclassified", + "f_weighted_at_rank": "60.0%", + "bp_match_at_rank": "60", + "query_ani_at_rank": "- ", + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + { + "rank": "phylum", + "fraction": "0.200", + "lineage": "a;b", + "f_weighted_at_rank": "40.0%", + "bp_match_at_rank": "40", + "query_ani_at_rank": "94.9%", + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + ] def test_make_human_summary_classification(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, classify=True, classify_rank="superkingdom") - hs = q_res.make_human_summary(display_rank = "superkingdom", classification=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, + taxD=taxD, + single_query=True, + classify=True, + classify_rank="superkingdom", + ) + hs = q_res.make_human_summary(display_rank="superkingdom", classification=True) print(hs) - assert hs == [{'rank': 'superkingdom', 'fraction': '0.200', 'lineage': 'a', - 'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': "40", - 'query_ani_at_rank': '94.9%', 'status': 'match', 'query_name': 'q1', - 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': "0"}] + assert hs == [ + { + "rank": "superkingdom", + "fraction": "0.200", + "lineage": "a", + "f_weighted_at_rank": "40.0%", + "bp_match_at_rank": "40", + "query_ani_at_rank": "94.9%", + "status": "match", + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + } + ] def test_make_human_summary_classification_2(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, classify=True, classify_rank="phylum") - hs = q_res.make_human_summary(display_rank = "phylum", classification=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, + taxD=taxD, + single_query=True, + classify=True, + classify_rank="phylum", + ) + hs = q_res.make_human_summary(display_rank="phylum", classification=True) print(hs) - assert hs == [{'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b', - 'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': "40", - 'query_ani_at_rank': '94.9%', 'status': 'match', - 'query_name': 'q1', 'query_md5': 'md5', - 'query_filename': 'query_fn', 'total_weighted_hashes': "0"}] + assert hs == [ + { + "rank": "phylum", + "fraction": "0.200", + "lineage": "a;b", + "f_weighted_at_rank": "40.0%", + "bp_match_at_rank": "40", + "query_ani_at_rank": "94.9%", + "status": "match", + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + } + ] def test_make_full_summary(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, summarize=True + ) header, fs = q_res.make_full_summary() - assert header == ['query_name', 'rank', 'fraction', 'lineage', 'query_md5', 'query_filename', - 'f_weighted_at_rank', 'bp_match_at_rank', 'query_ani_at_rank', 'total_weighted_hashes'] + assert header == [ + "query_name", + "rank", + "fraction", + "lineage", + "query_md5", + "query_filename", + "f_weighted_at_rank", + "bp_match_at_rank", + "query_ani_at_rank", + "total_weighted_hashes", + ] print(fs) - assert fs == [{'rank': 'superkingdom', 'fraction': '0.2', 'lineage': 'a', 'f_weighted_at_rank': '0.4', - 'bp_match_at_rank': '40', 'query_ani_at_rank': approx(0.949,rel=1e-3), 'query_name': 'q1', - 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, - {'rank': 'superkingdom', 'fraction': '0.8', 'lineage': 'unclassified', 'f_weighted_at_rank': - '0.6', 'bp_match_at_rank': '60', 'query_ani_at_rank': None, - 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', - 'total_weighted_hashes': '0'}, - {'rank': 'phylum', 'fraction': '0.2', 'lineage': 'a;b', 'f_weighted_at_rank': '0.4', - 'bp_match_at_rank': '40', 'query_ani_at_rank': approx(0.949,rel=1e-3), 'query_name': 'q1', - 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, - {'rank': 'phylum', 'fraction': '0.8', 'lineage': 'unclassified', 'f_weighted_at_rank': '0.6', - 'bp_match_at_rank': '60', 'query_ani_at_rank': None, 'query_name': 'q1', 'query_md5': 'md5', - 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, - {'rank': 'class', 'fraction': '0.1', 'lineage': 'a;b;c', 'f_weighted_at_rank': '0.2', - 'bp_match_at_rank': '20', 'query_ani_at_rank': approx(0.928, rel=1e-3), - 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, - {'rank': 'class', 'fraction': '0.1', 'lineage': 'a;b;d','f_weighted_at_rank': '0.2', - 'bp_match_at_rank': '20', 'query_ani_at_rank': approx(0.928, rel=1e-3), 'query_name': 'q1', - 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, - {'rank': 'class', 'fraction': '0.8', 'lineage': 'unclassified', 'f_weighted_at_rank': '0.6', - 'bp_match_at_rank': '60', 'query_ani_at_rank': None, 'query_name': 'q1', 'query_md5': 'md5', - 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}] - + assert fs == [ + { + "rank": "superkingdom", + "fraction": "0.2", + "lineage": "a", + "f_weighted_at_rank": "0.4", + "bp_match_at_rank": "40", + "query_ani_at_rank": approx(0.949, rel=1e-3), + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + { + "rank": "superkingdom", + "fraction": "0.8", + "lineage": "unclassified", + "f_weighted_at_rank": "0.6", + "bp_match_at_rank": "60", + "query_ani_at_rank": None, + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + { + "rank": "phylum", + "fraction": "0.2", + "lineage": "a;b", + "f_weighted_at_rank": "0.4", + "bp_match_at_rank": "40", + "query_ani_at_rank": approx(0.949, rel=1e-3), + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + { + "rank": "phylum", + "fraction": "0.8", + "lineage": "unclassified", + "f_weighted_at_rank": "0.6", + "bp_match_at_rank": "60", + "query_ani_at_rank": None, + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + { + "rank": "class", + "fraction": "0.1", + "lineage": "a;b;c", + "f_weighted_at_rank": "0.2", + "bp_match_at_rank": "20", + "query_ani_at_rank": approx(0.928, rel=1e-3), + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + { + "rank": "class", + "fraction": "0.1", + "lineage": "a;b;d", + "f_weighted_at_rank": "0.2", + "bp_match_at_rank": "20", + "query_ani_at_rank": approx(0.928, rel=1e-3), + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + { + "rank": "class", + "fraction": "0.8", + "lineage": "unclassified", + "f_weighted_at_rank": "0.6", + "bp_match_at_rank": "60", + "query_ani_at_rank": None, + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + ] + header, fs = q_res.make_full_summary(limit_float=True) - assert header == ['query_name', 'rank', 'fraction', 'lineage', 'query_md5', 'query_filename', - 'f_weighted_at_rank', 'bp_match_at_rank', 'query_ani_at_rank', 'total_weighted_hashes'] + assert header == [ + "query_name", + "rank", + "fraction", + "lineage", + "query_md5", + "query_filename", + "f_weighted_at_rank", + "bp_match_at_rank", + "query_ani_at_rank", + "total_weighted_hashes", + ] print(fs) - assert fs == [{'rank': 'superkingdom', 'fraction': '0.200', 'lineage': 'a', 'f_weighted_at_rank': '0.400', - 'bp_match_at_rank': '40', 'query_ani_at_rank': "0.949", 'query_name': 'q1', - 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, - {'rank': 'superkingdom', 'fraction': '0.800', 'lineage': 'unclassified', 'f_weighted_at_rank': - '0.600', 'bp_match_at_rank': '60', 'query_ani_at_rank': None, - 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', - 'total_weighted_hashes': '0'}, - {'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b', 'f_weighted_at_rank': '0.400', - 'bp_match_at_rank': '40', 'query_ani_at_rank': "0.949", 'query_name': 'q1', - 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, - {'rank': 'phylum', 'fraction': '0.800', 'lineage': 'unclassified', 'f_weighted_at_rank': '0.600', - 'bp_match_at_rank': '60', 'query_ani_at_rank': None, 'query_name': 'q1', 'query_md5': 'md5', - 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, - {'rank': 'class', 'fraction': '0.100', 'lineage': 'a;b;c', 'f_weighted_at_rank': '0.200', - 'bp_match_at_rank': '20', 'query_ani_at_rank': "0.928", - 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, - {'rank': 'class', 'fraction': '0.100', 'lineage': 'a;b;d','f_weighted_at_rank': '0.200', - 'bp_match_at_rank': '20', 'query_ani_at_rank': "0.928", 'query_name': 'q1', - 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, - {'rank': 'class', 'fraction': '0.800', 'lineage': 'unclassified', 'f_weighted_at_rank': '0.600', - 'bp_match_at_rank': '60', 'query_ani_at_rank': None, 'query_name': 'q1', 'query_md5': 'md5', - 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}] + assert fs == [ + { + "rank": "superkingdom", + "fraction": "0.200", + "lineage": "a", + "f_weighted_at_rank": "0.400", + "bp_match_at_rank": "40", + "query_ani_at_rank": "0.949", + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + { + "rank": "superkingdom", + "fraction": "0.800", + "lineage": "unclassified", + "f_weighted_at_rank": "0.600", + "bp_match_at_rank": "60", + "query_ani_at_rank": None, + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + { + "rank": "phylum", + "fraction": "0.200", + "lineage": "a;b", + "f_weighted_at_rank": "0.400", + "bp_match_at_rank": "40", + "query_ani_at_rank": "0.949", + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + { + "rank": "phylum", + "fraction": "0.800", + "lineage": "unclassified", + "f_weighted_at_rank": "0.600", + "bp_match_at_rank": "60", + "query_ani_at_rank": None, + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + { + "rank": "class", + "fraction": "0.100", + "lineage": "a;b;c", + "f_weighted_at_rank": "0.200", + "bp_match_at_rank": "20", + "query_ani_at_rank": "0.928", + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + { + "rank": "class", + "fraction": "0.100", + "lineage": "a;b;d", + "f_weighted_at_rank": "0.200", + "bp_match_at_rank": "20", + "query_ani_at_rank": "0.928", + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + { + "rank": "class", + "fraction": "0.800", + "lineage": "unclassified", + "f_weighted_at_rank": "0.600", + "bp_match_at_rank": "60", + "query_ani_at_rank": None, + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + ] def test_make_full_summary_summarization_fail(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=False) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, summarize=False + ) with pytest.raises(ValueError) as exc: q_res.make_full_summary() print(str(exc)) - assert 'not summarized yet' in str(exc) + assert "not summarized yet" in str(exc) def test_make_full_summary_classification(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, classify=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, classify=True + ) header, fs = q_res.make_full_summary(classification=True) - assert header == ["query_name", "status", "rank", "fraction", "lineage", - "query_md5", "query_filename", "f_weighted_at_rank", - "bp_match_at_rank", "query_ani_at_rank"] + assert header == [ + "query_name", + "status", + "rank", + "fraction", + "lineage", + "query_md5", + "query_filename", + "f_weighted_at_rank", + "bp_match_at_rank", + "query_ani_at_rank", + ] print(fs) - assert fs == [{'rank': 'class', 'fraction': '0.1', 'lineage': 'a;b;c', 'f_weighted_at_rank': '0.2', - 'bp_match_at_rank': '20', 'query_ani_at_rank': approx(0.928, rel=1e-3), - 'status': 'match', 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn'}] + assert fs == [ + { + "rank": "class", + "fraction": "0.1", + "lineage": "a;b;c", + "f_weighted_at_rank": "0.2", + "bp_match_at_rank": "20", + "query_ani_at_rank": approx(0.928, rel=1e-3), + "status": "match", + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + } + ] + - def test_make_full_summary_classification_limit_float(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, classify=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, classify=True + ) header, fs = q_res.make_full_summary(classification=True, limit_float=True) - assert header == ["query_name", "status", "rank", "fraction", "lineage", - "query_md5", "query_filename", "f_weighted_at_rank", - "bp_match_at_rank", "query_ani_at_rank"] + assert header == [ + "query_name", + "status", + "rank", + "fraction", + "lineage", + "query_md5", + "query_filename", + "f_weighted_at_rank", + "bp_match_at_rank", + "query_ani_at_rank", + ] print(fs) - assert fs == [{'rank': 'class', 'fraction': '0.100', 'lineage': 'a;b;c', 'f_weighted_at_rank': '0.200', - 'bp_match_at_rank': '20', 'query_ani_at_rank': "0.928", - 'status': 'match', 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn'}] + assert fs == [ + { + "rank": "class", + "fraction": "0.100", + "lineage": "a;b;c", + "f_weighted_at_rank": "0.200", + "bp_match_at_rank": "20", + "query_ani_at_rank": "0.928", + "status": "match", + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + } + ] def test_make_full_summary_classification_fail(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, summarize=True + ) with pytest.raises(ValueError) as exc: q_res.make_full_summary(classification=True) print(str(exc)) - assert 'not classified yet' in str(exc) + assert "not classified yet" in str(exc) def test_make_kreport_results(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;c;d;e;f;g")]) - #need to go down to species to check that `num_bp_assigned` is happening correctly - gather_results = [{"total_weighted_hashes":100}, {"name": 'gB', "total_weighted_hashes":100}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) + # need to go down to species to check that `num_bp_assigned` is happening correctly + gather_results = [ + {"total_weighted_hashes": 100}, + {"name": "gB", "total_weighted_hashes": 100}, + ] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, summarize=True + ) header, krepD = q_res.make_kreport_results() print(krepD) - assert krepD == [{'num_bp_assigned': '0', 'percent_containment': '40.00', 'num_bp_contained': '40', - 'rank_code': 'D', 'sci_name': 'a', 'ncbi_taxid': None}, - {'num_bp_assigned': '60', 'percent_containment': '60.00', 'num_bp_contained': '60', - 'sci_name': 'unclassified', 'rank_code': 'U', 'ncbi_taxid': None}, - {'num_bp_assigned': '0', 'percent_containment': '40.00', 'num_bp_contained': '40', - 'rank_code': 'P', 'sci_name': 'b', 'ncbi_taxid': None}, - {'num_bp_assigned': '0', 'percent_containment': '40.00', 'num_bp_contained': '40', - 'rank_code': 'C', 'sci_name': 'c', 'ncbi_taxid': None}, - {'num_bp_assigned': '0', 'percent_containment': '20.00', 'num_bp_contained': '20', - 'rank_code': 'O', 'sci_name': 'd', 'ncbi_taxid': None}, - {'num_bp_assigned': '0', 'percent_containment': '20.00', 'num_bp_contained': '20', - 'rank_code': 'F', 'sci_name': 'e', 'ncbi_taxid': None}, - {'num_bp_assigned': '0', 'percent_containment': '20.00', 'num_bp_contained': '20', - 'rank_code': 'G', 'sci_name': 'f', 'ncbi_taxid': None}, - {'num_bp_assigned': '20', 'percent_containment': '20.00', 'num_bp_contained': '20', - 'rank_code': 'S', 'sci_name': 'g', 'ncbi_taxid': None}] + assert krepD == [ + { + "num_bp_assigned": "0", + "percent_containment": "40.00", + "num_bp_contained": "40", + "rank_code": "D", + "sci_name": "a", + "ncbi_taxid": None, + }, + { + "num_bp_assigned": "60", + "percent_containment": "60.00", + "num_bp_contained": "60", + "sci_name": "unclassified", + "rank_code": "U", + "ncbi_taxid": None, + }, + { + "num_bp_assigned": "0", + "percent_containment": "40.00", + "num_bp_contained": "40", + "rank_code": "P", + "sci_name": "b", + "ncbi_taxid": None, + }, + { + "num_bp_assigned": "0", + "percent_containment": "40.00", + "num_bp_contained": "40", + "rank_code": "C", + "sci_name": "c", + "ncbi_taxid": None, + }, + { + "num_bp_assigned": "0", + "percent_containment": "20.00", + "num_bp_contained": "20", + "rank_code": "O", + "sci_name": "d", + "ncbi_taxid": None, + }, + { + "num_bp_assigned": "0", + "percent_containment": "20.00", + "num_bp_contained": "20", + "rank_code": "F", + "sci_name": "e", + "ncbi_taxid": None, + }, + { + "num_bp_assigned": "0", + "percent_containment": "20.00", + "num_bp_contained": "20", + "rank_code": "G", + "sci_name": "f", + "ncbi_taxid": None, + }, + { + "num_bp_assigned": "20", + "percent_containment": "20.00", + "num_bp_contained": "20", + "rank_code": "S", + "sci_name": "g", + "ncbi_taxid": None, + }, + ] def test_make_kreport_results_with_taxids(): - taxD = make_mini_taxonomy_with_taxids([("gA", "a;b;c", "1;2;3"), ("gB", "a;b;c;d;e;f;g", "1;2;3;4;5;6;7")]) + taxD = make_mini_taxonomy_with_taxids( + [("gA", "a;b;c", "1;2;3"), ("gB", "a;b;c;d;e;f;g", "1;2;3;4;5;6;7")] + ) print(taxD) - #need to go down to species to check that `num_bp_assigned` is happening correctly - gather_results = [{"total_weighted_hashes":100}, {"name": 'gB', "total_weighted_hashes":100}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) + # need to go down to species to check that `num_bp_assigned` is happening correctly + gather_results = [ + {"total_weighted_hashes": 100}, + {"name": "gB", "total_weighted_hashes": 100}, + ] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, summarize=True + ) header, krepD = q_res.make_kreport_results() print(krepD) - assert krepD == [{'num_bp_assigned': '0', 'percent_containment': '40.00', 'num_bp_contained': '40', - 'rank_code': 'D', 'sci_name': 'a', 'ncbi_taxid': '1'}, - {'num_bp_assigned': '60', 'percent_containment': '60.00', 'num_bp_contained': '60', - 'sci_name': 'unclassified', 'rank_code': 'U', 'ncbi_taxid': None}, - {'num_bp_assigned': '0', 'percent_containment': '40.00', 'num_bp_contained': '40', - 'rank_code': 'P', 'sci_name': 'b', 'ncbi_taxid': '2'}, - {'num_bp_assigned': '0', 'percent_containment': '40.00', 'num_bp_contained': '40', - 'rank_code': 'C', 'sci_name': 'c', 'ncbi_taxid': '3'}, - {'num_bp_assigned': '0', 'percent_containment': '20.00', 'num_bp_contained': '20', - 'rank_code': 'O', 'sci_name': 'd', 'ncbi_taxid': '4'}, - {'num_bp_assigned': '0', 'percent_containment': '20.00', 'num_bp_contained': '20', - 'rank_code': 'F', 'sci_name': 'e', 'ncbi_taxid': '5'}, - {'num_bp_assigned': '0', 'percent_containment': '20.00', 'num_bp_contained': '20', - 'rank_code': 'G', 'sci_name': 'f', 'ncbi_taxid': '6'}, - {'num_bp_assigned': '20', 'percent_containment': '20.00', 'num_bp_contained': '20', - 'rank_code': 'S', 'sci_name': 'g', 'ncbi_taxid': '7'}] + assert krepD == [ + { + "num_bp_assigned": "0", + "percent_containment": "40.00", + "num_bp_contained": "40", + "rank_code": "D", + "sci_name": "a", + "ncbi_taxid": "1", + }, + { + "num_bp_assigned": "60", + "percent_containment": "60.00", + "num_bp_contained": "60", + "sci_name": "unclassified", + "rank_code": "U", + "ncbi_taxid": None, + }, + { + "num_bp_assigned": "0", + "percent_containment": "40.00", + "num_bp_contained": "40", + "rank_code": "P", + "sci_name": "b", + "ncbi_taxid": "2", + }, + { + "num_bp_assigned": "0", + "percent_containment": "40.00", + "num_bp_contained": "40", + "rank_code": "C", + "sci_name": "c", + "ncbi_taxid": "3", + }, + { + "num_bp_assigned": "0", + "percent_containment": "20.00", + "num_bp_contained": "20", + "rank_code": "O", + "sci_name": "d", + "ncbi_taxid": "4", + }, + { + "num_bp_assigned": "0", + "percent_containment": "20.00", + "num_bp_contained": "20", + "rank_code": "F", + "sci_name": "e", + "ncbi_taxid": "5", + }, + { + "num_bp_assigned": "0", + "percent_containment": "20.00", + "num_bp_contained": "20", + "rank_code": "G", + "sci_name": "f", + "ncbi_taxid": "6", + }, + { + "num_bp_assigned": "20", + "percent_containment": "20.00", + "num_bp_contained": "20", + "rank_code": "S", + "sci_name": "g", + "ncbi_taxid": "7", + }, + ] def test_make_kreport_results_fail(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=False) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, summarize=False + ) with pytest.raises(ValueError) as exc: q_res.make_kreport_results() print(str(exc)) - assert 'not summarized yet' in str(exc) + assert "not summarized yet" in str(exc) def test_make_kreport_results_fail_pre_v450(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, summarize=True + ) with pytest.raises(ValueError) as exc: q_res.make_kreport_results() print(str(exc)) - assert "cannot produce 'kreport' format from gather results before sourmash v4.5.0" in str(exc) + assert ( + "cannot produce 'kreport' format from gather results before sourmash v4.5.0" + in str(exc) + ) def test_make_cami_results_with_taxids(): - taxD = make_mini_taxonomy_with_taxids([("gA", "a;b;c", "1;2;3"), ("gB", "a;b;c;d;e;f;g", "1;2;3;4;5;6;7")]) + taxD = make_mini_taxonomy_with_taxids( + [("gA", "a;b;c", "1;2;3"), ("gB", "a;b;c;d;e;f;g", "1;2;3;4;5;6;7")] + ) print(taxD) - #need to go down to species to check that `num_bp_assigned` is happening correctly - gather_results = [{"total_weighted_hashes":100}, {"name": 'gB', "total_weighted_hashes":100}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) + # need to go down to species to check that `num_bp_assigned` is happening correctly + gather_results = [ + {"total_weighted_hashes": 100}, + {"name": "gB", "total_weighted_hashes": 100}, + ] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, summarize=True + ) header, camires = q_res.make_cami_bioboxes() print(camires) - assert camires == [['1', 'superkingdom', '1', 'a', '40.00'], - ['2', 'phylum', '1|2', 'a|b', '40.00'], - ['3', 'class', '1|2|3', 'a|b|c', '40.00'], - ['4', 'order', '1|2|3|4', 'a|b|c|d', '20.00'], - ['5', 'family', '1|2|3|4|5', 'a|b|c|d|e', '20.00'], - ['6', 'genus', '1|2|3|4|5|6', 'a|b|c|d|e|f', '20.00'], - ['7', 'species', '1|2|3|4|5|6|7', 'a|b|c|d|e|f|g', '20.00']] + assert camires == [ + ["1", "superkingdom", "1", "a", "40.00"], + ["2", "phylum", "1|2", "a|b", "40.00"], + ["3", "class", "1|2|3", "a|b|c", "40.00"], + ["4", "order", "1|2|3|4", "a|b|c|d", "20.00"], + ["5", "family", "1|2|3|4|5", "a|b|c|d|e", "20.00"], + ["6", "genus", "1|2|3|4|5|6", "a|b|c|d|e|f", "20.00"], + ["7", "species", "1|2|3|4|5|6|7", "a|b|c|d|e|f|g", "20.00"], + ] def test_make_lingroup_results(): - taxD = make_mini_taxonomy([("gA", "1;0;0"), ("gB", "1;0;1"), ("gC", "1;1;0")], LIN=True) + taxD = make_mini_taxonomy( + [("gA", "1;0;0"), ("gB", "1;0;1"), ("gC", "1;1;0")], LIN=True + ) print(taxD) - lingroupD = {"1":"lg1", "1;0":'lg2', '1;1': "lg3"} + lingroupD = {"1": "lg1", "1;0": "lg2", "1;1": "lg3"} print(lingroupD) - gather_results = [{"total_weighted_hashes":100}, - {"name": 'gB', "total_weighted_hashes":100}, - {"name": 'gC', "total_weighted_hashes":100}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True, LIN=True) + gather_results = [ + {"total_weighted_hashes": 100}, + {"name": "gB", "total_weighted_hashes": 100}, + {"name": "gC", "total_weighted_hashes": 100}, + ] + q_res = make_QueryTaxResults( + gather_info=gather_results, + taxD=taxD, + single_query=True, + summarize=True, + LIN=True, + ) print(q_res.summarized_lineage_results) - header, lgD = q_res.make_lingroup_results(LINgroupsD = lingroupD) + header, lgD = q_res.make_lingroup_results(LINgroupsD=lingroupD) print(header) - assert header == ['name', 'lin', 'percent_containment', 'num_bp_contained'] + assert header == ["name", "lin", "percent_containment", "num_bp_contained"] # order may change, just check that each lg entry is present in list of results - lg1 = {'percent_containment': '60.00', 'num_bp_contained': '60', - 'lin': '1', 'name': 'lg1'} - lg2 = {'percent_containment': '40.00', 'num_bp_contained': '40', - 'lin': '1;0', 'name': 'lg2'} - lg3 = {'percent_containment': '20.00', 'num_bp_contained': '20', - 'lin': '1;1', 'name': 'lg3'} + lg1 = { + "percent_containment": "60.00", + "num_bp_contained": "60", + "lin": "1", + "name": "lg1", + } + lg2 = { + "percent_containment": "40.00", + "num_bp_contained": "40", + "lin": "1;0", + "name": "lg2", + } + lg3 = { + "percent_containment": "20.00", + "num_bp_contained": "20", + "lin": "1;1", + "name": "lg3", + } assert lg1 in lgD assert lg2 in lgD assert lg3 in lgD def test_make_lingroup_results_fail_pre_v450(): - taxD = make_mini_taxonomy([("gA", "1;0;0"), ("gB", "1;0;1"), ("gC", "1;1;0")], LIN=True) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True, LIN=True) - lingroupD = {"1":"lg1", "1;0":'lg2', '1;1': "lg3"} + taxD = make_mini_taxonomy( + [("gA", "1;0;0"), ("gB", "1;0;1"), ("gC", "1;1;0")], LIN=True + ) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, + taxD=taxD, + single_query=True, + summarize=True, + LIN=True, + ) + lingroupD = {"1": "lg1", "1;0": "lg2", "1;1": "lg3"} with pytest.raises(ValueError) as exc: q_res.make_lingroup_results(lingroupD) print(str(exc)) - assert "cannot produce 'lingroup' format from gather results before sourmash v4.5.0" in str(exc) + assert ( + "cannot produce 'lingroup' format from gather results before sourmash v4.5.0" + in str(exc) + ) def test_read_lingroups(runtmp): lg_file = runtmp.output("test.lg.csv") - with open(lg_file, 'w') as out: - out.write('lin,name\n') - out.write('1,lg1\n') - out.write('1;0,lg2\n') - out.write('1;1,lg3\n') + with open(lg_file, "w") as out: + out.write("lin,name\n") + out.write("1,lg1\n") + out.write("1;0,lg2\n") + out.write("1;1,lg3\n") lgD = read_lingroups(lg_file) - assert lgD == {"1":"lg1", "1;0":'lg2', '1;1': "lg3"} + assert lgD == {"1": "lg1", "1;0": "lg2", "1;1": "lg3"} + def test_read_lingroups_empty_file(runtmp): lg_file = runtmp.output("test.lg.csv") - with open(lg_file, 'w') as out: + with open(lg_file, "w") as out: out.write("") with pytest.raises(ValueError) as exc: read_lingroups(lg_file) @@ -2998,8 +4491,8 @@ def test_read_lingroups_empty_file(runtmp): def test_read_lingroups_only_header(runtmp): lg_file = runtmp.output("test.lg.csv") - with open(lg_file, 'w') as out: - out.write('lin,name\n') + with open(lg_file, "w") as out: + out.write("lin,name\n") with pytest.raises(ValueError) as exc: read_lingroups(lg_file) print(str(exc)) @@ -3008,8 +4501,8 @@ def test_read_lingroups_only_header(runtmp): def test_read_lingroups_bad_header(runtmp): lg_file = runtmp.output("test.lg.csv") - with open(lg_file, 'w') as out: - out.write('LINgroup_pfx,LINgroup_nm\n') + with open(lg_file, "w") as out: + out.write("LINgroup_pfx,LINgroup_nm\n") with pytest.raises(ValueError) as exc: read_lingroups(lg_file) print(str(exc)) @@ -3021,8 +4514,10 @@ def test_LineageTree_init(): lin1 = RankLineageInfo(lineage_str=x) print(lin1) tree = LineageTree([lin1]) - assert tree.tree == { LineagePair('superkingdom', 'a'): - { LineagePair('phylum', 'b') : {}} } + assert tree.tree == { + LineagePair("superkingdom", "a"): {LineagePair("phylum", "b"): {}} + } + def test_LineageTree_init_mult(): x = "a;b" @@ -3031,10 +4526,14 @@ def test_LineageTree_init_mult(): lin2 = RankLineageInfo(lineage_str=y) print(lin1) from sourmash.tax.tax_utils import LineageTree + tree = LineageTree([lin1, lin2]) - assert tree.tree == {LineagePair(rank='superkingdom', name='a', taxid=None): - {LineagePair(rank='phylum', name='b', taxid=None): {}, - LineagePair(rank='phylum', name='c', taxid=None): {}}} + assert tree.tree == { + LineagePair(rank="superkingdom", name="a", taxid=None): { + LineagePair(rank="phylum", name="b", taxid=None): {}, + LineagePair(rank="phylum", name="c", taxid=None): {}, + } + } def test_LineageTree_init_and_add_lineage(): @@ -3044,13 +4543,18 @@ def test_LineageTree_init_and_add_lineage(): lin2 = RankLineageInfo(lineage_str=y) print(lin1) from sourmash.tax.tax_utils import LineageTree + tree = LineageTree([lin1]) - assert tree.tree == { LineagePair('superkingdom', 'a'): - { LineagePair('phylum', 'b') : {}} } + assert tree.tree == { + LineagePair("superkingdom", "a"): {LineagePair("phylum", "b"): {}} + } tree.add_lineage(lin2) - assert tree.tree == {LineagePair(rank='superkingdom', name='a', taxid=None): - {LineagePair(rank='phylum', name='b', taxid=None): {}, - LineagePair(rank='phylum', name='c', taxid=None): {}}} + assert tree.tree == { + LineagePair(rank="superkingdom", name="a", taxid=None): { + LineagePair(rank="phylum", name="b", taxid=None): {}, + LineagePair(rank="phylum", name="c", taxid=None): {}, + } + } def test_LineageTree_init_and_add_lineages(): @@ -3060,13 +4564,18 @@ def test_LineageTree_init_and_add_lineages(): lin2 = RankLineageInfo(lineage_str=y) print(lin1) from sourmash.tax.tax_utils import LineageTree + tree = LineageTree([lin1]) - assert tree.tree == { LineagePair('superkingdom', 'a'): - { LineagePair('phylum', 'b') : {}} } + assert tree.tree == { + LineagePair("superkingdom", "a"): {LineagePair("phylum", "b"): {}} + } tree.add_lineages([lin2]) - assert tree.tree == {LineagePair(rank='superkingdom', name='a', taxid=None): - {LineagePair(rank='phylum', name='b', taxid=None): {}, - LineagePair(rank='phylum', name='c', taxid=None): {}}} + assert tree.tree == { + LineagePair(rank="superkingdom", name="a", taxid=None): { + LineagePair(rank="phylum", name="b", taxid=None): {}, + LineagePair(rank="phylum", name="c", taxid=None): {}, + } + } def test_build_tree_RankLineageInfo(): @@ -3074,8 +4583,9 @@ def test_build_tree_RankLineageInfo(): lin1 = RankLineageInfo(lineage_str=x) print(lin1) tree = LineageTree([lin1]) - assert tree.tree == { LineagePair('superkingdom', 'a'): - { LineagePair('phylum', 'b') : {}} } + assert tree.tree == { + LineagePair("superkingdom", "a"): {LineagePair("phylum", "b"): {}} + } def test_build_tree_LINLineageInfo(): @@ -3083,8 +4593,7 @@ def test_build_tree_LINLineageInfo(): lin1 = LINLineageInfo(lineage_str=x) print(lin1) tree = LineageTree([lin1]) - assert tree.tree == { LineagePair('0', '0'): - { LineagePair('1', '3') : {}} } + assert tree.tree == {LineagePair("0", "0"): {LineagePair("1", "3"): {}}} def test_build_tree_2(): @@ -3094,68 +4603,96 @@ def test_build_tree_2(): lin2 = RankLineageInfo(lineage_str=y) print(lin1) print(lin2) - tree = LineageTree([lin1,lin2]) + tree = LineageTree([lin1, lin2]) - assert tree.tree == { LineagePair('superkingdom', 'a'): { LineagePair('phylum', 'b') : {}, - LineagePair('phylum', 'c') : {}} } + assert tree.tree == { + LineagePair("superkingdom", "a"): { + LineagePair("phylum", "b"): {}, + LineagePair("phylum", "c"): {}, + } + } def test_build_tree_2_LineagePairs(): # build tree from LineagePairs - tree = LineageTree([[LineagePair('superkingdom', 'a'), LineagePair('phylum', 'b')], - [LineagePair('superkingdom', 'a'), LineagePair('phylum', 'c')], - ]) + tree = LineageTree( + [ + [LineagePair("superkingdom", "a"), LineagePair("phylum", "b")], + [LineagePair("superkingdom", "a"), LineagePair("phylum", "c")], + ] + ) - assert tree.tree == { LineagePair('superkingdom', 'a'): { LineagePair('phylum', 'b') : {}, - LineagePair('phylum', 'c') : {}} } + assert tree.tree == { + LineagePair("superkingdom", "a"): { + LineagePair("phylum", "b"): {}, + LineagePair("phylum", "c"): {}, + } + } def test_build_tree_3(): # empty phylum name - x='a;' + x = "a;" lin1 = RankLineageInfo(lineage_str=x) tree = LineageTree([lin1]) - assert tree.tree == { LineagePair('superkingdom', 'a'): {} } + assert tree.tree == {LineagePair("superkingdom", "a"): {}} def test_build_tree_3_LineagePairs(): # empty phylum name: LineagePair input - lin1 = (LineagePair('superkingdom', "a", '3'), - LineagePair('phylum', '', ''),) + lin1 = ( + LineagePair("superkingdom", "a", "3"), + LineagePair("phylum", "", ""), + ) tree = LineageTree([lin1]) - assert tree.tree == { LineagePair('superkingdom', 'a', '3'): {} } + assert tree.tree == {LineagePair("superkingdom", "a", "3"): {}} def test_build_tree_5(): with pytest.raises(ValueError): - tree = LineageTree([]) + LineageTree([]) def test_build_tree_5b(): with pytest.raises(ValueError): - tree = LineageTree("") + LineageTree("") def test_build_tree_iterable(): with pytest.raises(ValueError) as exc: - tree = LineageTree(RankLineageInfo()) - assert "Must pass in an iterable containing LineagePair or LineageInfo objects" in str(exc) + LineageTree(RankLineageInfo()) + assert ( + "Must pass in an iterable containing LineagePair or LineageInfo objects" + in str(exc) + ) def test_find_lca(): - x='a;b' + x = "a;b" lin1 = RankLineageInfo(lineage_str=x) tree = LineageTree([lin1]) lca = tree.find_lca() - assert lca == ((LineagePair('superkingdom', 'a'), LineagePair('phylum', 'b'),), 0) + assert lca == ( + ( + LineagePair("superkingdom", "a"), + LineagePair("phylum", "b"), + ), + 0, + ) def test_find_lca_LineagePairs(): - tree = LineageTree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2')]]) + tree = LineageTree([[LineagePair("rank1", "name1"), LineagePair("rank2", "name2")]]) lca = tree.find_lca() - assert lca == ((LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2'),), 0) + assert lca == ( + ( + LineagePair("rank1", "name1"), + LineagePair("rank2", "name2"), + ), + 0, + ) def test_find_lca_2(): @@ -3167,7 +4704,7 @@ def test_find_lca_2(): tree = LineageTree([lin1, lin2]) lca = tree.find_lca() - assert lca == ((LineagePair('superkingdom', 'a'),), 2) + assert lca == ((LineagePair("superkingdom", "a"),), 2) def test_find_lca_LIN(): @@ -3179,17 +4716,20 @@ def test_find_lca_LIN(): tree = LineageTree([lin1, lin2]) lca = tree.find_lca() - assert lca == ((LineagePair('0', '5'),), 2) + assert lca == ((LineagePair("0", "5"),), 2) print(lca) def test_find_lca_2_LineagePairs(): - tree = LineageTree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2a')], - [LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2b')], - ]) + tree = LineageTree( + [ + [LineagePair("rank1", "name1"), LineagePair("rank2", "name2a")], + [LineagePair("rank1", "name1"), LineagePair("rank2", "name2b")], + ] + ) lca = tree.find_lca() - assert lca == ((LineagePair('rank1', 'name1'),), 2) + assert lca == ((LineagePair("rank1", "name1"),), 2) def test_find_lca_3(): @@ -3198,7 +4738,7 @@ def test_find_lca_3(): tree = LineageTree([lin1, lin2]) lca, reason = tree.find_lca() - assert lca == lin1.filled_lineage # find most specific leaf node + assert lca == lin1.filled_lineage # find most specific leaf node print(lca) @@ -3214,12 +4754,17 @@ def test_build_tree_with_initial(): lca = tree.find_lca() print(lca) - assert lca == ((LineagePair(rank='superkingdom', name='a', taxid=None), - LineagePair(rank='phylum', name='b', taxid=None)), 2) + assert lca == ( + ( + LineagePair(rank="superkingdom", name="a", taxid=None), + LineagePair(rank="phylum", name="b", taxid=None), + ), + 2, + ) tree.add_lineages([lin3]) lca2 = tree.find_lca() print(lca2) - assert lca2 == ((LineagePair('superkingdom', 'a'),), 2) + assert lca2 == ((LineagePair("superkingdom", "a"),), 2) def test_LineageTree_find_ordered_paths(): @@ -3234,14 +4779,22 @@ def test_LineageTree_find_ordered_paths(): paths = tree.ordered_paths() print(paths) - assert paths == [(LineagePair(rank='superkingdom', name='a', taxid=None), - LineagePair(rank='phylum', name='e', taxid=None)), - (LineagePair(rank='superkingdom', name='a', taxid=None), - LineagePair(rank='phylum', name='b', taxid=None), - LineagePair(rank='class', name='c', taxid=None)), - (LineagePair(rank='superkingdom', name='a', taxid=None), - LineagePair(rank='phylum', name='b', taxid=None), - LineagePair(rank='class', name='d', taxid=None))] + assert paths == [ + ( + LineagePair(rank="superkingdom", name="a", taxid=None), + LineagePair(rank="phylum", name="e", taxid=None), + ), + ( + LineagePair(rank="superkingdom", name="a", taxid=None), + LineagePair(rank="phylum", name="b", taxid=None), + LineagePair(rank="class", name="c", taxid=None), + ), + ( + LineagePair(rank="superkingdom", name="a", taxid=None), + LineagePair(rank="phylum", name="b", taxid=None), + LineagePair(rank="class", name="d", taxid=None), + ), + ] def test_LineageTree_find_ordered_paths_include_internal(): @@ -3257,14 +4810,24 @@ def test_LineageTree_find_ordered_paths_include_internal(): print(paths) - assert paths == [(LineagePair(rank='superkingdom', name='a', taxid=None),), - (LineagePair(rank='superkingdom', name='a', taxid=None), - LineagePair(rank='phylum', name='e', taxid=None)), - (LineagePair(rank='superkingdom', name='a', taxid=None), - LineagePair(rank='phylum', name='b', taxid=None)), - (LineagePair(rank='superkingdom', name='a', taxid=None), - LineagePair(rank='phylum', name='b', taxid=None), - LineagePair(rank='class', name='c', taxid=None)), - (LineagePair(rank='superkingdom', name='a', taxid=None), - LineagePair(rank='phylum', name='b', taxid=None), - LineagePair(rank='class', name='d', taxid=None))] + assert paths == [ + (LineagePair(rank="superkingdom", name="a", taxid=None),), + ( + LineagePair(rank="superkingdom", name="a", taxid=None), + LineagePair(rank="phylum", name="e", taxid=None), + ), + ( + LineagePair(rank="superkingdom", name="a", taxid=None), + LineagePair(rank="phylum", name="b", taxid=None), + ), + ( + LineagePair(rank="superkingdom", name="a", taxid=None), + LineagePair(rank="phylum", name="b", taxid=None), + LineagePair(rank="class", name="c", taxid=None), + ), + ( + LineagePair(rank="superkingdom", name="a", taxid=None), + LineagePair(rank="phylum", name="b", taxid=None), + LineagePair(rank="class", name="d", taxid=None), + ), + ] diff --git a/tests/test_test_framework.py b/tests/test_test_framework.py index abf7e2c93a..85bb3e1020 100644 --- a/tests/test_test_framework.py +++ b/tests/test_test_framework.py @@ -5,4 +5,4 @@ def test_failed_sourmash_exception(runtmp): with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('') + runtmp.sourmash("") diff --git a/tox.ini b/tox.ini index 0e5602628c..1806e48778 100644 --- a/tox.ini +++ b/tox.ini @@ -1,28 +1,34 @@ [tox] -env_list = - py311, - py312, - py310, - coverage, - docs, - package_description - fix_lint, - hypothesis, - khmer, - khmer_master -min_version = 3.27 isolated_build = true skip_missing_interpreters = true +env_list = + py311, + py312, + py310, + coverage, + docs, + package_description + fix_lint, + hypothesis, + khmer, + khmer_master +min_version = 3.27 [testenv] description = run the tests with pytest under {basepython} +deps = + pip>=19.3.1 +extras = + storage + test +commands = + pytest \ + --cov "{envsitepackagesdir}/sourmash" \ + --cov-config "{toxinidir}/tox.ini" \ + --cov-report= \ + --junitxml {toxworkdir}/junit.{envname}.xml \ + {posargs:doc tests} package = wheel -wheel_build_env = .pkg -set_env = - PIP_DISABLE_VERSION_CHECK = 1 - COVERAGE_FILE = {env:COVERAGE_FILE:{toxworkdir}/.coverage.{envname}} - VIRTUALENV_NO_DOWNLOAD = 1 - PIP_EXTRA_INDEX_URL = https://antocuni.github.io/pypy-wheels/manylinux2010 pass_env = TOXENV CURL_CA_BUNDLE @@ -38,140 +44,160 @@ pass_env = PYTHONTRACEMALLOC LIBCLANG_PATH BINDGEN_EXTRA_CLANG_ARGS -deps = - pip >= 19.3.1 -extras = - test - storage -commands = pytest \ - --cov "{envsitepackagesdir}/sourmash" \ - --cov-config "{toxinidir}/tox.ini" \ - --cov-report= \ - --junitxml {toxworkdir}/junit.{envname}.xml \ - {posargs:doc tests} + NIX_LD +set_env = + PIP_DISABLE_VERSION_CHECK = 1 + COVERAGE_FILE = {env:COVERAGE_FILE:{toxworkdir}/.coverage.{envname}} + VIRTUALENV_NO_DOWNLOAD = 1 + PIP_EXTRA_INDEX_URL = https://antocuni.github.io/pypy-wheels/manylinux2010 +wheel_build_env = .pkg [testenv:.pkg] pass_env = - LIBCLANG_PATH - BINDGEN_EXTRA_CLANG_ARGS + LIBCLANG_PATH + BINDGEN_EXTRA_CLANG_ARGS [testenv:pypy3] deps = - pip >= 19.3.1 - psutil <= 5.6.7 + pip>=19.3.1 + psutil<=5.6.7 [testenv:hypothesis] -commands = pytest \ - --cov "{envsitepackagesdir}/sourmash" \ - --cov-config "{toxinidir}/tox.ini" \ - --cov-report= \ - --junitxml {toxworkdir}/junit.{envname}.xml \ - --run-hypothesis \ - --hypothesis-show-statistics \ - --hypothesis-profile ci \ - {posargs:.} +commands = + pytest \ + --cov "{envsitepackagesdir}/sourmash" \ + --cov-config "{toxinidir}/tox.ini" \ + --cov-report= \ + --junitxml {toxworkdir}/junit.{envname}.xml \ + --run-hypothesis \ + --hypothesis-show-statistics \ + --hypothesis-profile ci \ + {posargs:.} [testenv:khmer] basepython = python3.10 deps = - khmer -commands = pytest \ - --cov "{envsitepackagesdir}/sourmash" \ - --cov-config "{toxinidir}/tox.ini" \ - --cov-report= \ - --junitxml {toxworkdir}/junit.{envname}.xml \ - -k test_nodegraph \ - {posargs:.} + khmer +commands = + pytest \ + --cov "{envsitepackagesdir}/sourmash" \ + --cov-config "{toxinidir}/tox.ini" \ + --cov-report= \ + --junitxml {toxworkdir}/junit.{envname}.xml \ + -k test_nodegraph \ + {posargs:.} [testenv:khmer_master] basepython = python3.10 deps = - -e git+https://github.com/dib-lab/khmer.git\#egg=khmer -commands = pytest \ - --cov "{envsitepackagesdir}/sourmash" \ - --cov-config "{toxinidir}/tox.ini" \ - --cov-report= \ - --junitxml {toxworkdir}/junit.{envname}.xml \ - -k test_nodegraph \ - {posargs:.} + -e +commands = + pytest \ + --cov "{envsitepackagesdir}/sourmash" \ + --cov-config "{toxinidir}/tox.ini" \ + --cov-report= \ + --junitxml {toxworkdir}/junit.{envname}.xml \ + -k test_nodegraph \ + {posargs:.} [testenv:asv] description = run asv for benchmarking (compare current commit with latest) deps = - asv==0.5.1 - virtualenv + asv==0.5.1 + virtualenv changedir = {toxinidir} commands = - asv machine --yes - asv continuous latest HEAD {posargs} + asv machine --yes + asv continuous latest HEAD {posargs} [testenv:docs] description = invoke sphinx-build to build the HTML docs basepython = python3.10 -extras = doc +extras = + doc +commands = + sphinx-build -d "{toxworkdir}/docs_doctree" doc "{toxworkdir}/docs_out" --color -bhtml {posargs} + python -c 'import pathlib; print("documentation available under file://\{0\}".format(pathlib.Path(r"{toxworkdir}") / "docs_out" / "index.html"))' allowlist_externals = pandoc -pass_env = HOME change_dir = {toxinidir} -#commands = sphinx-build -d "{toxworkdir}/docs_doctree" doc "{toxworkdir}/docs_out" --color -W -bhtml {posargs} -commands = sphinx-build -d "{toxworkdir}/docs_doctree" doc "{toxworkdir}/docs_out" --color -bhtml {posargs} - python -c 'import pathlib; print("documentation available under file://\{0\}".format(pathlib.Path(r"{toxworkdir}") / "docs_out" / "index.html"))' +pass_env = HOME [testenv:package_description] description = check that the long description is valid basepython = python3.10 -deps = twine >= 1.12.1 - # TODO installing readme-renderer[md] should not be necessary - readme-renderer[md] >= 24.0 - pip >= 19.1 skip_install = true -change_dir = {toxinidir} +deps = + pip>=19.1 + readme-renderer[md]>=24 + twine>=1.12.1 extras = -commands = pip wheel -w {envtmpdir}/build --no-deps . - twine check {envtmpdir}/build/* +commands = + pip wheel -w {envtmpdir}/build --no-deps . + twine check {envtmpdir}/build/* +change_dir = {toxinidir} [testenv:mypy] description = run mypy checker basepython = python3.10 +deps = + mypy +commands = + mypy src/sourmash pass_env = {[testenv]pass_env} - # without PROGRAMDATA cloning using git for Windows will fail with an `error setting certificate verify locations` error - PROGRAMDATA -deps = mypy -commands = mypy src/sourmash + PROGRAMDATA [testenv:fix_lint] description = format the code base to adhere to our styles, and complain about what we cannot do automatically basepython = python3.10 +skip_install = true +deps = + pre-commit>=2 +extras = + lint +commands = + pre-commit run --all-files --show-diff-on-failure {posargs} + python -c 'import pathlib; print("hint: run \{\} install to add checks as pre-commit hook".format(pathlib.Path(r"{envdir}") / "bin" / "pre-commit"))' pass_env = {[testenv]pass_env} - # without PROGRAMDATA cloning using git for Windows will fail with an `error setting certificate verify locations` error - PROGRAMDATA - PRE_COMMIT_HOME -extras = lint -deps = pre-commit>=2 -skip_install = True -commands = pre-commit run --all-files --show-diff-on-failure {posargs} - python -c 'import pathlib; print("hint: run \{\} install to add checks as pre-commit hook".format(pathlib.Path(r"{envdir}") / "bin" / "pre-commit"))' + PROGRAMDATA + PRE_COMMIT_HOME [testenv:coverage] description = [run locally after tests]: combine coverage data and create report; - generates a diff coverage against origin/latest (can be changed by setting DIFF_AGAINST env var) -deps = {[testenv]deps} - coverage >= 5.0.1 - diff_cover -skip_install = True + generates a diff coverage against origin/latest (can be changed by setting DIFF_AGAINST env var) +skip_install = true +deps = + {[testenv]deps} + coverage>=5.0.1 + diff_cover +parallel_show_output = true +commands = + coverage combine + coverage report -i -m + coverage xml -i -o {toxworkdir}/coverage.xml + coverage html -i -d {toxworkdir}/htmlcov + diff-cover --compare-branch {env:DIFF_AGAINST:origin/latest} {toxworkdir}/coverage.xml +depends = py312, py311, py310, pypy3 pass_env = {[testenv]pass_env} - DIFF_AGAINST + DIFF_AGAINST set_env = COVERAGE_FILE={toxworkdir}/.coverage -commands = coverage combine - coverage report -i -m - coverage xml -i -o {toxworkdir}/coverage.xml - coverage html -i -d {toxworkdir}/htmlcov - diff-cover --compare-branch {env:DIFF_AGAINST:origin/latest} {toxworkdir}/coverage.xml -depends = py312, py311, py310, pypy3 -parallel_show_output = True [testenv:X] description = print the positional arguments passed in with echo -commands = echo {posargs} +commands = + echo {posargs} + +[testenv:dev] +description = dev environment with all deps at {envdir} +usedevelop = true +deps = + {[testenv]deps} +extras = + doc + storage + test +commands = + python -m pip list --format=columns + python -c "print(r'{envpython}')" [coverage:run] branch = true @@ -190,20 +216,20 @@ exclude_lines = [coverage:paths] source = src/sourmash/ - tests/ - */.tox/*/lib/python*/site-packages/sourmash - */.tox/pypy*/site-packages/sourmash - */.tox\*\Lib\site-packages\sourmash - */src/sourmash - *\src\sourmash - */tests - *\tests + tests/ + */.tox/*/lib/python*/site-packages/sourmash + */.tox/pypy*/site-packages/sourmash + */.tox\*\Lib\site-packages\sourmash + */src/sourmash + *\src\sourmash + */tests + *\tests [gh-actions] python = - 3.10: py310, docs, package_description, coverage - 3.11: py311, coverage - 3.12: py312, coverage + 3.10: py310, docs, package_description, coverage + 3.11: py311, coverage + 3.12: py312, coverage [flake8] max-complexity = 22 @@ -212,14 +238,3 @@ ignore = E203, W503, C901, E402, B011 [pep8] max-line-length = 99 - -[testenv:dev] -description = dev environment with all deps at {envdir} -extras = - test - storage - doc -deps = {[testenv]deps} -usedevelop = True -commands = python -m pip list --format=columns - python -c "print(r'{envpython}')" diff --git a/utils/cardinality_estimate_confidence.py b/utils/cardinality_estimate_confidence.py index 1f8471fbeb..85c6e5cc75 100644 --- a/utils/cardinality_estimate_confidence.py +++ b/utils/cardinality_estimate_confidence.py @@ -13,7 +13,7 @@ def set_size_chernoff(set_size, scale, relative_error=0.05): @param relative_error: the desired relative error (defaults to 5%) @return: float (the upper bound probability) """ - upper_bound = 1 - 2 * np.exp(- relative_error**2*set_size/(scale * 3)) + upper_bound = 1 - 2 * np.exp(-(relative_error**2) * set_size / (scale * 3)) return upper_bound @@ -28,7 +28,9 @@ def get_set_size(scale, num_sketches): return int(np.floor(scale * num_sketches)) -def set_size_estimate_is_accurate(scale, num_sketches, relative_error=0.05, confidence=0.95): +def set_size_estimate_is_accurate( + scale, num_sketches, relative_error=0.05, confidence=0.95 +): set_size = get_set_size(scale, num_sketches) probability = set_size_chernoff(set_size, scale, relative_error) if probability >= confidence: @@ -38,48 +40,96 @@ def set_size_estimate_is_accurate(scale, num_sketches, relative_error=0.05, conf def test_set_size_chernoff(): - eps = 10**(-6) + eps = 10 ** (-6) rel_error = 0.01 set_size = 1000000 - s = 1/0.1 # I'm used to using a scale value between 0 and 1 + s = 1 / 0.1 # I'm used to using a scale value between 0 and 1 value_from_mathematica = 0.928652 - assert np.abs(set_size_chernoff(set_size, s, rel_error) - value_from_mathematica) < eps + assert ( + np.abs(set_size_chernoff(set_size, s, rel_error) - value_from_mathematica) < eps + ) rel_error = 0.05 set_size = 10000 s = 1 value_from_mathematica = 0.999519 - assert np.abs(set_size_chernoff(set_size, s, rel_error) - value_from_mathematica) < eps + assert ( + np.abs(set_size_chernoff(set_size, s, rel_error) - value_from_mathematica) < eps + ) rel_error = 0.001 set_size = 10 - s = 1/.01 + s = 1 / 0.01 value_from_mathematica = -1 - assert np.abs(set_size_chernoff(set_size, s, rel_error) - value_from_mathematica) < eps + assert ( + np.abs(set_size_chernoff(set_size, s, rel_error) - value_from_mathematica) < eps + ) def test_set_size_estimate_is_accurate(): - eps = 10 ** (-6) + 10 ** (-6) rel_error = 0.05 set_size = 1000000 s = 1 / 0.1 # I'm used to using a scale value between 0 and 1 num_sketches = set_size / s # idealized case confidence = 0.95 - assert set_size_estimate_is_accurate(scale=s, num_sketches=num_sketches, relative_error=rel_error, confidence=confidence) is True + assert ( + set_size_estimate_is_accurate( + scale=s, + num_sketches=num_sketches, + relative_error=rel_error, + confidence=confidence, + ) + is True + ) confidence = set_size_chernoff(set_size=set_size, scale=s, relative_error=rel_error) - assert set_size_estimate_is_accurate(scale=s, num_sketches=num_sketches, relative_error=rel_error, confidence=confidence) is True + assert ( + set_size_estimate_is_accurate( + scale=s, + num_sketches=num_sketches, + relative_error=rel_error, + confidence=confidence, + ) + is True + ) # Horrible values - assert set_size_estimate_is_accurate(scale=10000, num_sketches=num_sketches, relative_error=0, confidence=1) is False + assert ( + set_size_estimate_is_accurate( + scale=10000, num_sketches=num_sketches, relative_error=0, confidence=1 + ) + is False + ) # Less horrible, but still bad values confidence = set_size_chernoff(set_size=set_size, scale=s, relative_error=rel_error) - assert set_size_estimate_is_accurate(scale=s, num_sketches=num_sketches, relative_error=rel_error, confidence=confidence*2) is False + assert ( + set_size_estimate_is_accurate( + scale=s, + num_sketches=num_sketches, + relative_error=rel_error, + confidence=confidence * 2, + ) + is False + ) # one where the confidence is negative - rel_error = .001 + rel_error = 0.001 set_size = 10 s = 100 - num_sketches = set_size/s - assert set_size_estimate_is_accurate(scale=s, num_sketches=num_sketches, relative_error=rel_error, confidence=confidence) is False - assert set_size_estimate_is_accurate(scale=s, num_sketches=0, relative_error=rel_error, confidence=confidence) is False + num_sketches = set_size / s + assert ( + set_size_estimate_is_accurate( + scale=s, + num_sketches=num_sketches, + relative_error=rel_error, + confidence=confidence, + ) + is False + ) + assert ( + set_size_estimate_is_accurate( + scale=s, num_sketches=0, relative_error=rel_error, confidence=confidence + ) + is False + ) def run_tests(): @@ -87,7 +137,7 @@ def run_tests(): test_set_size_estimate_is_accurate() -if __name__ == '__main__': +if __name__ == "__main__": print("Running tests") run_tests() print("Tests completed successfully") diff --git a/utils/check-tree.py b/utils/check-tree.py index 12fc0190de..639e376e3b 100644 --- a/utils/check-tree.py +++ b/utils/check-tree.py @@ -7,12 +7,12 @@ import sourmash from sourmash.sbtmh import search_minhashes -THRESHOLD=0.08 +THRESHOLD = 0.08 def main(): p = argparse.ArgumentParser() - p.add_argument('sbt') + p.add_argument("sbt") args = p.parse_args() db = sourmash.sbtmh.load_sbt_index(args.sbt) @@ -21,11 +21,11 @@ def main(): for leaf in db.leaves(): query = leaf.data matches = db.find(search_minhashes, query, threshold) - matches = list([ x.data for x in matches ]) + matches = list([x.data for x in matches]) if query not in matches: print(query) assert 0 - -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/utils/compute-dna-mh-another-way.py b/utils/compute-dna-mh-another-way.py index aad7198092..c197298d75 100755 --- a/utils/compute-dna-mh-another-way.py +++ b/utils/compute-dna-mh-another-way.py @@ -7,7 +7,9 @@ The output of this is used in test_sourmash.py to verify our C++ code. """ -__complementTranslation = { "A": "T", "C": "G", "G": "C", "T": "A", "N": "N" } +__complementTranslation = {"A": "T", "C": "G", "G": "C", "T": "A", "N": "N"} + + def complement(s): """ Return complement of 's'. @@ -26,21 +28,24 @@ def reverse(s): def kmers(seq, k): for start in range(len(seq) - k + 1): - yield seq[start:start + k] + yield seq[start : start + k] + ### K = 21 -import sys, screed +import sys +import screed import mmh3 import sourmash -print('imported sourmash:', sourmash, file=sys.stderr) + +print("imported sourmash:", sourmash, file=sys.stderr) import sourmash.signature record = next(iter(screed.open(sys.argv[1]))) -print('loaded', record.name, file=sys.stderr) -revcomp = reverse(complement((record.sequence))) +print("loaded", record.name, file=sys.stderr) +revcomp = reverse(complement(record.sequence)) mh = sourmash.MinHash(ksize=K, n=500, is_protein=False) @@ -69,5 +74,5 @@ def kmers(seq, k): mh.add_hash(hash) -s = sourmash.signature.SourmashSignature('', mh, name=record.name) +s = sourmash.signature.SourmashSignature("", mh, name=record.name) print(sourmash.signature.save_signatures([s])) diff --git a/utils/compute-input-prot-another-way.py b/utils/compute-input-prot-another-way.py index 5c1202eaee..7dec10d849 100755 --- a/utils/compute-input-prot-another-way.py +++ b/utils/compute-input-prot-another-way.py @@ -7,25 +7,77 @@ The output of this is used in test_sourmash.py to verify our C++ code. """ -dna_to_aa={'TTT':'F','TTC':'F', 'TTA':'L','TTG':'L', - 'TCT':'S','TCC':'S','TCA':'S','TCG':'S', - 'TAT':'Y','TAC':'Y', 'TAA':'*','TAG':'*','TGA':'*', - 'TGT':'C','TGC':'C', 'TGG':'W', - 'CTT':'L','CTC':'L','CTA':'L','CTG':'L', - 'CCT':'P','CCC':'P','CCA':'P','CCG':'P', - 'CAT':'H','CAC':'H', 'CAA':'Q','CAG':'Q', - 'CGT':'R','CGC':'R','CGA':'R','CGG':'R', - 'ATT':'I','ATC':'I','ATA':'I', 'ATG':'M', - 'ACT':'T','ACC':'T','ACA':'T','ACG':'T', - 'AAT':'N','AAC':'N', 'AAA':'K','AAG':'K', - 'AGT':'S','AGC':'S', 'AGA':'R','AGG':'R', - 'GTT':'V','GTC':'V','GTA':'V','GTG':'V', - 'GCT':'A','GCC':'A','GCA':'A','GCG':'A', - 'GAT':'D','GAC':'D', 'GAA':'E','GAG':'E', - 'GGT':'G','GGC':'G','GGA':'G','GGG':'G'} - - -__complementTranslation = { "A": "T", "C": "G", "G": "C", "T": "A", "N": "N" } +dna_to_aa = { + "TTT": "F", + "TTC": "F", + "TTA": "L", + "TTG": "L", + "TCT": "S", + "TCC": "S", + "TCA": "S", + "TCG": "S", + "TAT": "Y", + "TAC": "Y", + "TAA": "*", + "TAG": "*", + "TGA": "*", + "TGT": "C", + "TGC": "C", + "TGG": "W", + "CTT": "L", + "CTC": "L", + "CTA": "L", + "CTG": "L", + "CCT": "P", + "CCC": "P", + "CCA": "P", + "CCG": "P", + "CAT": "H", + "CAC": "H", + "CAA": "Q", + "CAG": "Q", + "CGT": "R", + "CGC": "R", + "CGA": "R", + "CGG": "R", + "ATT": "I", + "ATC": "I", + "ATA": "I", + "ATG": "M", + "ACT": "T", + "ACC": "T", + "ACA": "T", + "ACG": "T", + "AAT": "N", + "AAC": "N", + "AAA": "K", + "AAG": "K", + "AGT": "S", + "AGC": "S", + "AGA": "R", + "AGG": "R", + "GTT": "V", + "GTC": "V", + "GTA": "V", + "GTG": "V", + "GCT": "A", + "GCC": "A", + "GCA": "A", + "GCG": "A", + "GAT": "D", + "GAC": "D", + "GAA": "E", + "GAG": "E", + "GGT": "G", + "GGC": "G", + "GGA": "G", + "GGG": "G", +} + + +__complementTranslation = {"A": "T", "C": "G", "G": "C", "T": "A", "N": "N"} + + def complement(s): """ Return complement of 's'. @@ -44,7 +96,7 @@ def reverse(s): def peptides(seq, start): for i in range(start, len(seq), 3): - yield dna_to_aa.get(seq[i:i+3], "X") + yield dna_to_aa.get(seq[i : i + 3], "X") def translate(seq): @@ -52,27 +104,31 @@ def translate(seq): pep = peptides(seq, i) yield "".join(pep) - revcomp = reverse(complement((seq))) + revcomp = reverse(complement(seq)) for i in range(3): pep = peptides(revcomp, i) yield "".join(pep) + def kmers(seq, k): for start in range(len(seq) - k + 1): - yield seq[start:start + k] + yield seq[start : start + k] + ### K = 21 -import sys, screed +import sys +import screed import mmh3 import sourmash -print('imported sourmash:', sourmash, file=sys.stderr) + +print("imported sourmash:", sourmash, file=sys.stderr) import sourmash.signature record = next(iter(screed.open(sys.argv[1]))) -print('loaded', record.name, file=sys.stderr) +print("loaded", record.name, file=sys.stderr) mh = sourmash.MinHash(ksize=K, n=500, is_protein=True) prot_ksize = int(K / 3) @@ -86,5 +142,5 @@ def kmers(seq, k): mh.add_hash(hash) -s = sourmash.signature.SourmashSignature('', mh, name=record.name) +s = sourmash.signature.SourmashSignature("", mh, name=record.name) print(sourmash.signature.save_signatures([s])) diff --git a/utils/compute-prot-mh-another-way.py b/utils/compute-prot-mh-another-way.py index 6295204f3b..e859268d05 100755 --- a/utils/compute-prot-mh-another-way.py +++ b/utils/compute-prot-mh-another-way.py @@ -7,25 +7,77 @@ The output of this is used in test_sourmash.py to verify our C++ code. """ -dna_to_aa={'TTT':'F','TTC':'F', 'TTA':'L','TTG':'L', - 'TCT':'S','TCC':'S','TCA':'S','TCG':'S', - 'TAT':'Y','TAC':'Y', 'TAA':'*','TAG':'*','TGA':'*', - 'TGT':'C','TGC':'C', 'TGG':'W', - 'CTT':'L','CTC':'L','CTA':'L','CTG':'L', - 'CCT':'P','CCC':'P','CCA':'P','CCG':'P', - 'CAT':'H','CAC':'H', 'CAA':'Q','CAG':'Q', - 'CGT':'R','CGC':'R','CGA':'R','CGG':'R', - 'ATT':'I','ATC':'I','ATA':'I', 'ATG':'M', - 'ACT':'T','ACC':'T','ACA':'T','ACG':'T', - 'AAT':'N','AAC':'N', 'AAA':'K','AAG':'K', - 'AGT':'S','AGC':'S', 'AGA':'R','AGG':'R', - 'GTT':'V','GTC':'V','GTA':'V','GTG':'V', - 'GCT':'A','GCC':'A','GCA':'A','GCG':'A', - 'GAT':'D','GAC':'D', 'GAA':'E','GAG':'E', - 'GGT':'G','GGC':'G','GGA':'G','GGG':'G'} - - -__complementTranslation = { "A": "T", "C": "G", "G": "C", "T": "A", "N": "N" } +dna_to_aa = { + "TTT": "F", + "TTC": "F", + "TTA": "L", + "TTG": "L", + "TCT": "S", + "TCC": "S", + "TCA": "S", + "TCG": "S", + "TAT": "Y", + "TAC": "Y", + "TAA": "*", + "TAG": "*", + "TGA": "*", + "TGT": "C", + "TGC": "C", + "TGG": "W", + "CTT": "L", + "CTC": "L", + "CTA": "L", + "CTG": "L", + "CCT": "P", + "CCC": "P", + "CCA": "P", + "CCG": "P", + "CAT": "H", + "CAC": "H", + "CAA": "Q", + "CAG": "Q", + "CGT": "R", + "CGC": "R", + "CGA": "R", + "CGG": "R", + "ATT": "I", + "ATC": "I", + "ATA": "I", + "ATG": "M", + "ACT": "T", + "ACC": "T", + "ACA": "T", + "ACG": "T", + "AAT": "N", + "AAC": "N", + "AAA": "K", + "AAG": "K", + "AGT": "S", + "AGC": "S", + "AGA": "R", + "AGG": "R", + "GTT": "V", + "GTC": "V", + "GTA": "V", + "GTG": "V", + "GCT": "A", + "GCC": "A", + "GCA": "A", + "GCG": "A", + "GAT": "D", + "GAC": "D", + "GAA": "E", + "GAG": "E", + "GGT": "G", + "GGC": "G", + "GGA": "G", + "GGG": "G", +} + + +__complementTranslation = {"A": "T", "C": "G", "G": "C", "T": "A", "N": "N"} + + def complement(s): """ Return complement of 's'. @@ -44,7 +96,7 @@ def reverse(s): def peptides(seq, start): for i in range(start, len(seq), 3): - yield dna_to_aa.get(seq[i:i+3], "X") + yield dna_to_aa.get(seq[i : i + 3], "X") def translate(seq): @@ -52,27 +104,31 @@ def translate(seq): pep = peptides(seq, i) yield "".join(pep) - revcomp = reverse(complement((seq))) + revcomp = reverse(complement(seq)) for i in range(3): pep = peptides(revcomp, i) yield "".join(pep) + def kmers(seq, k): for start in range(len(seq) - k + 1): - yield seq[start:start + k] + yield seq[start : start + k] + ### K = 21 -import sys, screed +import sys +import screed import mmh3 import sourmash -print('imported sourmash:', sourmash, file=sys.stderr) + +print("imported sourmash:", sourmash, file=sys.stderr) import sourmash.signature record = next(iter(screed.open(sys.argv[1]))) -print('loaded', record.name, file=sys.stderr) +print("loaded", record.name, file=sys.stderr) mh = sourmash.MinHash(ksize=K, n=500, is_protein=True) prot_ksize = int(K / 3) @@ -87,5 +143,5 @@ def kmers(seq, k): mh.add_hash(hash) -s = sourmash.signature.SourmashSignature('', mh, name=record.name) +s = sourmash.signature.SourmashSignature("", mh, name=record.name) print(sourmash.signature.save_signatures([s])) From 9f36e2f8c0ef89879924d3c2d44eacdbf6ee795a Mon Sep 17 00:00:00 2001 From: Tessa Pierce Ward Date: Mon, 5 Feb 2024 10:57:56 -0800 Subject: [PATCH 08/14] MRG: core: add scaled selection to manifest; add helper functions for collection and sig/sketch usage (#2948) This PR adds: ## New functions: - `Collection::sig_from_record` > when we iterate through the sketches, we have both `idx` and `record` available. I thought it would make sense to just use record directly, rather than re-getting record from index. - `Signature::minhash` > if there is one minhash sketch available within the sig that matches selection params, return it - `Signature::get_sketch` > if there is one sketch (of any type) available within the sig that matches selection params, return it. Note that since this returns the sketch enum, it still requires checking MinHash type afterwards. @luizirber is there a way to return any of the sketches directly from the same function (like minhash function, above, but more flexible?). - `Manifest::From<&PathBuf>` > build a `manifest` directly from a pathlist file. Added and tested, but lmk if you think we should just build a list of paths separately. I wanted this for branchwater, but am not actually using it since neither the paths or PathBuf loading code allow missing/failed paths. ## New selection functionality - added `scaled` and `num` selection to manifest. For scaled, if sketch is compatible (equal scaled or can be downsampled), keep it during manifest selection. Otherwise, discard. Tests added for each new function/added code. Co-authored-by: Luiz Irber --- src/core/src/collection.rs | 219 +++++++++++++++++++++++++++++++++++++ src/core/src/manifest.rs | 113 ++++++++++++++++++- src/core/src/signature.rs | 141 ++++++++++++++++++++++-- 3 files changed, 465 insertions(+), 8 deletions(-) diff --git a/src/core/src/collection.rs b/src/core/src/collection.rs index c00b2fd288..8cc6129cf4 100644 --- a/src/core/src/collection.rs +++ b/src/core/src/collection.rs @@ -180,6 +180,14 @@ impl Collection { assert_eq!(sig.signatures.len(), 1); Ok(sig) } + + pub fn sig_from_record(&self, record: &Record) -> Result { + let match_path = record.internal_location().as_str(); + let selection = Selection::from_record(record)?; + let sig = self.storage.load_sig(match_path)?.select(&selection)?; + assert_eq!(sig.signatures.len(), 1); + Ok(sig) + } } impl Select for Collection { @@ -188,3 +196,214 @@ impl Select for Collection { Ok(self) } } + +#[cfg(test)] +mod test { + use camino::Utf8PathBuf as PathBuf; + use std::fs::File; + use std::io::BufReader; + + use super::Collection; + + use crate::encodings::HashFunctions; + use crate::prelude::Select; + use crate::selection::Selection; + use crate::signature::Signature; + + #[test] + fn sigstore_selection_with_downsample() { + // load test sigs + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("../../tests/test-data/47+63-multisig.sig"); + let file = File::open(filename).unwrap(); + let reader = BufReader::new(file); + let sigs: Vec = serde_json::from_reader(reader).expect("Loading error"); + // create Selection object + let mut selection = Selection::default(); + selection.set_scaled(2000); + // load sigs into collection + select compatible signatures + let cl = Collection::from_sigs(sigs) + .unwrap() + .select(&selection) + .unwrap(); + // count collection length + assert_eq!(cl.len(), 6); + for (idx, _rec) in cl.iter() { + // need to pass select again here so we actually downsample + let this_sig = cl.sig_for_dataset(idx).unwrap().select(&selection).unwrap(); + let this_mh = this_sig.minhash().unwrap(); + assert_eq!(this_mh.scaled(), 2000); + } + } + + #[test] + fn sigstore_selection_with_downsample_too_low() { + // load test sigs + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("../../tests/test-data/47+63-multisig.sig"); + let file = File::open(filename).unwrap(); + let reader = BufReader::new(file); + let sigs: Vec = serde_json::from_reader(reader).expect("Loading error"); + // create Selection object + let mut selection = Selection::default(); + selection.set_scaled(500); + // load sigs into collection + select compatible signatures + let cl = Collection::from_sigs(sigs) + .unwrap() + .select(&selection) + .unwrap(); + // no sigs should remain + assert_eq!(cl.len(), 0); + } + + #[test] + fn sigstore_selection_scaled_handle_num_sig() { + // load test sigs + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + // four num=500 sigs + filename.push("../../tests/test-data/genome-s11.fa.gz.sig"); + let file = File::open(filename).unwrap(); + let reader = BufReader::new(file); + let sigs: Vec = serde_json::from_reader(reader).expect("Loading error"); + assert_eq!(sigs.len(), 4); + // create Selection object + let mut selection = Selection::default(); + selection.set_scaled(1000); + // load sigs into collection + select compatible signatures + let cl = Collection::from_sigs(sigs) + .unwrap() + .select(&selection) + .unwrap(); + // no sigs should remain + assert_eq!(cl.len(), 0); + } + + #[test] + fn sigstore_selection_num() { + // load test sigs + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + // four num=500 sigs + filename.push("../../tests/test-data/genome-s11.fa.gz.sig"); + let file = File::open(filename).unwrap(); + let reader = BufReader::new(file); + let sigs: Vec = serde_json::from_reader(reader).expect("Loading error"); + let sigs_copy = sigs.clone(); + assert_eq!(sigs.len(), 4); + // create Selection object + let mut selection = Selection::default(); + selection.set_num(500); + // load sigs into collection + select compatible signatures + let cl = Collection::from_sigs(sigs) + .unwrap() + .select(&selection) + .unwrap(); + // all sigs should remain + assert_eq!(cl.len(), 4); + //now select diff num and none should remain + selection.set_num(100); + let cl2 = Collection::from_sigs(sigs_copy) + .unwrap() + .select(&selection) + .unwrap(); + assert_eq!(cl2.len(), 0); + } + + #[test] + fn sigstore_selection_num_handle_scaled_sig() { + // load test sigs + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + // four num=500 sigs + filename.push("../../tests/test-data/47+63-multisig.sig"); + let file = File::open(filename).unwrap(); + let reader = BufReader::new(file); + let sigs: Vec = serde_json::from_reader(reader).expect("Loading error"); + assert_eq!(sigs.len(), 6); + // create Selection object + let mut selection = Selection::default(); + selection.set_num(500); + // load sigs into collection + select compatible signatures + let cl = Collection::from_sigs(sigs) + .unwrap() + .select(&selection) + .unwrap(); + // no sigs should remain + assert_eq!(cl.len(), 0); + } + + #[test] + fn sigstore_sig_from_record() { + // load test sigs + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("../../tests/test-data/47+63-multisig.sig"); + let file = File::open(filename).unwrap(); + let reader = BufReader::new(file); + let sigs: Vec = serde_json::from_reader(reader).expect("Loading error"); + // create Selection object + let mut selection = Selection::default(); + selection.set_scaled(2000); + // load sigs into collection + select compatible signatures + let cl = Collection::from_sigs(sigs) + .unwrap() + .select(&selection) + .unwrap(); + // no sigs should remain + assert_eq!(cl.len(), 6); + for (_idx, rec) in cl.iter() { + // need to pass select again here so we actually downsample + let this_sig = cl.sig_from_record(rec).unwrap().select(&selection).unwrap(); + let this_mh = this_sig.minhash().unwrap(); + assert_eq!(this_mh.scaled(), 2000); + } + } + + #[test] + fn sigstore_selection_moltype_zip() { + // load test sigs + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("../../tests/test-data/prot/hp.zip"); + // create Selection object + let mut selection = Selection::default(); + selection.set_scaled(100); + selection.set_moltype(HashFunctions::Murmur64Hp); + // load sigs into collection + select compatible signatures + let cl = Collection::from_zipfile(&filename) + .unwrap() + .select(&selection) + .unwrap(); + // count collection length + assert_eq!(cl.len(), 2); + for (idx, _rec) in cl.iter() { + // need to pass select again here so we actually downsample + let this_sig = cl.sig_for_dataset(idx).unwrap().select(&selection).unwrap(); + let this_mh = this_sig.minhash().unwrap(); + assert_eq!(this_mh.scaled(), 100); + } + } + + #[test] + fn sigstore_selection_moltype_sig() { + // load test sigs + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename + .push("../../tests/test-data/prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig"); + let file = File::open(filename).unwrap(); + let reader = BufReader::new(file); + let sigs: Vec = serde_json::from_reader(reader).expect("Loading error"); + // create Selection object + let mut selection = Selection::default(); + selection.set_moltype(HashFunctions::Murmur64Hp); + // load sigs into collection + select compatible signatures + let cl = Collection::from_sigs(sigs) + .unwrap() + .select(&selection) + .unwrap(); + // count collection length + assert_eq!(cl.len(), 1); + for (idx, _rec) in cl.iter() { + // need to pass select again here so we actually downsample + let this_sig = cl.sig_for_dataset(idx).unwrap().select(&selection).unwrap(); + let this_mh = this_sig.minhash().unwrap(); + assert_eq!(this_mh.scaled(), 100); + } + } +} diff --git a/src/core/src/manifest.rs b/src/core/src/manifest.rs index 5bad8ec81b..a7ebfdfc96 100644 --- a/src/core/src/manifest.rs +++ b/src/core/src/manifest.rs @@ -1,5 +1,6 @@ use std::convert::TryInto; -use std::io::{Read, Write}; +use std::fs::File; +use std::io::{BufRead, BufReader, Read, Write}; use std::ops::Deref; use camino::Utf8PathBuf as PathBuf; @@ -200,6 +201,17 @@ impl Select for Manifest { } else { valid }; + valid = if let Some(scaled) = selection.scaled() { + // num sigs have row.scaled = 0, don't include them + valid && row.scaled != 0 && row.scaled <= scaled as u64 + } else { + valid + }; + valid = if let Some(num) = selection.num() { + valid && row.num == num + } else { + valid + }; valid }); @@ -270,6 +282,21 @@ impl From<&[PathBuf]> for Manifest { } } +impl From<&PathBuf> for Manifest { + fn from(pathlist: &PathBuf) -> Self { + let file = File::open(pathlist).unwrap_or_else(|_| panic!("Failed to open {:?}", pathlist)); + let reader = BufReader::new(file); + + let paths: Vec = reader + .lines() + .map(|line| line.unwrap_or_else(|_| panic!("Failed to read line from {:?}", pathlist))) + .map(PathBuf::from) + .collect(); + + paths.as_slice().into() + } +} + impl Deref for Manifest { type Target = Vec; @@ -277,3 +304,87 @@ impl Deref for Manifest { &self.records } } + +#[cfg(test)] +mod test { + use camino::Utf8PathBuf as PathBuf; + use std::fs::File; + use std::io::Write; + use tempfile::TempDir; + + use super::Manifest; + + #[test] + fn manifest_from_pathlist() { + let temp_dir = TempDir::new().unwrap(); + let utf8_output = PathBuf::from_path_buf(temp_dir.path().to_path_buf()) + .expect("Path should be valid UTF-8"); + let mut filename = utf8_output.join("sig-pathlist.txt"); + //convert to camino utf8pathbuf + filename = PathBuf::from(filename); + // build sig filenames + let base_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + let test_sigs = vec![ + "../../tests/test-data/47.fa.sig", + "../../tests/test-data/63.fa.sig", + ]; + + let full_paths: Vec<_> = test_sigs + .into_iter() + .map(|sig| base_path.join(sig)) + .collect(); + + // write a file in test directory with a filename on each line + let mut pathfile = File::create(&filename).unwrap(); + for sigfile in &full_paths { + writeln!(pathfile, "{}", sigfile).unwrap(); + } + + // load into manifest + let manifest = Manifest::from(&filename); + assert_eq!(manifest.len(), 2); + } + + #[test] + #[should_panic(expected = "Failed to open \"no-exist\"")] + fn manifest_from_pathlist_nonexistent_file() { + let filename = PathBuf::from("no-exist"); + let _manifest = Manifest::from(&filename); + } + + #[test] + #[should_panic] + fn manifest_from_pathlist_badfile() { + let temp_dir = TempDir::new().unwrap(); + let utf8_output = PathBuf::from_path_buf(temp_dir.path().to_path_buf()) + .expect("Path should be valid UTF-8"); + let mut filename = utf8_output.join("sig-pathlist.txt"); + //convert to camino utf8pathbuf + filename = PathBuf::from(filename); + + let mut pathfile = File::create(&filename).unwrap(); + write!(pathfile, "Valid line\n").unwrap(); + pathfile.write_all(&[0xED, 0xA0, 0x80]).unwrap(); // invalid UTF-8 + + // load into manifest + let _manifest = Manifest::from(&filename); + } + + #[test] + #[should_panic] + fn manifest_from_paths_badpath() { + let base_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + let test_sigs = vec![ + PathBuf::from("no-exist"), + PathBuf::from("../../tests/test-data/63.fa.sig"), + ]; + + let full_paths: Vec = test_sigs + .into_iter() + .map(|sig| base_path.join(sig)) + .collect(); + + // load into manifest + let _manifest = Manifest::from(&full_paths[..]); // pass full_paths as a slice + } +} diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index a75eb6c3f8..381d45c643 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -19,6 +19,7 @@ use typed_builder::TypedBuilder; use crate::encodings::{aa_to_dayhoff, aa_to_hp, revcomp, to_aa, HashFunctions, VALID}; use crate::prelude::*; use crate::selection::{Select, Selection}; +use crate::sketch::minhash::KmerMinHash; use crate::sketch::Sketch; use crate::Error; use crate::HashIntoType; @@ -534,6 +535,39 @@ impl Signature { None } + // return single corresponding sketch + pub fn get_sketch(&self) -> Option<&Sketch> { + if self.signatures.len() != 1 { + if self.signatures.len() > 1 { + todo!("Multiple sketches found! Please run select first."); + } + return None; + } + self.signatures.iter().find(|sk| { + matches!( + sk, + Sketch::MinHash(_) | Sketch::LargeMinHash(_) | Sketch::HyperLogLog(_) + ) + }) + } + + // return minhash directly + pub fn minhash(&self) -> Option<&KmerMinHash> { + if self.signatures.len() != 1 { + if self.signatures.len() > 1 { + todo!("Multiple sketches found! Please run select first."); + } + return None; + } + self.signatures.iter().find_map(|sk| { + if let Sketch::MinHash(mh) = sk { + Some(mh) + } else { + None + } + }) + } + pub fn from_path>(path: P) -> Result, Error> { let mut reader = io::BufReader::new(File::open(path)?); Signature::from_reader(&mut reader) @@ -772,13 +806,16 @@ impl Select for Signature { valid }; // keep compatible scaled if applicable - if let Some(sel_scaled) = selection.scaled() { - valid = if let Sketch::MinHash(mh) = s { - valid && mh.scaled() <= sel_scaled as u64 - } else { - valid - }; - } + valid = if let Some(sel_scaled) = selection.scaled() { + match s { + Sketch::MinHash(mh) => valid && mh.scaled() <= sel_scaled as u64, + // TODO: test LargeMinHash + // Sketch::LargeMinHash(lmh) => valid && lmh.scaled() <= sel_scaled as u64, + _ => valid, // other sketch types or invalid cases + } + } else { + valid // if selection.scaled() is None, keep prior valid + }; /* valid = if let Some(abund) = selection.abund() { valid && *s.with_abundance() == abund @@ -798,6 +835,7 @@ impl Select for Signature { // downsample the retained sketches if needed. if let Some(sel_scaled) = selection.scaled() { for sketch in self.signatures.iter_mut() { + // TODO: also account for LargeMinHash if let Sketch::MinHash(mh) = sketch { if (mh.scaled() as u32) < sel_scaled { *sketch = Sketch::MinHash(mh.downsample_scaled(sel_scaled as u64)?); @@ -1002,6 +1040,95 @@ mod test { } } + #[test] + fn load_minhash_from_signature() { + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("../../tests/test-data/47.fa.sig"); + + let file = File::open(filename).unwrap(); + let reader = BufReader::new(file); + let sigs: Vec = serde_json::from_reader(reader).expect("Loading error"); + + assert_eq!(sigs.len(), 1); + + let sig = sigs.get(0).unwrap(); + let mh = sig.minhash().unwrap(); + assert_eq!(mh.scaled(), 1000); + } + + #[test] + fn load_single_sketch_from_signature() { + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("../../tests/test-data/47.fa.sig"); + + let file = File::open(filename).unwrap(); + let reader = BufReader::new(file); + let sigs: Vec = serde_json::from_reader(reader).expect("Loading error"); + + assert_eq!(sigs.len(), 1); + + let sig = sigs.get(0).unwrap(); + let mhdirect = sig.minhash().unwrap(); + let sketch = sig.get_sketch().unwrap(); + if let Sketch::MinHash(mh) = sketch { + assert_eq!(mh.scaled(), 1000); + assert_eq!(mhdirect, mh); // should be the same + } else { + // error + assert!(false); + } + } + + #[test] + #[should_panic] + fn get_sketch_multisketch_panic() { + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("../../tests/test-data/47.fa.sig"); + + let file = File::open(filename).unwrap(); + let reader = BufReader::new(file); + let sigs: Vec = serde_json::from_reader(reader).expect("Loading error"); + + assert_eq!(sigs.len(), 1); + + let sig = sigs.get(0).unwrap(); + let mut mhdirect = sig.minhash().unwrap().clone(); + // change slightly and push into new_sig + mhdirect.add_sequence(b"ATGGA", false).unwrap(); + let new_sketch = Sketch::MinHash(mhdirect.clone()); + let mut new_sig = sig.clone(); + new_sig.push(new_sketch); + // check there are now two sketches in new_sig + assert_eq!(new_sig.signatures.len(), 2); + + let _ = new_sig.get_sketch(); + } + + #[test] + #[should_panic] + fn load_minhash_multisketch_panic() { + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("../../tests/test-data/47.fa.sig"); + + let file = File::open(filename).unwrap(); + let reader = BufReader::new(file); + let sigs: Vec = serde_json::from_reader(reader).expect("Loading error"); + + assert_eq!(sigs.len(), 1); + + let sig = sigs.get(0).unwrap(); + let mut mhdirect = sig.minhash().unwrap().clone(); + // change slightly and push into new_sig + mhdirect.add_sequence(b"ATGGA", false).unwrap(); + let new_sketch = Sketch::MinHash(mhdirect.clone()); + let mut new_sig = sig.clone(); + new_sig.push(new_sketch); + // check there are now two sketches in new_sig + assert_eq!(new_sig.signatures.len(), 2); + + let _ = new_sig.minhash(); + } + #[test] fn selection_with_downsample() { let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); From a128ee369d417253e1ff8bd37faf7ece4d13cee4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 5 Feb 2024 14:23:17 -0800 Subject: [PATCH 09/14] Bump rkyv from 0.7.43 to 0.7.44 (#2978) Bumps [rkyv](https://github.com/rkyv/rkyv) from 0.7.43 to 0.7.44.
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=rkyv&package-manager=cargo&previous-version=0.7.43&new-version=0.7.44)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 8 ++++---- src/core/Cargo.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2883cf4420..84c77f798e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1322,9 +1322,9 @@ checksum = "8c31b5c4033f8fdde8700e4657be2c497e7288f01515be52168c631e2e4d4086" [[package]] name = "rkyv" -version = "0.7.43" +version = "0.7.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "527a97cdfef66f65998b5f3b637c26f5a5ec09cc52a3f9932313ac645f4190f5" +checksum = "5cba464629b3394fc4dbc6f940ff8f5b4ff5c7aef40f29166fd4ad12acbc99c0" dependencies = [ "bitvec", "bytecheck", @@ -1340,9 +1340,9 @@ dependencies = [ [[package]] name = "rkyv_derive" -version = "0.7.43" +version = "0.7.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5c462a1328c8e67e4d6dbad1eb0355dd43e8ab432c6e227a43657f16ade5033" +checksum = "a7dddfff8de25e6f62b9d64e6e432bf1c6736c57d20323e15ee10435fbda7c65" dependencies = [ "proc-macro2", "quote", diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index 53032b8500..16154dc561 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -49,7 +49,7 @@ once_cell = "1.18.0" ouroboros = "0.18.3" piz = "0.5.0" primal-check = "0.3.1" -rkyv = { version = "0.7.43", optional = true } +rkyv = { version = "0.7.44", optional = true } roaring = "0.10.2" rayon = { version = "1.8.1", optional = true } serde = { version = "1.0.196", features = ["derive"] } From c6831fdcf86e84641e024cd33b33b83a5f60ec2a Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Mon, 5 Feb 2024 18:04:32 -0800 Subject: [PATCH 10/14] MRG: add label output & input options to `compare` and `plot`, for better customization (#2598) Adds `sourmash compare --labels-to` and `sourmash plot --labels-from` to support better label customization. Fixes https://github.com/sourmash-bio/sourmash/issues/2452 Fixes https://github.com/sourmash-bio/sourmash/issues/2915 ## `sourmash compare --labels-to` This command will generate a 'labels-to' file. Running: ``` sourmash compare tests/test-data/demo/*.sig -o compare-demo \ --labels-to compare-demo-labels.csv ``` will produce a file that looks like this: file `compare-demo-labels.csv`: ```csv order,md5,label,name,filename,signature_file 1,60f7e23c24a8d94791cc7a8680c493f9,SRR2060939_1.fastq.gz,,SRR2060939_1.fastq.gz,../tests/test-data/demo/SRR2060939_1.sig 2,4e94e60265e04f0763142e20b52c0da1,SRR2060939_2.fastq.gz,,SRR2060939_2.fastq.gz,../tests/test-data/demo/SRR2060939_2.sig 3,f71e78178af9e45e6f1d87a0c53c465c,SRR2241509_1.fastq.gz,,SRR2241509_1.fastq.gz,../tests/test-data/demo/SRR2241509_1.sig 4,6d6e87e1154e95b279e5e7db414bc37b,SRR2255622_1.fastq.gz,,SRR2255622_1.fastq.gz,../tests/test-data/demo/SRR2255622_1.sig 5,0107d767a345eff67ecdaed2ee5cd7ba,SRR453566_1.fastq.gz,,SRR453566_1.fastq.gz,../tests/test-data/demo/SRR453566_1.sig 6,f0c834bc306651d2b9321fb21d3e8d8f,SRR453569_1.fastq.gz,,SRR453569_1.fastq.gz,../tests/test-data/demo/SRR453569_1.sig 7,b59473c94ff2889eca5d7165936e64b3,SRR453570_1.fastq.gz,,SRR453570_1.fastq.gz,../tests/test-data/demo/SRR453570_1.sig ``` The `label` column in this file can be edited to suit the user's needs; the index column is `order`, and all other columns can be ignored or deleted or updated without consequence. ## `sourmash plot --labels-from` This command will load labels from a file. Running: ``` sourmash plot --labels-from compare-demo-new-labels.csv compare-demo ``` uses the `label` column from the CSV as labels, in the order specified by the `order` column (interpreted as integers and sorted from lowest to highest). All other columns are ignored. ## Example in a Jupyter Notebook Some example code for updating the labels is available here: https://github.com/sourmash-bio/sourmash/blob/compare_labels/doc/plotting-compare.ipynb ## TODO - [x] add test for `args.labeltext and args.labels_from` check - [x] check the notebook update ## Future: - [ ] Consider switching to `LinearIndex` in the signature loading code, as that would let us maintain the location in the code without the current machinations. Also worth thinking about enabling lazy loading, which some future `Index`-code based modification might support. - [ ] consider if and how to validate --labels-from CSV file... --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- doc/command-line.md | 2 +- src/sourmash/cli/compare.py | 5 + src/sourmash/cli/plot.py | 5 + src/sourmash/commands.py | 81 ++++++++++--- tests/test-data/compare/labels_from-test.csv | 5 + tests/test_sourmash.py | 116 ++++++++++++++++++- 6 files changed, 194 insertions(+), 20 deletions(-) create mode 100644 tests/test-data/compare/labels_from-test.csv diff --git a/doc/command-line.md b/doc/command-line.md index 4697797e1b..14e875a9d6 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -224,7 +224,6 @@ sourmash compare [ ... ] Options: * `--output ` -- save the output matrix to this file, as a numpy binary matrix. -* `--csv ` -- save the output matrix to this file in CSV format. * `--distance-matrix` -- create and output a distance matrix, instead of a similarity matrix. * `--ksize ` -- do the comparisons at this k-mer size. * `--containment` -- calculate containment instead of similarity; `C(i, j) = size(i intersection j) / size(i)` @@ -233,6 +232,7 @@ Options: * `--ignore-abundance` -- ignore abundances in signatures. * `--picklist ::` -- select a subset of signatures with [a picklist](#using-picklists-to-subset-large-collections-of-signatures) * `--csv ` -- save the output matrix in CSV format. +* `--labels-to ` -- create a CSV file (spreadsheet) that can be passed in to `sourmash plot` with `--labels-from` in order to customize the labels. **Note:** compare by default produces a symmetric similarity matrix that can be used for clustering in downstream tasks. With `--containment`, diff --git a/src/sourmash/cli/compare.py b/src/sourmash/cli/compare.py index 74da5bd837..45844aaa1d 100644 --- a/src/sourmash/cli/compare.py +++ b/src/sourmash/cli/compare.py @@ -94,6 +94,11 @@ def subparser(subparsers): metavar="F", help="write matrix to specified file in CSV format (with column " "headers)", ) + subparser.add_argument( + "--labels-to", + "--labels-save", + help="a CSV file containing label information", + ) subparser.add_argument( "-p", "--processes", diff --git a/src/sourmash/cli/plot.py b/src/sourmash/cli/plot.py index 718a5c8528..dd4726c365 100644 --- a/src/sourmash/cli/plot.py +++ b/src/sourmash/cli/plot.py @@ -72,6 +72,11 @@ def subparser(subparsers): help="write clustered matrix and labels out in CSV format (with column" " headers) to this file", ) + subparser.add_argument( + "--labels-from", + "--labels-load", + help="a CSV file containing label information to use on plot; implies --labels", + ) def main(args): diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index e2d1a09a50..920693f9df 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -71,13 +71,19 @@ def compare(args): notify( f"\nwarning: no signatures loaded at given ksize/molecule type/picklist from {filename}" ) - siglist.extend(loaded) - # track ksizes/moltypes + # add to siglist; track ksizes/moltypes + s = None for s in loaded: + siglist.append((s, filename)) ksizes.add(s.minhash.ksize) moltypes.add(sourmash_args.get_moltype(s)) + if s is None: + notify( + f"\nwarning: no signatures loaded at given ksize/molecule type/picklist from {filename}" + ) + # error out while loading if we have more than one ksize/moltype if len(ksizes) > 1 or len(moltypes) > 1: break @@ -105,7 +111,7 @@ def compare(args): # check to make sure they're potentially compatible - either using # scaled, or not. - scaled_sigs = [s.minhash.scaled for s in siglist] + scaled_sigs = [s.minhash.scaled for (s, _) in siglist] is_scaled = all(scaled_sigs) is_scaled_2 = any(scaled_sigs) @@ -145,16 +151,20 @@ def compare(args): # notify about implicit --ignore-abundance: if is_containment or return_ani: - track_abundances = any(s.minhash.track_abundance for s in siglist) + track_abundances = any(s.minhash.track_abundance for s, _ in siglist) if track_abundances: notify( "NOTE: --containment, --max-containment, --avg-containment, and --estimate-ani ignore signature abundances." ) + # CTB: note, up to this point, we could do everything with manifests + # w/o actually loading any signatures. I'm not sure the manifest + # API allows it tho. + # if using scaled sketches or --scaled, downsample to common max scaled. printed_scaled_msg = False if is_scaled: - max_scaled = max(s.minhash.scaled for s in siglist) + max_scaled = max(s.minhash.scaled for s, _ in siglist) if args.scaled: args.scaled = int(args.scaled) @@ -166,7 +176,7 @@ def compare(args): notify(f"WARNING: continuing with scaled value of {max_scaled}.") new_siglist = [] - for s in siglist: + for s, filename in siglist: if not size_may_be_inaccurate and not s.minhash.size_is_accurate(): size_may_be_inaccurate = True if s.minhash.scaled != max_scaled: @@ -177,9 +187,9 @@ def compare(args): printed_scaled_msg = True with s.update() as s: s.minhash = s.minhash.downsample(scaled=max_scaled) - new_siglist.append(s) + new_siglist.append((s, filename)) else: - new_siglist.append(s) + new_siglist.append((s, filename)) siglist = new_siglist elif args.scaled is not None: error("ERROR: cannot specify --scaled with non-scaled signatures.") @@ -196,16 +206,20 @@ def compare(args): # do all-by-all calculation - labeltext = [str(item) for item in siglist] + labeltext = [str(ss) for ss, _ in siglist] + sigsonly = [ss for ss, _ in siglist] if args.containment: - similarity = compare_serial_containment(siglist, return_ani=return_ani) + similarity = compare_serial_containment(sigsonly, return_ani=return_ani) elif args.max_containment: - similarity = compare_serial_max_containment(siglist, return_ani=return_ani) + similarity = compare_serial_max_containment(sigsonly, return_ani=return_ani) elif args.avg_containment: - similarity = compare_serial_avg_containment(siglist, return_ani=return_ani) + similarity = compare_serial_avg_containment(sigsonly, return_ani=return_ani) else: similarity = compare_all_pairs( - siglist, args.ignore_abundance, n_jobs=args.processes, return_ani=return_ani + sigsonly, + args.ignore_abundance, + n_jobs=args.processes, + return_ani=return_ani, ) # if distance matrix desired, switch to 1-similarity @@ -215,7 +229,7 @@ def compare(args): matrix = similarity if len(siglist) < 30: - for i, ss in enumerate(siglist): + for i, (ss, filename) in enumerate(siglist): # for small matrices, pretty-print some output name_num = f"{i}-{str(ss)}" if len(name_num) > 20: @@ -246,6 +260,25 @@ def compare(args): with open(args.output, "wb") as fp: numpy.save(fp, matrix) + # output labels information via --labels-to? + if args.labels_to: + labeloutname = args.labels_to + notify(f"saving labels to: {labeloutname}") + with sourmash_args.FileOutputCSV(labeloutname) as fp: + w = csv.writer(fp) + w.writerow( + ["sort_order", "md5", "label", "name", "filename", "signature_file"] + ) + + for n, (ss, location) in enumerate(siglist): + md5 = ss.md5sum() + sigfile = location + label = str(ss) + name = ss.name + filename = ss.filename + + w.writerow([str(n + 1), md5, label, name, filename, sigfile]) + # output CSV? if args.csv: with FileOutputCSV(args.csv) as csv_fp: @@ -289,7 +322,10 @@ def plot(args): notify("...got {} x {} matrix.", *D.shape) # see sourmash#2790 for details :) - if args.labeltext or args.labels: + if args.labeltext or args.labels or args.labels_from: + if args.labeltext and args.labels_from: + notify("ERROR: cannot supply both --labeltext and --labels-from") + sys.exit(-1) display_labels = True args.labels = True # override => labels always true elif args.labels is None and not args.indices: @@ -303,13 +339,24 @@ def plot(args): else: display_labels = False - if args.labels: + if args.labels_from: + labelfilename = args.labels_from + notify(f"loading labels from CSV file '{labelfilename}'") + + labeltext = [] + with sourmash_args.FileInputCSV(labelfilename) as r: + for row in r: + order, label = row["sort_order"], row["label"] + labeltext.append((int(order), label)) + labeltext.sort() + labeltext = [t[1] for t in labeltext] + elif args.labels: if args.labeltext: labelfilename = args.labeltext else: labelfilename = D_filename + ".labels.txt" - notify(f"loading labels from {labelfilename}") + notify(f"loading labels from text file '{labelfilename}'") with open(labelfilename) as f: labeltext = [x.strip() for x in f] diff --git a/tests/test-data/compare/labels_from-test.csv b/tests/test-data/compare/labels_from-test.csv new file mode 100644 index 0000000000..902c045e60 --- /dev/null +++ b/tests/test-data/compare/labels_from-test.csv @@ -0,0 +1,5 @@ +sort_order,md5,label,name,filename,signature_file +4,8a619747693c045afde376263841806b,genome-s10+s11-CHANGED,genome-s10+s11,-,/Users/t/dev/sourmash/tests/test-data/genome-s10+s11.sig +3,ff511252a80bb9a7dbb0acf62626e123,genome-s12-CHANGED,genome-s12,genome-s12.fa.gz,/Users/t/dev/sourmash/tests/test-data/genome-s12.fa.gz.sig +2,1437d8eae64bad9bdc8d13e1daa0a43e,genome-s11-CHANGED,genome-s11,genome-s11.fa.gz,/Users/t/dev/sourmash/tests/test-data/genome-s11.fa.gz.sig +1,4cb3290263eba24548f5bef38bcaefc9,genome-s10-CHANGED,genome-s10,genome-s10.fa.gz,/Users/t/dev/sourmash/tests/test-data/genome-s10.fa.gz.sig \ No newline at end of file diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 7aaac0446e..ed8cc80b45 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -151,6 +151,7 @@ def test_compare_serial(runtmp): testsigs = utils.get_test_data("genome-s1*.sig") testsigs = glob.glob(testsigs) + assert len(testsigs) == 4 c.run_sourmash("compare", "-o", "cmp", "-k", "21", "--dna", *testsigs) @@ -1252,7 +1253,7 @@ def test_plot_override_labeltext(runtmp): print(runtmp.last_result.out) - assert "loading labels from new.labels.txt" in runtmp.last_result.err + assert "loading labels from text file 'new.labels.txt'" in runtmp.last_result.err expected = """\ 0\ta @@ -1291,7 +1292,7 @@ def test_plot_override_labeltext_fail(runtmp): print(runtmp.last_result.out) print(runtmp.last_result.err) assert runtmp.last_result.status != 0 - assert "loading labels from new.labels.txt" in runtmp.last_result.err + assert "loading labels from text file 'new.labels.txt'" in runtmp.last_result.err assert "3 labels != matrix size, exiting" in runtmp.last_result.err @@ -1406,6 +1407,117 @@ def test_plot_subsample_2(runtmp): assert expected in runtmp.last_result.out +def test_compare_and_plot_labels_from_to(runtmp): + # test doing compare --labels-to and plot --labels-from. + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") + + labels_csv = runtmp.output("label.csv") + + runtmp.run_sourmash( + "compare", + testdata1, + testdata2, + testdata3, + testdata4, + "-o", + "cmp", + "-k", + "21", + "--dna", + "--labels-to", + labels_csv, + ) + + runtmp.sourmash("plot", "cmp", "--labels-from", labels_csv) + + print(runtmp.last_result.out) + + assert "loading labels from CSV file" in runtmp.last_result.err + + expected = """\ +0\tgenome-s10 +1\tgenome-s11 +2\tgenome-s12 +3\tgenome-s10+s11""" + assert expected in runtmp.last_result.out + + +def test_compare_and_plot_labels_from_changed(runtmp): + # test 'plot --labels-from' with changed labels + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") + + labels_csv = utils.get_test_data("compare/labels_from-test.csv") + + runtmp.run_sourmash( + "compare", + testdata1, + testdata2, + testdata3, + testdata4, + "-o", + "cmp", + "-k", + "21", + "--dna", + ) + + runtmp.sourmash("plot", "cmp", "--labels-from", labels_csv) + + print(runtmp.last_result.out) + + assert "loading labels from CSV file" in runtmp.last_result.err + + expected = """\ +0\tgenome-s10-CHANGED +1\tgenome-s11-CHANGED +2\tgenome-s12-CHANGED +3\tgenome-s10+s11-CHANGED""" + assert expected in runtmp.last_result.out + + +def test_compare_and_plot_labels_from_error(runtmp): + # 'plot --labels-from ... --labeltext ...' should fail + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") + + labels_csv = utils.get_test_data("compare/labels_from-test.csv") + + runtmp.run_sourmash( + "compare", + testdata1, + testdata2, + testdata3, + testdata4, + "-o", + "cmp", + "-k", + "21", + "--dna", + ) + + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash( + "plot", + "cmp", + "--labels-from", + labels_csv, + "--labeltext", + labels_csv, + fail_ok=True, + ) + + err = runtmp.last_result.err + assert "ERROR: cannot supply both --labeltext and --labels-from" in err + + @utils.in_tempdir def test_search_query_sig_does_not_exist(c): testdata1 = utils.get_test_data("short.fa") From 732fc37aec8454c4491d4e80508a7baa5dbae45b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 6 Feb 2024 03:03:24 +0000 Subject: [PATCH 11/14] Bump tempfile from 3.9.0 to 3.10.0 (#2979) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [tempfile](https://github.com/Stebalien/tempfile) from 3.9.0 to 3.10.0.
Changelog

Sourced from tempfile's changelog.

3.10.0

  • Drop redox_syscall dependency, we now use rustix for Redox.
  • Add Builder::permissions for setting the permissions on temporary files and directories (thanks to @​Byron).
  • Update rustix to 0.38.31.
  • Update fastrand to 2.0.1.
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=tempfile&package-manager=cargo&previous-version=3.9.0&new-version=3.10.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 28 +++++++++------------------- src/core/Cargo.toml | 2 +- 2 files changed, 10 insertions(+), 20 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 84c77f798e..6913ea4907 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -548,9 +548,9 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.0.0" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764" +checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" [[package]] name = "finch" @@ -772,9 +772,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.151" +version = "0.2.153" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "302d7ab3130588088d277783b1e2d2e10c9e9e4a16dd9050e6ec93fb3e7048f4" +checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" [[package]] name = "libloading" @@ -1275,15 +1275,6 @@ dependencies = [ "crossbeam-utils", ] -[[package]] -name = "redox_syscall" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" -dependencies = [ - "bitflags 1.3.2", -] - [[package]] name = "regex" version = "1.5.6" @@ -1392,9 +1383,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.28" +version = "0.38.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316" +checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" dependencies = [ "bitflags 2.4.1", "errno", @@ -1587,14 +1578,13 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" [[package]] name = "tempfile" -version = "3.9.0" +version = "3.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa" +checksum = "a365e8cd18e44762ef95d87f284f4b5cd04107fec2ff3052bd6a3e6069669e67" dependencies = [ "cfg-if", "fastrand", - "redox_syscall", - "rustix 0.38.28", + "rustix 0.38.31", "windows-sys 0.52.0", ] diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index 16154dc561..6a65d1b859 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -64,7 +64,7 @@ criterion = "0.5.1" needletail = { version = "0.5.1", default-features = false } proptest = { version = "1.4.0", default-features = false, features = ["std"]} rand = "0.8.2" -tempfile = "3.9.0" +tempfile = "3.10.0" [[bench]] name = "compute" From 427712cc50c2a14dd44d2863f2b2f85228bc2441 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 6 Feb 2024 14:54:41 +0000 Subject: [PATCH 12/14] Bump pypa/cibuildwheel from 2.16.4 to 2.16.5 (#2981) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.16.4 to 2.16.5.
Release notes

Sourced from pypa/cibuildwheel's releases.

v2.16.5

  • 🐛 Fix an incompatibility with the GitHub Action and new GitHub Runner images for Windows that bundle Powershell 7.3+ (#1741)
  • 🛠 Preliminary support for new macos-14 arm64 runners (#1743)
Changelog

Sourced from pypa/cibuildwheel's changelog.

v2.16.5

30 January 2024

  • 🐛 Fix an incompatibility with the GitHub Action and new GitHub Runner images for Windows that bundle Powershell 7.3+ (#1741)
  • 🛠 Preliminary support for new macos-14 arm64 runners (#1743)
Commits
  • ce3fb78 Bump version: v2.16.5
  • 5b0b458 fix: download pipx for action, allow support for M1 (#1743)
  • a7ea5fb Merge pull request #1739 from henryiii/henryiii/chore/checkschemas
  • bc55e8b Merge pull request #1741 from jborean93/pwsh-7.4
  • c753cd2 Add support for PowerShell 7.4 in GHA
  • 07bd78c chore: check schemas
  • d7db575 docs: add keyvi as an example that combines cibuildwheel with the ccache acti...
  • 7154e18 [Bot] Update dependencies (#1738)
  • See full diff in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=pypa/cibuildwheel&package-manager=github_actions&previous-version=2.16.4&new-version=2.16.5)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/build_wheel.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml index faf0ccd8af..841c0a03b9 100644 --- a/.github/workflows/build_wheel.yml +++ b/.github/workflows/build_wheel.yml @@ -46,7 +46,7 @@ jobs: python-version: '3.10' - name: Build wheels - uses: pypa/cibuildwheel@v2.16.4 + uses: pypa/cibuildwheel@v2.16.5 env: CIBW_ENVIRONMENT_MACOS: ${{ matrix.macos_target }} CIBW_ARCHS_LINUX: ${{ matrix.arch }} From e5cdc36b8eb9975134912cd929d170a4b503a0b3 Mon Sep 17 00:00:00 2001 From: Tessa Pierce Ward Date: Thu, 8 Feb 2024 15:58:13 -0800 Subject: [PATCH 13/14] MRG: re-establish `tax` gather reading flexibility (#2986) A while back, I introduced `GatherRow` to handle checking for required gather columns for us. However, it ended up being overly restrictive -- any extra columns cause `gather_csv` reading to fail. Here, I add a filtration step that lets us ignore unspecified columns entirely before reading a GatherRow. Initializing the GatherRow after this filtration continues to handle the checks for all required columns while restoring flexibility. As a consequence, we can actually delete all the `non-essential` names in `GatherRow` and avoid carrying them around (saving some memory) --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- src/sourmash/tax/tax_utils.py | 51 +++++++++++++++++------------------ tests/test_tax_utils.py | 19 ++++++++++++- 2 files changed, 42 insertions(+), 28 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 55b30a540e..d1827c2aad 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -6,7 +6,7 @@ from collections import abc, defaultdict from itertools import zip_longest from typing import NamedTuple -from dataclasses import dataclass, field, replace, asdict +from dataclasses import dataclass, field, replace, asdict, fields import gzip from sourmash import sqlite_utils, sourmash_args @@ -742,7 +742,10 @@ def load_gather_results( for n, row in enumerate(r): # try reading each gather row into a TaxResult try: - gatherRow = GatherRow(**row) + filt_row = filter_row( + row, GatherRow + ) # filter row first to allow extra (unused) columns in csv + gatherRow = GatherRow(**filt_row) except TypeError as exc: raise ValueError( f"'{gather_csv}' is missing columns needed for taxonomic summarization. Please run gather with sourmash >= 4.4." @@ -1675,6 +1678,20 @@ def load(cls, locations, **kwargs): return tax_assign +def filter_row(row, dataclass_type): + """ + Filter the row to only include keys that exist in the dataclass fields. + This allows extra columns to be passed in with the gather csv while still + taking advantage of the checks for required columns that come with dataclass + initialization. + """ + valid_keys = {field.name for field in fields(dataclass_type)} + # 'match_name' and 'name' should be interchangeable (sourmash 4.x) + if "match_name" in row.keys() and "name" not in row.keys(): + row["name"] = row.pop("match_name") + return {k: v for k, v in row.items() if k in valid_keys} + + @dataclass class GatherRow: """ @@ -1689,7 +1706,8 @@ class GatherRow: with sourmash_args.FileInputCSV(gather_csv) as r: for row in enumerate(r): - gatherRow = GatherRow(**row) + filt_row = filter_row(row, GatherRow) # filter first to allow extra columns + gatherRow = GatherRow(**filt_row) """ # essential columns @@ -1706,32 +1724,10 @@ class GatherRow: ksize: int scaled: int - # non-essential - intersect_bp: int = None - f_orig_query: float = None - f_match: float = None - average_abund: float = None - median_abund: float = None - std_abund: float = None - filename: str = None - md5: str = None - f_match_orig: float = None - gather_result_rank: str = None - moltype: str = None + # non-essential, but used if available query_n_hashes: int = None - query_abundance: int = None - query_containment_ani: float = None - match_containment_ani: float = None - average_containment_ani: float = None - max_containment_ani: float = None - potential_false_negative: bool = None - n_unique_weighted_found: int = None sum_weighted_found: int = None total_weighted_hashes: int = None - query_containment_ani_low: float = None - query_containment_ani_high: float = None - match_containment_ani_low: float = None - match_containment_ani_high: float = None @dataclass @@ -1854,7 +1850,8 @@ class TaxResult(BaseTaxResult): with sourmash_args.FileInputCSV(gather_csv) as r: for row in enumerate(r): - gatherRow = GatherRow(**row) + filt_row = filter_row(row, GatherRow) # this filters any extra columns + gatherRow = GatherRow(**filt_row) # this checks for required columns and raises TypeError for any missing # initialize TaxResult tax_res = TaxResult(raw=gatherRow) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index a362984532..bd0060b65a 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -37,6 +37,7 @@ LineageDB, LineageDB_Sqlite, MultiLineageDB, + filter_row, ) @@ -93,7 +94,8 @@ def make_GatherRow(gather_dict=None, exclude_cols=[]): gatherD.update(gather_dict) for col in exclude_cols: gatherD.pop(col) - gatherRaw = GatherRow(**gatherD) + fgatherD = filter_row(gatherD, GatherRow) + gatherRaw = GatherRow(**fgatherD) return gatherRaw @@ -807,6 +809,21 @@ def test_GatherRow_old_gather(): assert "__init__() missing 1 required positional argument: 'query_bp'" in str(exc) +def test_GatherRow_match_name_not_name(): + # gather contains match_name but not name column + gA = {"match_name": "gA.1 name"} + grow = make_GatherRow(gA, exclude_cols=["name"]) + print(grow) + assert grow.name == "gA.1 name" + + +def test_GatherRow_extra_cols(): + # gather contains extra columns + gA = {"not-a-col": "nope"} + grow = make_GatherRow(gA) + assert isinstance(grow, GatherRow) + + def test_get_ident_default(): ident = "GCF_001881345.1" n_id = get_ident(ident) From 712724866254e34cf23c6a24e7d20600f0259dd3 Mon Sep 17 00:00:00 2001 From: Tessa Pierce Ward Date: Thu, 8 Feb 2024 16:53:00 -0800 Subject: [PATCH 14/14] MRG: docs: add branchwater reference; make FAQ more visible (#2984) - fixes #2983 - adds links to branchwater plugin view draft docs: https://sourmash--2984.org.readthedocs.build/en/2984/ --------- Co-authored-by: C. Titus Brown --- doc/faq.md | 3 +++ doc/index.md | 8 +++++++- doc/sidebar.md | 5 ++++- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/doc/faq.md b/doc/faq.md index df17c56726..d8d9da0622 100644 --- a/doc/faq.md +++ b/doc/faq.md @@ -302,3 +302,6 @@ lot of new k-mers that pop up with a longer k-mer size (e.g. because of included variation)? These questions must be answered by experimentation and may be data-set specific. +## Can sourmash run with multiple theads? + +sourmash is currently single-threaded, but the [branchwater plugin for sourmash](https://github.com/sourmash-bio/sourmash_plugin_branchwater) provides faster and lower-memory multithreaded implementations of several important sourmash features - sketching, searching, and gather (metagenome decomposition). It does so by implementing higher-level functions in Rust on top of the core Rust library of sourmash. As a result it provides some of the same functionality as sourmash, but 10-100x faster and in 10x lower memory. Note that this code is functional and tested, but does not have all of the features of sourmash. Code and features will be integrated back into sourmash as they mature. diff --git a/doc/index.md b/doc/index.md index 20e5d4f9ff..6b3e0e21f7 100644 --- a/doc/index.md +++ b/doc/index.md @@ -58,6 +58,8 @@ sourmash is inspired by [mash](https://mash.readthedocs.io), and supports most mash analyses. sourmash also implements an expanded set of functionality for metagenome and taxonomic analysis. +While sourmash is currently single-threaded, the [branchwater plugin for sourmash](https://github.com/sourmash-bio/sourmash_plugin_branchwater) provides faster and lower-memory multithreaded implementations of several important sourmash features - sketching, searching, and gather (metagenome decomposition). It does so by implementing higher-level functions in Rust on top of the core Rust library of sourmash. As a result it provides some of the same functionality as sourmash, but 10-100x faster and in 10x lower memory. Note that this code is functional and tested, but does not have all of the features of sourmash. Code and features will be integrated back into sourmash as they mature. + sourmash development was initiated with a grant from the Moore Foundation under the Data Driven Discovery program, and has been supported by further funding from the NIH and NSF. Please see @@ -94,6 +96,10 @@ X and Linux. They require about 5 GB of disk space and 5 GB of RAM. * [A short guide to using sourmash output with R](other-languages.md). +## Frequently Asked Questions + +* [Frequently asked questions](faq.md) + ### How sourmash works under the hood * [An introduction to k-mers for genome comparison and analysis](kmers-and-minhash.ipynb) @@ -128,6 +134,6 @@ hidden: true sidebar command-line -api-example databases +api-example ``` diff --git a/doc/sidebar.md b/doc/sidebar.md index 5e81538fba..f4e6e8170c 100644 --- a/doc/sidebar.md +++ b/doc/sidebar.md @@ -25,6 +25,10 @@ X and Linux. They require about 5 GB of disk space and 5 GB of RAM. * [A short guide to using sourmash output with R](other-languages.md). +## Frequently Asked Questions + +* [Frequently asked questions](faq.md) + ## How sourmash works under the hood * [An introduction to k-mers for genome comparison and analysis](kmers-and-minhash.ipynb) @@ -38,7 +42,6 @@ X and Linux. They require about 5 GB of disk space and 5 GB of RAM. * [Publications about sourmash](publications.md) * [A guide to the internal design and structure of sourmash](sourmash-internals.md) * [Funding acknowledgements](funding.md) -* [Frequently asked questions](faq.md) ## Developing and extending sourmash